CMLAB-GPU-Mailer/main.py

import requests
import json
import smtplib
import time
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
import os
from mailer import Mailer


DEBUG_MODE = False

URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
GPU_LIMIT = 2
GOOGLE_OWNER = "snsd0805@cmlab.csie.ntu.edu.tw"
GOOGLE_CODE = os.environ['GOOGLE_CODE']
MAIL_CD_HOUR = 12
MAIL_MESSAGE = '''
Hi, {}<br>
<br>
提醒您，您目前{}，<br>
依照 CMLab 規定，每人在 CML workstation 使用的 GPU 之上限 VRAM 的總和不得超過 50GB<br>
詳細請參考 <a href='https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule#rule_1_%E5%9F%BA%E6%9C%AC%E8%A6%8F%E5%89%87'>wiki 工作站規定</a>
<br>
<br>

為了公平起見，建議您降低 GPU 使用量！<br>
雖然我們不會直接處理，但若有人檢舉，我們會停止您運行的程式！<br>
若有其他特殊需求請來信說明！謝謝您的配合！<br>
<br>
* 信件由網管自動化工具寄出，若為錯誤寄送請忽略此信件。<br>
<br>
Best,<br>
CMLab Unix Manager, Ting-Jun Wang<br>
CMLab, National Taiwan University<br>
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
<pre>
            __  __     _
           / / / /__  (_)_ __
   __  ___/ /_/ / _ \/ /\ \ /
  /  |/  /\____/_//_/_//_\_\____ ____
 / /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
/_/  /_/\_,_/_//_/\_,_/\_, /\__/_/
                      /___/
</pre>
'''

mailer = Mailer(GOOGLE_OWNER, GOOGLE_CODE)

def get_server_gpu_status() -> list:
    '''
        It will get html from CMLab webpage.
        and retrive the server name & users who are using GPU resources.

        Input:
            None
        Output:
            servers(list):
                [
                    {
                        'name': 'cml5'
                        'users': [
                            ['snsd0805', 'timmy'], ['timmy']
                        ],
                        'gpus': ['V100', 'V100']
                    },
                    ...
                ]

            means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.

    '''
    servers = []

    # get HTML file
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')

    # find the table which shows the GPU status
    table = soup.find('pre')
    boxes = table.prettify().split('\n\n')

    # retrive all server, one box means one server
    for box in boxes[:-1]:
        soup = BeautifulSoup(box, 'html.parser')
        # get server name
        server = soup.find('span', class_='f7')
        if server:
            server_name = server.text.replace(' ', '')
            gpu_infos = []

            users = []
            gpus = box.split('\n')
            for i in gpus[:]:
                if 'f6' in i:           # if this line is for a single GPU informations
                    soup = BeautifulSoup(i, 'html.parser')

                    # get max VRAM MB number
                    max_VRAM = int(soup.findAll('span', class_='f3')[1].text)

                    # get all users who is using this server's GPU resources.
                    user_objs = soup.findAll('span', class_='f0')
                    user_in_this_gpu = set()

                    # get users on this GPU
                    for user_obj in user_objs:
                        if user_obj != None:
                            username = user_obj.text
                            if username != 'gdm' and username != 'root':
                                user_in_this_gpu.add(username)

                    # get gpu names
                    gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
                    print(gpu_name, max_VRAM)

                    # log gpu name & users on this GPU
                    gpu_infos.append({'name': gpu_name, 'vram': max_VRAM})
                    users.append(list(user_in_this_gpu))

            servers.append({
                'name': server_name,
                'users': users,
                'gpus': gpu_infos,
            })
    return servers

def filter(server_status: list, limit: int = 2) -> list:
    '''
        You can set some rule here.
        it will return the a list which contains the username that exceed the limit.
        Rules now (2024/03/22):
            - 每人僅限使用 2 張卡

        I add this function beacause that we may need more filter rules. we can add some rules in this function easily

        Input:
            server_status (list): from get_server_gpu_status()
            limit (int): GPU limit (default: 2)
        Output:
            violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
    '''
    usage = {}

    for server in server_status:
        print(server['name'])
        for gpu_index, gpu_info in enumerate(server['gpus']):
            print("    ", gpu_index, gpu_info['name'], server['users'][gpu_index])
            for user in server['users'][gpu_index]:
                if user not in usage:
                    usage[user] = [{'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']}]
                else:
                    usage[user].append({'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']})

        print('-')

    violators = []
    for user, state in usage.items():
        # if len(state) > limit:
        #     violators.append({'username': user, 'usage': state})
        vrams = 0
        for i in state:
            vrams += i['vram']
        if vrams >= 50000:
            violators.append({'username': user, 'usage': state})

    return violators

def mail_notify(violators: list) -> None:
    def check_send(log: dict, username: str) -> bool:
        if DEBUG_MODE:
            return True
        if username not in log:
            return True
        else:
            if time.time() - log[username]['time'] >= (MAIL_CD_HOUR*60*60):        # 6 hr 提醒一次

                return True
            else:
                return False

    def get_usage_msg(usage: list) -> str:
        server_usage = {}
        ans = ""
        vrams = 0
        for gpu in usage:
            vrams += gpu['vram']
            if gpu['server'] not in server_usage:
                server_usage[gpu['server']] = { gpu['gpu']: 1 }
            else:
                if gpu['gpu'] not in server_usage[gpu['server']]:
                    server_usage[gpu['server']][gpu['gpu']] = 1
                else:
                    server_usage[gpu['server']][gpu['gpu']] += 1

        for server, gpus in server_usage.items():
            for gpu, count in gpus.items():
                ans += f"在 {server} 上使用 {gpu} * {count}, "
        ans = ans[:-2]
        ans += f'，GPU VRAM 上限之總和為 {vrams} MB VRAM。'
        return ans

    print(" ===== 寄送 ===== ")

    # get last send time
    with open('send_log.json') as fp:
        send_log = json.load(fp)

    for violator in violators:
        username = violator['username']

        if check_send(send_log, username):
            week_count = 1 if username not in send_log else send_log[username]['week_count']+1
            # update log
            send_log[username] = {
                'time': time.time(),
                'week_count': week_count
            }
            with open('send_log.json', 'w') as fp:
                json.dump(send_log, fp)
            usage_msg = get_usage_msg(violator['usage'])
            print(f'    {username} {usage_msg}')
            msg = MAIL_MESSAGE.format(username, usage_msg)

            status = mailer.send('unix_manager@cmlab.csie.ntu.edu.tw', f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw', \
                f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制！({username})', msg)

            print(status)

            if status:
                print(f'    {username}, 郵件傳送成功！')
            else:
                print(f'    {username}, 郵件傳送失敗...')


if __name__ == '__main__':
    server_status = get_server_gpu_status()

    violators = filter(server_status, 2)
    mail_notify(violators)

    '''
    for server in server_status:
        print(server['name'], server['gpu'])
        violators = filter(server, 2)
        print(violators)
        mail_notify(server['name'], server['gpu'], violators)
        print("=" * 20)
    '''