CMLAB-GPU-Mailer/main.py

249 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import smtplib
import time
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
import os
from mailer import Mailer
DEBUG_MODE = False
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
GPU_LIMIT = 2
GOOGLE_OWNER = "snsd0805@cmlab.csie.ntu.edu.tw"
GOOGLE_CODE = os.environ['GOOGLE_CODE']
MAIL_CD_HOUR = 12
MAIL_MESSAGE = '''
Hi, {}<br>
<br>
提醒您,您目前{}<br>
依照 CMLab 規定,每人在 CML workstation 使用的 GPU 之上限 VRAM 的總和不得超過 50GB<br>
詳細請參考 <a href='https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule#rule_1_%E5%9F%BA%E6%9C%AC%E8%A6%8F%E5%89%87'>wiki 工作站規定</a>
<br>
<br>
為了公平起見,建議您降低 GPU 使用量!<br>
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!<br>
若有其他特殊需求請來信說明!謝謝您的配合!<br>
<br>
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。<br>
<br>
Best,<br>
CMLab Unix Manager, Ting-Jun Wang<br>
CMLab, National Taiwan University<br>
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
<pre>
__ __ _
/ / / /__ (_)_ __
__ ___/ /_/ / _ \/ /\ \ /
/ |/ /\____/_//_/_//_\_\____ ____
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
/___/
</pre>
'''
mailer = Mailer(GOOGLE_OWNER, GOOGLE_CODE)
def get_server_gpu_status() -> list:
'''
It will get html from CMLab webpage.
and retrive the server name & users who are using GPU resources.
Input:
None
Output:
servers(list):
[
{
'name': 'cml5'
'users': [
['snsd0805', 'timmy'], ['timmy']
],
'gpus': ['V100', 'V100']
},
...
]
means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.
'''
servers = []
# get HTML file
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
# find the table which shows the GPU status
table = soup.find('pre')
boxes = table.prettify().split('\n\n')
# retrive all server, one box means one server
for box in boxes[:-1]:
soup = BeautifulSoup(box, 'html.parser')
# get server name
server = soup.find('span', class_='f7')
if server:
server_name = server.text.replace(' ', '')
gpu_infos = []
users = []
gpus = box.split('\n')
for i in gpus[:]:
if 'f6' in i: # if this line is for a single GPU informations
soup = BeautifulSoup(i, 'html.parser')
# get max VRAM MB number
max_VRAM = int(soup.findAll('span', class_='f3')[1].text)
# get all users who is using this server's GPU resources.
user_objs = soup.findAll('span', class_='f0')
user_in_this_gpu = set()
# get users on this GPU
for user_obj in user_objs:
if user_obj != None:
username = user_obj.text
if username != 'gdm' and username != 'root':
user_in_this_gpu.add(username)
# get gpu names
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
print(gpu_name, max_VRAM)
# log gpu name & users on this GPU
gpu_infos.append({'name': gpu_name, 'vram': max_VRAM})
users.append(list(user_in_this_gpu))
servers.append({
'name': server_name,
'users': users,
'gpus': gpu_infos,
})
return servers
def filter(server_status: list, limit: int = 2) -> list:
'''
You can set some rule here.
it will return the a list which contains the username that exceed the limit.
Rules now (2024/03/22):
- 每人僅限使用 2 張卡
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
Input:
server_status (list): from get_server_gpu_status()
limit (int): GPU limit (default: 2)
Output:
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
'''
usage = {}
for server in server_status:
print(server['name'])
for gpu_index, gpu_info in enumerate(server['gpus']):
print(" ", gpu_index, gpu_info['name'], server['users'][gpu_index])
for user in server['users'][gpu_index]:
if user not in usage:
usage[user] = [{'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']}]
else:
usage[user].append({'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']})
print('-')
violators = []
for user, state in usage.items():
# if len(state) > limit:
# violators.append({'username': user, 'usage': state})
vrams = 0
for i in state:
vrams += i['vram']
if vrams >= 50000:
violators.append({'username': user, 'usage': state})
return violators
def mail_notify(violators: list) -> None:
def check_send(log: dict, username: str) -> bool:
if DEBUG_MODE:
return True
if username not in log:
return True
else:
if time.time() - log[username]['time'] >= (MAIL_CD_HOUR*60*60): # 6 hr 提醒一次
return True
else:
return False
def get_usage_msg(usage: list) -> str:
server_usage = {}
ans = ""
vrams = 0
for gpu in usage:
vrams += gpu['vram']
if gpu['server'] not in server_usage:
server_usage[gpu['server']] = { gpu['gpu']: 1 }
else:
if gpu['gpu'] not in server_usage[gpu['server']]:
server_usage[gpu['server']][gpu['gpu']] = 1
else:
server_usage[gpu['server']][gpu['gpu']] += 1
for server, gpus in server_usage.items():
for gpu, count in gpus.items():
ans += f"{server} 上使用 {gpu} * {count}, "
ans = ans[:-2]
ans += f'GPU VRAM 上限之總和為 {vrams} MB VRAM。'
return ans
print(" ===== 寄送 ===== ")
# get last send time
with open('send_log.json') as fp:
send_log = json.load(fp)
for violator in violators:
username = violator['username']
if check_send(send_log, username):
week_count = 1 if username not in send_log else send_log[username]['week_count']+1
# update log
send_log[username] = {
'time': time.time(),
'week_count': week_count
}
with open('send_log.json', 'w') as fp:
json.dump(send_log, fp)
usage_msg = get_usage_msg(violator['usage'])
print(f' {username} {usage_msg}')
msg = MAIL_MESSAGE.format(username, usage_msg)
status = mailer.send('unix_manager@cmlab.csie.ntu.edu.tw', f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw', \
f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})', msg)
print(status)
if status:
print(f' {username}, 郵件傳送成功!')
else:
print(f' {username}, 郵件傳送失敗...')
if __name__ == '__main__':
server_status = get_server_gpu_status()
violators = filter(server_status, 2)
mail_notify(violators)
'''
for server in server_status:
print(server['name'], server['gpu'])
violators = filter(server, 2)
print(violators)
mail_notify(server['name'], server['gpu'], violators)
print("=" * 20)
'''