249 lines
8.4 KiB
Python
249 lines
8.4 KiB
Python
import requests
|
||
import json
|
||
import smtplib
|
||
import time
|
||
from bs4 import BeautifulSoup
|
||
from email.mime.text import MIMEText
|
||
import os
|
||
from mailer import Mailer
|
||
|
||
|
||
DEBUG_MODE = False
|
||
|
||
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
|
||
GPU_LIMIT = 2
|
||
GOOGLE_OWNER = "snsd0805@cmlab.csie.ntu.edu.tw"
|
||
GOOGLE_CODE = os.environ['GOOGLE_CODE']
|
||
MAIL_CD_HOUR = 12
|
||
MAIL_MESSAGE = '''
|
||
Hi, {}<br>
|
||
<br>
|
||
提醒您,您目前{},<br>
|
||
依照 CMLab 規定,每人在 CML workstation 使用的 GPU 之上限 VRAM 的總和不得超過 50GB<br>
|
||
詳細請參考 <a href='https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule#rule_1_%E5%9F%BA%E6%9C%AC%E8%A6%8F%E5%89%87'>wiki 工作站規定</a>
|
||
<br>
|
||
<br>
|
||
|
||
為了公平起見,建議您降低 GPU 使用量!<br>
|
||
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!<br>
|
||
若有其他特殊需求請來信說明!謝謝您的配合!<br>
|
||
<br>
|
||
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。<br>
|
||
<br>
|
||
Best,<br>
|
||
CMLab Unix Manager, Ting-Jun Wang<br>
|
||
CMLab, National Taiwan University<br>
|
||
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
|
||
<pre>
|
||
__ __ _
|
||
/ / / /__ (_)_ __
|
||
__ ___/ /_/ / _ \/ /\ \ /
|
||
/ |/ /\____/_//_/_//_\_\____ ____
|
||
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
|
||
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
|
||
/___/
|
||
</pre>
|
||
'''
|
||
|
||
mailer = Mailer(GOOGLE_OWNER, GOOGLE_CODE)
|
||
|
||
def get_server_gpu_status() -> list:
|
||
'''
|
||
It will get html from CMLab webpage.
|
||
and retrive the server name & users who are using GPU resources.
|
||
|
||
Input:
|
||
None
|
||
Output:
|
||
servers(list):
|
||
[
|
||
{
|
||
'name': 'cml5'
|
||
'users': [
|
||
['snsd0805', 'timmy'], ['timmy']
|
||
],
|
||
'gpus': ['V100', 'V100']
|
||
},
|
||
...
|
||
]
|
||
|
||
means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.
|
||
|
||
'''
|
||
servers = []
|
||
|
||
# get HTML file
|
||
response = requests.get(URL)
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# find the table which shows the GPU status
|
||
table = soup.find('pre')
|
||
boxes = table.prettify().split('\n\n')
|
||
|
||
# retrive all server, one box means one server
|
||
for box in boxes[:-1]:
|
||
soup = BeautifulSoup(box, 'html.parser')
|
||
# get server name
|
||
server = soup.find('span', class_='f7')
|
||
if server:
|
||
server_name = server.text.replace(' ', '')
|
||
gpu_infos = []
|
||
|
||
users = []
|
||
gpus = box.split('\n')
|
||
for i in gpus[:]:
|
||
if 'f6' in i: # if this line is for a single GPU informations
|
||
soup = BeautifulSoup(i, 'html.parser')
|
||
|
||
# get max VRAM MB number
|
||
max_VRAM = int(soup.findAll('span', class_='f3')[1].text)
|
||
|
||
# get all users who is using this server's GPU resources.
|
||
user_objs = soup.findAll('span', class_='f0')
|
||
user_in_this_gpu = set()
|
||
|
||
# get users on this GPU
|
||
for user_obj in user_objs:
|
||
if user_obj != None:
|
||
username = user_obj.text
|
||
if username != 'gdm' and username != 'root':
|
||
user_in_this_gpu.add(username)
|
||
|
||
# get gpu names
|
||
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
|
||
print(gpu_name, max_VRAM)
|
||
|
||
# log gpu name & users on this GPU
|
||
gpu_infos.append({'name': gpu_name, 'vram': max_VRAM})
|
||
users.append(list(user_in_this_gpu))
|
||
|
||
servers.append({
|
||
'name': server_name,
|
||
'users': users,
|
||
'gpus': gpu_infos,
|
||
})
|
||
return servers
|
||
|
||
def filter(server_status: list, limit: int = 2) -> list:
|
||
'''
|
||
You can set some rule here.
|
||
it will return the a list which contains the username that exceed the limit.
|
||
Rules now (2024/03/22):
|
||
- 每人僅限使用 2 張卡
|
||
|
||
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
|
||
|
||
Input:
|
||
server_status (list): from get_server_gpu_status()
|
||
limit (int): GPU limit (default: 2)
|
||
Output:
|
||
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
|
||
'''
|
||
usage = {}
|
||
|
||
for server in server_status:
|
||
print(server['name'])
|
||
for gpu_index, gpu_info in enumerate(server['gpus']):
|
||
print(" ", gpu_index, gpu_info['name'], server['users'][gpu_index])
|
||
for user in server['users'][gpu_index]:
|
||
if user not in usage:
|
||
usage[user] = [{'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']}]
|
||
else:
|
||
usage[user].append({'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']})
|
||
|
||
print('-')
|
||
|
||
violators = []
|
||
for user, state in usage.items():
|
||
# if len(state) > limit:
|
||
# violators.append({'username': user, 'usage': state})
|
||
vrams = 0
|
||
for i in state:
|
||
vrams += i['vram']
|
||
if vrams >= 50000:
|
||
violators.append({'username': user, 'usage': state})
|
||
|
||
return violators
|
||
|
||
def mail_notify(violators: list) -> None:
|
||
def check_send(log: dict, username: str) -> bool:
|
||
if DEBUG_MODE:
|
||
return True
|
||
if username not in log:
|
||
return True
|
||
else:
|
||
if time.time() - log[username]['time'] >= (MAIL_CD_HOUR*60*60): # 6 hr 提醒一次
|
||
|
||
return True
|
||
else:
|
||
return False
|
||
|
||
def get_usage_msg(usage: list) -> str:
|
||
server_usage = {}
|
||
ans = ""
|
||
vrams = 0
|
||
for gpu in usage:
|
||
vrams += gpu['vram']
|
||
if gpu['server'] not in server_usage:
|
||
server_usage[gpu['server']] = { gpu['gpu']: 1 }
|
||
else:
|
||
if gpu['gpu'] not in server_usage[gpu['server']]:
|
||
server_usage[gpu['server']][gpu['gpu']] = 1
|
||
else:
|
||
server_usage[gpu['server']][gpu['gpu']] += 1
|
||
|
||
for server, gpus in server_usage.items():
|
||
for gpu, count in gpus.items():
|
||
ans += f"在 {server} 上使用 {gpu} * {count}, "
|
||
ans = ans[:-2]
|
||
ans += f',GPU VRAM 上限之總和為 {vrams} MB VRAM。'
|
||
return ans
|
||
|
||
print(" ===== 寄送 ===== ")
|
||
|
||
# get last send time
|
||
with open('send_log.json') as fp:
|
||
send_log = json.load(fp)
|
||
|
||
for violator in violators:
|
||
username = violator['username']
|
||
|
||
if check_send(send_log, username):
|
||
week_count = 1 if username not in send_log else send_log[username]['week_count']+1
|
||
# update log
|
||
send_log[username] = {
|
||
'time': time.time(),
|
||
'week_count': week_count
|
||
}
|
||
with open('send_log.json', 'w') as fp:
|
||
json.dump(send_log, fp)
|
||
usage_msg = get_usage_msg(violator['usage'])
|
||
print(f' {username} {usage_msg}')
|
||
msg = MAIL_MESSAGE.format(username, usage_msg)
|
||
|
||
status = mailer.send('unix_manager@cmlab.csie.ntu.edu.tw', f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw', \
|
||
f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})', msg)
|
||
|
||
print(status)
|
||
|
||
if status:
|
||
print(f' {username}, 郵件傳送成功!')
|
||
else:
|
||
print(f' {username}, 郵件傳送失敗...')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
server_status = get_server_gpu_status()
|
||
|
||
violators = filter(server_status, 2)
|
||
mail_notify(violators)
|
||
|
||
'''
|
||
for server in server_status:
|
||
print(server['name'], server['gpu'])
|
||
violators = filter(server, 2)
|
||
print(violators)
|
||
mail_notify(server['name'], server['gpu'], violators)
|
||
print("=" * 20)
|
||
'''
|