feat: detect use 3 card on multi server (rule)

This commit is contained in:
Ting-Jun Wang 2024-03-22 19:52:06 +08:00
parent 66e4b9ec51
commit a9c3e63b44
Signed by: snsd0805
GPG Key ID: D175E969960C4B16

82
main.py
View File

@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
from email.mime.text import MIMEText from email.mime.text import MIMEText
import os import os
DEBUG_MODE = True DEBUG_MODE = False
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/' URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
GPU_LIMIT = 2 GPU_LIMIT = 2
@ -15,12 +15,12 @@ MAIL_CD_HOUR = 6
MAIL_MESSAGE = ''' MAIL_MESSAGE = '''
Hi, {}<br> Hi, {}<br>
<br> <br>
提醒您您目前 {} 伺服器上已經使用了 {} {}<br> 提醒您您目前{}<br>
依照 CMLab 規定每人在每台 CML workstation 上至多使用 2 GPU詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule <br> 依照 CMLab 規定每人在 CML workstation 上至多使用 2 GPU <br>
<br> <br>
為了公平起見建議您降低 GPU 使用量<br> 為了公平起見建議您降低 GPU 使用量<br>
雖然我們不會直接處理但若有人檢舉我們會停止您運行的程式<br> 雖然我們不會直接處理但若有人檢舉我們會停止您運行的程式<br>
若有其他問題歡迎來信討論謝謝您的配合<br> 若有其他特殊需求請來信說明謝謝您的配合<br>
<br> <br>
* 信件由網管自動化工具寄出若為錯誤寄送請忽略此信件<br> * 信件由網管自動化工具寄出若為錯誤寄送請忽略此信件<br>
<br> <br>
@ -106,7 +106,7 @@ def get_server_gpu_status() -> list:
servers.append({ servers.append({
'name': server_name, 'name': server_name,
'users': users, 'users': users,
'gpu': gpu_names 'gpus': gpu_names
}) })
return servers return servers
@ -114,8 +114,8 @@ def filter(server_status: list, limit: int = 2) -> list:
''' '''
You can set some rule here. You can set some rule here.
it will return the a list which contains the username that exceed the limit. it will return the a list which contains the username that exceed the limit.
Rules now (2024/03/11): Rules now (2024/03/22):
- 每人每台僅限使用 2 張卡 - 每人僅限使用 2 張卡
I add this function beacause that we may need more filter rules. we can add some rules in this function easily I add this function beacause that we may need more filter rules. we can add some rules in this function easily
@ -125,29 +125,28 @@ def filter(server_status: list, limit: int = 2) -> list:
Output: Output:
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU. violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
''' '''
violators = [] usage = {}
counter = {}
for server in server_status: for server in server_status:
print(server) print(server['name'])
''' for gpu_index, gpu_name in enumerate(server['gpus']):
for user in server_status['users']: print(" ", gpu_index, gpu_name, server['users'][gpu_index])
if user not in counter: for user in server['users'][gpu_index]:
counter[user] = 1 if user not in usage:
else: usage[user] = [{'server': server['name'], 'gpu': gpu_name}]
counter[user] += 1 else:
usage[user].append({'server': server['name'], 'gpu': gpu_name})
for k, v in counter.items():
if v > limit: print('-')
violators.append({
'user': k, violators = []
'gpu': v, for user, state in usage.items():
}) if len(state) > limit:
''' violators.append({'username': user, 'usage': state})
return violators return violators
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: def mail_notify(violators: list) -> None:
def check_send(log: dict, username: str) -> bool: def check_send(log: dict, username: str) -> bool:
if DEBUG_MODE: if DEBUG_MODE:
return True return True
@ -160,21 +159,40 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
else: else:
return False return False
def get_usage_msg(usage: list) -> str:
server_usage = {}
ans = ""
for gpu in usage:
if gpu['server'] not in server_usage:
server_usage[gpu['server']] = { gpu['gpu']: 1 }
else:
if gpu['gpu'] not in server_usage[gpu['server']]:
server_usage[gpu['server']][gpu['gpu']] = 1
else:
server_usage[gpu['server']][gpu['gpu']] += 1
for server, gpus in server_usage.items():
for gpu, count in gpus.items():
ans += f"{server} 上使用 {gpu} * {count}, "
return ans[:-2]
print(" ===== 寄送 ===== ")
# get last send time # get last send time
with open('send_log.json') as fp: with open('send_log.json') as fp:
send_log = json.load(fp) send_log = json.load(fp)
for violator in violators: for violator in violators:
username = violator['user'] username = violator['username']
print(f' {username}')
if check_send(send_log, username): if check_send(send_log, username):
# update log # update log
send_log[username] = time.time() send_log[username] = time.time()
with open('send_log.json', 'w') as fp: with open('send_log.json', 'w') as fp:
json.dump(send_log, fp) json.dump(send_log, fp)
usage = violator['gpu'] usage_msg = get_usage_msg(violator['usage'])
msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name) print(f' {username} {usage_msg}')
msg = MAIL_MESSAGE.format(username, usage_msg)
msg = MIMEText(msg, 'html') # 郵件內文 msg = MIMEText(msg, 'html') # 郵件內文
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else "" msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else ""
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw' msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
@ -196,10 +214,10 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
if __name__ == '__main__': if __name__ == '__main__':
server_status = get_server_gpu_status() server_status = get_server_gpu_status()
for i in server_status:
print(i)
violators = filter(server_status, 2) violators = filter(server_status, 2)
mail_notify(violators)
''' '''
for server in server_status: for server in server_status:
print(server['name'], server['gpu']) print(server['name'], server['gpu'])