From a9c3e63b4498dc490a07378900229014aff937bb Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Fri, 22 Mar 2024 19:52:06 +0800 Subject: [PATCH] feat: detect use 3 card on multi server (rule) --- main.py | 82 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index 7d0b495..183fb50 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ from bs4 import BeautifulSoup from email.mime.text import MIMEText import os -DEBUG_MODE = True +DEBUG_MODE = False URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/' GPU_LIMIT = 2 @@ -15,12 +15,12 @@ MAIL_CD_HOUR = 6 MAIL_MESSAGE = ''' Hi, {}

-提醒您,您目前在 {} 伺服器上已經使用了 {} 張 {},
-依照 CMLab 規定,每人在每台 CML workstation 上至多使用 2 顆 GPU,詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule
+提醒您,您目前{},
+依照 CMLab 規定,每人在 CML workstation 上至多使用 2 顆 GPU

為了公平起見,建議您降低 GPU 使用量!
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!
-若有其他問題歡迎來信討論!謝謝您的配合!
+若有其他特殊需求請來信說明!謝謝您的配合!

* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。

@@ -106,7 +106,7 @@ def get_server_gpu_status() -> list: servers.append({ 'name': server_name, 'users': users, - 'gpu': gpu_names + 'gpus': gpu_names }) return servers @@ -114,8 +114,8 @@ def filter(server_status: list, limit: int = 2) -> list: ''' You can set some rule here. it will return the a list which contains the username that exceed the limit. - Rules now (2024/03/11): - - 每人每台僅限使用 2 張卡 + Rules now (2024/03/22): + - 每人僅限使用 2 張卡 I add this function beacause that we may need more filter rules. we can add some rules in this function easily @@ -125,29 +125,28 @@ def filter(server_status: list, limit: int = 2) -> list: Output: violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU. ''' - violators = [] - - counter = {} + usage = {} for server in server_status: - print(server) - ''' - for user in server_status['users']: - if user not in counter: - counter[user] = 1 - else: - counter[user] += 1 - - for k, v in counter.items(): - if v > limit: - violators.append({ - 'user': k, - 'gpu': v, - }) - ''' + print(server['name']) + for gpu_index, gpu_name in enumerate(server['gpus']): + print(" ", gpu_index, gpu_name, server['users'][gpu_index]) + for user in server['users'][gpu_index]: + if user not in usage: + usage[user] = [{'server': server['name'], 'gpu': gpu_name}] + else: + usage[user].append({'server': server['name'], 'gpu': gpu_name}) + + print('-') + + violators = [] + for user, state in usage.items(): + if len(state) > limit: + violators.append({'username': user, 'usage': state}) + return violators -def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: +def mail_notify(violators: list) -> None: def check_send(log: dict, username: str) -> bool: if DEBUG_MODE: return True @@ -160,21 +159,40 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: else: return False + def get_usage_msg(usage: list) -> str: + server_usage = {} + ans = "" + for gpu in usage: + if gpu['server'] not in server_usage: + server_usage[gpu['server']] = { gpu['gpu']: 1 } + else: + if gpu['gpu'] not in server_usage[gpu['server']]: + server_usage[gpu['server']][gpu['gpu']] = 1 + else: + server_usage[gpu['server']][gpu['gpu']] += 1 + + for server, gpus in server_usage.items(): + for gpu, count in gpus.items(): + ans += f"在 {server} 上使用 {gpu} * {count}, " + return ans[:-2] + + print(" ===== 寄送 ===== ") + # get last send time with open('send_log.json') as fp: send_log = json.load(fp) for violator in violators: - username = violator['user'] - print(f' {username}') + username = violator['username'] if check_send(send_log, username): # update log send_log[username] = time.time() with open('send_log.json', 'w') as fp: json.dump(send_log, fp) - usage = violator['gpu'] - msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name) + usage_msg = get_usage_msg(violator['usage']) + print(f' {username} {usage_msg}') + msg = MAIL_MESSAGE.format(username, usage_msg) msg = MIMEText(msg, 'html') # 郵件內文 msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else "" msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw' @@ -196,10 +214,10 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: if __name__ == '__main__': server_status = get_server_gpu_status() - for i in server_status: - print(i) violators = filter(server_status, 2) + mail_notify(violators) + ''' for server in server_status: print(server['name'], server['gpu'])