From 26137b8a15214ee453be3259bf1d26822d770ae6 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Mon, 11 Nov 2024 16:49:56 +0800 Subject: [PATCH] feat: change to new rules --- main.py | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 48c7f11..fcfa5da 100644 --- a/main.py +++ b/main.py @@ -19,8 +19,11 @@ MAIL_MESSAGE = ''' Hi, {}

提醒您,您目前{},
-依照 CMLab 規定,每人在 CML workstation 上至多使用 2 顆 GPU
+依照 CMLab 規定,每人在 CML workstation 使用的 GPU 之上限 VRAM 的總和不得超過 50GB
+詳細請參考 wiki 工作站規定
+
+ 為了公平起見,建議您降低 GPU 使用量!
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!
若有其他特殊需求請來信說明!謝謝您的配合!
@@ -84,7 +87,7 @@ def get_server_gpu_status() -> list: server = soup.find('span', class_='f7') if server: server_name = server.text.replace(' ', '') - gpu_names = [] + gpu_infos = [] users = [] gpus = box.split('\n') @@ -92,6 +95,9 @@ def get_server_gpu_status() -> list: if 'f6' in i: # if this line is for a single GPU informations soup = BeautifulSoup(i, 'html.parser') + # get max VRAM MB number + max_VRAM = int(soup.findAll('span', class_='f3')[1].text) + # get all users who is using this server's GPU resources. user_objs = soup.findAll('span', class_='f0') user_in_this_gpu = set() @@ -105,15 +111,16 @@ def get_server_gpu_status() -> list: # get gpu names gpu_name = soup.find('span', class_='f4').text.replace(' ', '') + print(gpu_name, max_VRAM) # log gpu name & users on this GPU - gpu_names.append(gpu_name) + gpu_infos.append({'name': gpu_name, 'vram': max_VRAM}) users.append(list(user_in_this_gpu)) servers.append({ 'name': server_name, 'users': users, - 'gpus': gpu_names + 'gpus': gpu_infos, }) return servers @@ -136,20 +143,25 @@ def filter(server_status: list, limit: int = 2) -> list: for server in server_status: print(server['name']) - for gpu_index, gpu_name in enumerate(server['gpus']): - print(" ", gpu_index, gpu_name, server['users'][gpu_index]) + for gpu_index, gpu_info in enumerate(server['gpus']): + print(" ", gpu_index, gpu_info['name'], server['users'][gpu_index]) for user in server['users'][gpu_index]: if user not in usage: - usage[user] = [{'server': server['name'], 'gpu': gpu_name}] + usage[user] = [{'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']}] else: - usage[user].append({'server': server['name'], 'gpu': gpu_name}) + usage[user].append({'server': server['name'], 'gpu': gpu_info['name'], 'vram': gpu_info['vram']}) print('-') violators = [] for user, state in usage.items(): - if len(state) > limit: - violators.append({'username': user, 'usage': state}) + # if len(state) > limit: + # violators.append({'username': user, 'usage': state}) + vrams = 0 + for i in state: + vrams += i['vram'] + if vrams >= 50000: + violators.append({'username': user, 'usage': state}) return violators @@ -169,7 +181,9 @@ def mail_notify(violators: list) -> None: def get_usage_msg(usage: list) -> str: server_usage = {} ans = "" + vrams = 0 for gpu in usage: + vrams += gpu['vram'] if gpu['server'] not in server_usage: server_usage[gpu['server']] = { gpu['gpu']: 1 } else: @@ -181,7 +195,9 @@ def mail_notify(violators: list) -> None: for server, gpus in server_usage.items(): for gpu, count in gpus.items(): ans += f"在 {server} 上使用 {gpu} * {count}, " - return ans[:-2] + ans = ans[:-2] + ans += f',GPU VRAM 上限之總和為 {vrams} MB VRAM。' + return ans print(" ===== 寄送 ===== ") @@ -208,7 +224,9 @@ def mail_notify(violators: list) -> None: status = mailer.send('unix_manager@cmlab.csie.ntu.edu.tw', f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw', \ f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})', msg) - if status == {}: + print(status) + + if status: print(f' {username}, 郵件傳送成功!') else: print(f' {username}, 郵件傳送失敗...')