Compare commits

..

4 Commits

Author SHA1 Message Date
a9c3e63b44
feat: detect use 3 card on multi server (rule) 2024-03-22 19:52:06 +08:00
66e4b9ec51
docs: update comment 2024-03-22 17:40:17 +08:00
e907d42043
fix: get all users on the same GPU
Problem:
    Sometimes, multiple users may run the different program on the same
    GPU if the VRAM is enough.
    But the original program only get the first username on the webpage
    so that we may lose some users.

Solution:
    For one servers, I get all GPU's name and all usernames on this line
    on the webpage.

    So I will return a dict contains all GPU names and username on every
    GPU.
2024-03-22 17:34:44 +08:00
834bed6eb0
feat: debug mode & fix 'unix manager' asci in mail
- add debug mode. if in the debug mode, it will send the mails without
  any CD time. And it only use my Gmail account as the receiver so that
  the other admin will not receive this DEBUG mail.
- send the mail with HTML header so that we can use <pre> label to show
  the ascii emoji
2024-03-22 16:31:37 +08:00

148
main.py
View File

@ -6,27 +6,29 @@ from bs4 import BeautifulSoup
from email.mime.text import MIMEText from email.mime.text import MIMEText
import os import os
DEBUG_MODE = False
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/' URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
GPU_LIMIT = 2 GPU_LIMIT = 2
GOOGLE_CODE = os.environ['GOOGLE_CODE'] GOOGLE_CODE = os.environ['GOOGLE_CODE']
MAIL_CD_HOUR = 6 MAIL_CD_HOUR = 6
MAIL_MESSAGE = ''' MAIL_MESSAGE = '''
Hi, {} Hi, {}<br>
<br>
提醒您您目前{} 伺服器上已經使用了 {} {} 提醒您您目前{}<br>
依照 CMLab 規定每人在每台 CML workstation 上至多使用 2 GPU詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule 依照 CMLab 規定每人在 CML workstation 上至多使用 2 GPU <br>
<br>
為了公平起見建議您降低 GPU 使用量 為了公平起見建議您降低 GPU 使用量<br>
雖然我們不會直接處理但若有人檢舉我們會停止您運行的程式 雖然我們不會直接處理但若有人檢舉我們會停止您運行的程式<br>
若有其他問題歡迎來信討論謝謝您的配合 若有其他特殊需求請來信說明謝謝您的配合<br>
<br>
* 信件由網管自動化工具寄出若為錯誤寄送請忽略此信件 * 信件由網管自動化工具寄出若為錯誤寄送請忽略此信件<br>
<br>
Best, Best,<br>
CMLab Unix Manager, Ting-Jun Wang CMLab Unix Manager, Ting-Jun Wang<br>
CMLab, National Taiwan University CMLab, National Taiwan University<br>
Email: unix_manager@cmlab.csie.ntu.edu.tw Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
<pre>
__ __ _ __ __ _
/ / / /__ (_)_ __ / / / /__ (_)_ __
__ ___/ /_/ / _ \/ /\ \ / __ ___/ /_/ / _ \/ /\ \ /
@ -34,6 +36,7 @@ Email: unix_manager@cmlab.csie.ntu.edu.tw
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/ / /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/ /_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
/___/ /___/
</pre>
''' '''
def get_server_gpu_status() -> list: def get_server_gpu_status() -> list:
@ -49,14 +52,14 @@ def get_server_gpu_status() -> list:
{ {
'name': 'cml5' 'name': 'cml5'
'users': [ 'users': [
'snsd0805', 'snsd0805', 'timmy' ['snsd0805', 'timmy'], ['timmy']
], ],
'gpu': 'V100' 'gpus': ['V100', 'V100']
}, },
... ...
] ]
means that 'cml5' server has 3 'V100', and the user 'snsd0805' is using 2 GPU and the user 'timmy' is usig 1 GPU now. means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.
''' '''
servers = [] servers = []
@ -74,63 +77,79 @@ def get_server_gpu_status() -> list:
soup = BeautifulSoup(box, 'html.parser') soup = BeautifulSoup(box, 'html.parser')
# get server name # get server name
server_name = soup.find('span', class_='f7').text.replace(' ', '') server_name = soup.find('span', class_='f7').text.replace(' ', '')
gpu_names = []
# get GPU name
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
users = [] users = []
gpus = box.split('\n') gpus = box.split('\n')
for i in gpus[1:]: for i in gpus[:]:
if 'f6' in i: # if this line is for a single GPU informations
soup = BeautifulSoup(i, 'html.parser') soup = BeautifulSoup(i, 'html.parser')
# get all users who is using this server's GPU resources. # get all users who is using this server's GPU resources.
user_obj = soup.find('span', class_='f0') user_objs = soup.findAll('span', class_='f0')
user_in_this_gpu = set()
# get users on this GPU
for user_obj in user_objs:
if user_obj != None: if user_obj != None:
username = user_obj.text username = user_obj.text
if username != 'gdm': if username != 'gdm':
users.append(username) user_in_this_gpu.add(username)
# get gpu names
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
# log gpu name & users on this GPU
gpu_names.append(gpu_name)
users.append(list(user_in_this_gpu))
servers.append({ servers.append({
'name': server_name, 'name': server_name,
'users': users, 'users': users,
'gpu': gpu_name 'gpus': gpu_names
}) })
return servers return servers
def filter(server_status: dict, limit: int = 2) -> list: def filter(server_status: list, limit: int = 2) -> list:
''' '''
You can set some rule here. You can set some rule here.
it will return the a list which contains the username that exceed the limit. it will return the a list which contains the username that exceed the limit.
Rules now (2024/03/11): Rules now (2024/03/22):
- 每人每台僅限使用 2 張卡 - 每人僅限使用 2 張卡
I add this function beacause that we may need more filter rules. we can add some rules in this function easily I add this function beacause that we may need more filter rules. we can add some rules in this function easily
Input: Input:
server_status (dict): e.g. {'name': 'cml5', ...} server_status (list): from get_server_gpu_status()
limit (int): GPU limit (default: 2) limit (int): GPU limit (default: 2)
Output: Output:
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'gpu': 3}] means that the violator 'snsd0805' is using 3 GPU. violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
''' '''
violators = [] usage = {}
counter = {} for server in server_status:
for user in server_status['users']: print(server['name'])
if user not in counter: for gpu_index, gpu_name in enumerate(server['gpus']):
counter[user] = 1 print(" ", gpu_index, gpu_name, server['users'][gpu_index])
for user in server['users'][gpu_index]:
if user not in usage:
usage[user] = [{'server': server['name'], 'gpu': gpu_name}]
else: else:
counter[user] += 1 usage[user].append({'server': server['name'], 'gpu': gpu_name})
print('-')
violators = []
for user, state in usage.items():
if len(state) > limit:
violators.append({'username': user, 'usage': state})
for k, v in counter.items():
if v > limit:
violators.append({
'user': k,
'gpu': v,
})
return violators return violators
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: def mail_notify(violators: list) -> None:
def check_send(log: dict, username: str) -> bool: def check_send(log: dict, username: str) -> bool:
if DEBUG_MODE:
return True
if username not in log: if username not in log:
return True return True
else: else:
@ -140,26 +159,45 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
else: else:
return False return False
def get_usage_msg(usage: list) -> str:
server_usage = {}
ans = ""
for gpu in usage:
if gpu['server'] not in server_usage:
server_usage[gpu['server']] = { gpu['gpu']: 1 }
else:
if gpu['gpu'] not in server_usage[gpu['server']]:
server_usage[gpu['server']][gpu['gpu']] = 1
else:
server_usage[gpu['server']][gpu['gpu']] += 1
for server, gpus in server_usage.items():
for gpu, count in gpus.items():
ans += f"{server} 上使用 {gpu} * {count}, "
return ans[:-2]
print(" ===== 寄送 ===== ")
# get last send time # get last send time
with open('send_log.json') as fp: with open('send_log.json') as fp:
send_log = json.load(fp) send_log = json.load(fp)
for violator in violators: for violator in violators:
username = violator['user'] username = violator['username']
print(f' {username}')
if check_send(send_log, username): if check_send(send_log, username):
# update log # update log
send_log[username] = time.time() send_log[username] = time.time()
with open('send_log.json', 'w') as fp: with open('send_log.json', 'w') as fp:
json.dump(send_log, fp) json.dump(send_log, fp)
usage = violator['gpu'] usage_msg = get_usage_msg(violator['usage'])
msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name) print(f' {username} {usage_msg}')
msg = MIMEText(msg, 'plain') # 郵件內文 msg = MAIL_MESSAGE.format(username, usage_msg)
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' msg = MIMEText(msg, 'html') # 郵件內文
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else ""
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw' msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw' msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw' msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
smtp = smtplib.SMTP('smtp.gmail.com', 587) smtp = smtplib.SMTP('smtp.gmail.com', 587)
smtp.ehlo() smtp.ehlo()
@ -176,9 +214,15 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
if __name__ == '__main__': if __name__ == '__main__':
server_status = get_server_gpu_status() server_status = get_server_gpu_status()
violators = filter(server_status, 2)
mail_notify(violators)
'''
for server in server_status: for server in server_status:
print(server['name'], server['gpu']) print(server['name'], server['gpu'])
violators = filter(server, 2) violators = filter(server, 2)
print(violators) print(violators)
mail_notify(server['name'], server['gpu'], violators) mail_notify(server['name'], server['gpu'], violators)
print("=" * 20) print("=" * 20)
'''