Compare commits
4 Commits
f0e94303cc
...
a9c3e63b44
| Author | SHA1 | Date | |
|---|---|---|---|
| a9c3e63b44 | |||
| 66e4b9ec51 | |||
| e907d42043 | |||
| 834bed6eb0 |
172
main.py
172
main.py
@ -6,34 +6,37 @@ from bs4 import BeautifulSoup
|
|||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
DEBUG_MODE = False
|
||||||
|
|
||||||
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
|
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
|
||||||
GPU_LIMIT = 2
|
GPU_LIMIT = 2
|
||||||
GOOGLE_CODE = os.environ['GOOGLE_CODE']
|
GOOGLE_CODE = os.environ['GOOGLE_CODE']
|
||||||
MAIL_CD_HOUR = 6
|
MAIL_CD_HOUR = 6
|
||||||
MAIL_MESSAGE = '''
|
MAIL_MESSAGE = '''
|
||||||
Hi, {}
|
Hi, {}<br>
|
||||||
|
<br>
|
||||||
提醒您,您目前在 {} 伺服器上已經使用了 {} 張 {},
|
提醒您,您目前{},<br>
|
||||||
依照 CMLab 規定,每人在每台 CML workstation 上至多使用 2 顆 GPU,詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule
|
依照 CMLab 規定,每人在 CML workstation 上至多使用 2 顆 GPU <br>
|
||||||
|
<br>
|
||||||
為了公平起見,建議您降低 GPU 使用量!
|
為了公平起見,建議您降低 GPU 使用量!<br>
|
||||||
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!
|
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!<br>
|
||||||
若有其他問題歡迎來信討論!謝謝您的配合!
|
若有其他特殊需求請來信說明!謝謝您的配合!<br>
|
||||||
|
<br>
|
||||||
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。
|
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。<br>
|
||||||
|
<br>
|
||||||
Best,
|
Best,<br>
|
||||||
CMLab Unix Manager, Ting-Jun Wang
|
CMLab Unix Manager, Ting-Jun Wang<br>
|
||||||
CMLab, National Taiwan University
|
CMLab, National Taiwan University<br>
|
||||||
Email: unix_manager@cmlab.csie.ntu.edu.tw
|
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
|
||||||
|
<pre>
|
||||||
__ __ _
|
__ __ _
|
||||||
/ / / /__ (_)_ __
|
/ / / /__ (_)_ __
|
||||||
__ ___/ /_/ / _ \/ /\ \ /
|
__ ___/ /_/ / _ \/ /\ \ /
|
||||||
/ |/ /\____/_//_/_//_\_\____ ____
|
/ |/ /\____/_//_/_//_\_\____ ____
|
||||||
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
|
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
|
||||||
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
|
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
|
||||||
/___/
|
/___/
|
||||||
|
</pre>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def get_server_gpu_status() -> list:
|
def get_server_gpu_status() -> list:
|
||||||
@ -49,14 +52,14 @@ def get_server_gpu_status() -> list:
|
|||||||
{
|
{
|
||||||
'name': 'cml5'
|
'name': 'cml5'
|
||||||
'users': [
|
'users': [
|
||||||
'snsd0805', 'snsd0805', 'timmy'
|
['snsd0805', 'timmy'], ['timmy']
|
||||||
],
|
],
|
||||||
'gpu': 'V100'
|
'gpus': ['V100', 'V100']
|
||||||
},
|
},
|
||||||
...
|
...
|
||||||
]
|
]
|
||||||
|
|
||||||
means that 'cml5' server has 3 'V100', and the user 'snsd0805' is using 2 GPU and the user 'timmy' is usig 1 GPU now.
|
means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
servers = []
|
servers = []
|
||||||
@ -74,63 +77,79 @@ def get_server_gpu_status() -> list:
|
|||||||
soup = BeautifulSoup(box, 'html.parser')
|
soup = BeautifulSoup(box, 'html.parser')
|
||||||
# get server name
|
# get server name
|
||||||
server_name = soup.find('span', class_='f7').text.replace(' ', '')
|
server_name = soup.find('span', class_='f7').text.replace(' ', '')
|
||||||
|
gpu_names = []
|
||||||
# get GPU name
|
|
||||||
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
|
|
||||||
|
|
||||||
users = []
|
users = []
|
||||||
gpus = box.split('\n')
|
gpus = box.split('\n')
|
||||||
for i in gpus[1:]:
|
for i in gpus[:]:
|
||||||
soup = BeautifulSoup(i, 'html.parser')
|
if 'f6' in i: # if this line is for a single GPU informations
|
||||||
|
soup = BeautifulSoup(i, 'html.parser')
|
||||||
|
|
||||||
# get all users who is using this server's GPU resources.
|
# get all users who is using this server's GPU resources.
|
||||||
user_obj = soup.find('span', class_='f0')
|
user_objs = soup.findAll('span', class_='f0')
|
||||||
if user_obj != None:
|
user_in_this_gpu = set()
|
||||||
username = user_obj.text
|
|
||||||
if username != 'gdm':
|
# get users on this GPU
|
||||||
users.append(username)
|
for user_obj in user_objs:
|
||||||
|
if user_obj != None:
|
||||||
|
username = user_obj.text
|
||||||
|
if username != 'gdm':
|
||||||
|
user_in_this_gpu.add(username)
|
||||||
|
|
||||||
|
# get gpu names
|
||||||
|
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
|
||||||
|
|
||||||
|
# log gpu name & users on this GPU
|
||||||
|
gpu_names.append(gpu_name)
|
||||||
|
users.append(list(user_in_this_gpu))
|
||||||
|
|
||||||
servers.append({
|
servers.append({
|
||||||
'name': server_name,
|
'name': server_name,
|
||||||
'users': users,
|
'users': users,
|
||||||
'gpu': gpu_name
|
'gpus': gpu_names
|
||||||
})
|
})
|
||||||
return servers
|
return servers
|
||||||
|
|
||||||
def filter(server_status: dict, limit: int = 2) -> list:
|
def filter(server_status: list, limit: int = 2) -> list:
|
||||||
'''
|
'''
|
||||||
You can set some rule here.
|
You can set some rule here.
|
||||||
it will return the a list which contains the username that exceed the limit.
|
it will return the a list which contains the username that exceed the limit.
|
||||||
Rules now (2024/03/11):
|
Rules now (2024/03/22):
|
||||||
- 每人每台僅限使用 2 張卡
|
- 每人僅限使用 2 張卡
|
||||||
|
|
||||||
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
|
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
server_status (dict): e.g. {'name': 'cml5', ...}
|
server_status (list): from get_server_gpu_status()
|
||||||
limit (int): GPU limit (default: 2)
|
limit (int): GPU limit (default: 2)
|
||||||
Output:
|
Output:
|
||||||
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'gpu': 3}] means that the violator 'snsd0805' is using 3 GPU.
|
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
|
||||||
'''
|
'''
|
||||||
violators = []
|
usage = {}
|
||||||
|
|
||||||
counter = {}
|
for server in server_status:
|
||||||
for user in server_status['users']:
|
print(server['name'])
|
||||||
if user not in counter:
|
for gpu_index, gpu_name in enumerate(server['gpus']):
|
||||||
counter[user] = 1
|
print(" ", gpu_index, gpu_name, server['users'][gpu_index])
|
||||||
else:
|
for user in server['users'][gpu_index]:
|
||||||
counter[user] += 1
|
if user not in usage:
|
||||||
|
usage[user] = [{'server': server['name'], 'gpu': gpu_name}]
|
||||||
for k, v in counter.items():
|
else:
|
||||||
if v > limit:
|
usage[user].append({'server': server['name'], 'gpu': gpu_name})
|
||||||
violators.append({
|
|
||||||
'user': k,
|
print('-')
|
||||||
'gpu': v,
|
|
||||||
})
|
violators = []
|
||||||
|
for user, state in usage.items():
|
||||||
|
if len(state) > limit:
|
||||||
|
violators.append({'username': user, 'usage': state})
|
||||||
|
|
||||||
return violators
|
return violators
|
||||||
|
|
||||||
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
|
def mail_notify(violators: list) -> None:
|
||||||
def check_send(log: dict, username: str) -> bool:
|
def check_send(log: dict, username: str) -> bool:
|
||||||
|
if DEBUG_MODE:
|
||||||
|
return True
|
||||||
if username not in log:
|
if username not in log:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
@ -140,26 +159,45 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
|
|||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_usage_msg(usage: list) -> str:
|
||||||
|
server_usage = {}
|
||||||
|
ans = ""
|
||||||
|
for gpu in usage:
|
||||||
|
if gpu['server'] not in server_usage:
|
||||||
|
server_usage[gpu['server']] = { gpu['gpu']: 1 }
|
||||||
|
else:
|
||||||
|
if gpu['gpu'] not in server_usage[gpu['server']]:
|
||||||
|
server_usage[gpu['server']][gpu['gpu']] = 1
|
||||||
|
else:
|
||||||
|
server_usage[gpu['server']][gpu['gpu']] += 1
|
||||||
|
|
||||||
|
for server, gpus in server_usage.items():
|
||||||
|
for gpu, count in gpus.items():
|
||||||
|
ans += f"在 {server} 上使用 {gpu} * {count}, "
|
||||||
|
return ans[:-2]
|
||||||
|
|
||||||
|
print(" ===== 寄送 ===== ")
|
||||||
|
|
||||||
# get last send time
|
# get last send time
|
||||||
with open('send_log.json') as fp:
|
with open('send_log.json') as fp:
|
||||||
send_log = json.load(fp)
|
send_log = json.load(fp)
|
||||||
|
|
||||||
for violator in violators:
|
for violator in violators:
|
||||||
username = violator['user']
|
username = violator['username']
|
||||||
print(f' {username}')
|
|
||||||
|
|
||||||
if check_send(send_log, username):
|
if check_send(send_log, username):
|
||||||
# update log
|
# update log
|
||||||
send_log[username] = time.time()
|
send_log[username] = time.time()
|
||||||
with open('send_log.json', 'w') as fp:
|
with open('send_log.json', 'w') as fp:
|
||||||
json.dump(send_log, fp)
|
json.dump(send_log, fp)
|
||||||
usage = violator['gpu']
|
usage_msg = get_usage_msg(violator['usage'])
|
||||||
msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name)
|
print(f' {username} {usage_msg}')
|
||||||
msg = MIMEText(msg, 'plain') # 郵件內文
|
msg = MAIL_MESSAGE.format(username, usage_msg)
|
||||||
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})'
|
msg = MIMEText(msg, 'html') # 郵件內文
|
||||||
|
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else ""
|
||||||
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
|
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
|
||||||
msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw'
|
msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
|
||||||
msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
|
msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
|
||||||
|
|
||||||
smtp = smtplib.SMTP('smtp.gmail.com', 587)
|
smtp = smtplib.SMTP('smtp.gmail.com', 587)
|
||||||
smtp.ehlo()
|
smtp.ehlo()
|
||||||
@ -176,9 +214,15 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
server_status = get_server_gpu_status()
|
server_status = get_server_gpu_status()
|
||||||
|
|
||||||
|
violators = filter(server_status, 2)
|
||||||
|
mail_notify(violators)
|
||||||
|
|
||||||
|
'''
|
||||||
for server in server_status:
|
for server in server_status:
|
||||||
print(server['name'], server['gpu'])
|
print(server['name'], server['gpu'])
|
||||||
violators = filter(server, 2)
|
violators = filter(server, 2)
|
||||||
print(violators)
|
print(violators)
|
||||||
mail_notify(server['name'], server['gpu'], violators)
|
mail_notify(server['name'], server['gpu'], violators)
|
||||||
print("=" * 20)
|
print("=" * 20)
|
||||||
|
'''
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user