Problem:
Sometimes, multiple users may run the different program on the same
GPU if the VRAM is enough.
But the original program only get the first username on the webpage
so that we may lose some users.
Solution:
For one servers, I get all GPU's name and all usernames on this line
on the webpage.
So I will return a dict contains all GPU names and username on every
GPU.
211 lines
6.9 KiB
Python
211 lines
6.9 KiB
Python
import requests
|
||
import json
|
||
import smtplib
|
||
import time
|
||
from bs4 import BeautifulSoup
|
||
from email.mime.text import MIMEText
|
||
import os
|
||
|
||
DEBUG_MODE = True
|
||
|
||
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
|
||
GPU_LIMIT = 2
|
||
GOOGLE_CODE = os.environ['GOOGLE_CODE']
|
||
MAIL_CD_HOUR = 6
|
||
MAIL_MESSAGE = '''
|
||
Hi, {}<br>
|
||
<br>
|
||
提醒您,您目前在 {} 伺服器上已經使用了 {} 張 {},<br>
|
||
依照 CMLab 規定,每人在每台 CML workstation 上至多使用 2 顆 GPU,詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule <br>
|
||
<br>
|
||
為了公平起見,建議您降低 GPU 使用量!<br>
|
||
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!<br>
|
||
若有其他問題歡迎來信討論!謝謝您的配合!<br>
|
||
<br>
|
||
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。<br>
|
||
<br>
|
||
Best,<br>
|
||
CMLab Unix Manager, Ting-Jun Wang<br>
|
||
CMLab, National Taiwan University<br>
|
||
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
|
||
<pre>
|
||
__ __ _
|
||
/ / / /__ (_)_ __
|
||
__ ___/ /_/ / _ \/ /\ \ /
|
||
/ |/ /\____/_//_/_//_\_\____ ____
|
||
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
|
||
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
|
||
/___/
|
||
</pre>
|
||
'''
|
||
|
||
def get_server_gpu_status() -> list:
|
||
'''
|
||
It will get html from CMLab webpage.
|
||
and retrive the server name & users who are using GPU resources.
|
||
|
||
Input:
|
||
None
|
||
Output:
|
||
servers(list):
|
||
[
|
||
{
|
||
'name': 'cml5'
|
||
'users': [
|
||
'snsd0805', 'snsd0805', 'timmy'
|
||
],
|
||
'gpu': 'V100'
|
||
},
|
||
...
|
||
]
|
||
|
||
means that 'cml5' server has 3 'V100', and the user 'snsd0805' is using 2 GPU and the user 'timmy' is usig 1 GPU now.
|
||
|
||
'''
|
||
servers = []
|
||
|
||
# get HTML file
|
||
response = requests.get(URL)
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# find the table which shows the GPU status
|
||
table = soup.find('pre')
|
||
boxes = table.prettify().split('\n\n')
|
||
|
||
# retrive all server, one box means one server
|
||
for box in boxes[:-1]:
|
||
soup = BeautifulSoup(box, 'html.parser')
|
||
# get server name
|
||
server_name = soup.find('span', class_='f7').text.replace(' ', '')
|
||
gpu_names = []
|
||
|
||
users = []
|
||
gpus = box.split('\n')
|
||
for i in gpus[:]:
|
||
if 'f6' in i: # if this line is for a single GPU informations
|
||
soup = BeautifulSoup(i, 'html.parser')
|
||
|
||
# get all users who is using this server's GPU resources.
|
||
user_objs = soup.findAll('span', class_='f0')
|
||
user_in_this_gpu = set()
|
||
|
||
# get users on this GPU
|
||
for user_obj in user_objs:
|
||
if user_obj != None:
|
||
username = user_obj.text
|
||
if username != 'gdm':
|
||
user_in_this_gpu.add(username)
|
||
|
||
# get gpu names
|
||
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
|
||
|
||
# log gpu name & users on this GPU
|
||
gpu_names.append(gpu_name)
|
||
users.append(list(user_in_this_gpu))
|
||
|
||
servers.append({
|
||
'name': server_name,
|
||
'users': users,
|
||
'gpu': gpu_names
|
||
})
|
||
return servers
|
||
|
||
def filter(server_status: list, limit: int = 2) -> list:
|
||
'''
|
||
You can set some rule here.
|
||
it will return the a list which contains the username that exceed the limit.
|
||
Rules now (2024/03/11):
|
||
- 每人每台僅限使用 2 張卡
|
||
|
||
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
|
||
|
||
Input:
|
||
server_status (list): from get_server_gpu_status()
|
||
limit (int): GPU limit (default: 2)
|
||
Output:
|
||
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
|
||
'''
|
||
violators = []
|
||
|
||
counter = {}
|
||
|
||
for server in server_status:
|
||
print(server)
|
||
'''
|
||
for user in server_status['users']:
|
||
if user not in counter:
|
||
counter[user] = 1
|
||
else:
|
||
counter[user] += 1
|
||
|
||
for k, v in counter.items():
|
||
if v > limit:
|
||
violators.append({
|
||
'user': k,
|
||
'gpu': v,
|
||
})
|
||
'''
|
||
return violators
|
||
|
||
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
|
||
def check_send(log: dict, username: str) -> bool:
|
||
if DEBUG_MODE:
|
||
return True
|
||
if username not in log:
|
||
return True
|
||
else:
|
||
if time.time() - log[username] >= (MAIL_CD_HOUR*60*60): # 6 hr 提醒一次
|
||
|
||
return True
|
||
else:
|
||
return False
|
||
|
||
# get last send time
|
||
with open('send_log.json') as fp:
|
||
send_log = json.load(fp)
|
||
|
||
for violator in violators:
|
||
username = violator['user']
|
||
print(f' {username}')
|
||
|
||
if check_send(send_log, username):
|
||
# update log
|
||
send_log[username] = time.time()
|
||
with open('send_log.json', 'w') as fp:
|
||
json.dump(send_log, fp)
|
||
usage = violator['gpu']
|
||
msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name)
|
||
msg = MIMEText(msg, 'html') # 郵件內文
|
||
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else ""
|
||
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
|
||
msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
|
||
msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
|
||
|
||
smtp = smtplib.SMTP('smtp.gmail.com', 587)
|
||
smtp.ehlo()
|
||
smtp.starttls()
|
||
smtp.login('snsd0805@cmlab.csie.ntu.edu.tw', GOOGLE_CODE)
|
||
|
||
status = smtp.send_message(msg)
|
||
if status == {}:
|
||
print(f' {username}, 郵件傳送成功!')
|
||
else:
|
||
print(f' {username}, 郵件傳送失敗...')
|
||
smtp.quit()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
server_status = get_server_gpu_status()
|
||
for i in server_status:
|
||
print(i)
|
||
|
||
violators = filter(server_status, 2)
|
||
'''
|
||
for server in server_status:
|
||
print(server['name'], server['gpu'])
|
||
violators = filter(server, 2)
|
||
print(violators)
|
||
mail_notify(server['name'], server['gpu'], violators)
|
||
print("=" * 20)
|
||
'''
|