CMLAB-GPU-Mailer/main.py

211 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import smtplib
import time
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
import os
DEBUG_MODE = True
URL = 'https://www.cmlab.csie.ntu.edu.tw/status-gpu/'
GPU_LIMIT = 2
GOOGLE_CODE = os.environ['GOOGLE_CODE']
MAIL_CD_HOUR = 6
MAIL_MESSAGE = '''
Hi, {}<br>
<br>
提醒您,您目前在 {} 伺服器上已經使用了 {}{}<br>
依照 CMLab 規定,每人在每台 CML workstation 上至多使用 2 顆 GPU詳細規定請參閱 https://www.cmlab.csie.ntu.edu.tw/wiki/doku.php?id=workstation:rule <br>
<br>
為了公平起見,建議您降低 GPU 使用量!<br>
雖然我們不會直接處理,但若有人檢舉,我們會停止您運行的程式!<br>
若有其他問題歡迎來信討論!謝謝您的配合!<br>
<br>
* 信件由網管自動化工具寄出,若為錯誤寄送請忽略此信件。<br>
<br>
Best,<br>
CMLab Unix Manager, Ting-Jun Wang<br>
CMLab, National Taiwan University<br>
Email: unix_manager@cmlab.csie.ntu.edu.tw<br>
<pre>
__ __ _
/ / / /__ (_)_ __
__ ___/ /_/ / _ \/ /\ \ /
/ |/ /\____/_//_/_//_\_\____ ____
/ /|_/ / _ `/ _ \/ _ `/ _ `/ -_) __/
/_/ /_/\_,_/_//_/\_,_/\_, /\__/_/
/___/
</pre>
'''
def get_server_gpu_status() -> list:
'''
It will get html from CMLab webpage.
and retrive the server name & users who are using GPU resources.
Input:
None
Output:
servers(list):
[
{
'name': 'cml5'
'users': [
['snsd0805', 'timmy'], ['timmy']
],
'gpus': ['V100', 'V100']
},
...
]
means that 'cml5' server has 2 'V100', and the user 'snsd0805' is using 1 GPU and the user 'timmy' is usig 2 GPU now.
'''
servers = []
# get HTML file
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
# find the table which shows the GPU status
table = soup.find('pre')
boxes = table.prettify().split('\n\n')
# retrive all server, one box means one server
for box in boxes[:-1]:
soup = BeautifulSoup(box, 'html.parser')
# get server name
server_name = soup.find('span', class_='f7').text.replace(' ', '')
gpu_names = []
users = []
gpus = box.split('\n')
for i in gpus[:]:
if 'f6' in i: # if this line is for a single GPU informations
soup = BeautifulSoup(i, 'html.parser')
# get all users who is using this server's GPU resources.
user_objs = soup.findAll('span', class_='f0')
user_in_this_gpu = set()
# get users on this GPU
for user_obj in user_objs:
if user_obj != None:
username = user_obj.text
if username != 'gdm':
user_in_this_gpu.add(username)
# get gpu names
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
# log gpu name & users on this GPU
gpu_names.append(gpu_name)
users.append(list(user_in_this_gpu))
servers.append({
'name': server_name,
'users': users,
'gpu': gpu_names
})
return servers
def filter(server_status: list, limit: int = 2) -> list:
'''
You can set some rule here.
it will return the a list which contains the username that exceed the limit.
Rules now (2024/03/11):
- 每人每台僅限使用 2 張卡
I add this function beacause that we may need more filter rules. we can add some rules in this function easily
Input:
server_status (list): from get_server_gpu_status()
limit (int): GPU limit (default: 2)
Output:
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
'''
violators = []
counter = {}
for server in server_status:
print(server)
'''
for user in server_status['users']:
if user not in counter:
counter[user] = 1
else:
counter[user] += 1
for k, v in counter.items():
if v > limit:
violators.append({
'user': k,
'gpu': v,
})
'''
return violators
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
def check_send(log: dict, username: str) -> bool:
if DEBUG_MODE:
return True
if username not in log:
return True
else:
if time.time() - log[username] >= (MAIL_CD_HOUR*60*60): # 6 hr 提醒一次
return True
else:
return False
# get last send time
with open('send_log.json') as fp:
send_log = json.load(fp)
for violator in violators:
username = violator['user']
print(f' {username}')
if check_send(send_log, username):
# update log
send_log[username] = time.time()
with open('send_log.json', 'w') as fp:
json.dump(send_log, fp)
usage = violator['gpu']
msg = MAIL_MESSAGE.format(username, server_name, usage, gpu_name)
msg = MIMEText(msg, 'html') # 郵件內文
msg['Subject'] = f'[網管通知] 提醒您已經超過 CMLab GPU 使用限制!({username})' + "(DEBUG MODE)" if DEBUG_MODE else ""
msg['From'] = 'unix_manager@cmlab.csie.ntu.edu.tw'
msg['To'] = f'{username}@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
msg['Cc'] = 'unix_manager@cmlab.csie.ntu.edu.tw' if not DEBUG_MODE else 'snsd0805@cmlab.csie.ntu.edu.tw'
smtp = smtplib.SMTP('smtp.gmail.com', 587)
smtp.ehlo()
smtp.starttls()
smtp.login('snsd0805@cmlab.csie.ntu.edu.tw', GOOGLE_CODE)
status = smtp.send_message(msg)
if status == {}:
print(f' {username}, 郵件傳送成功!')
else:
print(f' {username}, 郵件傳送失敗...')
smtp.quit()
if __name__ == '__main__':
server_status = get_server_gpu_status()
for i in server_status:
print(i)
violators = filter(server_status, 2)
'''
for server in server_status:
print(server['name'], server['gpu'])
violators = filter(server, 2)
print(violators)
mail_notify(server['name'], server['gpu'], violators)
print("=" * 20)
'''