fix: get all users on the same GPU

Problem:
    Sometimes, multiple users may run the different program on the same
    GPU if the VRAM is enough.
    But the original program only get the first username on the webpage
    so that we may lose some users.

Solution:
    For one servers, I get all GPU's name and all usernames on this line
    on the webpage.

    So I will return a dict contains all GPU names and username on every
    GPU.
This commit is contained in:
Ting-Jun Wang 2024-03-22 17:34:44 +08:00
parent 834bed6eb0
commit e907d42043
Signed by: snsd0805
GPG Key ID: D175E969960C4B16

41
main.py
View File

@ -77,30 +77,40 @@ def get_server_gpu_status() -> list:
soup = BeautifulSoup(box, 'html.parser') soup = BeautifulSoup(box, 'html.parser')
# get server name # get server name
server_name = soup.find('span', class_='f7').text.replace(' ', '') server_name = soup.find('span', class_='f7').text.replace(' ', '')
gpu_names = []
# get GPU name
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
users = [] users = []
gpus = box.split('\n') gpus = box.split('\n')
for i in gpus[1:]: for i in gpus[:]:
if 'f6' in i: # if this line is for a single GPU informations
soup = BeautifulSoup(i, 'html.parser') soup = BeautifulSoup(i, 'html.parser')
# get all users who is using this server's GPU resources. # get all users who is using this server's GPU resources.
user_obj = soup.find('span', class_='f0') user_objs = soup.findAll('span', class_='f0')
user_in_this_gpu = set()
# get users on this GPU
for user_obj in user_objs:
if user_obj != None: if user_obj != None:
username = user_obj.text username = user_obj.text
if username != 'gdm': if username != 'gdm':
users.append(username) user_in_this_gpu.add(username)
# get gpu names
gpu_name = soup.find('span', class_='f4').text.replace(' ', '')
# log gpu name & users on this GPU
gpu_names.append(gpu_name)
users.append(list(user_in_this_gpu))
servers.append({ servers.append({
'name': server_name, 'name': server_name,
'users': users, 'users': users,
'gpu': gpu_name 'gpu': gpu_names
}) })
return servers return servers
def filter(server_status: dict, limit: int = 2) -> list: def filter(server_status: list, limit: int = 2) -> list:
''' '''
You can set some rule here. You can set some rule here.
it will return the a list which contains the username that exceed the limit. it will return the a list which contains the username that exceed the limit.
@ -110,14 +120,18 @@ def filter(server_status: dict, limit: int = 2) -> list:
I add this function beacause that we may need more filter rules. we can add some rules in this function easily I add this function beacause that we may need more filter rules. we can add some rules in this function easily
Input: Input:
server_status (dict): e.g. {'name': 'cml5', ...} server_status (list): from get_server_gpu_status()
limit (int): GPU limit (default: 2) limit (int): GPU limit (default: 2)
Output: Output:
violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'gpu': 3}] means that the violator 'snsd0805' is using 3 GPU. violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU.
''' '''
violators = [] violators = []
counter = {} counter = {}
for server in server_status:
print(server)
'''
for user in server_status['users']: for user in server_status['users']:
if user not in counter: if user not in counter:
counter[user] = 1 counter[user] = 1
@ -130,6 +144,7 @@ def filter(server_status: dict, limit: int = 2) -> list:
'user': k, 'user': k,
'gpu': v, 'gpu': v,
}) })
'''
return violators return violators
def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
@ -181,9 +196,15 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None:
if __name__ == '__main__': if __name__ == '__main__':
server_status = get_server_gpu_status() server_status = get_server_gpu_status()
for i in server_status:
print(i)
violators = filter(server_status, 2)
'''
for server in server_status: for server in server_status:
print(server['name'], server['gpu']) print(server['name'], server['gpu'])
violators = filter(server, 2) violators = filter(server, 2)
print(violators) print(violators)
mail_notify(server['name'], server['gpu'], violators) mail_notify(server['name'], server['gpu'], violators)
print("=" * 20) print("=" * 20)
'''