From e907d420431505f3c04e8b3871b85dcaf5668231 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Fri, 22 Mar 2024 17:34:44 +0800 Subject: [PATCH] fix: get all users on the same GPU Problem: Sometimes, multiple users may run the different program on the same GPU if the VRAM is enough. But the original program only get the first username on the webpage so that we may lose some users. Solution: For one servers, I get all GPU's name and all usernames on this line on the webpage. So I will return a dict contains all GPU names and username on every GPU. --- main.py | 51 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/main.py b/main.py index ff8fc40..aecb514 100644 --- a/main.py +++ b/main.py @@ -77,30 +77,40 @@ def get_server_gpu_status() -> list: soup = BeautifulSoup(box, 'html.parser') # get server name server_name = soup.find('span', class_='f7').text.replace(' ', '') - - # get GPU name - gpu_name = soup.find('span', class_='f4').text.replace(' ', '') + gpu_names = [] users = [] gpus = box.split('\n') - for i in gpus[1:]: - soup = BeautifulSoup(i, 'html.parser') + for i in gpus[:]: + if 'f6' in i: # if this line is for a single GPU informations + soup = BeautifulSoup(i, 'html.parser') - # get all users who is using this server's GPU resources. - user_obj = soup.find('span', class_='f0') - if user_obj != None: - username = user_obj.text - if username != 'gdm': - users.append(username) + # get all users who is using this server's GPU resources. + user_objs = soup.findAll('span', class_='f0') + user_in_this_gpu = set() + + # get users on this GPU + for user_obj in user_objs: + if user_obj != None: + username = user_obj.text + if username != 'gdm': + user_in_this_gpu.add(username) + + # get gpu names + gpu_name = soup.find('span', class_='f4').text.replace(' ', '') + + # log gpu name & users on this GPU + gpu_names.append(gpu_name) + users.append(list(user_in_this_gpu)) servers.append({ 'name': server_name, 'users': users, - 'gpu': gpu_name + 'gpu': gpu_names }) return servers -def filter(server_status: dict, limit: int = 2) -> list: +def filter(server_status: list, limit: int = 2) -> list: ''' You can set some rule here. it will return the a list which contains the username that exceed the limit. @@ -110,14 +120,18 @@ def filter(server_status: dict, limit: int = 2) -> list: I add this function beacause that we may need more filter rules. we can add some rules in this function easily Input: - server_status (dict): e.g. {'name': 'cml5', ...} + server_status (list): from get_server_gpu_status() limit (int): GPU limit (default: 2) Output: - violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'gpu': 3}] means that the violator 'snsd0805' is using 3 GPU. + violators (list): a violator list(set) e.g. [ {'user': 'snsd0805', 'usage': [{'server': 'cml5', 'gpu': 'NVDIA3090', 'number': 3}...] }] means that the violator 'snsd0805' is using 3 GPU. ''' violators = [] counter = {} + + for server in server_status: + print(server) + ''' for user in server_status['users']: if user not in counter: counter[user] = 1 @@ -130,6 +144,7 @@ def filter(server_status: dict, limit: int = 2) -> list: 'user': k, 'gpu': v, }) + ''' return violators def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: @@ -181,9 +196,15 @@ def mail_notify(server_name: str, gpu_name: str, violators: list) -> None: if __name__ == '__main__': server_status = get_server_gpu_status() + for i in server_status: + print(i) + + violators = filter(server_status, 2) + ''' for server in server_status: print(server['name'], server['gpu']) violators = filter(server, 2) print(violators) mail_notify(server['name'], server['gpu'], violators) print("=" * 20) + '''