diff --git a/getData.py b/getData.py index 911f0d3..e7f7718 100644 --- a/getData.py +++ b/getData.py @@ -1,103 +1,145 @@ -import requests, json +import requests +import json +import os from bs4 import BeautifulSoup as bs -from tqdm import tqdm, trange -allFinalPage = 86 -generalFinalPage = 10 -courseObjList = {} header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0', 'Cookie': '輸入登入暨大教務系統後所得到的cookie' } -def curlAllCoursePage(): +mainURL = "https://ccweb.ncnu.edu.tw/student/" + +def curlDepartmentCourseTable(year): print("取得所有課程資料:") - progress = tqdm(total=allFinalPage) - for page in range(1, allFinalPage+1): - url = 'https://ccweb.ncnu.edu.tw/student/current_semester_opened_listlist.php?start=1&pageno={}'.format(page) - response = requests.get(url, headers=header) - data = response.text - with open('all/{}.html'.format(page), 'w') as fp: - fp.write(data) - progress.update(1) -def curlGeneralCoursePage(): - print("取得通識課資料:") - progress = tqdm(total=generalFinalPage) - for page in range(1, generalFinalPage+1): - url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page) - response = requests.get(url, headers=header) - data = response.text - with open('general/{}.html'.format(page), 'w') as fp: - fp.write(data) - progress.update(1) + response = requests.get(mainURL+"aspmaker_course_opened_semester_stat_viewlist.php?x_year={}&recperpage=ALL".format(year), headers=header) + data = response.text + root = bs(data, "html.parser") + + count = 1 + departmentsTR = root.findAll('tr')[1:] # 清除 thead + for tr in departmentsTR: + name = tr.findAll('td')[4].find('span').find('span').string # 取得 科系名稱 + link = mainURL + tr.find('a').get('data-url').replace('amp;', '') # 清除不必要符號, 取得 連結 + print("擷取{}課程... ({}/{})...".format(name, count, len(departmentsTR))) + count += 1 + extractDepartmentCourseTable(name, link) # 透過連結 開始擷取 各科系課程 -def extractAllCourse(): - print("解析所有課程html:") - progress = tqdm(total=allFinalPage) - for pageNumber in range(1, allFinalPage+1): - html = "" - with open('all/{}.html'.format(pageNumber), 'r') as fp: - html = fp.read() - root = bs(html, "html.parser") - courses = root.find_all('tr') - courses = courses[1:] - for course in courses: - courseObj = {} - tds = course.find_all('td') - tds = tds[1:] - courseObj['year'] = tds[0].text.replace('\n', '') - courseObj['number'] = tds[1].text.replace('\n', '') - courseObj['name'] = tds[3].text.replace('\n', '') - courseObj['class'] = tds[2].text.replace('\n', '') - courseObj['department'] = tds[4].text.replace('\n', '') - courseObj['graduated'] = tds[5].text.replace('\n', '') - courseObj['grade'] = tds[6].text.replace('\n', '') - courseObj['teacher'] = tds[7].text.replace('\n', '') - courseObj['place'] = tds[8].text.replace('\n', '') - courseObj['time'] = tds[10].text.replace('\n', '') +# def curlGeneralCoursePage(): +# print("取得通識課資料:") +# progress = tqdm(total=generalFinalPage) +# for page in range(1, generalFinalPage+1): +# url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page) +# response = requests.get(url, headers=header) +# data = response.text +# with open('general/{}.html'.format(page), 'w') as fp: +# fp.write(data) +# progress.update(1) - courseObjList[ - tds[1].text.replace('\n', '') - + - tds[2].text.replace('\n', '') - ] = courseObj - progress.update(1) +def extractDepartmentCourseTable(departmentName, link): + # 判斷是否目前還沒有資料 + if(os.path.isfile('output.json')): + with open('output.json', 'r') as fp: + courses = json.load(fp) + else: + courses = [] + + response = requests.get(link, headers=header) + data = response.text + root = bs(data, "html.parser") -def extractGeneralCourse(): - print("解析通識課html:") - progress = tqdm(total=generalFinalPage) - for pageNumber in range(1, generalFinalPage+1): - html = "" - with open('general/{}.html'.format(pageNumber), 'r') as fp: - html = fp.read() - root = bs(html, "html.parser") - courses = root.find_all('tr') - courses = courses[1:] - for course in courses: - courseObj = {} - tds = course.find_all('td') - number = tds[3].text.replace('\n', '') - classNum = tds[4].text.replace('\n','') - major = tds[1].text.replace('\n', '') - name = tds[6].text.replace('\n', '') - old = courseObjList[number+classNum]['department'] - if old != "90, 體育室": - courseObjList[number+classNum]['department'] = major - progress.update(1) + courseTR = root.findAll('tr')[1:] # 清除 thead + for tr in courseTR: + courseObj = {} + tds = tr.find_all('td') -curlAllCoursePage() -extractAllCourse() -curlGeneralCoursePage() -extractGeneralCourse() + courseObj['link'] = mainURL + tds[0].find('a').get('href') + courseObj['year'] = tds[1].find('span').string + courseObj['number'] = tds[2].find('span').string + courseObj['class'] = tds[3].find('span').string + courseObj['name'] = tds[4].find('span').string + courseObj['department'] = tds[5].find('span').string + courseObj['graduated'] = tds[6].find('span').string + courseObj['grade'] = tds[7].find('span').string + courseObj['teacher'] = tds[8].find('span').string + courseObj['place'] = tds[9].find('span').string + courseObj['time'] = tds[11].find('span').string + courses.append(courseObj) + + with open('output.json', 'w') as fp: + json.dump(courses, fp) -out = [] -count = 0 -for item in courseObjList: - count = count+1 - out.append(courseObjList[item]) + -with open('output.json', 'w') as fp: - fp.write(json.dumps(out, ensure_ascii=False)) -print(count) + # print("解析所有課程html:") + # progress = tqdm(total=allFinalPage) + # for pageNumber in range(1, allFinalPage+1): + # html = "" + # with open('all/{}.html'.format(pageNumber), 'r') as fp: + # html = fp.read() + # root = bs(html, "html.parser") + # courses = root.find_all('tr') + # courses = courses[1:] + # for course in courses: + # courseObj = {} + # tds = course.find_all('td') + # tds = tds[1:] + # courseObj['year'] = tds[0].text.replace('\n', '') + # courseObj['number'] = tds[1].text.replace('\n', '') + # courseObj['name'] = tds[3].text.replace('\n', '') + # courseObj['class'] = tds[2].text.replace('\n', '') + # courseObj['department'] = tds[4].text.replace('\n', '') + # courseObj['graduated'] = tds[5].text.replace('\n', '') + # courseObj['grade'] = tds[6].text.replace('\n', '') + # courseObj['teacher'] = tds[7].text.replace('\n', '') + # courseObj['place'] = tds[8].text.replace('\n', '') + # courseObj['time'] = tds[10].text.replace('\n', '') + + # courseObjList[ + # tds[1].text.replace('\n', '') + # + + # tds[2].text.replace('\n', '') + # ] = courseObj + # progress.update(1) + +# def extractGeneralCourse(): +# print("解析通識課html:") +# progress = tqdm(total=generalFinalPage) +# for pageNumber in range(1, generalFinalPage+1): +# html = "" +# with open('general/{}.html'.format(pageNumber), 'r') as fp: +# html = fp.read() +# root = bs(html, "html.parser") +# courses = root.find_all('tr') +# courses = courses[1:] +# for course in courses: +# courseObj = {} +# tds = course.find_all('td') +# number = tds[3].text.replace('\n', '') +# classNum = tds[4].text.replace('\n','') +# major = tds[1].text.replace('\n', '') +# name = tds[6].text.replace('\n', '') +# old = courseObjList[number+classNum]['department'] +# if old != "90, 體育室": +# courseObjList[number+classNum]['department'] = major +# progress.update(1) + + + +year = input("年份: ") +curlDepartmentCourseTable(year) +# extractAllCourse() +# curlGeneralCoursePage() +# extractGeneralCourse() + +# out = [] +# count = 0 +# for item in courseObjList: +# count = count+1 +# out.append(courseObjList[item]) + +# with open('output.json', 'w') as fp: +# fp.write(json.dumps(out, ensure_ascii=False)) +# print(count)