104 lines
3.7 KiB
Python
104 lines
3.7 KiB
Python
import requests, json
|
||
from bs4 import BeautifulSoup as bs
|
||
from tqdm import tqdm, trange
|
||
|
||
allFinalPage = 86
|
||
generalFinalPage = 10
|
||
courseObjList = {}
|
||
|
||
header = {
|
||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
|
||
'Cookie': '輸入登入暨大教務系統後所得到的cookie'
|
||
}
|
||
|
||
def curlAllCoursePage():
|
||
print("取得所有課程資料:")
|
||
progress = tqdm(total=allFinalPage)
|
||
for page in range(1, allFinalPage+1):
|
||
url = 'https://ccweb.ncnu.edu.tw/student/current_semester_opened_listlist.php?start=1&pageno={}'.format(page)
|
||
response = requests.get(url, headers=header)
|
||
data = response.text
|
||
with open('all/{}.html'.format(page), 'w') as fp:
|
||
fp.write(data)
|
||
progress.update(1)
|
||
|
||
def curlGeneralCoursePage():
|
||
print("取得通識課資料:")
|
||
progress = tqdm(total=generalFinalPage)
|
||
for page in range(1, generalFinalPage+1):
|
||
url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page)
|
||
response = requests.get(url, headers=header)
|
||
data = response.text
|
||
with open('general/{}.html'.format(page), 'w') as fp:
|
||
fp.write(data)
|
||
progress.update(1)
|
||
|
||
def extractAllCourse():
|
||
print("解析所有課程html:")
|
||
progress = tqdm(total=allFinalPage)
|
||
for pageNumber in range(1, allFinalPage+1):
|
||
html = ""
|
||
with open('all/{}.html'.format(pageNumber), 'r') as fp:
|
||
html = fp.read()
|
||
root = bs(html, "html.parser")
|
||
courses = root.find_all('tr')
|
||
courses = courses[1:]
|
||
for course in courses:
|
||
courseObj = {}
|
||
tds = course.find_all('td')
|
||
tds = tds[1:]
|
||
courseObj['year'] = tds[0].text.replace('\n', '')
|
||
courseObj['number'] = tds[1].text.replace('\n', '')
|
||
courseObj['name'] = tds[3].text.replace('\n', '')
|
||
courseObj['class'] = tds[2].text.replace('\n', '')
|
||
courseObj['department'] = tds[4].text.replace('\n', '')
|
||
courseObj['graduated'] = tds[5].text.replace('\n', '')
|
||
courseObj['grade'] = tds[6].text.replace('\n', '')
|
||
courseObj['teacher'] = tds[7].text.replace('\n', '')
|
||
courseObj['place'] = tds[8].text.replace('\n', '')
|
||
courseObj['time'] = tds[10].text.replace('\n', '')
|
||
|
||
courseObjList[
|
||
tds[1].text.replace('\n', '')
|
||
+
|
||
tds[2].text.replace('\n', '')
|
||
] = courseObj
|
||
progress.update(1)
|
||
|
||
def extractGeneralCourse():
|
||
print("解析通識課html:")
|
||
progress = tqdm(total=generalFinalPage)
|
||
for pageNumber in range(1, generalFinalPage+1):
|
||
html = ""
|
||
with open('general/{}.html'.format(pageNumber), 'r') as fp:
|
||
html = fp.read()
|
||
root = bs(html, "html.parser")
|
||
courses = root.find_all('tr')
|
||
courses = courses[1:]
|
||
for course in courses:
|
||
courseObj = {}
|
||
tds = course.find_all('td')
|
||
number = tds[3].text.replace('\n', '')
|
||
classNum = tds[4].text.replace('\n','')
|
||
major = tds[1].text.replace('\n', '')
|
||
name = tds[6].text.replace('\n', '')
|
||
old = courseObjList[number+classNum]['department']
|
||
if old != "90, 體育室":
|
||
courseObjList[number+classNum]['department'] = major
|
||
progress.update(1)
|
||
|
||
curlAllCoursePage()
|
||
extractAllCourse()
|
||
curlGeneralCoursePage()
|
||
extractGeneralCourse()
|
||
|
||
out = []
|
||
count = 0
|
||
for item in courseObjList:
|
||
count = count+1
|
||
out.append(courseObjList[item])
|
||
|
||
with open('output.json', 'w') as fp:
|
||
fp.write(json.dumps(out, ensure_ascii=False))
|
||
print(count)
|