refactor: 修正取得課程方法(尚未處理通識課程)
This commit is contained in:
parent
76dec69edd
commit
3cfec76d5c
210
getData.py
210
getData.py
@ -1,103 +1,145 @@
|
|||||||
import requests, json
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from tqdm import tqdm, trange
|
|
||||||
|
|
||||||
allFinalPage = 86
|
|
||||||
generalFinalPage = 10
|
|
||||||
courseObjList = {}
|
|
||||||
|
|
||||||
header = {
|
header = {
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
|
||||||
'Cookie': '輸入登入暨大教務系統後所得到的cookie'
|
'Cookie': '輸入登入暨大教務系統後所得到的cookie'
|
||||||
}
|
}
|
||||||
|
|
||||||
def curlAllCoursePage():
|
mainURL = "https://ccweb.ncnu.edu.tw/student/"
|
||||||
|
|
||||||
|
def curlDepartmentCourseTable(year):
|
||||||
print("取得所有課程資料:")
|
print("取得所有課程資料:")
|
||||||
progress = tqdm(total=allFinalPage)
|
|
||||||
for page in range(1, allFinalPage+1):
|
response = requests.get(mainURL+"aspmaker_course_opened_semester_stat_viewlist.php?x_year={}&recperpage=ALL".format(year), headers=header)
|
||||||
url = 'https://ccweb.ncnu.edu.tw/student/current_semester_opened_listlist.php?start=1&pageno={}'.format(page)
|
|
||||||
response = requests.get(url, headers=header)
|
|
||||||
data = response.text
|
data = response.text
|
||||||
with open('all/{}.html'.format(page), 'w') as fp:
|
root = bs(data, "html.parser")
|
||||||
fp.write(data)
|
|
||||||
progress.update(1)
|
|
||||||
|
|
||||||
def curlGeneralCoursePage():
|
count = 1
|
||||||
print("取得通識課資料:")
|
departmentsTR = root.findAll('tr')[1:] # 清除 thead
|
||||||
progress = tqdm(total=generalFinalPage)
|
for tr in departmentsTR:
|
||||||
for page in range(1, generalFinalPage+1):
|
name = tr.findAll('td')[4].find('span').find('span').string # 取得 科系名稱
|
||||||
url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page)
|
link = mainURL + tr.find('a').get('data-url').replace('amp;', '') # 清除不必要符號, 取得 連結
|
||||||
response = requests.get(url, headers=header)
|
print("擷取{}課程... ({}/{})...".format(name, count, len(departmentsTR)))
|
||||||
|
count += 1
|
||||||
|
extractDepartmentCourseTable(name, link) # 透過連結 開始擷取 各科系課程
|
||||||
|
|
||||||
|
# def curlGeneralCoursePage():
|
||||||
|
# print("取得通識課資料:")
|
||||||
|
# progress = tqdm(total=generalFinalPage)
|
||||||
|
# for page in range(1, generalFinalPage+1):
|
||||||
|
# url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page)
|
||||||
|
# response = requests.get(url, headers=header)
|
||||||
|
# data = response.text
|
||||||
|
# with open('general/{}.html'.format(page), 'w') as fp:
|
||||||
|
# fp.write(data)
|
||||||
|
# progress.update(1)
|
||||||
|
|
||||||
|
def extractDepartmentCourseTable(departmentName, link):
|
||||||
|
# 判斷是否目前還沒有資料
|
||||||
|
if(os.path.isfile('output.json')):
|
||||||
|
with open('output.json', 'r') as fp:
|
||||||
|
courses = json.load(fp)
|
||||||
|
else:
|
||||||
|
courses = []
|
||||||
|
|
||||||
|
response = requests.get(link, headers=header)
|
||||||
data = response.text
|
data = response.text
|
||||||
with open('general/{}.html'.format(page), 'w') as fp:
|
root = bs(data, "html.parser")
|
||||||
fp.write(data)
|
|
||||||
progress.update(1)
|
|
||||||
|
|
||||||
def extractAllCourse():
|
courseTR = root.findAll('tr')[1:] # 清除 thead
|
||||||
print("解析所有課程html:")
|
for tr in courseTR:
|
||||||
progress = tqdm(total=allFinalPage)
|
|
||||||
for pageNumber in range(1, allFinalPage+1):
|
|
||||||
html = ""
|
|
||||||
with open('all/{}.html'.format(pageNumber), 'r') as fp:
|
|
||||||
html = fp.read()
|
|
||||||
root = bs(html, "html.parser")
|
|
||||||
courses = root.find_all('tr')
|
|
||||||
courses = courses[1:]
|
|
||||||
for course in courses:
|
|
||||||
courseObj = {}
|
courseObj = {}
|
||||||
tds = course.find_all('td')
|
tds = tr.find_all('td')
|
||||||
tds = tds[1:]
|
|
||||||
courseObj['year'] = tds[0].text.replace('\n', '')
|
|
||||||
courseObj['number'] = tds[1].text.replace('\n', '')
|
|
||||||
courseObj['name'] = tds[3].text.replace('\n', '')
|
|
||||||
courseObj['class'] = tds[2].text.replace('\n', '')
|
|
||||||
courseObj['department'] = tds[4].text.replace('\n', '')
|
|
||||||
courseObj['graduated'] = tds[5].text.replace('\n', '')
|
|
||||||
courseObj['grade'] = tds[6].text.replace('\n', '')
|
|
||||||
courseObj['teacher'] = tds[7].text.replace('\n', '')
|
|
||||||
courseObj['place'] = tds[8].text.replace('\n', '')
|
|
||||||
courseObj['time'] = tds[10].text.replace('\n', '')
|
|
||||||
|
|
||||||
courseObjList[
|
courseObj['link'] = mainURL + tds[0].find('a').get('href')
|
||||||
tds[1].text.replace('\n', '')
|
courseObj['year'] = tds[1].find('span').string
|
||||||
+
|
courseObj['number'] = tds[2].find('span').string
|
||||||
tds[2].text.replace('\n', '')
|
courseObj['class'] = tds[3].find('span').string
|
||||||
] = courseObj
|
courseObj['name'] = tds[4].find('span').string
|
||||||
progress.update(1)
|
courseObj['department'] = tds[5].find('span').string
|
||||||
|
courseObj['graduated'] = tds[6].find('span').string
|
||||||
|
courseObj['grade'] = tds[7].find('span').string
|
||||||
|
courseObj['teacher'] = tds[8].find('span').string
|
||||||
|
courseObj['place'] = tds[9].find('span').string
|
||||||
|
courseObj['time'] = tds[11].find('span').string
|
||||||
|
courses.append(courseObj)
|
||||||
|
|
||||||
def extractGeneralCourse():
|
with open('output.json', 'w') as fp:
|
||||||
print("解析通識課html:")
|
json.dump(courses, fp)
|
||||||
progress = tqdm(total=generalFinalPage)
|
|
||||||
for pageNumber in range(1, generalFinalPage+1):
|
|
||||||
html = ""
|
|
||||||
with open('general/{}.html'.format(pageNumber), 'r') as fp:
|
|
||||||
html = fp.read()
|
|
||||||
root = bs(html, "html.parser")
|
|
||||||
courses = root.find_all('tr')
|
|
||||||
courses = courses[1:]
|
|
||||||
for course in courses:
|
|
||||||
courseObj = {}
|
|
||||||
tds = course.find_all('td')
|
|
||||||
number = tds[3].text.replace('\n', '')
|
|
||||||
classNum = tds[4].text.replace('\n','')
|
|
||||||
major = tds[1].text.replace('\n', '')
|
|
||||||
name = tds[6].text.replace('\n', '')
|
|
||||||
old = courseObjList[number+classNum]['department']
|
|
||||||
if old != "90, 體育室":
|
|
||||||
courseObjList[number+classNum]['department'] = major
|
|
||||||
progress.update(1)
|
|
||||||
|
|
||||||
curlAllCoursePage()
|
|
||||||
extractAllCourse()
|
|
||||||
curlGeneralCoursePage()
|
|
||||||
extractGeneralCourse()
|
|
||||||
|
|
||||||
out = []
|
|
||||||
count = 0
|
|
||||||
for item in courseObjList:
|
|
||||||
count = count+1
|
|
||||||
out.append(courseObjList[item])
|
|
||||||
|
|
||||||
with open('output.json', 'w') as fp:
|
# print("解析所有課程html:")
|
||||||
fp.write(json.dumps(out, ensure_ascii=False))
|
# progress = tqdm(total=allFinalPage)
|
||||||
print(count)
|
# for pageNumber in range(1, allFinalPage+1):
|
||||||
|
# html = ""
|
||||||
|
# with open('all/{}.html'.format(pageNumber), 'r') as fp:
|
||||||
|
# html = fp.read()
|
||||||
|
# root = bs(html, "html.parser")
|
||||||
|
# courses = root.find_all('tr')
|
||||||
|
# courses = courses[1:]
|
||||||
|
# for course in courses:
|
||||||
|
# courseObj = {}
|
||||||
|
# tds = course.find_all('td')
|
||||||
|
# tds = tds[1:]
|
||||||
|
# courseObj['year'] = tds[0].text.replace('\n', '')
|
||||||
|
# courseObj['number'] = tds[1].text.replace('\n', '')
|
||||||
|
# courseObj['name'] = tds[3].text.replace('\n', '')
|
||||||
|
# courseObj['class'] = tds[2].text.replace('\n', '')
|
||||||
|
# courseObj['department'] = tds[4].text.replace('\n', '')
|
||||||
|
# courseObj['graduated'] = tds[5].text.replace('\n', '')
|
||||||
|
# courseObj['grade'] = tds[6].text.replace('\n', '')
|
||||||
|
# courseObj['teacher'] = tds[7].text.replace('\n', '')
|
||||||
|
# courseObj['place'] = tds[8].text.replace('\n', '')
|
||||||
|
# courseObj['time'] = tds[10].text.replace('\n', '')
|
||||||
|
|
||||||
|
# courseObjList[
|
||||||
|
# tds[1].text.replace('\n', '')
|
||||||
|
# +
|
||||||
|
# tds[2].text.replace('\n', '')
|
||||||
|
# ] = courseObj
|
||||||
|
# progress.update(1)
|
||||||
|
|
||||||
|
# def extractGeneralCourse():
|
||||||
|
# print("解析通識課html:")
|
||||||
|
# progress = tqdm(total=generalFinalPage)
|
||||||
|
# for pageNumber in range(1, generalFinalPage+1):
|
||||||
|
# html = ""
|
||||||
|
# with open('general/{}.html'.format(pageNumber), 'r') as fp:
|
||||||
|
# html = fp.read()
|
||||||
|
# root = bs(html, "html.parser")
|
||||||
|
# courses = root.find_all('tr')
|
||||||
|
# courses = courses[1:]
|
||||||
|
# for course in courses:
|
||||||
|
# courseObj = {}
|
||||||
|
# tds = course.find_all('td')
|
||||||
|
# number = tds[3].text.replace('\n', '')
|
||||||
|
# classNum = tds[4].text.replace('\n','')
|
||||||
|
# major = tds[1].text.replace('\n', '')
|
||||||
|
# name = tds[6].text.replace('\n', '')
|
||||||
|
# old = courseObjList[number+classNum]['department']
|
||||||
|
# if old != "90, 體育室":
|
||||||
|
# courseObjList[number+classNum]['department'] = major
|
||||||
|
# progress.update(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
year = input("年份: ")
|
||||||
|
curlDepartmentCourseTable(year)
|
||||||
|
# extractAllCourse()
|
||||||
|
# curlGeneralCoursePage()
|
||||||
|
# extractGeneralCourse()
|
||||||
|
|
||||||
|
# out = []
|
||||||
|
# count = 0
|
||||||
|
# for item in courseObjList:
|
||||||
|
# count = count+1
|
||||||
|
# out.append(courseObjList[item])
|
||||||
|
|
||||||
|
# with open('output.json', 'w') as fp:
|
||||||
|
# fp.write(json.dumps(out, ensure_ascii=False))
|
||||||
|
# print(count)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user