feat: 修改爬蟲方式

This commit is contained in:
Ting-Jun Wang 2021-07-16 21:10:58 +08:00
parent db020f765f
commit c61489ed9c
Signed by: snsd0805
GPG Key ID: 8DB0D22BC1217D33

View File

@ -4,33 +4,35 @@ import os
import csv import csv
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
session = requests.Session()
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
'Cookie': '輸入登入暨大教務系統後所得到的cookie'
}
mainURL = "https://ccweb.ncnu.edu.tw/student/" mainURL = "https://ccweb.ncnu.edu.tw/student/"
courses = [] courses = []
generalCourse = [] generalCourse = []
def getGeneralCourseData(year): def login(username, password):
''' global session
透過年份取得 通識課程分類的csv檔 response = session.get('https://ccweb.ncnu.edu.tw/student/login.php')
供後續課程對應 root = bs(response.text, 'html.parser')
loginToken = root.find('input', {'name': 'token'}).get('value')
先儲存到 generalCourse list後續再用 courseID 對應通識分類
'''
# 教務系統有開放 年度的query # request login page
# 但實際操作後似乎僅開放當前學年度 response = session.post(
response = requests.get(mainURL+"aspmaker_student_common_rank_courses_viewlist.php?x_studentid=0&z_studentid=LIKE&x_year={}&z_year=%3D&cmd=search&export=csv".format(year), headers=header) "https://ccweb.ncnu.edu.tw/student/login.php",
data = response.text data={
'token': loginToken,
'modal': '0',
'username': username,
'password': password,
'type': 'a'
}
)
courses = data.split('\r\n')[1:-1] # 成功的話 return http 302, redirect
for course in courses: if len(response.history)!=0:
course = course.split(',') return True
generalCourse.append(course) else:
return False
def curlDepartmentCourseTable(year): def curlDepartmentCourseTable(year):
''' '''
@ -39,74 +41,63 @@ def curlDepartmentCourseTable(year):
''' '''
print("取得所有課程資料:") print("取得所有課程資料:")
response = requests.get(mainURL+"aspmaker_course_opened_semester_stat_viewlist.php?x_year={}&recperpage=ALL".format(year), headers=header) # 切換年度,應該是用 cookie 儲存當前閱覽的年份
data = response.text url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_course_opened_detail_viewlist.php?cmd=search&t=aspmaker_course_opened_detail_view&z_year=%3D&x_year={}&z_courseid=%3D&x_courseid=&z_cname=LIKE&x_cname=&z_deptid=%3D&x_deptid=&z_division=LIKE&x_division=&z_grade=%3D&x_grade=&z_teachers=LIKE&x_teachers=&z_not_accessible=LIKE&x_not_accessible='
root = bs(data, "html.parser") response = session.get(url.format(year))
count = 1
departmentsTR = root.findAll('tr')[1:] # 清除 thead
for tr in departmentsTR:
name = tr.findAll('td')[4].find('span').find('span').string # 取得 科系名稱
link = mainURL + tr.find('a').get('data-url').replace('amp;', '') # 清除不必要符號, 取得 連結
print("擷取{}課程... ({}/{})...".format(name, count, len(departmentsTR)))
count += 1
extractDepartmentCourseTable(name, link) # 透過連結 開始擷取 各科系課程
def extractDepartmentCourseTable(departmentName, link): # 取得 所有課程的 csv
response = session.get('https://ccweb.ncnu.edu.tw/student/aspmaker_course_opened_detail_viewlist.php?export=csv')
with open("allCourses.csv", "wb") as fp:
fp.write(response.content)
def extractDepartmentCourseTable(year):
''' '''
透過各科系連結取得課程資訊 透過各科系連結取得課程資訊
若為通識類別還要跟csv檔資料做對應取得正確通識類別 若為通識類別還要跟csv檔資料做對應取得正確通識類別
對應後存取到 output.json 對應後存取到 output.json
''' '''
response = requests.get(link, headers=header) with open("allCourses.csv") as fp:
data = response.text csvData = fp.read()
root = bs(data, "html.parser")
courseTR = root.findAll('tr')[1:] # 清除 thead ans = []
for tr in courseTR: courses = csvData.split('"\n')[1:-1]
courseObj = {} for course in courses:
tds = tr.find_all('td') course = course.replace('\n', '.')
# print(course)
courseObj['link'] = mainURL + tds[0].find('a').get('href') data = course[1:].split('","')
courseObj['year'] = tds[1].find('span').string
courseObj['number'] = tds[2].find('span').string
courseObj['class'] = tds[3].find('span').string
courseObj['name'] = tds[4].find('span').string
courseObj['department'] = tds[5].find('span').string
courseObj['graduated'] = tds[6].find('span').string
courseObj['grade'] = tds[7].find('span').string
courseObj['teacher'] = tds[8].find('span').string
courseObj['place'] = tds[9].find('span').string
courseObj['time'] = tds[11].find('span').string
if courseObj['department']=="99, 通識" :
flag = False
for row in generalCourse:
if row[2] == '"{}"'.format(courseObj['number']):
courseObj['department'] = row[0].replace('"', '')
generalCourse.remove(row)
flag = True
break
if not flag:
print(" - 找不到對應的通識類別: {} ( {} )".format(courseObj['name'], courseObj['number']))
courses.append(courseObj) courseObj = {}
with open('output.json', 'w') as fp:
json.dump(courses, fp)
baseLink = "https://ccweb.ncnu.edu.tw/student/aspmaker_course_opened_detail_viewview.php?showdetail=&year={}&courseid={}&_class={}&modal=2"
courseObj['link'] = baseLink.format(year, data[1].zfill(6), data[2])
courseObj['year'] = data[0]
courseObj['number'] = data[1]
courseObj['class'] = data[2]
courseObj['name'] = data[3]
courseObj['department'] = data[4]
courseObj['graduated'] = data[6]
courseObj['grade'] = data[7]
courseObj['teacher'] = data[8]
courseObj['place'] = data[9]
courseObj['time'] = data[13]
courseObj['credit'] = data[14]
ans.append(courseObj)
with open("歷年課程資料/{}_output.json".format(year), 'w') as fp:
json.dump(ans, fp, ensure_ascii=False)
if __name__ == "__main__": if __name__ == "__main__":
year = input("年份: ") year = input("年份: ")
getGeneralCourseData(year) while True:
username = input("學號: ")
password = input("密碼: ")
if login(username, password):
break
else:
print("登入失敗!")
curlDepartmentCourseTable(year) curlDepartmentCourseTable(year)
extractDepartmentCourseTable("1101")
print("\n\n=====================")
print("未列入追蹤的通識課程")
print("=====================\n")
for notIn in generalCourse:
if "體育:" not in notIn[5]:
print(" - 未列入追蹤的新通識課程: {}".format(notIn))