NCNU_Course/getData.py

140 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import os
from bs4 import BeautifulSoup as bs
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
'Cookie': '輸入登入暨大教務系統後所得到的cookie'
}
mainURL = "https://ccweb.ncnu.edu.tw/student/"
courses = []
def curlDepartmentCourseTable(year):
print("取得所有課程資料:")
response = requests.get(mainURL+"aspmaker_course_opened_semester_stat_viewlist.php?x_year={}&recperpage=ALL".format(year), headers=header)
data = response.text
root = bs(data, "html.parser")
count = 1
departmentsTR = root.findAll('tr')[1:] # 清除 thead
for tr in departmentsTR:
name = tr.findAll('td')[4].find('span').find('span').string # 取得 科系名稱
link = mainURL + tr.find('a').get('data-url').replace('amp;', '') # 清除不必要符號, 取得 連結
print("擷取{}課程... ({}/{})...".format(name, count, len(departmentsTR)))
count += 1
extractDepartmentCourseTable(name, link) # 透過連結 開始擷取 各科系課程
# def curlGeneralCoursePage():
# print("取得通識課資料:")
# progress = tqdm(total=generalFinalPage)
# for page in range(1, generalFinalPage+1):
# url = 'https://ccweb.ncnu.edu.tw/student/aspmaker_student_common_rank_courses_viewlist.php?pageno={}'.format(page)
# response = requests.get(url, headers=header)
# data = response.text
# with open('general/{}.html'.format(page), 'w') as fp:
# fp.write(data)
# progress.update(1)
def extractDepartmentCourseTable(departmentName, link):
response = requests.get(link, headers=header)
data = response.text
root = bs(data, "html.parser")
courseTR = root.findAll('tr')[1:] # 清除 thead
for tr in courseTR:
courseObj = {}
tds = tr.find_all('td')
courseObj['link'] = mainURL + tds[0].find('a').get('href')
courseObj['year'] = tds[1].find('span').string
courseObj['number'] = tds[2].find('span').string
courseObj['class'] = tds[3].find('span').string
courseObj['name'] = tds[4].find('span').string
courseObj['department'] = tds[5].find('span').string
courseObj['graduated'] = tds[6].find('span').string
courseObj['grade'] = tds[7].find('span').string
courseObj['teacher'] = tds[8].find('span').string
courseObj['place'] = tds[9].find('span').string
courseObj['time'] = tds[11].find('span').string
courses.append(courseObj)
with open('output.json', 'w') as fp:
json.dump(courses, fp)
# print("解析所有課程html")
# progress = tqdm(total=allFinalPage)
# for pageNumber in range(1, allFinalPage+1):
# html = ""
# with open('all/{}.html'.format(pageNumber), 'r') as fp:
# html = fp.read()
# root = bs(html, "html.parser")
# courses = root.find_all('tr')
# courses = courses[1:]
# for course in courses:
# courseObj = {}
# tds = course.find_all('td')
# tds = tds[1:]
# courseObj['year'] = tds[0].text.replace('\n', '')
# courseObj['number'] = tds[1].text.replace('\n', '')
# courseObj['name'] = tds[3].text.replace('\n', '')
# courseObj['class'] = tds[2].text.replace('\n', '')
# courseObj['department'] = tds[4].text.replace('\n', '')
# courseObj['graduated'] = tds[5].text.replace('\n', '')
# courseObj['grade'] = tds[6].text.replace('\n', '')
# courseObj['teacher'] = tds[7].text.replace('\n', '')
# courseObj['place'] = tds[8].text.replace('\n', '')
# courseObj['time'] = tds[10].text.replace('\n', '')
# courseObjList[
# tds[1].text.replace('\n', '')
# +
# tds[2].text.replace('\n', '')
# ] = courseObj
# progress.update(1)
# def extractGeneralCourse():
# print("解析通識課html")
# progress = tqdm(total=generalFinalPage)
# for pageNumber in range(1, generalFinalPage+1):
# html = ""
# with open('general/{}.html'.format(pageNumber), 'r') as fp:
# html = fp.read()
# root = bs(html, "html.parser")
# courses = root.find_all('tr')
# courses = courses[1:]
# for course in courses:
# courseObj = {}
# tds = course.find_all('td')
# number = tds[3].text.replace('\n', '')
# classNum = tds[4].text.replace('\n','')
# major = tds[1].text.replace('\n', '')
# name = tds[6].text.replace('\n', '')
# old = courseObjList[number+classNum]['department']
# if old != "90, 體育室":
# courseObjList[number+classNum]['department'] = major
# progress.update(1)
year = input("年份: ")
curlDepartmentCourseTable(year)
# extractAllCourse()
# curlGeneralCoursePage()
# extractGeneralCourse()
# out = []
# count = 0
# for item in courseObjList:
# count = count+1
# out.append(courseObjList[item])
# with open('output.json', 'w') as fp:
# fp.write(json.dumps(out, ensure_ascii=False))
# print(count)