NCNU_Course/getData.py

import requests
import json
import os
import csv
from bs4 import BeautifulSoup as bs


header = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
    'Cookie': '輸入登入暨大教務系統後所得到的cookie'
}

mainURL = "https://ccweb.ncnu.edu.tw/student/"
courses = []
generalCourse = []

def getGeneralCourseData(year):
    '''
        透過年份取得 通識課程分類的csv檔
        供後續課程對應。

        先儲存到 generalCourse list，後續再用 courseID 對應通識分類
    '''

    # 教務系統有開放 年度的query
    # 但實際操作後似乎僅開放當前學年度
    response = requests.get(mainURL+"aspmaker_student_common_rank_courses_viewlist.php?x_studentid=0&z_studentid=LIKE&x_year={}&z_year=%3D&cmd=search&export=csv".format(year), headers=header)
    data = response.text

    courses = data.split('\r\n')[1:-1]
    for course in courses:
        course = course.split(',')
        generalCourse.append(course)

def curlDepartmentCourseTable(year):
    '''
        先取得各科系的開課表格連結
        再將連結丟給 extractDepartmentCourseTable() 取得課程資訊
    '''
    print("取得所有課程資料：")

    response = requests.get(mainURL+"aspmaker_course_opened_semester_stat_viewlist.php?x_year={}&recperpage=ALL".format(year), headers=header)
    data = response.text
    root = bs(data, "html.parser")

    count = 1
    departmentsTR = root.findAll('tr')[1:]  # 清除 thead
    for tr in departmentsTR:
        name = tr.findAll('td')[4].find('span').find('span').string # 取得 科系名稱
        link = mainURL + tr.find('a').get('data-url').replace('amp;', '')     # 清除不必要符號， 取得 連結
        print("擷取{}課程... ({}/{})...".format(name, count, len(departmentsTR)))
        count += 1
        extractDepartmentCourseTable(name, link)    # 透過連結 開始擷取 各科系課程

def extractDepartmentCourseTable(departmentName, link):
    '''
        透過各科系連結取得課程資訊
        若為通識類別還要跟csv檔資料做對應，取得正確通識類別

        對應後存取到 output.json
    '''
    response = requests.get(link, headers=header)
    data = response.text
    root = bs(data, "html.parser")

    courseTR = root.findAll('tr')[1:] # 清除 thead
    for tr in courseTR:
        courseObj = {}
        tds = tr.find_all('td')

        courseObj['link'] = mainURL + tds[0].find('a').get('href')
        courseObj['year'] = tds[1].find('span').string
        courseObj['number'] = tds[2].find('span').string
        courseObj['class'] = tds[3].find('span').string
        courseObj['name'] = tds[4].find('span').string
        courseObj['department'] = tds[5].find('span').string
        courseObj['graduated'] = tds[6].find('span').string
        courseObj['grade'] = tds[7].find('span').string
        courseObj['teacher'] = tds[8].find('span').string
        courseObj['place'] = tds[9].find('span').string
        courseObj['time'] = tds[11].find('span').string

        if courseObj['department']=="99, 通識" :
            flag = False
            for row in generalCourse:
                if row[2] == '"{}"'.format(courseObj['number']):
                    courseObj['department'] = row[0].replace('"', '')
                    generalCourse.remove(row)
                    flag = True
                    break
            if not flag:
                print(" - 找不到對應的通識類別： {} ( {} )".format(courseObj['name'], courseObj['number']))

        courses.append(courseObj)

    with open('output.json', 'w') as fp:
        json.dump(courses, fp)


if __name__ == "__main__":
    year = input("年份: ")

    getGeneralCourseData(year)
    curlDepartmentCourseTable(year)

    print("\n\n=====================")
    print("未列入追蹤的通識課程")
    print("=====================\n")

    for notIn in generalCourse:
        if "體育:" not in notIn[5]:
            print(" - 未列入追蹤的新通識課程： {}".format(notIn))