feat: PTT 爬蟲程式

2022-06-05 03:50:56 +08:00 · 2022-06-05 03:50:56 +08:00 · 5b89104cfa
commit 5b89104cfa
3 changed files with 134 additions and 0 deletions
--- a/Crawler/CommentJudge.py
+++ b/Crawler/CommentJudge.py
@ -0,0 +1,48 @@
 import jieba
 class CommentJudge():
    def getBest(self, comments):
        self.comments = comments
        self.segment_comments = []
        self.wordDict = {}
        self.commentsScore = []
        self.segment()
        self.buildWordDict()
        self.score()
        maxScore, maxIndex = 0, 0
        for index in range(len(self.commentsScore)):
            score = self.commentsScore[index]
            if score > maxScore:
                maxScore = score
                maxIndex = index
        return self.comments[maxIndex], self.commentsScore[maxIndex]
    def segment(self):
        banned = [' ', ',', '，', '。', '？', '?', '=']
        for comment in self.comments:
            words = [ word for word in jieba.cut(comment) if word not in banned]
            self.segment_comments.append(words)
    def buildWordDict(self):
        for comment in self.segment_comments:
            for word in comment:
                if word in self.wordDict:
                    self.wordDict[word] += 1
                else:
                    self.wordDict[word] = 0
        # print(self.wordDict)
    def score(self):
        for index in range(len(self.segment_comments)):
            comment = self.segment_comments[index]
            if len(self.comments[index]) >= 15:
                self.commentsScore.append(0)
                continue
            weight = 0
            for word in comment:
                weight += self.wordDict[word]
            self.commentsScore.append(weight)
--- a/Crawler/PTTCrawler.py
+++ b/Crawler/PTTCrawler.py
@ -0,0 +1,82 @@
 from tracemalloc import start
 from urllib import response
 from itsdangerous import exc
 import requests
 from bs4 import BeautifulSoup
 import time
 from CommentJudge import CommentJudge
 import jieba
 import json
 import os
 class PTTCrawler():
    def __init__(self, board, startURL='index.html'):
        self.board = board
        self.startURL = startURL
        self.judge = CommentJudge()
        if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200:
            raise Exception("No board in PTT named {}".format(self.board))
    def getPosts(self, number=100000):
        url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL)
        if os.path.isfile('train.json'):
            with open('train.json') as fp:
                ans = json.load(fp)
                counter = len(ans)
        else:
            ans = []
            counter = 0
        while counter < number:
            print(url)
            response = requests.get(url, headers={'cookie': 'over18=1;'})
            if response.status_code == 200:
                # 取得文章的標題和URL，並進一步 call getComments() 取得推文
                root = BeautifulSoup(response.text, 'html.parser')
                posts = root.find_all('div', class_='r-ent')
                for post in posts:
                    link = post.find("a")
                    if link:                        # 如果被刪文，則會是 None
                        if "[問卦] " in link.text and "Re:" not in link.text:
                            counter += 1
                            comments = self.getComments(link.get('href'))
                            if len(comments) != 0:
                                bestComment, score = self.judge.getBest(comments)
                                ans.append({
                                    "Q": link.text.replace('[問卦] ', ''),
                                    "A": bestComment
                                })
                                print(ans[-1], counter)
                                time.sleep(2.5)
                                if counter % 100 == 0:
                                    with open('train.json', 'w') as fp:
                                        json.dump(ans, fp)
                # 取得上一頁的位址
                btns = root.find_all('a', class_='btn wide')
                for btn in btns:
                    if '上頁' in btn.text:
                        url = 'https://www.ptt.cc{}'.format(btn.get('href'))
                        print(url)
                        print()
                # time.sleep(3)
            else:
                raise Exception("Response status code {}".format(response.status_code))
        return ans
    def getComments(self, url):
        url = 'https://www.ptt.cc{}'.format(url)
        ans = []
        response = requests.get(url, headers={'cookie': 'over18=1;'})
        root = BeautifulSoup(response.text, 'html.parser')
        comments = root.find_all('div', class_='push')
        for comment in comments:
            try:
                text = comment.find_all('span')[2].text
                if 'http' not in text:
                    ans.append(text.replace(': ', ''))
            except:
                print(comment)  # 推文太多會出現 error
        return ans
--- a/Crawler/main.py
+++ b/Crawler/main.py
@ -0,0 +1,4 @@
 from PTTCrawler import PTTCrawler
 crawler = PTTCrawler("Gossiping", 'index29686.html')
 posts = crawler.getPosts()