feat: PTT 爬蟲程式

2022-06-05 03:50:56 +08:00 · 2022-06-05 03:50:56 +08:00 · 5b89104cfa
commit 5b89104cfa
3 changed files with 134 additions and 0 deletions
--- a/Crawler/CommentJudge.py
+++ b/Crawler/CommentJudge.py
@ -0,0 +1,48 @@
+import jieba
+
+class CommentJudge():
+    def getBest(self, comments):
+        self.comments = comments
+        self.segment_comments = []
+        self.wordDict = {}
+        self.commentsScore = []
+
+        self.segment()
+        self.buildWordDict()
+        self.score()
+
+        maxScore, maxIndex = 0, 0
+        for index in range(len(self.commentsScore)):
+            score = self.commentsScore[index]
+            if score > maxScore:
+                maxScore = score
+                maxIndex = index
+        return self.comments[maxIndex], self.commentsScore[maxIndex]
+    
+    def segment(self):
+        banned = [' ', ',', '，', '。', '？', '?', '=']
+        for comment in self.comments:
+            words = [ word for word in jieba.cut(comment) if word not in banned]
+            self.segment_comments.append(words)
+
+    def buildWordDict(self):
+        for comment in self.segment_comments:
+            for word in comment:
+                if word in self.wordDict:
+                    self.wordDict[word] += 1
+                else:
+                    self.wordDict[word] = 0
+        # print(self.wordDict)
+    
+    def score(self):
+        for index in range(len(self.segment_comments)):
+            comment = self.segment_comments[index]
+            if len(self.comments[index]) >= 15:
+                self.commentsScore.append(0)
+                continue
+
+            weight = 0
+            for word in comment:
+                weight += self.wordDict[word]
+
+            self.commentsScore.append(weight)
--- a/Crawler/PTTCrawler.py
+++ b/Crawler/PTTCrawler.py
@ -0,0 +1,82 @@
+from tracemalloc import start
+from urllib import response
+from itsdangerous import exc
+import requests
+from bs4 import BeautifulSoup
+import time
+from CommentJudge import CommentJudge
+import jieba
+import json
+import os
+
+class PTTCrawler():
+    def __init__(self, board, startURL='index.html'):
+        self.board = board
+        self.startURL = startURL
+        self.judge = CommentJudge()
+        if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200:
+            raise Exception("No board in PTT named {}".format(self.board))
+    
+    def getPosts(self, number=100000):
+        url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL)
+        
+        if os.path.isfile('train.json'):
+            with open('train.json') as fp:
+                ans = json.load(fp)
+                counter = len(ans)
+        else:
+            ans = []
+            counter = 0
+
+        while counter < number:
+            print(url)
+            response = requests.get(url, headers={'cookie': 'over18=1;'})
+            if response.status_code == 200:
+                # 取得文章的標題和URL，並進一步 call getComments() 取得推文
+                root = BeautifulSoup(response.text, 'html.parser')
+                posts = root.find_all('div', class_='r-ent')
+                for post in posts:
+                    link = post.find("a")
+                    if link:                        # 如果被刪文，則會是 None
+                        if "[問卦] " in link.text and "Re:" not in link.text:
+                            counter += 1
+
+                            comments = self.getComments(link.get('href'))
+                            if len(comments) != 0:
+                                bestComment, score = self.judge.getBest(comments)
+                                ans.append({
+                                    "Q": link.text.replace('[問卦] ', ''),
+                                    "A": bestComment
+                                })
+                                print(ans[-1], counter)
+                                time.sleep(2.5)
+                                if counter % 100 == 0:
+                                    with open('train.json', 'w') as fp:
+                                        json.dump(ans, fp)
+                
+                # 取得上一頁的位址
+                btns = root.find_all('a', class_='btn wide')
+                for btn in btns:
+                    if '上頁' in btn.text:
+                        url = 'https://www.ptt.cc{}'.format(btn.get('href'))
+                        print(url)
+                        print()
+                # time.sleep(3)
+            else:
+                raise Exception("Response status code {}".format(response.status_code))
+        return ans
+
+    def getComments(self, url):
+        url = 'https://www.ptt.cc{}'.format(url)
+        ans = []
+        response = requests.get(url, headers={'cookie': 'over18=1;'})
+        root = BeautifulSoup(response.text, 'html.parser')
+        comments = root.find_all('div', class_='push')
+        for comment in comments:
+            try:
+                text = comment.find_all('span')[2].text
+                if 'http' not in text:
+                    ans.append(text.replace(': ', ''))
+            except:
+                print(comment)  # 推文太多會出現 error
+        return ans
--- a/Crawler/main.py
+++ b/Crawler/main.py
@ -0,0 +1,4 @@
+from PTTCrawler import PTTCrawler
+
+crawler = PTTCrawler("Gossiping", 'index29686.html')
+posts = crawler.getPosts()