commit 5b89104cfa4ecf98765625a0db0a8bb2dca695a6 Author: Ting-Jun Wang Date: Sun Jun 5 03:50:56 2022 +0800 feat: PTT 爬蟲程式 diff --git a/Crawler/CommentJudge.py b/Crawler/CommentJudge.py new file mode 100644 index 0000000..e1c7cdc --- /dev/null +++ b/Crawler/CommentJudge.py @@ -0,0 +1,48 @@ +import jieba + +class CommentJudge(): + def getBest(self, comments): + self.comments = comments + self.segment_comments = [] + self.wordDict = {} + self.commentsScore = [] + + self.segment() + self.buildWordDict() + self.score() + + maxScore, maxIndex = 0, 0 + for index in range(len(self.commentsScore)): + score = self.commentsScore[index] + if score > maxScore: + maxScore = score + maxIndex = index + return self.comments[maxIndex], self.commentsScore[maxIndex] + + def segment(self): + banned = [' ', ',', ',', '。', '?', '?', '='] + for comment in self.comments: + words = [ word for word in jieba.cut(comment) if word not in banned] + self.segment_comments.append(words) + + def buildWordDict(self): + for comment in self.segment_comments: + for word in comment: + if word in self.wordDict: + self.wordDict[word] += 1 + else: + self.wordDict[word] = 0 + # print(self.wordDict) + + def score(self): + for index in range(len(self.segment_comments)): + comment = self.segment_comments[index] + if len(self.comments[index]) >= 15: + self.commentsScore.append(0) + continue + + weight = 0 + for word in comment: + weight += self.wordDict[word] + + self.commentsScore.append(weight) diff --git a/Crawler/PTTCrawler.py b/Crawler/PTTCrawler.py new file mode 100644 index 0000000..d8c23da --- /dev/null +++ b/Crawler/PTTCrawler.py @@ -0,0 +1,82 @@ +from tracemalloc import start +from urllib import response +from itsdangerous import exc +import requests +from bs4 import BeautifulSoup +import time +from CommentJudge import CommentJudge +import jieba +import json +import os + +class PTTCrawler(): + def __init__(self, board, startURL='index.html'): + self.board = board + self.startURL = startURL + self.judge = CommentJudge() + if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200: + raise Exception("No board in PTT named {}".format(self.board)) + + def getPosts(self, number=100000): + url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL) + + if os.path.isfile('train.json'): + with open('train.json') as fp: + ans = json.load(fp) + counter = len(ans) + else: + ans = [] + counter = 0 + + while counter < number: + print(url) + response = requests.get(url, headers={'cookie': 'over18=1;'}) + if response.status_code == 200: + # 取得文章的標題和URL,並進一步 call getComments() 取得推文 + root = BeautifulSoup(response.text, 'html.parser') + posts = root.find_all('div', class_='r-ent') + for post in posts: + link = post.find("a") + if link: # 如果被刪文,則會是 None + if "[問卦] " in link.text and "Re:" not in link.text: + counter += 1 + + comments = self.getComments(link.get('href')) + if len(comments) != 0: + bestComment, score = self.judge.getBest(comments) + ans.append({ + "Q": link.text.replace('[問卦] ', ''), + "A": bestComment + }) + print(ans[-1], counter) + time.sleep(2.5) + if counter % 100 == 0: + with open('train.json', 'w') as fp: + json.dump(ans, fp) + + # 取得上一頁的位址 + btns = root.find_all('a', class_='btn wide') + for btn in btns: + if '上頁' in btn.text: + url = 'https://www.ptt.cc{}'.format(btn.get('href')) + print(url) + print() + # time.sleep(3) + else: + raise Exception("Response status code {}".format(response.status_code)) + return ans + + def getComments(self, url): + url = 'https://www.ptt.cc{}'.format(url) + ans = [] + response = requests.get(url, headers={'cookie': 'over18=1;'}) + root = BeautifulSoup(response.text, 'html.parser') + comments = root.find_all('div', class_='push') + for comment in comments: + try: + text = comment.find_all('span')[2].text + if 'http' not in text: + ans.append(text.replace(': ', '')) + except: + print(comment) # 推文太多會出現 error + return ans diff --git a/Crawler/main.py b/Crawler/main.py new file mode 100644 index 0000000..c821517 --- /dev/null +++ b/Crawler/main.py @@ -0,0 +1,4 @@ +from PTTCrawler import PTTCrawler + +crawler = PTTCrawler("Gossiping", 'index29686.html') +posts = crawler.getPosts()