feat: PTT 爬蟲程式

This commit is contained in:
Ting-Jun Wang 2022-06-05 03:50:56 +08:00
commit 5b89104cfa
Signed by: snsd0805
GPG Key ID: 8DB0D22BC1217D33
3 changed files with 134 additions and 0 deletions

48
Crawler/CommentJudge.py Normal file
View File

@ -0,0 +1,48 @@
import jieba
class CommentJudge():
def getBest(self, comments):
self.comments = comments
self.segment_comments = []
self.wordDict = {}
self.commentsScore = []
self.segment()
self.buildWordDict()
self.score()
maxScore, maxIndex = 0, 0
for index in range(len(self.commentsScore)):
score = self.commentsScore[index]
if score > maxScore:
maxScore = score
maxIndex = index
return self.comments[maxIndex], self.commentsScore[maxIndex]
def segment(self):
banned = [' ', ',', '', '', '', '?', '=']
for comment in self.comments:
words = [ word for word in jieba.cut(comment) if word not in banned]
self.segment_comments.append(words)
def buildWordDict(self):
for comment in self.segment_comments:
for word in comment:
if word in self.wordDict:
self.wordDict[word] += 1
else:
self.wordDict[word] = 0
# print(self.wordDict)
def score(self):
for index in range(len(self.segment_comments)):
comment = self.segment_comments[index]
if len(self.comments[index]) >= 15:
self.commentsScore.append(0)
continue
weight = 0
for word in comment:
weight += self.wordDict[word]
self.commentsScore.append(weight)

82
Crawler/PTTCrawler.py Normal file
View File

@ -0,0 +1,82 @@
from tracemalloc import start
from urllib import response
from itsdangerous import exc
import requests
from bs4 import BeautifulSoup
import time
from CommentJudge import CommentJudge
import jieba
import json
import os
class PTTCrawler():
def __init__(self, board, startURL='index.html'):
self.board = board
self.startURL = startURL
self.judge = CommentJudge()
if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200:
raise Exception("No board in PTT named {}".format(self.board))
def getPosts(self, number=100000):
url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL)
if os.path.isfile('train.json'):
with open('train.json') as fp:
ans = json.load(fp)
counter = len(ans)
else:
ans = []
counter = 0
while counter < number:
print(url)
response = requests.get(url, headers={'cookie': 'over18=1;'})
if response.status_code == 200:
# 取得文章的標題和URL並進一步 call getComments() 取得推文
root = BeautifulSoup(response.text, 'html.parser')
posts = root.find_all('div', class_='r-ent')
for post in posts:
link = post.find("a")
if link: # 如果被刪文,則會是 None
if "[問卦] " in link.text and "Re:" not in link.text:
counter += 1
comments = self.getComments(link.get('href'))
if len(comments) != 0:
bestComment, score = self.judge.getBest(comments)
ans.append({
"Q": link.text.replace('[問卦] ', ''),
"A": bestComment
})
print(ans[-1], counter)
time.sleep(2.5)
if counter % 100 == 0:
with open('train.json', 'w') as fp:
json.dump(ans, fp)
# 取得上一頁的位址
btns = root.find_all('a', class_='btn wide')
for btn in btns:
if '上頁' in btn.text:
url = 'https://www.ptt.cc{}'.format(btn.get('href'))
print(url)
print()
# time.sleep(3)
else:
raise Exception("Response status code {}".format(response.status_code))
return ans
def getComments(self, url):
url = 'https://www.ptt.cc{}'.format(url)
ans = []
response = requests.get(url, headers={'cookie': 'over18=1;'})
root = BeautifulSoup(response.text, 'html.parser')
comments = root.find_all('div', class_='push')
for comment in comments:
try:
text = comment.find_all('span')[2].text
if 'http' not in text:
ans.append(text.replace(': ', ''))
except:
print(comment) # 推文太多會出現 error
return ans

4
Crawler/main.py Normal file
View File

@ -0,0 +1,4 @@
from PTTCrawler import PTTCrawler
crawler = PTTCrawler("Gossiping", 'index29686.html')
posts = crawler.getPosts()