feat: PTT 爬蟲程式
This commit is contained in:
commit
5b89104cfa
48
Crawler/CommentJudge.py
Normal file
48
Crawler/CommentJudge.py
Normal file
@ -0,0 +1,48 @@
|
||||
import jieba
|
||||
|
||||
class CommentJudge():
|
||||
def getBest(self, comments):
|
||||
self.comments = comments
|
||||
self.segment_comments = []
|
||||
self.wordDict = {}
|
||||
self.commentsScore = []
|
||||
|
||||
self.segment()
|
||||
self.buildWordDict()
|
||||
self.score()
|
||||
|
||||
maxScore, maxIndex = 0, 0
|
||||
for index in range(len(self.commentsScore)):
|
||||
score = self.commentsScore[index]
|
||||
if score > maxScore:
|
||||
maxScore = score
|
||||
maxIndex = index
|
||||
return self.comments[maxIndex], self.commentsScore[maxIndex]
|
||||
|
||||
def segment(self):
|
||||
banned = [' ', ',', ',', '。', '?', '?', '=']
|
||||
for comment in self.comments:
|
||||
words = [ word for word in jieba.cut(comment) if word not in banned]
|
||||
self.segment_comments.append(words)
|
||||
|
||||
def buildWordDict(self):
|
||||
for comment in self.segment_comments:
|
||||
for word in comment:
|
||||
if word in self.wordDict:
|
||||
self.wordDict[word] += 1
|
||||
else:
|
||||
self.wordDict[word] = 0
|
||||
# print(self.wordDict)
|
||||
|
||||
def score(self):
|
||||
for index in range(len(self.segment_comments)):
|
||||
comment = self.segment_comments[index]
|
||||
if len(self.comments[index]) >= 15:
|
||||
self.commentsScore.append(0)
|
||||
continue
|
||||
|
||||
weight = 0
|
||||
for word in comment:
|
||||
weight += self.wordDict[word]
|
||||
|
||||
self.commentsScore.append(weight)
|
||||
82
Crawler/PTTCrawler.py
Normal file
82
Crawler/PTTCrawler.py
Normal file
@ -0,0 +1,82 @@
|
||||
from tracemalloc import start
|
||||
from urllib import response
|
||||
from itsdangerous import exc
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
from CommentJudge import CommentJudge
|
||||
import jieba
|
||||
import json
|
||||
import os
|
||||
|
||||
class PTTCrawler():
|
||||
def __init__(self, board, startURL='index.html'):
|
||||
self.board = board
|
||||
self.startURL = startURL
|
||||
self.judge = CommentJudge()
|
||||
if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200:
|
||||
raise Exception("No board in PTT named {}".format(self.board))
|
||||
|
||||
def getPosts(self, number=100000):
|
||||
url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL)
|
||||
|
||||
if os.path.isfile('train.json'):
|
||||
with open('train.json') as fp:
|
||||
ans = json.load(fp)
|
||||
counter = len(ans)
|
||||
else:
|
||||
ans = []
|
||||
counter = 0
|
||||
|
||||
while counter < number:
|
||||
print(url)
|
||||
response = requests.get(url, headers={'cookie': 'over18=1;'})
|
||||
if response.status_code == 200:
|
||||
# 取得文章的標題和URL,並進一步 call getComments() 取得推文
|
||||
root = BeautifulSoup(response.text, 'html.parser')
|
||||
posts = root.find_all('div', class_='r-ent')
|
||||
for post in posts:
|
||||
link = post.find("a")
|
||||
if link: # 如果被刪文,則會是 None
|
||||
if "[問卦] " in link.text and "Re:" not in link.text:
|
||||
counter += 1
|
||||
|
||||
comments = self.getComments(link.get('href'))
|
||||
if len(comments) != 0:
|
||||
bestComment, score = self.judge.getBest(comments)
|
||||
ans.append({
|
||||
"Q": link.text.replace('[問卦] ', ''),
|
||||
"A": bestComment
|
||||
})
|
||||
print(ans[-1], counter)
|
||||
time.sleep(2.5)
|
||||
if counter % 100 == 0:
|
||||
with open('train.json', 'w') as fp:
|
||||
json.dump(ans, fp)
|
||||
|
||||
# 取得上一頁的位址
|
||||
btns = root.find_all('a', class_='btn wide')
|
||||
for btn in btns:
|
||||
if '上頁' in btn.text:
|
||||
url = 'https://www.ptt.cc{}'.format(btn.get('href'))
|
||||
print(url)
|
||||
print()
|
||||
# time.sleep(3)
|
||||
else:
|
||||
raise Exception("Response status code {}".format(response.status_code))
|
||||
return ans
|
||||
|
||||
def getComments(self, url):
|
||||
url = 'https://www.ptt.cc{}'.format(url)
|
||||
ans = []
|
||||
response = requests.get(url, headers={'cookie': 'over18=1;'})
|
||||
root = BeautifulSoup(response.text, 'html.parser')
|
||||
comments = root.find_all('div', class_='push')
|
||||
for comment in comments:
|
||||
try:
|
||||
text = comment.find_all('span')[2].text
|
||||
if 'http' not in text:
|
||||
ans.append(text.replace(': ', ''))
|
||||
except:
|
||||
print(comment) # 推文太多會出現 error
|
||||
return ans
|
||||
4
Crawler/main.py
Normal file
4
Crawler/main.py
Normal file
@ -0,0 +1,4 @@
|
||||
from PTTCrawler import PTTCrawler
|
||||
|
||||
crawler = PTTCrawler("Gossiping", 'index29686.html')
|
||||
posts = crawler.getPosts()
|
||||
Loading…
Reference in New Issue
Block a user