PTT-Chatbot/Crawler/PTTCrawler.py

83 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from tracemalloc import start
from urllib import response
from itsdangerous import exc
import requests
from bs4 import BeautifulSoup
import time
from CommentJudge import CommentJudge
import jieba
import json
import os
class PTTCrawler():
def __init__(self, board, startURL='index.html'):
self.board = board
self.startURL = startURL
self.judge = CommentJudge()
if requests.get('https://www.ptt.cc/bbs/{}/index.html'.format(self.board)).status_code != 200:
raise Exception("No board in PTT named {}".format(self.board))
def getPosts(self, number=100000):
url = 'https://www.ptt.cc/bbs/{}/{}'.format(self.board, self.startURL)
if os.path.isfile('train.json'):
with open('train.json') as fp:
ans = json.load(fp)
counter = len(ans)
else:
ans = []
counter = 0
while counter < number:
print(url)
response = requests.get(url, headers={'cookie': 'over18=1;'})
if response.status_code == 200:
# 取得文章的標題和URL並進一步 call getComments() 取得推文
root = BeautifulSoup(response.text, 'html.parser')
posts = root.find_all('div', class_='r-ent')
for post in posts:
link = post.find("a")
if link: # 如果被刪文,則會是 None
if "[問卦] " in link.text and "Re:" not in link.text:
counter += 1
comments = self.getComments(link.get('href'))
if len(comments) != 0:
bestComment, score = self.judge.getBest(comments)
ans.append({
"Q": link.text.replace('[問卦] ', ''),
"A": bestComment
})
print(ans[-1], counter)
time.sleep(2.5)
if counter % 100 == 0:
with open('train.json', 'w') as fp:
json.dump(ans, fp)
# 取得上一頁的位址
btns = root.find_all('a', class_='btn wide')
for btn in btns:
if '上頁' in btn.text:
url = 'https://www.ptt.cc{}'.format(btn.get('href'))
print(url)
print()
# time.sleep(3)
else:
raise Exception("Response status code {}".format(response.status_code))
return ans
def getComments(self, url):
url = 'https://www.ptt.cc{}'.format(url)
ans = []
response = requests.get(url, headers={'cookie': 'over18=1;'})
root = BeautifulSoup(response.text, 'html.parser')
comments = root.find_all('div', class_='push')
for comment in comments:
try:
text = comment.find_all('span')[2].text
if 'http' not in text:
ans.append(text.replace(': ', ''))
except:
print(comment) # 推文太多會出現 error
return ans