Skip to content

Commit 70bb7d8

Browse files
committed
Merge remote-tracking branch 'origin/master'
# Conflicts: # .idea/workspace.xml
2 parents 6acac5d + d77120c commit 70bb7d8

File tree

3 files changed

+308
-39
lines changed

3 files changed

+308
-39
lines changed

.idea/workspace.xml

Lines changed: 28 additions & 39 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

baidu_tieba/ID.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1000090000

baidu_tieba/tieba.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
# -*- coding: utf-8 -*-
2+
# 按贴吧帖子 ID 顺序爬取纯文本数据, 每个帖子保存为一个 ID_帖子标题.txt 文件
3+
# ./ID.txt 存放从哪个 ID 开始爬,不存在则ID默认为 5000000000
4+
5+
import re
6+
import os
7+
import threading
8+
import time
9+
import codecs
10+
import multiprocessing
11+
from multiprocessing.dummy import Pool as thpool
12+
import urllib.request
13+
import random
14+
import logging
15+
import pymongo
16+
from bs4 import BeautifulSoup
17+
from fake_useragent import UserAgent
18+
19+
client = pymongo.MongoClient('localhost')
20+
db = client['baidu']
21+
col = db['tieba']
22+
23+
# 多进程锁
24+
m_lock = multiprocessing.Lock()
25+
ua = UserAgent()
26+
27+
28+
def get_title(html):
29+
try:
30+
soup = BeautifulSoup(html, 'lxml')
31+
raw_title = soup.find('h1')
32+
if not raw_title:
33+
raw_title = soup.find('h3')
34+
if not raw_title:
35+
raw_title = re.findall('很抱歉,该贴已被删除。', html)
36+
if raw_title:
37+
raw_title = raw_title[0]
38+
if not raw_title:
39+
raw_title = re.findall('该吧被合并您所访问的贴子无法显示', html)
40+
if raw_title:
41+
raw_title = raw_title[0]
42+
if not raw_title:
43+
raw_title = re.findall('抱歉,您访问的贴子被隐藏,暂时无法访问。', html)
44+
if raw_title:
45+
raw_title = raw_title[0]
46+
if not raw_title:
47+
return ''
48+
title = remove_html_tag(str(raw_title))
49+
return title
50+
except Exception as e:
51+
logging.warning('Get title: {}'.format(e))
52+
return ''
53+
54+
55+
def get_posts_num(html):
56+
try:
57+
soup = BeautifulSoup(html, 'lxml')
58+
raw_posts_num = soup.find('ul', {'class': 'l_posts_num'})
59+
match = re.findall('pn=[0-9]+', str(raw_posts_num))
60+
if match:
61+
last_num_url = match.pop()
62+
last_num = re.findall('[0-9]+', str(last_num_url))
63+
return int(last_num[0])
64+
else:
65+
return 1
66+
except Exception as e:
67+
logging.warning('Get posts num: '.format(e))
68+
return 1
69+
70+
71+
# 暂时不需要
72+
def get_floor(content):
73+
c_content = '<html><body>' + str(content) + '</html></body>'
74+
try:
75+
soup = BeautifulSoup(c_content, 'lxml')
76+
raw_floor = soup.findAll('span', {'class': 'tail-info'})
77+
f_floor = re.findall('[0-9]+楼', str(raw_floor))
78+
if f_floor:
79+
floor = remove_html_tag(str(f_floor[0]))
80+
return str(floor)
81+
else:
82+
return ''
83+
except Exception as e:
84+
logging.warning('Get floor: {}'.format(e))
85+
return ''
86+
87+
88+
def get_whole_page_content(html):
89+
try:
90+
soup = BeautifulSoup(html, 'lxml')
91+
raw_posts_content = soup.findAll('div', {'class': ['d_post_content_main']})
92+
content = ''
93+
for post_content in raw_posts_content:
94+
each_content = get_content(post_content)
95+
if each_content:
96+
content = content + each_content + '\n\n'
97+
return content
98+
except Exception as e:
99+
logging.warning('Get whole page content: {}'.format(e))
100+
return ''
101+
102+
103+
def get_content(text):
104+
c_text = '<html><body>' + str(text) + '</html></body>'
105+
try:
106+
soup = BeautifulSoup(c_text, 'lxml')
107+
raw_content = soup.find('div', {'class': 'd_post_content'})
108+
content = re.findall('\S.+', remove_html_tag(str(raw_content)))
109+
if content:
110+
return str(content[0])
111+
else:
112+
return ''
113+
except Exception as e:
114+
logging.warning('Get content: {}'.format(e))
115+
return ''
116+
117+
118+
def save_content(path, content):
119+
try:
120+
with codecs.open(path, 'w', 'utf-8') as fw:
121+
fw.write(content)
122+
except Exception as e:
123+
logging.warning('Save content: {}'.format(e))
124+
125+
126+
def remove_html_tag(html):
127+
html = html.strip()
128+
dr = re.compile(r'<[^>]+>', re.S)
129+
html = dr.sub('', html)
130+
return html
131+
132+
133+
class Spider(object):
134+
def __init__(self):
135+
self.list_url_queue = multiprocessing.Manager().list()
136+
self.seed_url = 'https://tieba.baidu.com/'
137+
self.post_id_file = './ID.txt'
138+
self.output_dir = './output/'
139+
self.post_id = int()
140+
141+
# 多进程数量
142+
self.process_num = 100
143+
# 150 2min 750 `
144+
# 200 2min 880
145+
# 180 820
146+
# 一次放多少条帖子链接队列
147+
self.queue_put_num = 1000
148+
149+
def get_html(self, url):
150+
req = urllib.request.Request(url, headers={'User-Agent': ua.random})
151+
time.sleep(random.randint(3, 20))
152+
attempts = 0
153+
attempts_times = 15
154+
while attempts < attempts_times:
155+
try:
156+
website = urllib.request.urlopen(req, timeout=(25 + random.randint(3, 10)))
157+
html = website.read().decode('utf-8')
158+
return html
159+
except Exception as e:
160+
attempts = attempts + 1
161+
if attempts == attempts_times:
162+
logging.warning('Get html: {0}: {1}'.format(e, url))
163+
return ''
164+
165+
def load_post_id(self):
166+
with open(self.post_id_file, 'r') as fr:
167+
self.post_id = int(fr.read())
168+
169+
def save_post_id(self, numb):
170+
with open(self.post_id_file, 'w') as fw:
171+
fw.write(str(numb))
172+
173+
def init_post_id(self):
174+
with m_lock:
175+
if not os.path.exists(self.output_dir):
176+
os.mkdir(self.output_dir)
177+
if not os.path.exists(self.post_id_file):
178+
with open(self.post_id_file, 'w') as fw:
179+
fw.write('5000000000')
180+
181+
self.load_post_id()
182+
logging.info('导入 {} 到队列'.format(str(self.post_id)))
183+
for post_count in range(self.post_id, self.post_id + self.queue_put_num):
184+
self.list_url_queue.append(post_count)
185+
self.save_post_id(int(self.post_id + self.queue_put_num))
186+
187+
def crawl_post_list(self):
188+
while True:
189+
try:
190+
if not self.list_url_queue:
191+
self.init_post_id()
192+
193+
post_id = self.list_url_queue.pop(0)
194+
post_id_prefix = re.findall('^[0-9]{6}', str(post_id))
195+
output_file_path = self.output_dir + str(post_id_prefix[0]) + '/'
196+
if not os.path.exists(output_file_path):
197+
os.makedirs(output_file_path, exist_ok=True)
198+
post_url = self.seed_url + 'p/' + str(post_id)
199+
200+
except Exception as e:
201+
logging.critical('取ID问题: {}'.format(e))
202+
continue
203+
204+
try:
205+
post_html = self.get_html(post_url)
206+
if not post_html:
207+
continue
208+
post_title = get_title(post_html)
209+
if post_title == '很抱歉,该贴已被删除。':
210+
# logging.error('{}: ---贴子被删---'.format(post_url))
211+
continue
212+
if post_title == '该吧被合并您所访问的贴子无法显示':
213+
logging.error('{}: 贴吧被合并无法显示'.format(post_url))
214+
continue
215+
if post_title == '抱歉,您访问的贴子被隐藏,暂时无法访问。':
216+
# logging.error('{}: *****帖子被隐藏*****'.format(post_url))
217+
continue
218+
if not post_title:
219+
logging.error('{}: 找不到title'.format(post_url))
220+
continue
221+
first_page_content = get_whole_page_content(post_html)
222+
if not first_page_content:
223+
# logging.error('{}: ### 帖子无内容 ###'.format(post_url))
224+
continue
225+
226+
all_content = first_page_content.split('\n\n')
227+
page_num = get_posts_num(post_html)
228+
229+
for i in range(page_num):
230+
if i != 0:
231+
page_url = post_url + '?pn=' + str(i + 1)
232+
other_page = self.get_html(page_url)
233+
other_content = get_whole_page_content(other_page)
234+
all_content = all_content + other_content.split('\n\n')
235+
if not all_content[-1]:
236+
all_content.pop()
237+
dr = re.compile(r'/|[\\]|[ ]|[|]|[:]|[*]|[<]|[>]|[?]|[\']|["]')
238+
post_title = dr.sub('_', post_title)
239+
doc = {
240+
'id': post_id,
241+
'title': post_title,
242+
'content': all_content
243+
244+
}
245+
col.insert(doc)
246+
print(doc)
247+
# output_file = output_file_path + str(post_id) + '_' + post_title + '.txt'
248+
# save_content(output_file, all_content)
249+
logging.info('{0} ---{2}--- {1}'.format(post_id, post_title, str(page_num)))
250+
251+
except Exception as e:
252+
logging.critical('尚未预料到的错误: {0} | {1}'.format(e, post_url))
253+
continue
254+
255+
def start(self):
256+
self.init_post_id()
257+
258+
task = []
259+
for _ in range(1, self.process_num):
260+
t = (threading.Thread(target=self.crawl_post_list))
261+
t.start()
262+
task.append(t)
263+
for t in task:
264+
t.join()
265+
266+
# processes = []
267+
# for i in range(1, self.process_num):
268+
# t = multiprocessing.Process(target=self.crawl_post_list, args=())
269+
# t.start()
270+
# processes.append(t)
271+
# for t in processes:
272+
# t.join()
273+
274+
275+
if __name__ == '__main__':
276+
logging.basicConfig(format='%(asctime)s|PID:%(process)d|%(levelname)s: %(message)s',
277+
level=logging.INFO, filename='./log.txt')
278+
spider = Spider()
279+
spider.start()

0 commit comments

Comments
 (0)