|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# 按贴吧帖子 ID 顺序爬取纯文本数据, 每个帖子保存为一个 ID_帖子标题.txt 文件 |
| 3 | +# ./ID.txt 存放从哪个 ID 开始爬,不存在则ID默认为 5000000000 |
| 4 | + |
| 5 | +import re |
| 6 | +import os |
| 7 | +import threading |
| 8 | +import time |
| 9 | +import codecs |
| 10 | +import multiprocessing |
| 11 | +from multiprocessing.dummy import Pool as thpool |
| 12 | +import urllib.request |
| 13 | +import random |
| 14 | +import logging |
| 15 | +import pymongo |
| 16 | +from bs4 import BeautifulSoup |
| 17 | +from fake_useragent import UserAgent |
| 18 | + |
| 19 | +client = pymongo.MongoClient('localhost') |
| 20 | +db = client['baidu'] |
| 21 | +col = db['tieba'] |
| 22 | + |
| 23 | +# 多进程锁 |
| 24 | +m_lock = multiprocessing.Lock() |
| 25 | +ua = UserAgent() |
| 26 | + |
| 27 | + |
| 28 | +def get_title(html): |
| 29 | + try: |
| 30 | + soup = BeautifulSoup(html, 'lxml') |
| 31 | + raw_title = soup.find('h1') |
| 32 | + if not raw_title: |
| 33 | + raw_title = soup.find('h3') |
| 34 | + if not raw_title: |
| 35 | + raw_title = re.findall('很抱歉,该贴已被删除。', html) |
| 36 | + if raw_title: |
| 37 | + raw_title = raw_title[0] |
| 38 | + if not raw_title: |
| 39 | + raw_title = re.findall('该吧被合并您所访问的贴子无法显示', html) |
| 40 | + if raw_title: |
| 41 | + raw_title = raw_title[0] |
| 42 | + if not raw_title: |
| 43 | + raw_title = re.findall('抱歉,您访问的贴子被隐藏,暂时无法访问。', html) |
| 44 | + if raw_title: |
| 45 | + raw_title = raw_title[0] |
| 46 | + if not raw_title: |
| 47 | + return '' |
| 48 | + title = remove_html_tag(str(raw_title)) |
| 49 | + return title |
| 50 | + except Exception as e: |
| 51 | + logging.warning('Get title: {}'.format(e)) |
| 52 | + return '' |
| 53 | + |
| 54 | + |
| 55 | +def get_posts_num(html): |
| 56 | + try: |
| 57 | + soup = BeautifulSoup(html, 'lxml') |
| 58 | + raw_posts_num = soup.find('ul', {'class': 'l_posts_num'}) |
| 59 | + match = re.findall('pn=[0-9]+', str(raw_posts_num)) |
| 60 | + if match: |
| 61 | + last_num_url = match.pop() |
| 62 | + last_num = re.findall('[0-9]+', str(last_num_url)) |
| 63 | + return int(last_num[0]) |
| 64 | + else: |
| 65 | + return 1 |
| 66 | + except Exception as e: |
| 67 | + logging.warning('Get posts num: '.format(e)) |
| 68 | + return 1 |
| 69 | + |
| 70 | + |
| 71 | +# 暂时不需要 |
| 72 | +def get_floor(content): |
| 73 | + c_content = '<html><body>' + str(content) + '</html></body>' |
| 74 | + try: |
| 75 | + soup = BeautifulSoup(c_content, 'lxml') |
| 76 | + raw_floor = soup.findAll('span', {'class': 'tail-info'}) |
| 77 | + f_floor = re.findall('[0-9]+楼', str(raw_floor)) |
| 78 | + if f_floor: |
| 79 | + floor = remove_html_tag(str(f_floor[0])) |
| 80 | + return str(floor) |
| 81 | + else: |
| 82 | + return '' |
| 83 | + except Exception as e: |
| 84 | + logging.warning('Get floor: {}'.format(e)) |
| 85 | + return '' |
| 86 | + |
| 87 | + |
| 88 | +def get_whole_page_content(html): |
| 89 | + try: |
| 90 | + soup = BeautifulSoup(html, 'lxml') |
| 91 | + raw_posts_content = soup.findAll('div', {'class': ['d_post_content_main']}) |
| 92 | + content = '' |
| 93 | + for post_content in raw_posts_content: |
| 94 | + each_content = get_content(post_content) |
| 95 | + if each_content: |
| 96 | + content = content + each_content + '\n\n' |
| 97 | + return content |
| 98 | + except Exception as e: |
| 99 | + logging.warning('Get whole page content: {}'.format(e)) |
| 100 | + return '' |
| 101 | + |
| 102 | + |
| 103 | +def get_content(text): |
| 104 | + c_text = '<html><body>' + str(text) + '</html></body>' |
| 105 | + try: |
| 106 | + soup = BeautifulSoup(c_text, 'lxml') |
| 107 | + raw_content = soup.find('div', {'class': 'd_post_content'}) |
| 108 | + content = re.findall('\S.+', remove_html_tag(str(raw_content))) |
| 109 | + if content: |
| 110 | + return str(content[0]) |
| 111 | + else: |
| 112 | + return '' |
| 113 | + except Exception as e: |
| 114 | + logging.warning('Get content: {}'.format(e)) |
| 115 | + return '' |
| 116 | + |
| 117 | + |
| 118 | +def save_content(path, content): |
| 119 | + try: |
| 120 | + with codecs.open(path, 'w', 'utf-8') as fw: |
| 121 | + fw.write(content) |
| 122 | + except Exception as e: |
| 123 | + logging.warning('Save content: {}'.format(e)) |
| 124 | + |
| 125 | + |
| 126 | +def remove_html_tag(html): |
| 127 | + html = html.strip() |
| 128 | + dr = re.compile(r'<[^>]+>', re.S) |
| 129 | + html = dr.sub('', html) |
| 130 | + return html |
| 131 | + |
| 132 | + |
| 133 | +class Spider(object): |
| 134 | + def __init__(self): |
| 135 | + self.list_url_queue = multiprocessing.Manager().list() |
| 136 | + self.seed_url = 'https://tieba.baidu.com/' |
| 137 | + self.post_id_file = './ID.txt' |
| 138 | + self.output_dir = './output/' |
| 139 | + self.post_id = int() |
| 140 | + |
| 141 | + # 多进程数量 |
| 142 | + self.process_num = 100 |
| 143 | + # 150 2min 750 ` |
| 144 | + # 200 2min 880 |
| 145 | + # 180 820 |
| 146 | + # 一次放多少条帖子链接队列 |
| 147 | + self.queue_put_num = 1000 |
| 148 | + |
| 149 | + def get_html(self, url): |
| 150 | + req = urllib.request.Request(url, headers={'User-Agent': ua.random}) |
| 151 | + time.sleep(random.randint(3, 20)) |
| 152 | + attempts = 0 |
| 153 | + attempts_times = 15 |
| 154 | + while attempts < attempts_times: |
| 155 | + try: |
| 156 | + website = urllib.request.urlopen(req, timeout=(25 + random.randint(3, 10))) |
| 157 | + html = website.read().decode('utf-8') |
| 158 | + return html |
| 159 | + except Exception as e: |
| 160 | + attempts = attempts + 1 |
| 161 | + if attempts == attempts_times: |
| 162 | + logging.warning('Get html: {0}: {1}'.format(e, url)) |
| 163 | + return '' |
| 164 | + |
| 165 | + def load_post_id(self): |
| 166 | + with open(self.post_id_file, 'r') as fr: |
| 167 | + self.post_id = int(fr.read()) |
| 168 | + |
| 169 | + def save_post_id(self, numb): |
| 170 | + with open(self.post_id_file, 'w') as fw: |
| 171 | + fw.write(str(numb)) |
| 172 | + |
| 173 | + def init_post_id(self): |
| 174 | + with m_lock: |
| 175 | + if not os.path.exists(self.output_dir): |
| 176 | + os.mkdir(self.output_dir) |
| 177 | + if not os.path.exists(self.post_id_file): |
| 178 | + with open(self.post_id_file, 'w') as fw: |
| 179 | + fw.write('5000000000') |
| 180 | + |
| 181 | + self.load_post_id() |
| 182 | + logging.info('导入 {} 到队列'.format(str(self.post_id))) |
| 183 | + for post_count in range(self.post_id, self.post_id + self.queue_put_num): |
| 184 | + self.list_url_queue.append(post_count) |
| 185 | + self.save_post_id(int(self.post_id + self.queue_put_num)) |
| 186 | + |
| 187 | + def crawl_post_list(self): |
| 188 | + while True: |
| 189 | + try: |
| 190 | + if not self.list_url_queue: |
| 191 | + self.init_post_id() |
| 192 | + |
| 193 | + post_id = self.list_url_queue.pop(0) |
| 194 | + post_id_prefix = re.findall('^[0-9]{6}', str(post_id)) |
| 195 | + output_file_path = self.output_dir + str(post_id_prefix[0]) + '/' |
| 196 | + if not os.path.exists(output_file_path): |
| 197 | + os.makedirs(output_file_path, exist_ok=True) |
| 198 | + post_url = self.seed_url + 'p/' + str(post_id) |
| 199 | + |
| 200 | + except Exception as e: |
| 201 | + logging.critical('取ID问题: {}'.format(e)) |
| 202 | + continue |
| 203 | + |
| 204 | + try: |
| 205 | + post_html = self.get_html(post_url) |
| 206 | + if not post_html: |
| 207 | + continue |
| 208 | + post_title = get_title(post_html) |
| 209 | + if post_title == '很抱歉,该贴已被删除。': |
| 210 | + # logging.error('{}: ---贴子被删---'.format(post_url)) |
| 211 | + continue |
| 212 | + if post_title == '该吧被合并您所访问的贴子无法显示': |
| 213 | + logging.error('{}: 贴吧被合并无法显示'.format(post_url)) |
| 214 | + continue |
| 215 | + if post_title == '抱歉,您访问的贴子被隐藏,暂时无法访问。': |
| 216 | + # logging.error('{}: *****帖子被隐藏*****'.format(post_url)) |
| 217 | + continue |
| 218 | + if not post_title: |
| 219 | + logging.error('{}: 找不到title'.format(post_url)) |
| 220 | + continue |
| 221 | + first_page_content = get_whole_page_content(post_html) |
| 222 | + if not first_page_content: |
| 223 | + # logging.error('{}: ### 帖子无内容 ###'.format(post_url)) |
| 224 | + continue |
| 225 | + |
| 226 | + all_content = first_page_content.split('\n\n') |
| 227 | + page_num = get_posts_num(post_html) |
| 228 | + |
| 229 | + for i in range(page_num): |
| 230 | + if i != 0: |
| 231 | + page_url = post_url + '?pn=' + str(i + 1) |
| 232 | + other_page = self.get_html(page_url) |
| 233 | + other_content = get_whole_page_content(other_page) |
| 234 | + all_content = all_content + other_content.split('\n\n') |
| 235 | + if not all_content[-1]: |
| 236 | + all_content.pop() |
| 237 | + dr = re.compile(r'/|[\\]|[ ]|[|]|[:]|[*]|[<]|[>]|[?]|[\']|["]') |
| 238 | + post_title = dr.sub('_', post_title) |
| 239 | + doc = { |
| 240 | + 'id': post_id, |
| 241 | + 'title': post_title, |
| 242 | + 'content': all_content |
| 243 | + |
| 244 | + } |
| 245 | + col.insert(doc) |
| 246 | + print(doc) |
| 247 | + # output_file = output_file_path + str(post_id) + '_' + post_title + '.txt' |
| 248 | + # save_content(output_file, all_content) |
| 249 | + logging.info('{0} ---{2}--- {1}'.format(post_id, post_title, str(page_num))) |
| 250 | + |
| 251 | + except Exception as e: |
| 252 | + logging.critical('尚未预料到的错误: {0} | {1}'.format(e, post_url)) |
| 253 | + continue |
| 254 | + |
| 255 | + def start(self): |
| 256 | + self.init_post_id() |
| 257 | + |
| 258 | + task = [] |
| 259 | + for _ in range(1, self.process_num): |
| 260 | + t = (threading.Thread(target=self.crawl_post_list)) |
| 261 | + t.start() |
| 262 | + task.append(t) |
| 263 | + for t in task: |
| 264 | + t.join() |
| 265 | + |
| 266 | + # processes = [] |
| 267 | + # for i in range(1, self.process_num): |
| 268 | + # t = multiprocessing.Process(target=self.crawl_post_list, args=()) |
| 269 | + # t.start() |
| 270 | + # processes.append(t) |
| 271 | + # for t in processes: |
| 272 | + # t.join() |
| 273 | + |
| 274 | + |
| 275 | +if __name__ == '__main__': |
| 276 | + logging.basicConfig(format='%(asctime)s|PID:%(process)d|%(levelname)s: %(message)s', |
| 277 | + level=logging.INFO, filename='./log.txt') |
| 278 | + spider = Spider() |
| 279 | + spider.start() |
0 commit comments