Skip to content

Commit 95bc157

Browse files
committed
创建eventmonitor项目
0 parents  commit 95bc157

File tree

611 files changed

+13029
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

611 files changed

+13029
-0
lines changed

__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# author: chenhe<hee0624@163.com>
4+
# time: 2017-11-30
5+
# version: 1.0

event_monitor.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
# File: word_monitor.py
4+
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
5+
# Date: 18-5-28
6+
import urllib.parse
7+
import urllib.request
8+
from lxml import etree
9+
import extract_news
10+
11+
12+
class NewsMonitor:
13+
def __int__(self):
14+
self.sogou_hompage = 'http://news.sogou.com/news?mode=0&manual=true&query=%D6%D0%D0%CB&sort=1&page=2'
15+
self.baidu_homepage = 'http://news.baidu.com/ns?word=title%3A%28%E4%B8%AD%E5%85%B4%29&pn=0&cl=2&ct=1&tn=newstitle&rn=20&ie=utf-8&bt=0&et=0'
16+
17+
def get_html(self, url):
18+
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
19+
req = urllib.request.Request(url, headers=headers)
20+
html = urllib.request.urlopen(req).read().decode('utf-8')
21+
return html
22+
23+
def get_news(self, html):
24+
selector = etree.HTML(html)
25+
urls = selector.xpath('//h3[@class="c-title"]/a/@href')
26+
print(len(urls))
27+
return urls
28+
29+
def search_news(self, word):
30+
url_list = []
31+
word = urllib.parse.quote_plus(word)
32+
for page_num in range(20, 800, 20):
33+
news_url = 'http://news.baidu.com/ns?word=title%3A%28'+word+'%29&pn=' + str(page_num) + '&cl=2&ct=0&tn=newstitle&rn=20&ie=utf-8&bt=0&et=0'
34+
news_html = self.get_html(news_url)
35+
news_links = self.get_news(news_html)
36+
url_list += news_links
37+
return set(url_list)
38+
39+
def collect_news(self, urls):
40+
for url in urls:
41+
try:
42+
data = extract_news.online_parse(url)
43+
print(data)
44+
try:
45+
file_name = data['news_pubtime']
46+
news_title = data['news_title']
47+
news_content = data['news_content']
48+
f = open('news/%s.txt'%(file_name +'@' + news_title), 'w+')
49+
f.write(news_content)
50+
f.close()
51+
except:
52+
pass
53+
except:
54+
pass
55+
56+
def demo():
57+
news_monitor = NewsMonitor()
58+
word = '中兴'
59+
news_list = news_monitor.search_news(word)
60+
news_monitor.collect_news(news_list)
61+
f = open('news_list.txt', 'w+')
62+
for news in news_list:
63+
f.write(news + "\n")
64+
f.close()
65+
66+
67+
demo()

0 commit comments

Comments
 (0)