1+ #!/usr/bin/env python3
2+ # coding: utf-8
3+ # File: word_monitor.py
4+ # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
5+ # Date: 18-5-28
6+ import urllib .parse
7+ import urllib .request
8+ from lxml import etree
9+ import extract_news
10+
11+
12+ class NewsMonitor :
13+ def __int__ (self ):
14+ self .sogou_hompage = 'http://news.sogou.com/news?mode=0&manual=true&query=%D6%D0%D0%CB&sort=1&page=2'
15+ self .baidu_homepage = 'http://news.baidu.com/ns?word=title%3A%28%E4%B8%AD%E5%85%B4%29&pn=0&cl=2&ct=1&tn=newstitle&rn=20&ie=utf-8&bt=0&et=0'
16+
17+ def get_html (self , url ):
18+ headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17" }
19+ req = urllib .request .Request (url , headers = headers )
20+ html = urllib .request .urlopen (req ).read ().decode ('utf-8' )
21+ return html
22+
23+ def get_news (self , html ):
24+ selector = etree .HTML (html )
25+ urls = selector .xpath ('//h3[@class="c-title"]/a/@href' )
26+ print (len (urls ))
27+ return urls
28+
29+ def search_news (self , word ):
30+ url_list = []
31+ word = urllib .parse .quote_plus (word )
32+ for page_num in range (20 , 800 , 20 ):
33+ news_url = 'http://news.baidu.com/ns?word=title%3A%28' + word + '%29&pn=' + str (page_num ) + '&cl=2&ct=0&tn=newstitle&rn=20&ie=utf-8&bt=0&et=0'
34+ news_html = self .get_html (news_url )
35+ news_links = self .get_news (news_html )
36+ url_list += news_links
37+ return set (url_list )
38+
39+ def collect_news (self , urls ):
40+ for url in urls :
41+ try :
42+ data = extract_news .online_parse (url )
43+ print (data )
44+ try :
45+ file_name = data ['news_pubtime' ]
46+ news_title = data ['news_title' ]
47+ news_content = data ['news_content' ]
48+ f = open ('news/%s.txt' % (file_name + '@' + news_title ), 'w+' )
49+ f .write (news_content )
50+ f .close ()
51+ except :
52+ pass
53+ except :
54+ pass
55+
56+ def demo ():
57+ news_monitor = NewsMonitor ()
58+ word = '中兴'
59+ news_list = news_monitor .search_news (word )
60+ news_monitor .collect_news (news_list )
61+ f = open ('news_list.txt' , 'w+' )
62+ for news in news_list :
63+ f .write (news + "\n " )
64+ f .close ()
65+
66+
67+ demo ()
0 commit comments