forked from yidao620c/python3-cookbook
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_parser.py
More file actions
92 lines (75 loc) · 2.38 KB
/
html_parser.py
File metadata and controls
92 lines (75 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
Topic: sample
Desc :
"""
from html.parser import HTMLParser
import re
from urllib.request import Request, urlopen
class Parselinks(HTMLParser):
def __init__(self):
self.data = []
self.href = 0
self.linkname = ''
self.patt = re.compile(r'^/doc/\d+$')
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href' and re.match(self.patt, value):
self.href = 1
self.data.append([value])
def handle_data(self, data):
if self.href:
self.linkname += data
def handle_endtag(self, tag):
if tag == 'a' and self.href:
self.linkname = ''.join(self.linkname.split())
self.linkname = self.linkname.strip()
self.data[-1].append(self.linkname)
self.linkname = ''
self.href = 0
class ParsePages(HTMLParser):
def __init__(self):
self.data = set([])
self.href = 0
self.patt = re.compile(r'^\?p=\d+$')
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href' and re.match(self.patt, value):
self.href = 1
self.data.add(value)
def handle_endtag(self, tag):
if tag == 'a' and self.href:
self.href = 0
def fetch_data(pparser, url):
headers = {
'User-Agent': '''Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/28.0.1500.72 Safari/537.36'''
}
req = Request(
url=url,
headers=headers
)
pparser.feed(urlopen(req).read())
pparser.close()
return pparser.data
def main():
result = []
pattt = re.compile(r'程序员编码诀窍')
urll = 'http://www.oschina.network/doc'
pages = fetch_data(ParsePages(), urll)
for eachurl in pages:
print('**********')
each_page_data = fetch_data(Parselinks(), urll + eachurl)
for each_link_data in each_page_data:
if re.match(pattt, each_link_data[1]):
result.append(each_link_data)
print("*" * 30)
for r in result:
print('%s -> %s' % tuple(r))
if __name__ == '__main__':
main()