forked from liuhuanyong/EventMonitor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhandle_html.py
More file actions
74 lines (60 loc) · 1.77 KB
/
handle_html.py
File metadata and controls
74 lines (60 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: chenhe<hee0624@163.com>
# time: 2017-11-30
# version: 1.0
from html.parser import HTMLParser
from bs4 import BeautifulSoup
class StripParser(HTMLParser):
"""
去除一些特定的标签
"""
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs = True
self.drop_tags = {'script', 'style', 'iframe', 'aside', 'nav', 'footer'}
self.fed = []
self.point_tags =[]
self.is_fed = True
def handle_starttag(self, tag, attrs):
if tag in self.drop_tags:
self.is_fed = False
self.point_tags.append(tag)
else:
if tag == 'p':
tmp_attrs = ['{0}="{1}"'.format(i[0], i[1]) for i in attrs]
tmp_attrs = ' '.join(tmp_attrs)
self.fed.append('<p {}>'.format(tmp_attrs))
else:
self.fed.append('<{}>'.format(tag))
def handle_data(self, data):
if self.is_fed:
self.fed.append(data)
def handle_endtag(self, tag):
if tag in self.drop_tags:
if tag == self.point_tags[-1]:
self.point_tags.pop()
if not self.point_tags:
self.is_fed = True
else:
self.fed.append('</{}>'.format(tag))
def get_html(self):
return '\n'.join(self.fed)
def pretty_html(html):
soup = BeautifulSoup(html, 'html.parser')
fixed_html = soup.prettify()
return fixed_html
def strip_tag(html):
"""
去除html特定的标签
:param html: string
:return: string
"""
s = StripParser()
s.feed(html)
return s.get_html()
def handle_html(html):
html = pretty_html(html)
html = strip_tag(html)
return html