comparison roundup/dehtml.py @ 5997:1700542408f3

flake8 cleanups dehtml.py Note you need to disable long lines as there is a test example that requires really long lines of htmlized output.
author John Rouillard <rouilj@ieee.org>
date Wed, 25 Dec 2019 20:18:39 -0500
parents b74f0b50bef1
children af81e7a4302f
comparison
equal deleted inserted replaced
5996:69a35d164a69 5997:1700542408f3
1 1
2 from __future__ import print_function 2 from __future__ import print_function
3 from roundup.anypy.strings import u2s, uchr 3 from roundup.anypy.strings import u2s, uchr
4
5
4 class dehtml: 6 class dehtml:
5 def __init__(self, converter): 7 def __init__(self, converter):
6 if converter == "none": 8 if converter == "none":
7 self.html2text = None 9 self.html2text = None
8 return 10 return
9 11
10 try: 12 try:
11 if converter == "beautifulsoup": 13 if converter == "beautifulsoup":
12 # Not as well tested as dehtml. 14 # Not as well tested as dehtml.
13 from bs4 import BeautifulSoup 15 from bs4 import BeautifulSoup
16
14 def html2text(html): 17 def html2text(html):
15 soup = BeautifulSoup(html) 18 soup = BeautifulSoup(html)
16 19
17 # kill all script and style elements 20 # kill all script and style elements
18 for script in soup(["script", "style"]): 21 for script in soup(["script", "style"]):
20 23
21 return u2s(soup.get_text('\n', strip=True)) 24 return u2s(soup.get_text('\n', strip=True))
22 25
23 self.html2text = html2text 26 self.html2text = html2text
24 else: 27 else:
25 raise ImportError # use 28 raise ImportError
26 except ImportError: 29 except ImportError:
27 # use the fallback below if beautiful soup is not installed. 30 # use the fallback below if beautiful soup is not installed.
28 try: 31 try:
29 # Python 3+. 32 # Python 3+.
30 from html.parser import HTMLParser 33 from html.parser import HTMLParser
31 from html.entities import name2codepoint 34 from html.entities import name2codepoint
32 pyver=3 35 pyver = 3
33 except ImportError: 36 except ImportError:
34 # Python 2. 37 # Python 2.
35 from HTMLParser import HTMLParser 38 from HTMLParser import HTMLParser
36 from htmlentitydefs import name2codepoint 39 from htmlentitydefs import name2codepoint
37 pyver=2 40 pyver = 2
38 41
39 class DumbHTMLParser(HTMLParser): 42 class DumbHTMLParser(HTMLParser):
40 # class attribute 43 # class attribute
41 text="" 44 text = ""
42 45
43 # internal state variable 46 # internal state variable
44 _skip_data = False 47 _skip_data = False
45 _last_empty = False 48 _last_empty = False
46 49
47 def handle_data(self, data): 50 def handle_data(self, data):
48 if self._skip_data: # skip data if in script or style block 51 if self._skip_data: # skip data in script or style block
49 return 52 return
50 53
51 if ( data.strip() == ""): 54 if (data.strip() == ""):
52 # reduce multiple blank lines to 1 55 # reduce multiple blank lines to 1
53 if ( self._last_empty ): 56 if (self._last_empty):
54 return 57 return
55 else: 58 else:
56 self._last_empty = True 59 self._last_empty = True
57 else: 60 else:
58 self._last_empty = False 61 self._last_empty = False
59 62
60 self.text=self.text + data 63 self.text = self.text + data
61 64
62 def handle_starttag(self, tag, attrs): 65 def handle_starttag(self, tag, attrs):
63 if (tag == "p" ): 66 if (tag == "p"):
64 self.text= self.text + "\n" 67 self.text = self.text + "\n"
65 if (tag in ("style", "script")): 68 if (tag in ("style", "script")):
66 self._skip_data = True 69 self._skip_data = True
67 70
68 def handle_endtag(self, tag): 71 def handle_endtag(self, tag):
69 if (tag in ("style", "script")): 72 if (tag in ("style", "script")):
70 self._skip_data = False 73 self._skip_data = False
71 74
72 def handle_entityref(self, name): 75 def handle_entityref(self, name):
73 if self._skip_data: 76 if self._skip_data:
74 return 77 return
75 c = uchr(name2codepoint[name]) 78 c = uchr(name2codepoint[name])
76 try: 79 try:
77 self.text= self.text + c 80 self.text = self.text + c
78 except UnicodeEncodeError: 81 except UnicodeEncodeError:
79 # print a space as a placeholder 82 # print a space as a placeholder
80 self.text= self.text + ' ' 83 self.text = self.text + ' '
81 84
82 def html2text(html): 85 def html2text(html):
83 if pyver == 3: 86 if pyver == 3:
84 parser = DumbHTMLParser(convert_charrefs=True) 87 parser = DumbHTMLParser(convert_charrefs=True)
85 else: 88 else:
88 parser.close() 91 parser.close()
89 return parser.text 92 return parser.text
90 93
91 self.html2text = html2text 94 self.html2text = html2text
92 95
96
93 if "__main__" == __name__: 97 if "__main__" == __name__:
94 html=''' 98 html = '''
95 <body> 99 <body>
96 <script> 100 <script>
97 this must not be in output 101 this must not be in output
98 </script> 102 </script>
99 <style> 103 <style>
150 # trap error seen if N_TOKENS not defined when run. 154 # trap error seen if N_TOKENS not defined when run.
151 html2text = dehtml("beautifulsoup").html2text 155 html2text = dehtml("beautifulsoup").html2text
152 if html2text: 156 if html2text:
153 print(html2text(html)) 157 print(html2text(html))
154 except NameError as e: 158 except NameError as e:
155 print("captured error %s"%e) 159 print("captured error %s" % e)
156 160
157 html2text = dehtml("none").html2text 161 html2text = dehtml("none").html2text
158 if html2text: 162 if html2text:
159 print("FAIL: Error, dehtml(none) is returning a function") 163 print("FAIL: Error, dehtml(none) is returning a function")
160 else: 164 else:
161 print("PASS: dehtml(none) is returning None") 165 print("PASS: dehtml(none) is returning None")
162
163

Roundup Issue Tracker: http://roundup-tracker.org/