Mercurial > p > roundup > code
comparison roundup/dehtml.py @ 5997:1700542408f3
flake8 cleanups dehtml.py
Note you need to disable long lines as there is a test example that
requires really long lines of htmlized output.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Wed, 25 Dec 2019 20:18:39 -0500 |
| parents | b74f0b50bef1 |
| children | af81e7a4302f |
comparison
equal
deleted
inserted
replaced
| 5996:69a35d164a69 | 5997:1700542408f3 |
|---|---|
| 1 | 1 |
| 2 from __future__ import print_function | 2 from __future__ import print_function |
| 3 from roundup.anypy.strings import u2s, uchr | 3 from roundup.anypy.strings import u2s, uchr |
| 4 | |
| 5 | |
| 4 class dehtml: | 6 class dehtml: |
| 5 def __init__(self, converter): | 7 def __init__(self, converter): |
| 6 if converter == "none": | 8 if converter == "none": |
| 7 self.html2text = None | 9 self.html2text = None |
| 8 return | 10 return |
| 9 | 11 |
| 10 try: | 12 try: |
| 11 if converter == "beautifulsoup": | 13 if converter == "beautifulsoup": |
| 12 # Not as well tested as dehtml. | 14 # Not as well tested as dehtml. |
| 13 from bs4 import BeautifulSoup | 15 from bs4 import BeautifulSoup |
| 16 | |
| 14 def html2text(html): | 17 def html2text(html): |
| 15 soup = BeautifulSoup(html) | 18 soup = BeautifulSoup(html) |
| 16 | 19 |
| 17 # kill all script and style elements | 20 # kill all script and style elements |
| 18 for script in soup(["script", "style"]): | 21 for script in soup(["script", "style"]): |
| 20 | 23 |
| 21 return u2s(soup.get_text('\n', strip=True)) | 24 return u2s(soup.get_text('\n', strip=True)) |
| 22 | 25 |
| 23 self.html2text = html2text | 26 self.html2text = html2text |
| 24 else: | 27 else: |
| 25 raise ImportError # use | 28 raise ImportError |
| 26 except ImportError: | 29 except ImportError: |
| 27 # use the fallback below if beautiful soup is not installed. | 30 # use the fallback below if beautiful soup is not installed. |
| 28 try: | 31 try: |
| 29 # Python 3+. | 32 # Python 3+. |
| 30 from html.parser import HTMLParser | 33 from html.parser import HTMLParser |
| 31 from html.entities import name2codepoint | 34 from html.entities import name2codepoint |
| 32 pyver=3 | 35 pyver = 3 |
| 33 except ImportError: | 36 except ImportError: |
| 34 # Python 2. | 37 # Python 2. |
| 35 from HTMLParser import HTMLParser | 38 from HTMLParser import HTMLParser |
| 36 from htmlentitydefs import name2codepoint | 39 from htmlentitydefs import name2codepoint |
| 37 pyver=2 | 40 pyver = 2 |
| 38 | 41 |
| 39 class DumbHTMLParser(HTMLParser): | 42 class DumbHTMLParser(HTMLParser): |
| 40 # class attribute | 43 # class attribute |
| 41 text="" | 44 text = "" |
| 42 | 45 |
| 43 # internal state variable | 46 # internal state variable |
| 44 _skip_data = False | 47 _skip_data = False |
| 45 _last_empty = False | 48 _last_empty = False |
| 46 | 49 |
| 47 def handle_data(self, data): | 50 def handle_data(self, data): |
| 48 if self._skip_data: # skip data if in script or style block | 51 if self._skip_data: # skip data in script or style block |
| 49 return | 52 return |
| 50 | 53 |
| 51 if ( data.strip() == ""): | 54 if (data.strip() == ""): |
| 52 # reduce multiple blank lines to 1 | 55 # reduce multiple blank lines to 1 |
| 53 if ( self._last_empty ): | 56 if (self._last_empty): |
| 54 return | 57 return |
| 55 else: | 58 else: |
| 56 self._last_empty = True | 59 self._last_empty = True |
| 57 else: | 60 else: |
| 58 self._last_empty = False | 61 self._last_empty = False |
| 59 | 62 |
| 60 self.text=self.text + data | 63 self.text = self.text + data |
| 61 | 64 |
| 62 def handle_starttag(self, tag, attrs): | 65 def handle_starttag(self, tag, attrs): |
| 63 if (tag == "p" ): | 66 if (tag == "p"): |
| 64 self.text= self.text + "\n" | 67 self.text = self.text + "\n" |
| 65 if (tag in ("style", "script")): | 68 if (tag in ("style", "script")): |
| 66 self._skip_data = True | 69 self._skip_data = True |
| 67 | 70 |
| 68 def handle_endtag(self, tag): | 71 def handle_endtag(self, tag): |
| 69 if (tag in ("style", "script")): | 72 if (tag in ("style", "script")): |
| 70 self._skip_data = False | 73 self._skip_data = False |
| 71 | 74 |
| 72 def handle_entityref(self, name): | 75 def handle_entityref(self, name): |
| 73 if self._skip_data: | 76 if self._skip_data: |
| 74 return | 77 return |
| 75 c = uchr(name2codepoint[name]) | 78 c = uchr(name2codepoint[name]) |
| 76 try: | 79 try: |
| 77 self.text= self.text + c | 80 self.text = self.text + c |
| 78 except UnicodeEncodeError: | 81 except UnicodeEncodeError: |
| 79 # print a space as a placeholder | 82 # print a space as a placeholder |
| 80 self.text= self.text + ' ' | 83 self.text = self.text + ' ' |
| 81 | 84 |
| 82 def html2text(html): | 85 def html2text(html): |
| 83 if pyver == 3: | 86 if pyver == 3: |
| 84 parser = DumbHTMLParser(convert_charrefs=True) | 87 parser = DumbHTMLParser(convert_charrefs=True) |
| 85 else: | 88 else: |
| 88 parser.close() | 91 parser.close() |
| 89 return parser.text | 92 return parser.text |
| 90 | 93 |
| 91 self.html2text = html2text | 94 self.html2text = html2text |
| 92 | 95 |
| 96 | |
| 93 if "__main__" == __name__: | 97 if "__main__" == __name__: |
| 94 html=''' | 98 html = ''' |
| 95 <body> | 99 <body> |
| 96 <script> | 100 <script> |
| 97 this must not be in output | 101 this must not be in output |
| 98 </script> | 102 </script> |
| 99 <style> | 103 <style> |
| 150 # trap error seen if N_TOKENS not defined when run. | 154 # trap error seen if N_TOKENS not defined when run. |
| 151 html2text = dehtml("beautifulsoup").html2text | 155 html2text = dehtml("beautifulsoup").html2text |
| 152 if html2text: | 156 if html2text: |
| 153 print(html2text(html)) | 157 print(html2text(html)) |
| 154 except NameError as e: | 158 except NameError as e: |
| 155 print("captured error %s"%e) | 159 print("captured error %s" % e) |
| 156 | 160 |
| 157 html2text = dehtml("none").html2text | 161 html2text = dehtml("none").html2text |
| 158 if html2text: | 162 if html2text: |
| 159 print("FAIL: Error, dehtml(none) is returning a function") | 163 print("FAIL: Error, dehtml(none) is returning a function") |
| 160 else: | 164 else: |
| 161 print("PASS: dehtml(none) is returning None") | 165 print("PASS: dehtml(none) is returning None") |
| 162 | |
| 163 |
