Mercurial > p > roundup > code
comparison roundup/dehtml.py @ 5305:e20f472fde7d
issue2550799: provide basic support for handling html only emails
Initial implementation and testing with the dehtml html converter
done.
The use of beautifulsoup 4 is not tested. My test system breaks when
running dehtml.py using beautiful soup. I don't get the failures when
running under the test harness, but the text output is significantly
different (different line breaks, number of newlines etc.)
The tests for dehtml need to be generated for beautiful soup and the
expected output changed. Since I have a wonky install of beautiful
soup, I don't trust my output as the standard to test against. Also
since beautiful soup is optional, the test harness needs to skip the
beautifulsoup tests if import bs4 fails. Again something outside of my
expertise. I deleted the work I had done to implement that. I could
not get it working and wanted to get this feature in in some form.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Fri, 13 Oct 2017 21:46:59 -0400 |
| parents | |
| children | 64b05e24dbd8 |
comparison
equal
deleted
inserted
replaced
| 5304:ae32f082e623 | 5305:e20f472fde7d |
|---|---|
| 1 | |
| 2 class dehtml: | |
| 3 def __init__(self, converter): | |
| 4 if converter == "none": | |
| 5 self.html2text = None | |
| 6 return | |
| 7 | |
| 8 try: | |
| 9 if converter == "beautifulsoup": | |
| 10 # Not as well tested as dehtml. | |
| 11 from bs4 import BeautifulSoup | |
| 12 def html2text(html): | |
| 13 soup = BeautifulSoup(html) | |
| 14 | |
| 15 # kill all script and style elements | |
| 16 for script in soup(["script", "style"]): | |
| 17 script.extract() | |
| 18 | |
| 19 return soup.get_text('\n', strip=True).encode('utf-8') | |
| 20 | |
| 21 self.html2text = html2text | |
| 22 else: | |
| 23 raise ImportError # use | |
| 24 except ImportError: | |
| 25 # use the fallback below if beautiful soup is not installed. | |
| 26 from HTMLParser import HTMLParser | |
| 27 from htmlentitydefs import name2codepoint | |
| 28 | |
| 29 class DumbHTMLParser(HTMLParser): | |
| 30 # class attribute | |
| 31 text="" | |
| 32 | |
| 33 # internal state variable | |
| 34 _skip_data = False | |
| 35 _last_empty = False | |
| 36 | |
| 37 def handle_data(self, data): | |
| 38 if self._skip_data: # skip data if in script or style block | |
| 39 return | |
| 40 | |
| 41 if ( data.strip() == ""): | |
| 42 # reduce multiple blank lines to 1 | |
| 43 if ( self._last_empty ): | |
| 44 return | |
| 45 else: | |
| 46 self._last_empty = True | |
| 47 else: | |
| 48 self._last_empty = False | |
| 49 | |
| 50 self.text=self.text + data | |
| 51 | |
| 52 def handle_starttag(self, tag, attrs): | |
| 53 if (tag == "p" ): | |
| 54 self.text= self.text + "\n" | |
| 55 if (tag in ("style", "script")): | |
| 56 self._skip_data = True | |
| 57 | |
| 58 def handle_endtag(self, tag): | |
| 59 if (tag in ("style", "script")): | |
| 60 self._skip_data = False | |
| 61 | |
| 62 def handle_entityref(self, name): | |
| 63 if self._skip_data: | |
| 64 return | |
| 65 c = unichr(name2codepoint[name]) | |
| 66 try: | |
| 67 self.text= self.text + c | |
| 68 except UnicodeEncodeError: | |
| 69 # print a space as a placeholder | |
| 70 pass | |
| 71 | |
| 72 def html2text(html): | |
| 73 parser = DumbHTMLParser() | |
| 74 parser.feed(html) | |
| 75 parser.close() | |
| 76 return parser.text | |
| 77 | |
| 78 self.html2text = html2text | |
| 79 | |
| 80 if "__main__" == __name__: | |
| 81 html=''' | |
| 82 <body> | |
| 83 <script> | |
| 84 this must not be in output | |
| 85 </script> | |
| 86 <style> | |
| 87 p {display:block} | |
| 88 </style> | |
| 89 <div class="header"><h1>Roundup</h1> | |
| 90 <div id="searchbox" style="display: none"> | |
| 91 <form class="search" action="../search.html" method="get"> | |
| 92 <input type="text" name="q" size="18" /> | |
| 93 <input type="submit" value="Search" /> | |
| 94 <input type="hidden" name="check_keywords" value="yes" /> | |
| 95 <input type="hidden" name="area" value="default" /> | |
| 96 </form> | |
| 97 </div> | |
| 98 <script type="text/javascript">$('#searchbox').show(0);</script> | |
| 99 </div> | |
| 100 <ul class="current"> | |
| 101 <li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li> | |
| 102 <li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li> | |
| 103 <li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current"> | |
| 104 <li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li> | |
| 105 <li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li> | |
| 106 <li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li> | |
| 107 <li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li> | |
| 108 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li> | |
| 109 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li> | |
| 110 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li> | |
| 111 </ul> | |
| 112 <div class="section" id="prerequisites"> | |
| 113 <h2><a class="toc-backref" href="#id5">Prerequisites</a></h2> | |
| 114 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning | |
| 115 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>. | |
| 116 It is highly recommended that users install the latest patch version | |
| 117 of python as these contain many fixes to serious bugs.</p> | |
| 118 <p>Some variants of Linux will need an additional “python dev” package | |
| 119 installed for Roundup installation to work. Debian and derivatives, are | |
| 120 known to require this.</p> | |
| 121 <p>If you’re on windows, you will either need to be using the ActiveState python | |
| 122 distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you’ll | |
| 123 have to install the win32all package separately (get it from | |
| 124 <a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p> | |
| 125 </div> | |
| 126 </body> | |
| 127 ''' | |
| 128 | |
| 129 html2text = dehtml("dehtml").html2text | |
| 130 if html2text: | |
| 131 print html2text(html) | |
| 132 | |
| 133 try: | |
| 134 # trap error seen if N_TOKENS not defined when run. | |
| 135 html2text = dehtml("beautifulsoup").html2text | |
| 136 if html2text: | |
| 137 print html2text(html) | |
| 138 except NameError as e: | |
| 139 print "captured error %s"%e | |
| 140 | |
| 141 html2text = dehtml("none").html2text | |
| 142 if html2text: | |
| 143 print "FAIL: Error, dehtml(none) is returning a function" | |
| 144 else: | |
| 145 print "PASS: dehtml(none) is returning None" | |
| 146 | |
| 147 |
