comparison roundup/dehtml.py @ 5305:e20f472fde7d

issue2550799: provide basic support for handling html only emails Initial implementation and testing with the dehtml html converter done. The use of beautifulsoup 4 is not tested. My test system breaks when running dehtml.py using beautiful soup. I don't get the failures when running under the test harness, but the text output is significantly different (different line breaks, number of newlines etc.) The tests for dehtml need to be generated for beautiful soup and the expected output changed. Since I have a wonky install of beautiful soup, I don't trust my output as the standard to test against. Also since beautiful soup is optional, the test harness needs to skip the beautifulsoup tests if import bs4 fails. Again something outside of my expertise. I deleted the work I had done to implement that. I could not get it working and wanted to get this feature in in some form.
author John Rouillard <rouilj@ieee.org>
date Fri, 13 Oct 2017 21:46:59 -0400
parents
children 64b05e24dbd8
comparison
equal deleted inserted replaced
5304:ae32f082e623 5305:e20f472fde7d
1
2 class dehtml:
3 def __init__(self, converter):
4 if converter == "none":
5 self.html2text = None
6 return
7
8 try:
9 if converter == "beautifulsoup":
10 # Not as well tested as dehtml.
11 from bs4 import BeautifulSoup
12 def html2text(html):
13 soup = BeautifulSoup(html)
14
15 # kill all script and style elements
16 for script in soup(["script", "style"]):
17 script.extract()
18
19 return soup.get_text('\n', strip=True).encode('utf-8')
20
21 self.html2text = html2text
22 else:
23 raise ImportError # use
24 except ImportError:
25 # use the fallback below if beautiful soup is not installed.
26 from HTMLParser import HTMLParser
27 from htmlentitydefs import name2codepoint
28
29 class DumbHTMLParser(HTMLParser):
30 # class attribute
31 text=""
32
33 # internal state variable
34 _skip_data = False
35 _last_empty = False
36
37 def handle_data(self, data):
38 if self._skip_data: # skip data if in script or style block
39 return
40
41 if ( data.strip() == ""):
42 # reduce multiple blank lines to 1
43 if ( self._last_empty ):
44 return
45 else:
46 self._last_empty = True
47 else:
48 self._last_empty = False
49
50 self.text=self.text + data
51
52 def handle_starttag(self, tag, attrs):
53 if (tag == "p" ):
54 self.text= self.text + "\n"
55 if (tag in ("style", "script")):
56 self._skip_data = True
57
58 def handle_endtag(self, tag):
59 if (tag in ("style", "script")):
60 self._skip_data = False
61
62 def handle_entityref(self, name):
63 if self._skip_data:
64 return
65 c = unichr(name2codepoint[name])
66 try:
67 self.text= self.text + c
68 except UnicodeEncodeError:
69 # print a space as a placeholder
70 pass
71
72 def html2text(html):
73 parser = DumbHTMLParser()
74 parser.feed(html)
75 parser.close()
76 return parser.text
77
78 self.html2text = html2text
79
80 if "__main__" == __name__:
81 html='''
82 <body>
83 <script>
84 this must not be in output
85 </script>
86 <style>
87 p {display:block}
88 </style>
89 <div class="header"><h1>Roundup</h1>
90 <div id="searchbox" style="display: none">
91 <form class="search" action="../search.html" method="get">
92 <input type="text" name="q" size="18" />
93 <input type="submit" value="Search" />
94 <input type="hidden" name="check_keywords" value="yes" />
95 <input type="hidden" name="area" value="default" />
96 </form>
97 </div>
98 <script type="text/javascript">$('#searchbox').show(0);</script>
99 </div>
100 <ul class="current">
101 <li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
102 <li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li>
103 <li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current">
104 <li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li>
105 <li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li>
106 <li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li>
107 <li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li>
108 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
109 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
110 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
111 </ul>
112 <div class="section" id="prerequisites">
113 <h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
114 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
115 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
116 It is highly recommended that users install the latest patch version
117 of python as these contain many fixes to serious bugs.</p>
118 <p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
119 installed for Roundup installation to work. Debian and derivatives, are
120 known to require this.</p>
121 <p>If you&#8217;re on windows, you will either need to be using the ActiveState python
122 distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you&#8217;ll
123 have to install the win32all package separately (get it from
124 <a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p>
125 </div>
126 </body>
127 '''
128
129 html2text = dehtml("dehtml").html2text
130 if html2text:
131 print html2text(html)
132
133 try:
134 # trap error seen if N_TOKENS not defined when run.
135 html2text = dehtml("beautifulsoup").html2text
136 if html2text:
137 print html2text(html)
138 except NameError as e:
139 print "captured error %s"%e
140
141 html2text = dehtml("none").html2text
142 if html2text:
143 print "FAIL: Error, dehtml(none) is returning a function"
144 else:
145 print "PASS: dehtml(none) is returning None"
146
147

Roundup Issue Tracker: http://roundup-tracker.org/