from __future__ import print_function from roundup.anypy.strings import u2s, uchr class dehtml: def __init__(self, converter): if converter == "none": self.html2text = None return try: if converter == "beautifulsoup": # Not as well tested as dehtml. from bs4 import BeautifulSoup def html2text(html): soup = BeautifulSoup(html) # kill all script and style elements for script in soup(["script", "style"]): script.extract() return u2s(soup.get_text('\n', strip=True)) self.html2text = html2text else: raise ImportError # use except ImportError: # use the fallback below if beautiful soup is not installed. try: # Python 3+. from html.parser import HTMLParser from html.entities import name2codepoint except ImportError: # Python 2. from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint class DumbHTMLParser(HTMLParser): # class attribute text="" # internal state variable _skip_data = False _last_empty = False def handle_data(self, data): if self._skip_data: # skip data if in script or style block return if ( data.strip() == ""): # reduce multiple blank lines to 1 if ( self._last_empty ): return else: self._last_empty = True else: self._last_empty = False self.text=self.text + data def handle_starttag(self, tag, attrs): if (tag == "p" ): self.text= self.text + "\n" if (tag in ("style", "script")): self._skip_data = True def handle_endtag(self, tag): if (tag in ("style", "script")): self._skip_data = False def handle_entityref(self, name): if self._skip_data: return c = uchr(name2codepoint[name]) try: self.text= self.text + c except UnicodeEncodeError: # print a space as a placeholder pass def html2text(html): parser = DumbHTMLParser() parser.feed(html) parser.close() return parser.text self.html2text = html2text if "__main__" == __name__: html='''

Roundup