view roundup/dehtml.py @ 8180:d02ce1d14acd

feat: issue2551068 - Provide way to retrieve file/msg data via rest endpoint. Use Allow header to change format of /binary_content endpoint. If Allow header for endpoint is not application/json, it will be matched against the mime type for the file. */*, text/* are supported and will return the native mime type if present. Changes: move */* mime type from static dict of supported types. It was hardcoded to return json only. Now it can return a matching non-json mime type for the /binary_content endpoint. Edited some errors to explicitly add */* mime type. Cleanups to use ', ' separation in lists of valid mime types rather than just space separated. Remove ETag header when sending raw content. See issue 2551375 for background. Doc added to rest.txt. Small format fix up (add dash) in CHANGES.txt. Make passing an unset/None/False accept_mime_type to format_dispatch_output a 500 error. This used to be the fallback to produce a 406 error after all processing had happened. It should no longer be possible to take that code path as all 406 errors (with valid accept_mime_types) are generated before processing takes place. Make format_dispatch_output handle output other than json/xml so it can send back binary_content data. Removed a spurious client.response_code = 400 that seems to not be used. Tests added for all code paths. Database setup for tests msg and file entry. This required a file upload test to change so it doesn't look for file1 as the link returned by the upload. Download the link and verify the data rather than verifying the link. Multiple formatting changes to error messages to make all lists of valid mime types ', ' an not just space separated.
author John Rouillard <rouilj@ieee.org>
date Sun, 08 Dec 2024 17:22:33 -0500
parents b68a1d8fd5d9
children 520075b29474
line wrap: on
line source


from __future__ import print_function

import sys

from roundup.anypy.strings import u2s, uchr

_pyver = sys.version_info[0]


class dehtml:
    def __init__(self, converter):
        if converter == "none":
            self.html2text = None
            return

        try:
            if converter == "beautifulsoup":
                # Not as well tested as dehtml.
                from bs4 import BeautifulSoup

                def html2text(html):
                    soup = BeautifulSoup(html, "html.parser")

                    # kill all script and style elements
                    for script in soup(["script", "style"]):
                        script.extract()

                    return u2s(soup.get_text("\n", strip=True))

                self.html2text = html2text
            else:
                raise ImportError
        except ImportError:
            # use the fallback below if beautiful soup is not installed.
            try:
                # Python 3+.
                from html.entities import name2codepoint
                from html.parser import HTMLParser
            except ImportError:
                # Python 2.
                from htmlentitydefs import name2codepoint
                from HTMLParser import HTMLParser

            class DumbHTMLParser(HTMLParser):
                # class attribute
                text = ""

                # internal state variable
                _skip_data = False
                _last_empty = False

                def handle_data(self, data):
                    if self._skip_data:  # skip data in script or style block
                        return

                    if (data.strip() == ""):
                        # reduce multiple blank lines to 1
                        if (self._last_empty):
                            return
                        else:
                            self._last_empty = True
                    else:
                        self._last_empty = False

                    self.text = self.text + data

                def handle_starttag(self, tag, attrs):  # noqa: ARG002
                    if (tag == "p"):
                        self.text = self.text + "\n"
                    if (tag in ("style", "script")):
                        self._skip_data = True

                def handle_endtag(self, tag):
                    if (tag in ("style", "script")):
                        self._skip_data = False

                def handle_entityref(self, name):
                    if self._skip_data:
                        return
                    c = uchr(name2codepoint[name])
                    try:
                        self.text = self.text + c
                    except UnicodeEncodeError:
                        # print a space as a placeholder
                        self.text = self.text + " "

            def html2text(html):
                parser = DumbHTMLParser(
                    convert_charrefs=True) if _pyver == 3 else DumbHTMLParser()
                parser.feed(html)
                parser.close()
                return parser.text

            self.html2text = html2text


if __name__ == "__main__":
    html = """
<body>
<script>
this must not be in output
</script>
<style>
p {display:block}
</style>
    <div class="header"><h1>Roundup</h1>
        <div id="searchbox" style="display: none">
          <form class="search" action="../search.html" method="get">
            <input type="text" name="q" size="18" />
            <input type="submit" value="Search" />
            <input type="hidden" name="check_keywords" value="yes" />
            <input type="hidden" name="area" value="default" />
          </form>
        </div>
        <script type="text/javascript">$('#searchbox').show(0);</script>
    </div>
       <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
<li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li>
<li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li>
<li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li>
<li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
<li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
</ul>
<div class="section" id="prerequisites">
<h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
<p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
It is highly recommended that users install the latest patch version
of python as these contain many fixes to serious bugs.</p>
<p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
installed for Roundup installation to work. Debian and derivatives, are
known to require this.</p>
<p>If you&#8217;re on windows, you will either need to be using the ActiveState python
distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you&#8217;ll
have to install the win32all package separately (get it from
<a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p>
<script>
  &lt; HELP &GT;
</script>
</div>
</body>
"""

    html2text = dehtml("dehtml").html2text
    if html2text:
        print(html2text(html))

    try:
        # trap error seen if N_TOKENS not defined when run.
        html2text = dehtml("beautifulsoup").html2text
        if html2text:
            print(html2text(html))
    except NameError as e:
        print("captured error %s" % e)

    html2text = dehtml("none").html2text
    if html2text:
        print("FAIL: Error, dehtml(none) is returning a function")
    else:
        print("PASS: dehtml(none) is returning None")

Roundup Issue Tracker: http://roundup-tracker.org/