diff --git a/setup.py b/setup.py index 1211b9b79b89c6b63d9cee2d5d7a1ebc6e43ca7c..4c3cfe7d93117e328e454ecc864d5f17dd9b8c03 100755 --- a/setup.py +++ b/setup.py @@ -132,6 +132,7 @@ def install_weboob(): 'requests>=2.0.0', 'python-dateutil', 'PyYAML', + 'html2text>=3.200', 'six', ] try: diff --git a/weboob/browser/filters/html.py b/weboob/browser/filters/html.py index ee5c49bc405982ef00c4321ee278d9325727f8ef..29884d4414c27483aa11e9e74eb0a52faadb811a 100644 --- a/weboob/browser/filters/html.py +++ b/weboob/browser/filters/html.py @@ -88,17 +88,22 @@ def __call__(self, item): class CleanHTML(Filter): + def __init__(self, selector=None, options=None, default=_NO_DEFAULT): + super(CleanHTML, self).__init__(selector=selector, default=default) + self.options = options + @debug() def filter(self, txt): if isinstance(txt, (tuple, list)): - return u' '.join([self.clean(item) for item in txt]) - return self.clean(txt) + return u' '.join([self.clean(item, self.options) for item in txt]) + return self.clean(txt, self.options) @classmethod - def clean(cls, txt): + def clean(cls, txt, options=None): if not isinstance(txt, basestring): txt = html.tostring(txt, encoding=unicode) - return html2text(txt) + options = options or {} + return html2text(txt, **options) class UnrecognizedElement(Exception): diff --git a/weboob/tools/html.py b/weboob/tools/html.py index 774b3d9f8522405d260fde971eb2207f8c5d1fff..47b8b318bb63f16c3ca6b318585668b94c736baf 100644 --- a/weboob/tools/html.py +++ b/weboob/tools/html.py @@ -17,35 +17,23 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -import warnings - from weboob.tools.compat import unicode __all__ = ['html2text'] -try: - from html2text import HTML2Text +from html2text import HTML2Text - def html2text(html): - h = HTML2Text() - h.unicode_snob = True - h.skip_internal_links = True - h.inline_links = False - h.links_each_paragraph = True - return unicode(h.handle(html)) -except ImportError: - # Older versions of html2text do not have a class, so we have - # to configure the module globally. - try: - import html2text as h2t - h2t.UNICODE_SNOB = 1 - h2t.SKIP_INTERNAL_LINKS = True - h2t.INLINE_LINKS = False - h2t.LINKS_EACH_PARAGRAPH = True - html2text = h2t.html2text - except ImportError: - def html2text(html): - warnings.warn('python-html2text is not present. HTML pages are not converted into text.', stacklevel=2) - return html +def html2text(html, **options): + h = HTML2Text() + defaults = dict( + unicode_snob=True, + skip_internal_links=True, + inline_links=False, + links_each_paragraph=True, + ) + defaults.update(options) + for k, v in defaults.items(): + setattr(h, k, v) + return unicode(h.handle(html))