Commit 93dddbca authored by Laurent Bachelier's avatar Laurent Bachelier 🐧 Committed by Florent Fourcot

html2text/CleanHTML: Allow changing options

Deprecate html2text<3.200 and add it to requirements.
This is a light dependency and it is used by a great number of modules.
parent 8ed802f4
Pipeline #730 failed
......@@ -132,6 +132,7 @@ def install_weboob():
'requests>=2.0.0',
'python-dateutil',
'PyYAML',
'html2text>=3.200',
'six',
]
try:
......
......@@ -88,17 +88,22 @@ class AbsoluteLink(Link):
class CleanHTML(Filter):
def __init__(self, selector=None, options=None, default=_NO_DEFAULT):
super(CleanHTML, self).__init__(selector=selector, default=default)
self.options = options
@debug()
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item) for item in txt])
return self.clean(txt)
return u' '.join([self.clean(item, self.options) for item in txt])
return self.clean(txt, self.options)
@classmethod
def clean(cls, txt):
def clean(cls, txt, options=None):
if not isinstance(txt, basestring):
txt = html.tostring(txt, encoding=unicode)
return html2text(txt)
options = options or {}
return html2text(txt, **options)
class UnrecognizedElement(Exception):
......
......@@ -17,35 +17,23 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import warnings
from weboob.tools.compat import unicode
__all__ = ['html2text']
try:
from html2text import HTML2Text
from html2text import HTML2Text
def html2text(html):
h = HTML2Text()
h.unicode_snob = True
h.skip_internal_links = True
h.inline_links = False
h.links_each_paragraph = True
return unicode(h.handle(html))
except ImportError:
# Older versions of html2text do not have a class, so we have
# to configure the module globally.
try:
import html2text as h2t
h2t.UNICODE_SNOB = 1
h2t.SKIP_INTERNAL_LINKS = True
h2t.INLINE_LINKS = False
h2t.LINKS_EACH_PARAGRAPH = True
html2text = h2t.html2text
except ImportError:
def html2text(html):
warnings.warn('python-html2text is not present. HTML pages are not converted into text.', stacklevel=2)
return html
def html2text(html, **options):
h = HTML2Text()
defaults = dict(
unicode_snob=True,
skip_internal_links=True,
inline_links=False,
links_each_paragraph=True,
)
defaults.update(options)
for k, v in defaults.items():
setattr(h, k, v)
return unicode(h.handle(html))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment