Commit 4a901eb3 authored by Vincent Ardisson's avatar Vincent Ardisson Committed by Vincent A

weboob.browser.pages.HTMLPage: add ABSOLUTE_LINKS option to transform urls

When enabled, the documents is pre-processed so all links of the page are
made absolute using the <base> href (if present) or the page URL.

AbsoluteLink becomes pointless when this option is enabled. It's not enabled
by default as it would break some existing XPaths like:

    starts-with(@href, "/foo")
parent 5d8d514b
......@@ -580,6 +580,11 @@ class HTMLPage(Page):
Default xpath, which is also the most commun, override it if needed
"""
ABSOLUTE_LINKS = False
"""
Make links URLs absolute.
"""
def __init__(self, *args, **kwargs):
import lxml.html as html
ns = html.etree.FunctionNamespace(None)
......@@ -687,7 +692,12 @@ class HTMLPage(Page):
encoding = encoding.replace(u'iso8859_', u'iso8859-')
import lxml.html as html
parser = html.HTMLParser(encoding=encoding)
return html.parse(BytesIO(content), parser)
doc = html.parse(BytesIO(content), parser, base_url=self.url)
if self.ABSOLUTE_LINKS:
doc.getroot().make_links_absolute(handle_failures='ignore')
return doc
def detect_encoding(self):
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment