From 490bcb05080d153bb57307a04c3bc23e5874a3e7 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Tue, 1 Oct 2019 17:27:41 +0200 Subject: [PATCH] browser: Normalize encodings Sometimes it is lowercase, uppercase, or even bytes instead of unicode strings! This removes a warning under Python 3. --- weboob/browser/pages.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/weboob/browser/pages.py b/weboob/browser/pages.py index 27590e46e2..91a7245010 100644 --- a/weboob/browser/pages.py +++ b/weboob/browser/pages.py @@ -151,7 +151,7 @@ def __init__(self, browser, response, params=None, encoding=None): self.params = params # Setup encoding and build document - self.forced_encoding = encoding or self.ENCODING + self.forced_encoding = self.normalize_encoding(encoding or self.ENCODING) if self.forced_encoding: self.response.encoding = self.forced_encoding self.doc = self.build_doc(self.data) @@ -169,7 +169,7 @@ def __init__(self, browser, response, params=None, encoding=None): @property def encoding(self): - return self.response.encoding + return self.normalize_encoding(self.response.encoding) @encoding.setter def encoding(self, value): @@ -223,6 +223,14 @@ def detect_encoding(self): """ return None + def normalize_encoding(self, encoding): + """ + Make sure we can easily compare encodings by formatting them the same way. + """ + if isinstance(encoding, bytes): + encoding = encoding.decode('utf-8') + return encoding.lower() if encoding else encoding + def absurl(self, url): """ Get an absolute URL from an a partial URL, relative to the Page URL @@ -518,7 +526,7 @@ def detect_encoding(self): import re m = re.search(b'<\?xml version="1.0" encoding="(.*)"\?>', self.data) if m: - return m.group(1) + return self.normalize_encoding(m.group(1)) def build_doc(self, content): import lxml.etree as etree @@ -664,10 +672,10 @@ def build_doc(self, content): Method to build the lxml document from response and given encoding. """ encoding = self.encoding - if encoding == 'latin-1': - encoding = 'latin1' + if encoding == u'latin-1': + encoding = u'latin1' if encoding: - encoding = encoding.replace('ISO8859_', 'ISO8859-') + encoding = encoding.replace(u'iso8859_', u'iso8859-') import lxml.html as html parser = html.HTMLParser(encoding=encoding) return html.parse(BytesIO(content), parser) @@ -681,18 +689,18 @@ def detect_encoding(self): # meta http-equiv=content-type content=... _, params = parse_header(content) if 'charset' in params: - encoding = params['charset'].strip("'\"") + encoding = self.normalize_encoding(params['charset'].strip("'\"")) for charset in self.doc.xpath('//head/meta[@charset]/@charset'): # meta charset=... - encoding = charset.lower() + encoding = self.normalize_encoding(charset) - if encoding == 'iso-8859-1' or not encoding: - encoding = 'windows-1252' + if encoding == u'iso-8859-1' or not encoding: + encoding = u'windows-1252' try: codecs.lookup(encoding) except LookupError: - encoding = 'windows-1252' + encoding = u'windows-1252' return encoding -- GitLab