From 90114c4813bbd96419723c9f13a449c177c58f32 Mon Sep 17 00:00:00 2001 From: Damien Date: Sun, 18 Apr 2021 18:12:39 +0200 Subject: [PATCH] browser.pages: Implement distinct-values xpath function On some websites, html content can be duplicated. This can be overcame but it can lead to complexe/fragile xpaths. distinct-values allows to solve this cases by unifiying duplicated values. --- setup.cfg | 3 +- woob/browser/pages.py | 17 +++++---- woob/browser/tests/xpath_functions.py | 52 +++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 woob/browser/tests/xpath_functions.py diff --git a/setup.cfg b/setup.cfg index 4179fe0bd0..5c92ae5aee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,8 @@ tests = woob.tools.capabilities.bank.iban, woob.browser.filters.standard, woob.browser.tests.form, woob.browser.tests.filters, - woob.browser.tests.url + woob.browser.tests.url, + woob.browser.tests.xpath_functions [isort] known_first_party = woob, weboob diff --git a/woob/browser/pages.py b/woob/browser/pages.py index fbb2075d99..7c7bfcf6f3 100644 --- a/woob/browser/pages.py +++ b/woob/browser/pages.py @@ -19,19 +19,18 @@ from __future__ import absolute_import -from collections import OrderedDict -from functools import wraps -import warnings -from io import BytesIO, StringIO import codecs -from cgi import parse_header -from functools import reduce import re import sys +import warnings +from cgi import parse_header +from collections import OrderedDict +from functools import reduce, wraps +from io import BytesIO, StringIO import requests -from woob.exceptions import ParseError, ModuleInstallError +from woob.exceptions import ModuleInstallError, ParseError from woob.tools.compat import basestring, unicode, urljoin from woob.tools.json import json, mini_jsonpath from woob.tools.log import getLogger @@ -675,11 +674,15 @@ def first_non_empty(context, *nodes_list): return nodes return [] + def distinct_values(context, text): + return list(set(text)) + ns['has-class'] = has_class ns['starts-with'] = starts_with ns['ends-with'] = ends_with ns['matches'] = matches ns['first-non-empty'] = first_non_empty + ns['distinct-values'] = distinct_values def build_doc(self, content): """ diff --git a/woob/browser/tests/xpath_functions.py b/woob/browser/tests/xpath_functions.py new file mode 100644 index 0000000000..e989bf9836 --- /dev/null +++ b/woob/browser/tests/xpath_functions.py @@ -0,0 +1,52 @@ +from lxml.html import fromstring + +from woob.tools.test import TestCase + + +class DistinctValuesTest(TestCase): + def setUp(self): + self.identity = fromstring(''' + +
+ Isaac + Asimov + 02/01/1920 + Writer + M + 651 Essex Street + Brooklyn +
+
+ Isaac + Asimov + 02/01/1920 + Writer + M + 651 Essex Street + Brooklyn +
+
+ Foundation + Foundation + Foundation and Empire + Foundation and Empire + Second Foundation + Foundation's Edge +
+ + ''') + + def test_that_values_are_successfully_distinct(self): + self.assertEqual( + self.identity.xpath('distinct-values(//div[@id="identity"]//span[@id="lastname"]/text())'), 'Asimov' + ) + self.assertEqual(self.identity.xpath('distinct-values(//span[@id="firstname"]/text())'), 'Isaac') + self.assertEqual(self.identity.xpath('distinct-values(//a[@class="book-1"]/text())'), 'Foundation') + + def test_that_distinct_inexistent_values_return_empty_value(self): + self.assertEqual(self.identity.xpath('distinct-values(//a[@class="book-4"]/text())'), []) + + def test_that_different_values_are_successfully_returns_as_is(self): + self.assertEqual( + self.identity.xpath('distinct-values(//a[@class="book-3"]/text())'), ["Foundation's Edge", 'Second Foundation'] + ) -- GitLab