The new woob repository is here: https://gitlab.com/woob/woob. This gitlab will be removed soon.

Commit 90114c48 authored by Damien Ramelet's avatar Damien Ramelet Committed by Vincent A

browser.pages: Implement distinct-values xpath function

On some websites, html content can be duplicated.
This can be overcame but it can lead to complexe/fragile xpaths.
distinct-values allows to solve this cases by unifiying duplicated values.
parent afd26e29
Pipeline #3718 failed with stages
in 63 minutes and 26 seconds
......@@ -18,7 +18,8 @@ tests = woob.tools.capabilities.bank.iban,
woob.browser.filters.standard,
woob.browser.tests.form,
woob.browser.tests.filters,
woob.browser.tests.url
woob.browser.tests.url,
woob.browser.tests.xpath_functions
[isort]
known_first_party = woob, weboob
......
......@@ -19,19 +19,18 @@
from __future__ import absolute_import
from collections import OrderedDict
from functools import wraps
import warnings
from io import BytesIO, StringIO
import codecs
from cgi import parse_header
from functools import reduce
import re
import sys
import warnings
from cgi import parse_header
from collections import OrderedDict
from functools import reduce, wraps
from io import BytesIO, StringIO
import requests
from woob.exceptions import ParseError, ModuleInstallError
from woob.exceptions import ModuleInstallError, ParseError
from woob.tools.compat import basestring, unicode, urljoin
from woob.tools.json import json, mini_jsonpath
from woob.tools.log import getLogger
......@@ -675,11 +674,15 @@ def first_non_empty(context, *nodes_list):
return nodes
return []
def distinct_values(context, text):
return list(set(text))
ns['has-class'] = has_class
ns['starts-with'] = starts_with
ns['ends-with'] = ends_with
ns['matches'] = matches
ns['first-non-empty'] = first_non_empty
ns['distinct-values'] = distinct_values
def build_doc(self, content):
"""
......
from lxml.html import fromstring
from woob.tools.test import TestCase
class DistinctValuesTest(TestCase):
def setUp(self):
self.identity = fromstring('''
<body>
<div id="identity">
<span id="firstname">Isaac</span>
<span id="lastname">Asimov</span>
<span id="birthday">02/01/1920</span>
<span id="job">Writer</span>
<span id="gender">M</span>
<span id="adress">651 Essex Street</span>
<span id="city">Brooklyn</span>
</div>
<div id="identity">
<span id="firstname">Isaac</span>
<span id="lastname">Asimov</span>
<span id="birthday">02/01/1920</span>
<span id="job">Writer</span>
<span id="gender">M</span>
<span id="adress">651 Essex Street</span>
<span id="city">Brooklyn</span>
</div>
<div id="bibliography">
<a id="Foundation" class="book-1" href="#">Foundation</a>
<a id="Foundation" class="book-1" href="#">Foundation</a>
<a id="Foundation and Empire" class="book-2" href="#">Foundation and Empire</a>
<a id="Foundation and Empire" class="book-2" href="#">Foundation and Empire</a>
<a id="Second Foundation" class="book-3" href="#">Second Foundation</a>
<a id="Foundation’s Edge" class="book-3" href="#">Foundation's Edge</a>
</div>
</body>
''')
def test_that_values_are_successfully_distinct(self):
self.assertEqual(
self.identity.xpath('distinct-values(//div[@id="identity"]//span[@id="lastname"]/text())'), 'Asimov'
)
self.assertEqual(self.identity.xpath('distinct-values(//span[@id="firstname"]/text())'), 'Isaac')
self.assertEqual(self.identity.xpath('distinct-values(//a[@class="book-1"]/text())'), 'Foundation')
def test_that_distinct_inexistent_values_return_empty_value(self):
self.assertEqual(self.identity.xpath('distinct-values(//a[@class="book-4"]/text())'), [])
def test_that_different_values_are_successfully_returns_as_is(self):
self.assertEqual(
self.identity.xpath('distinct-values(//a[@class="book-3"]/text())'), ["Foundation's Edge", 'Second Foundation']
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment