diff --git a/woob/browser/filters/standard.py b/woob/browser/filters/standard.py index 4b9b9326cfb91588c04ecb5f02367574fd788c1d..430eeb9d425a7ea748d24d5384c7b77bea25cf6b 100644 --- a/woob/browser/filters/standard.py +++ b/woob/browser/filters/standard.py @@ -267,7 +267,10 @@ def filter(self, txt): elif isinstance(txt, int): txt = str(txt) elif isinstance(txt, (tuple, list)): - txt = u' '.join([self.clean(item, children=self.children) for item in txt]) + txt = u' '.join( + self.clean(item, newlines=self.newlines, children=self.children) + for item in txt + ) txt = self.clean(txt, self.children, self.newlines, self.normalize, self.transliterate) txt = self.remove(txt, self.symbols) @@ -279,15 +282,17 @@ def filter(self, txt): def clean(cls, txt, children=True, newlines=True, normalize='NFC', transliterate=False): if not isinstance(txt, basestring): if children: - txt = [t.strip() for t in txt.itertext()] + txt = list(txt.itertext()) else: - txt = [t.strip() for t in txt.xpath('./text()')] - txt = u' '.join(txt) # 'foo bar' + txt = list(txt.xpath('./text()')) + txt = u' '.join(txt) # 'foo bar ' + if newlines: - txt = re.compile(u'\s+', flags=re.UNICODE).sub(u' ', txt) # 'foo bar' + txt = re.compile(u'\s+', flags=re.UNICODE).sub(u' ', txt) # 'foo bar ' else: # normalize newlines and clean what is inside txt = '\n'.join([cls.clean(l) for l in txt.splitlines()]) + txt = txt.strip() # lxml under Python 2 returns str instead of unicode if it is pure ASCII txt = unicode(txt) diff --git a/woob/browser/tests/filters.py b/woob/browser/tests/filters.py index 01e1bda566a9267d8aa7c2344a0bd4fc43c77bf1..5f8fe29c38276ce112668f19a61f948af6fe0cea 100644 --- a/woob/browser/tests/filters.py +++ b/woob/browser/tests/filters.py @@ -24,7 +24,7 @@ from lxml.html import fromstring from woob.browser.filters.html import FormValue, Link -from woob.browser.filters.standard import RawText, DateTime +from woob.browser.filters.standard import RawText, DateTime, CleanText class RawTextTest(TestCase): @@ -56,6 +56,25 @@ def test_first_node_is_element_recursive(self): self.assertEqual("229,90 EUR", RawText('//p', default="foo", children=True)(e)) +class CleanTextNewlinesTest(TestCase): + def setUp(self): + self.e = fromstring(''' + +
+ foo + bar + baz +
+ + ''') + + def test_value(self): + self.assertEqual("foo bar baz", CleanText("//div")(self.e)) + self.assertEqual("foo baz", CleanText("//div", children=False)(self.e)) + self.assertEqual("foo\nbar\nbaz", CleanText("//div", newlines=False)(self.e)) + self.assertEqual("foo\n\nbaz", CleanText("//div", newlines=False, children=False)(self.e)) + + class FormValueTest(TestCase): def setUp(self): self.e = fromstring('''