The new woob repository is here: https://gitlab.com/woob/woob. This gitlab will be removed soon.

Commit e98a0cdd authored by Vincent A's avatar Vincent A

browser.filters: CleanText(newlines=False) shouldn't strip newlines

When calling directly filter(), newlines=False did work, but not when
processing lxml tags:
- CleanText.clean() did not properly receive newlines config when it
should have
- The spaces were stripped too early in CleanText.clean(), no need to
strip at that point, they will be stripped a few lines after
parent 924c326e
......@@ -267,7 +267,10 @@ def filter(self, txt):
elif isinstance(txt, int):
txt = str(txt)
elif isinstance(txt, (tuple, list)):
txt = u' '.join([self.clean(item, children=self.children) for item in txt])
txt = u' '.join(
self.clean(item, newlines=self.newlines, children=self.children)
for item in txt
)
txt = self.clean(txt, self.children, self.newlines, self.normalize, self.transliterate)
txt = self.remove(txt, self.symbols)
......@@ -279,15 +282,17 @@ def filter(self, txt):
def clean(cls, txt, children=True, newlines=True, normalize='NFC', transliterate=False):
if not isinstance(txt, basestring):
if children:
txt = [t.strip() for t in txt.itertext()]
txt = list(txt.itertext())
else:
txt = [t.strip() for t in txt.xpath('./text()')]
txt = u' '.join(txt) # 'foo bar'
txt = list(txt.xpath('./text()'))
txt = u' '.join(txt) # 'foo bar '
if newlines:
txt = re.compile(u'\s+', flags=re.UNICODE).sub(u' ', txt) # 'foo bar'
txt = re.compile(u'\s+', flags=re.UNICODE).sub(u' ', txt) # 'foo bar '
else:
# normalize newlines and clean what is inside
txt = '\n'.join([cls.clean(l) for l in txt.splitlines()])
txt = txt.strip()
# lxml under Python 2 returns str instead of unicode if it is pure ASCII
txt = unicode(txt)
......
......@@ -24,7 +24,7 @@
from lxml.html import fromstring
from woob.browser.filters.html import FormValue, Link
from woob.browser.filters.standard import RawText, DateTime
from woob.browser.filters.standard import RawText, DateTime, CleanText
class RawTextTest(TestCase):
......@@ -56,6 +56,25 @@ def test_first_node_is_element_recursive(self):
self.assertEqual("229,90 EUR", RawText('//p', default="foo", children=True)(e))
class CleanTextNewlinesTest(TestCase):
def setUp(self):
self.e = fromstring('''
<body>
<div>
foo
<span>bar</span>
baz
</div>
</body>
''')
def test_value(self):
self.assertEqual("foo bar baz", CleanText("//div")(self.e))
self.assertEqual("foo baz", CleanText("//div", children=False)(self.e))
self.assertEqual("foo\nbar\nbaz", CleanText("//div", newlines=False)(self.e))
self.assertEqual("foo\n\nbaz", CleanText("//div", newlines=False, children=False)(self.e))
class FormValueTest(TestCase):
def setUp(self):
self.e = fromstring('''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment