diff --git a/weboob/browser/filters/html.py b/weboob/browser/filters/html.py index 71e08adacc20280988fbf07a92d5027037a2829b..a4faf4a3cce4cdc0103d7b135968d11c763f7f18 100644 --- a/weboob/browser/filters/html.py +++ b/weboob/browser/filters/html.py @@ -25,11 +25,11 @@ from weboob.tools.compat import basestring, unicode, urljoin from weboob.tools.html import html2text -from .base import _NO_DEFAULT, Filter, FilterError, _Selector, debug, ItemNotFound -from .standard import ( - TableCell, ColumnNotFound, # TODO move class here when modules are migrated - CleanText, +from .base import ( + _NO_DEFAULT, Filter, FilterError, _Selector, debug, ItemNotFound, + _Filter, ) +from .standard import CleanText __all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound', 'Attr', 'Link', 'AbsoluteLink', @@ -47,6 +47,10 @@ class AttributeNotFound(ItemNotFound): pass +class ColumnNotFound(FilterError): + pass + + class CSS(_Selector): """Select HTML elements with a CSS selector @@ -243,3 +247,64 @@ def filter(self, data): h = HTMLParser() txt = super(ReplaceEntities, self).filter(data) return h.unescape(txt) + + +class TableCell(_Filter): + """ + Used with TableElement, gets the cell element from its name. + + For example: + + >>> from weboob.capabilities.bank import Transaction + >>> from weboob.browser.elements import TableElement, ItemElement + >>> class table(TableElement): + ... head_xpath = '//table/thead/th' + ... item_xpath = '//table/tbody/tr' + ... col_date = u'Date' + ... col_label = [u'Name', u'Label'] + ... class item(ItemElement): + ... klass = Transaction + ... obj_date = Date(TableCell('date')) + ... obj_label = CleanText(TableCell('label')) + ... + + TableCell handles table tags that have + a "colspan" attribute that modify the width of the column: + for example will occupy two columns instead of one, + creating a column shift for all the next columns that must be taken + in consideration when trying to match columns values with column heads. + """ + + def __init__(self, *names, **kwargs): + support_th = kwargs.pop('support_th', False) + kwargs.pop('colspan', True) + super(TableCell, self).__init__(**kwargs) + self.names = names + + if support_th: + self.td = '(./th | ./td)[%s]' + else: + self.td = './td[%s]' + + def __call__(self, item): + # New behavior, handling colspans > 1 + for name in self.names: + col_idx = item.parent.get_colnum(name) + if col_idx is not None: + current_col = 0 + for td_idx in range(col_idx + 1): + ret = item.xpath(self.td % (td_idx + 1)) + if col_idx <= current_col: + for el in ret: + self.highlight_el(el, item) + return ret + + if not ret: + # There might no be no TD at all + # ColumnNotFound seems for case when corresponding header is not found + # Thus for compat return empty + return [] + + current_col += int(ret[0].attrib.get('colspan', 1)) + + return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) diff --git a/weboob/browser/filters/standard.py b/weboob/browser/filters/standard.py index fd0ac4a199a18a4ce55efec3e4e0c1414463e773..6db5bc23aaffa26a18b3db6141c4acae83d8ce42 100644 --- a/weboob/browser/filters/standard.py +++ b/weboob/browser/filters/standard.py @@ -37,8 +37,8 @@ from .base import _NO_DEFAULT, Filter, FilterError, ItemNotFound, _Filter, debug __all__ = [ - 'Filter', 'FilterError', 'ColumnNotFound', 'RegexpError', 'FormatError', - 'AsyncLoad', 'Async', 'Base', 'Decode', 'Env', 'TableCell', 'RawText', + 'Filter', 'FilterError', 'RegexpError', 'FormatError', + 'AsyncLoad', 'Async', 'Base', 'Decode', 'Env', 'RawText', 'CleanText', 'Lower', 'Upper', 'Title', 'Currency', 'NumberFormatError', 'CleanDecimal', 'Slugify', 'Type', 'Field', 'Regexp', 'Map', 'MapIn', 'DateTime', 'FromTimestamp', 'Date', 'DateGuesser', 'Time', 'Duration', @@ -47,10 +47,6 @@ ] -class ColumnNotFound(FilterError): - pass - - class RegexpError(FilterError): pass @@ -178,67 +174,6 @@ def __call__(self, item): return self.default_or_raise(ItemNotFound('Environment variable %s not found' % self.name)) -class TableCell(_Filter): - """ - Used with TableElement, gets the cell element from its name. - - For example: - - >>> from weboob.capabilities.bank import Transaction - >>> from weboob.browser.elements import TableElement, ItemElement - >>> class table(TableElement): - ... head_xpath = '//table/thead/th' - ... item_xpath = '//table/tbody/tr' - ... col_date = u'Date' - ... col_label = [u'Name', u'Label'] - ... class item(ItemElement): - ... klass = Transaction - ... obj_date = Date(TableCell('date')) - ... obj_label = CleanText(TableCell('label')) - ... - - TableCell handles table tags that have - a "colspan" attribute that modify the width of the column: - for example will occupy two columns instead of one, - creating a column shift for all the next columns that must be taken - in consideration when trying to match columns values with column heads. - """ - - def __init__(self, *names, **kwargs): - support_th = kwargs.pop('support_th', False) - kwargs.pop('colspan', True) - super(TableCell, self).__init__(**kwargs) - self.names = names - - if support_th: - self.td = '(./th | ./td)[%s]' - else: - self.td = './td[%s]' - - def __call__(self, item): - # New behavior, handling colspans > 1 - for name in self.names: - col_idx = item.parent.get_colnum(name) - if col_idx is not None: - current_col = 0 - for td_idx in range(col_idx + 1): - ret = item.xpath(self.td % (td_idx + 1)) - if col_idx <= current_col: - for el in ret: - self.highlight_el(el, item) - return ret - - if not ret: - # There might no be no TD at all - # ColumnNotFound seems for case when corresponding header is not found - # Thus for compat return empty - return [] - - current_col += int(ret[0].attrib.get('colspan', 1)) - - return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) - - class RawText(Filter): """Get raw text from an element.