Skip to content
Commits on Source (3)
# -*- coding: utf-8 -*-
# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import lxml.html as html
from six.moves.html_parser import HTMLParser
from weboob.tools.compat import basestring, unicode, urljoin
from weboob.tools.html import html2text
from weboob.browser.filters.base import _NO_DEFAULT, Filter, FilterError, _Selector, debug, ItemNotFound
from weboob.browser.filters.standard import (
TableCell, ColumnNotFound, # TODO move class here when modules are migrated
CleanText,
)
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound',
'Attr', 'Link', 'AbsoluteLink',
'CleanHTML', 'FormValue', 'HasElement',
'TableCell', 'ColumnNotFound',
'ReplaceEntities',
]
from weboob.browser.filters.html import XPathNotFound as _XPathNotFound
class XPathNotFound(_XPathNotFound):
pass
from weboob.browser.filters.html import AttributeNotFound as _AttributeNotFound
class AttributeNotFound(_AttributeNotFound):
pass
from weboob.browser.filters.html import CSS as _CSS
class CSS(_CSS):
"""Select HTML elements with a CSS selector
For example::
obj_foo = CleanText(CSS('div.main'))
will take the text of all ``<div>`` having CSS class "main".
"""
def select(self, selector, item):
ret = item.cssselect(selector)
if isinstance(ret, list):
for el in ret:
if isinstance(el, html.HtmlElement):
self.highlight_el(el, item)
return ret
from weboob.browser.filters.html import XPath as _XPath
class XPath(_XPath):
"""Select HTML elements with a XPath selector
"""
pass
from weboob.browser.filters.html import Attr as _Attr
class Attr(_Attr):
"""Get the text value of an HTML attribute.
Get value from attribute `attr` of HTML element matched by `selector`.
For example::
obj_foo = Attr('//img[@id="thumbnail"]', 'src')
will take the "src" attribute of ``<img>`` whose "id" is "thumbnail".
"""
def __init__(self, selector, attr, default=_NO_DEFAULT):
"""
:param selector: selector targeting the element
:param attr: name of the attribute to take
"""
super(Attr, self).__init__(selector, attr, default=default)
self.attr = attr
@debug()
def filter(self, el):
"""
:raises: :class:`XPathNotFound` if no element is found
:raises: :class:`AttributeNotFound` if the element doesn't have the requested attribute
"""
try:
return u'%s' % el[0].attrib[self.attr]
except IndexError:
return self.default_or_raise(XPathNotFound('Unable to find element %s' % self.selector))
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute %s' % (el[0], self.attr)))
from weboob.browser.filters.html import Link as _Link
class Link(_Link):
"""
Get the link uri of an element.
If the ``<a>`` tag is not found, an exception `IndexError` is raised.
"""
def __init__(self, selector=None, default=_NO_DEFAULT):
super(Link, self).__init__(selector, default=default)
from weboob.browser.filters.html import AbsoluteLink as _AbsoluteLink
class AbsoluteLink(_AbsoluteLink):
"""Get the absolute link URI of an element.
"""
def __call__(self, item):
ret = super(AbsoluteLink, self).__call__(item)
if ret:
ret = urljoin(item.page.url, ret)
return ret
from weboob.browser.filters.html import CleanHTML as _CleanHTML
class CleanHTML(_CleanHTML):
"""Convert HTML to text (Markdown) using html2text.
.. seealso:: `html2text site <https://pypi.python.org/pypi/html2text>`_
"""
def __init__(self, selector=None, options=None, default=_NO_DEFAULT):
"""
:param options: options suitable for html2text
:type options: dict
"""
super(CleanHTML, self).__init__(selector=selector, default=default)
self.options = options
@debug()
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item, self.options) for item in txt])
return self.clean(txt, self.options)
@classmethod
def clean(cls, txt, options=None):
if not isinstance(txt, basestring):
txt = html.tostring(txt, encoding=unicode)
options = options or {}
return html2text(txt, **options)
from weboob.browser.filters.html import UnrecognizedElement as _UnrecognizedElement
class UnrecognizedElement(_UnrecognizedElement):
pass
from weboob.browser.filters.html import FormValue as _FormValue
class FormValue(_FormValue):
"""
Extract a Python value from a form element.
Checkboxes and radio return booleans, while the rest
return text. For ``<select>`` tags, returns the user-visible text.
"""
@debug()
def filter(self, el):
try:
el = el[0]
except IndexError:
return self.default_or_raise(XPathNotFound('Unable to find element %s' % self.selector))
if el.tag == 'input':
# checkboxes or radios
if el.attrib.get('type') in ('radio', 'checkbox'):
return 'checked' in el.attrib
# regular text input
elif el.attrib.get('type', '') in ('', 'text', 'email', 'search', 'tel', 'url'):
try:
return unicode(el.attrib['value'])
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute value' % el))
# TODO handle html5 number, datetime, etc.
else:
raise UnrecognizedElement('Element %s is recognized' % el)
elif el.tag == 'textarea':
return unicode(el.text)
elif el.tag == 'select':
options = el.xpath('.//option[@selected]')
# default is the first one
if len(options) == 0:
options = el.xpath('.//option[1]')
return u'\n'.join([unicode(o.text) for o in options])
else:
raise UnrecognizedElement('Element %s is recognized' % el)
from weboob.browser.filters.html import HasElement as _HasElement
class HasElement(_HasElement):
"""
Returns `yesvalue` if the `selector` finds elements, `novalue` otherwise.
"""
def __init__(self, selector, yesvalue=True, novalue=False):
super(HasElement, self).__init__(selector, default=novalue)
self.yesvalue = yesvalue
@debug()
def filter(self, value):
if value:
return self.yesvalue
return self.default_or_raise(FilterError('No default value'))
class ReplaceEntities(CleanText):
"""
Filter to replace HTML entities like "&eacute;" or "&#x42;" with their unicode counterpart.
"""
def filter(self, data):
h = HTMLParser()
txt = super(ReplaceEntities, self).filter(data)
return h.unescape(txt)
......@@ -24,7 +24,7 @@
from weboob.browser.pages import HTMLPage, PDFPage, LoggedPage
from weboob.browser.elements import TableElement, ListElement, ItemElement, method
from weboob.browser.filters.standard import CleanText, CleanDecimal, Regexp, Field, Date, Eval
from weboob.browser.filters.html import Attr, TableCell, ReplaceEntities
from .compat.weboob_browser_filters_html import Attr, TableCell, ReplaceEntities
from weboob.capabilities.bank import Account, Investment, Loan, NotAvailable
from weboob.tools.capabilities.bank.transactions import FrenchTransaction
from weboob.tools.capabilities.bank.iban import is_iban_valid
......
import weboob.capabilities.bank as OLD
# can't import *, __all__ is incomplete...
for attr in dir(OLD):
globals()[attr] = getattr(OLD, attr)
__all__ = OLD.__all__
class AccountOwnerType(object):
"""
Specifies the usage of the account
"""
PRIVATE = u'PRIV'
"""private personal account"""
ORGANIZATION = u'ORGA'
"""professional account"""
......@@ -26,7 +26,7 @@
from weboob.browser.pages import HTMLPage, JsonPage, LoggedPage
from weboob.exceptions import BrowserUnavailable
from weboob.capabilities import NotAvailable
from weboob.capabilities.bank import (
from .compat.weboob_capabilities_bank import (
Account, AccountOwnerType,
)
......
......@@ -6,7 +6,7 @@
import sys
import re
from contextlib import contextmanager
from os import system, path, makedirs, getenv
from os import system, path, makedirs, getenv, mknod, unlink
from subprocess import check_output, STDOUT, CalledProcessError
from collections import defaultdict
import shutil
......@@ -43,6 +43,7 @@ def create_compat_dir(name):
MANUAL_PORTS = [
'weboob.capabilities.bank',
]
MANUAL_PORT_DIR = path.join(path.dirname(__file__), 'stable_backport_data')
......@@ -180,7 +181,9 @@ def main(self):
system('git add -u')
with log('Lookup modules errors'):
mknod('modules/__init__.py')
r = check_output("pylint modules -f parseable -E -d all -e no-name-in-module,import-error; exit 0", shell=True, stderr=STDOUT).decode('utf-8')
unlink('modules/__init__.py')
dirnames = defaultdict(list)
for line in r.split('\n'):
......@@ -204,6 +207,10 @@ def main(self):
error.fixup()
system('git add %s' % compat_dirname)
with log('Custom fixups'):
replace_all('super(Attr, self).__init__(selector, default=default)', 'super(Attr, self).__init__(selector, attr, default=default)')
replace_all("super(Link, self).__init__(selector, 'href', default=default)", "super(Link, self).__init__(selector, default=default)")
system('git add -u')
......
import weboob.capabilities.bank as OLD
# can't import *, __all__ is incomplete...
for attr in dir(OLD):
globals()[attr] = getattr(OLD, attr)
__all__ = OLD.__all__
class AccountOwnerType(object):
"""
Specifies the usage of the account
"""
PRIVATE = u'PRIV'
"""private personal account"""
ORGANIZATION = u'ORGA'
"""professional account"""