elements.py 14.3 KB
Newer Older
1 2 3 4 5 6 7
# -*- coding: utf-8 -*-

# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
8
# it under the terms of the GNU Lesser General Public License as published by
9 10 11 12 13 14
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU Lesser General Public License for more details.
16
#
17
# You should have received a copy of the GNU Lesser General Public License
18 19
# along with weboob. If not, see <http://www.gnu.org/licenses/>.

Vincent A's avatar
Vincent A committed
20 21
from __future__ import print_function

22
import os
23 24
import re
import sys
25
from collections import OrderedDict
26
from copy import deepcopy
27
import traceback
28

29 30
import lxml.html

31
from weboob.tools.log import getLogger, DEBUG_FILTERS
32
from weboob.tools.compat import basestring, unicode, with_metaclass
33
from weboob.browser.pages import NextPage
34
from weboob.capabilities.base import FetchError
35

36 37 38 39 40 41 42
from .filters.standard import _Filter, CleanText
from .filters.html import AttributeNotFound, XPathNotFound


__all__ = ['DataError', 'AbstractElement', 'ListElement', 'ItemElement', 'TableElement', 'SkipItem']


43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
def generate_table_element(doc, head_xpath, cleaner=CleanText):
    """
    Prints generated base code for TableElement/TableCell usage.
    It is intended for development purposes, typically in weboob-debug.
    :param doc: lxml tree of the page (e.g. browser.page.doc)
    :param head_xpath: xpath of header columns (e.g. //table//th)
    :type head_xpath: str
    :param cleaner: cleaner class (Filter)
    :type cleaner: Filter
    """
    from unidecode import unidecode
    indent = 4
    headers = doc.xpath(head_xpath)
    cols = dict()
    for el in headers:
        th = cleaner.clean(el)
        cols.update({re.sub('[^a-zA-Z]', '_', unidecode(th)).lower(): th})

    print(' ' * indent + '@method')
    print(' ' * indent + 'class get_items(TableElement):')
    if cleaner is not CleanText:
        print(' ' * indent * 2 + 'cleaner = %s' % cleaner.__name__)
    print(' ' * indent * 2 + 'head_xpath = ' + repr(head_xpath))
    print(' ' * indent * 2 + 'item_xpath = ' + repr('...') + '\n')

    for col, name in cols.items():
        print(' ' * indent * 2 + 'col_' + col + ' = ' + repr(name))

    print('\n' + ' ' * indent * 2 + 'class item(ItemElement):')
    print(' ' * indent * 3 + 'klass = BaseObject' + '\n')

    for col in cols:
        print(' ' * indent * 3 + 'obj_' + col + ' = ' + "TableCell('%s') & CleanText()" % col)


78 79 80 81 82 83
class DataError(Exception):
    """
    Returned data from pages are incoherent.
    """


84 85 86 87
def method(klass):
    """
    Class-decorator to call it as a method.
    """
88

89 90 91 92 93
    def inner(self, *args, **kwargs):
        return klass(self)(*args, **kwargs)
    return inner


94
class AbstractElement(object):
95
    _creation_counter = 0
96
    condition = None
97

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    def __init__(self, page, parent=None, el=None):
        self.page = page
        self.parent = parent
        if el is not None:
            self.el = el
        elif parent is not None:
            self.el = parent.el
        else:
            self.el = page.doc

        if parent is not None:
            self.env = deepcopy(parent.env)
        else:
            self.env = deepcopy(page.params)

113 114 115 116
        # Used by debug
        self._random_id = AbstractElement._creation_counter
        AbstractElement._creation_counter += 1

117 118
        self.loaders = {}

119
    def use_selector(self, func, key=None):
120
        if isinstance(func, _Filter):
121 122
            func._obj = self
            func._key = key
123
            value = func(self)
124 125
        elif isinstance(func, type) and issubclass(func, ItemElement):
            value = func(self.page, self, self.el)()
126 127
        elif isinstance(func, type) and issubclass(func, ListElement):
            value = list(func(self.page, self, self.el)())
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
        elif callable(func):
            value = func()
        else:
            value = deepcopy(func)

        return value

    def parse(self, obj):
        pass

    def cssselect(self, *args, **kwargs):
        return self.el.cssselect(*args, **kwargs)

    def xpath(self, *args, **kwargs):
        return self.el.xpath(*args, **kwargs)

144 145 146 147 148 149 150 151 152
    def handle_loaders(self):
        for attrname in dir(self):
            m = re.match('load_(.*)', attrname)
            if not m:
                continue
            name = m.group(1)
            if name in self.loaders:
                continue
            loader = getattr(self, attrname)
153
            self.loaders[name] = self.use_selector(loader, key=attrname)
154

155

156 157 158 159 160 161 162 163 164 165 166
class ListElement(AbstractElement):
    item_xpath = None
    flush_at_end = False
    ignore_duplicate = False

    def __init__(self, *args, **kwargs):
        super(ListElement, self).__init__(*args, **kwargs)
        self.logger = getLogger(self.__class__.__name__.lower())
        self.objects = OrderedDict()

    def __call__(self, *args, **kwargs):
167
        for key, value in kwargs.items():
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
            self.env[key] = value

        return self.__iter__()

    def find_elements(self):
        """
        Get the nodes that will have to be processed.
        This method can be overridden if xpath filters are not
        sufficient.
        """
        if self.item_xpath is not None:
            for el in self.el.xpath(self.item_xpath):
                yield el
        else:
            yield self.el

    def __iter__(self):
185 186 187
        if self.condition is not None and not self.condition():
            return

188 189
        self.parse(self.el)

190
        items = []
191
        for el in self.find_elements():
192 193 194 195
            for attrname in dir(self):
                attr = getattr(self, attrname)
                if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self):
                    item = attr(self.page, self, el)
196 197 198
                    if item.condition is not None and not item.condition():
                        continue

199 200 201 202 203 204 205
                    item.handle_loaders()
                    items.append(item)

        for item in items:
            for obj in item:
                obj = self.store(obj)
                if obj and not self.flush_at_end:
206 207 208 209 210 211 212 213 214
                    yield obj

        if self.flush_at_end:
            for obj in self.flush():
                yield obj

        self.check_next_page()

    def flush(self):
215
        for obj in self.objects.values():
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
            yield obj

    def check_next_page(self):
        if not hasattr(self, 'next_page'):
            return

        next_page = getattr(self, 'next_page')
        try:
            value = self.use_selector(next_page)
        except (AttributeNotFound, XPathNotFound):
            return

        if value is None:
            return

        raise NextPage(value)


    def store(self, obj):
        if obj.id:
            if obj.id in self.objects:
                if self.ignore_duplicate:
                    self.logger.warning('There are two objects with the same ID! %s' % obj.id)
                    return
                else:
                    raise DataError('There are two objects with the same ID! %s' % obj.id)
            self.objects[obj.id] = obj
        return obj


246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
class SkipItem(Exception):
    """
    Raise this exception in an :class:`ItemElement` subclass to skip an item.
    """


class _ItemElementMeta(type):
    """
    Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`.
    """
    def __new__(mcs, name, bases, attrs):
        _attrs = []
        for base in bases:
            if hasattr(base, '_attrs'):
                _attrs += base._attrs

        filters = [(re.sub('^obj_', '', attr_name), attrs[attr_name]) for attr_name, obj in attrs.items() if attr_name.startswith('obj_')]
        # constants first, then filters, then methods
264
        filters.sort(key=lambda x: x[1]._creation_counter if hasattr(x[1], '_creation_counter') else (sys.maxsize if callable(x[1]) else 0))
265

266
        attrs['_class_file'], attrs['_class_line'] = traceback.extract_stack()[-2][:2]
267 268 269 270 271
        new_class = super(_ItemElementMeta, mcs).__new__(mcs, name, bases, attrs)
        new_class._attrs = _attrs + [f[0] for f in filters]
        return new_class


272
class ItemElement(with_metaclass(_ItemElementMeta, AbstractElement)):
273
    _attrs = None
274
    _loaders = None
275 276
    klass = None
    validate = None
277
    skip_optional_fields_errors = False
278 279 280 281 282 283

    class Index(object):
        pass

    def __init__(self, *args, **kwargs):
        super(ItemElement, self).__init__(*args, **kwargs)
284
        self.logger = getLogger(self.__class__.__name__.lower())
285
        self.obj = None
286
        self.saved_attrib = {}  # safer way would be to clone lxml tree
287 288 289 290 291 292

    def build_object(self):
        if self.klass is None:
            return
        return self.klass()

293 294 295 296 297 298 299 300
    def _restore_attrib(self):
        for el in self.saved_attrib:
            el.attrib.clear()
            el.attrib.update(self.saved_attrib[el])
        self.saved_attrib = {}

    def should_highlight(self):
        try:
301
            responses_dirname = self.page.browser.responses_dirname and self.page.browser.highlight_el
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
            if not responses_dirname:
                return False
            if not self.el.getroottree():
                return False
        except AttributeError:
            return False
        else:
            return True

    def _write_highlighted(self):
        if not self.should_highlight():
            return

        responses_dirname = self.page.browser.responses_dirname
        html = lxml.html.tostring(self.el.getroottree().getroot())

        fn = os.path.join(responses_dirname, 'obj-%s.html' % self._random_id)
        with open(fn, 'w') as fd:
            fd.write(html)
        self.logger.debug('highlighted object to %s', fn)

323 324 325 326 327 328 329 330 331 332 333
    def __call__(self, obj=None):
        if obj is not None:
            self.obj = obj

        for obj in self:
            return obj

    def __iter__(self):
        if self.condition is not None and not self.condition():
            return

334
        highlight = False
335
        try:
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
            if self.should_highlight():
                self.saved_attrib[self.el] = dict(self.el.attrib)
                self.el.attrib['style'] = 'color: white !important; background: orange !important;'

            try:
                if self.obj is None:
                    self.obj = self.build_object()
                self.parse(self.el)
                self.handle_loaders()
                for attr in self._attrs:
                    self.handle_attr(attr, getattr(self, 'obj_%s' % attr))
            except SkipItem:
                return

            if self.validate is not None and not self.validate(self.obj):
                return

            highlight = True
        finally:
            if highlight:
                self._write_highlighted()
            self._restore_attrib()
358 359 360 361

        yield self.obj

    def handle_attr(self, key, func):
362
        try:
363
            value = self.use_selector(func, key=key)
364
        except SkipItem as e:
365
            # Help debugging as tracebacks do not give us the key
366
            self.logger.debug("Attribute %s raises a %r", key, e)
367 368 369
            raise
        except Exception as e:
            # If we are here, we have probably a real parsing issue
370
            self.logger.warning('Attribute %s (in %s:%s) raises %s', key, self._class_file, self._class_line, repr(e))
371 372 373 374
            if not self.skip_optional_fields_errors or key not in self.obj._fields or self.obj._fields[key].mandatory:
                raise
            else:
                value = FetchError
375
        logger = getLogger('b2filters')
376
        logger.log(DEBUG_FILTERS, "%s.%s = %r" % (self._random_id, key, value))
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
        setattr(self.obj, key, value)


class TableElement(ListElement):
    head_xpath = None
    cleaner = CleanText

    def __init__(self, *args, **kwargs):
        super(TableElement, self).__init__(*args, **kwargs)

        self._cols = {}

        columns = {}
        for attrname in dir(self):
            m = re.match('col_(.*)', attrname)
            if m:
                cols = getattr(self, attrname)
                if not isinstance(cols, (list,tuple)):
                    cols = [cols]
396
                columns[m.group(1)] = [s.lower() if isinstance(s, (str, unicode)) else s for s in cols]
397

398 399
        colnum = 0
        for el in self.el.xpath(self.head_xpath):
400
            title = self.cleaner.clean(el)
401
            for name, titles in columns.items():
402 403 404 405
                if name in self._cols:
                    continue
                if title.lower() in [s for s in titles if isinstance(s, (str, unicode))] or \
                   any(map(lambda x: x.match(title), [s for s in titles if isinstance(s, type(re.compile('')))])):
406
                    self._cols[name] = colnum
407 408
            try:
                colnum += int(el.attrib.get('colspan', 1))
Romain Bignon's avatar
Romain Bignon committed
409
            except (ValueError, AttributeError):
410
                colnum += 1
411 412 413

    def get_colnum(self, name):
        return self._cols.get(name, None)
414 415 416 417 418 419 420 421 422 423 424 425 426


class DictElement(ListElement):
    def find_elements(self):
        if self.item_xpath is None:
            selector = []

        elif isinstance(self.item_xpath, basestring):
            selector = self.item_xpath.split('/')

        else:
            selector = self.item_xpath

427 428 429 430 431 432
        bases = [self.el]
        for key in selector:
            if key == '*':
                bases = sum([el if isinstance(el, list) else list(el.values()) for el in bases], [])
            else:
                bases = [el[int(key)] if isinstance(el, list) else el[key] for el in bases]
433

434 435 436
        for base in bases:
            for el in base:
                yield el
437 438


439
def magic_highlight(els, open_browser=True):
440 441 442 443 444 445
    """Open a web browser with the document open and the element highlighted"""

    import lxml.html
    import webbrowser
    import tempfile

446 447 448
    if not els:
        raise Exception('no elements to highlight')

449 450 451 452 453 454 455 456
    if not isinstance(els, (list, tuple)):
        els = [els]

    saved = {}
    for el in els:
        saved[el] = el.attrib.get('style', '')
        el.attrib['style'] = 'color: white !important; background: red !important;'

457
    html = lxml.html.tostring(el.xpath('/*')[0])
458 459
    for el in els:
        el.attrib['style'] = saved[el]
460 461 462 463 464 465 466 467

    _, fn = tempfile.mkstemp(dir='/tmp', prefix='weboob-highlight', suffix='.html')
    with open(fn, 'w') as fd:
        fd.write(html)

    print('Saved to %r' % fn)
    if open_browser:
        webbrowser.open('file://%s' % fn)