Newer
Older
# -*- coding: utf-8 -*-
# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from decimal import Decimal
from six.moves.html_parser import HTMLParser
from weboob.tools.compat import basestring, unicode, urljoin
from weboob.tools.html import html2text
from .base import _NO_DEFAULT, Filter, FilterError, _Selector, debug, ItemNotFound
from .standard import (
TableCell, ColumnNotFound, # TODO move class here when modules are migrated
CleanText,
)
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound',
'Attr', 'Link', 'AbsoluteLink',
'CleanHTML', 'FormValue', 'HasElement',
class XPathNotFound(ItemNotFound):
class AttributeNotFound(ItemNotFound):
"""Select HTML elements with a CSS selector
For example::
obj_foo = CleanText(CSS('div.main'))
will take the text of all ``<div>`` having CSS class "main".
"""
def select(self, selector, item):
ret = item.cssselect(selector)
if isinstance(ret, list):
for el in ret:
if isinstance(el, html.HtmlElement):
self.highlight_el(el, item)
return ret
"""Select HTML elements with a XPath selector
"""
"""Get the text value of an HTML attribute.
Get value from attribute `attr` of HTML element matched by `selector`.
For example::
obj_foo = Attr('//img[@id="thumbnail"]', 'src')
will take the "src" attribute of ``<img>`` whose "id" is "thumbnail".
"""
def __init__(self, selector, attr, default=_NO_DEFAULT):
"""
:param selector: selector targeting the element
:param attr: name of the attribute to take
"""
super(Attr, self).__init__(selector, default=default)
self.attr = attr
"""
:raises: :class:`XPathNotFound` if no element is found
:raises: :class:`AttributeNotFound` if the element doesn't have the requested attribute
"""
try:
return u'%s' % el[0].attrib[self.attr]
except IndexError:
return self.default_or_raise(XPathNotFound('Unable to find element %s' % self.selector))
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute %s' % (el[0], self.attr)))
class Link(Attr):
"""
Get the link uri of an element.
If the ``<a>`` tag is not found, an exception `IndexError` is raised.
"""
def __init__(self, selector=None, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default=default)
class AbsoluteLink(Link):
"""Get the absolute link URI of an element.
"""
def __call__(self, item):
ret = super(AbsoluteLink, self).__call__(item)
if ret:
ret = urljoin(item.page.url, ret)
return ret
"""Convert HTML to text (Markdown) using html2text.
.. seealso:: `html2text site <https://pypi.python.org/pypi/html2text>`_
"""
def __init__(self, selector=None, options=None, default=_NO_DEFAULT):
"""
:param options: options suitable for html2text
:type options: dict
"""
super(CleanHTML, self).__init__(selector=selector, default=default)
self.options = options
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item, self.options) for item in txt])
return self.clean(txt, self.options)
def clean(cls, txt, options=None):
if not isinstance(txt, basestring):
txt = html.tostring(txt, encoding=unicode)
options = options or {}
return html2text(txt, **options)
class UnrecognizedElement(Exception):
pass
class FormValue(Filter):
"""
Extract a Python value from a form element.
Checkboxes and radio return booleans, while the rest
return text. For ``<select>`` tags, returns the user-visible text.
def filter(self, el):
try:
el = el[0]
except IndexError:
return self.default_or_raise(XPathNotFound('Unable to find element %s' % self.selector))
if el.tag == 'input':
# checkboxes or radios
if el.attrib.get('type') in ('radio', 'checkbox'):
return 'checked' in el.attrib
# regular text input
elif el.attrib.get('type', '') in ('', 'text', 'email', 'search', 'tel', 'url', 'password', 'hidden', 'color'):
try:
return unicode(el.attrib['value'])
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute value' % el))
# numeric input
elif el.attrib.get('type', '') in ('number', 'range'):
try:
if '.' in el.attrib.get('step', ''):
return Decimal(el.attrib['value'])
else:
return int(el.attrib['value'])
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute value' % el))
# datetime input
try:
if el.attrib.get('type', '') == 'date':
return datetime.datetime.strptime(el.attrib['value'], '%Y-%m-%d').date()
elif el.attrib.get('type', '') == 'time':
return datetime.datetime.strptime(el.attrib['value'], '%H:%M').time()
elif el.attrib.get('type', '') == 'datetime-local':
return datetime.datetime.strptime(el.attrib['value'], '%Y-%m-%dT%H:%M')
except KeyError:
return self.default_or_raise(AttributeNotFound('Element %s does not have attribute value' % el))
raise UnrecognizedElement('Element %s is not recognized' % el)
elif el.tag == 'textarea':
return unicode(el.text)
elif el.tag == 'select':
options = el.xpath('.//option[@selected]')
# default is the first one
if len(options) == 0:
options = el.xpath('.//option[1]')
return u'\n'.join([unicode(o.text) for o in options])
else:
raise UnrecognizedElement('Element %s is not recognized' % el)
class HasElement(Filter):
"""
Returns `yesvalue` if the `selector` finds elements, `novalue` otherwise.
"""
def __init__(self, selector, yesvalue=True, novalue=False):
super(HasElement, self).__init__(selector, default=novalue)
self.yesvalue = yesvalue
def filter(self, value):
if value:
return self.yesvalue
return self.default_or_raise(FilterError('No default value'))
class ReplaceEntities(CleanText):
"""
Filter to replace HTML entities like "é" or "B" with their unicode counterpart.
"""
def filter(self, data):
h = HTMLParser()
txt = super(ReplaceEntities, self).filter(data)
return h.unescape(txt)