Commit 0200c006 authored by Romain Bignon's avatar Romain Bignon

remove weboob.deprecated

parent c2e858bc
Pipeline #1908 passed with stages
in 15 minutes and 3 seconds
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser.browser import BrowserIncorrectPassword, BrowserBanned, \
BrowserUnavailable, BrowserRetry, \
BrowserHTTPNotFound, BrowserHTTPError, \
Page, Browser, BrokenPageError, \
StandardBrowser, BrowserPasswordExpired, \
BrowserForbidden, StateBrowser
__all__ = ['BrowserIncorrectPassword', 'BrowserPasswordExpired', 'BrowserBanned',
'BrowserUnavailable', 'BrowserRetry', 'BrowserHTTPNotFound', 'BrowserHTTPError',
'Page', 'Browser', 'BrokenPageError', 'StandardBrowser', 'BrowserForbidden',
'StateBrowser']
This diff is collapsed.
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz, Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
__all__ = ['check_url', 'id2url']
from urlparse import urlsplit
import re
class check_url(object):
"""
Checks if the first argument matches the given regular expression (given as str,
without the ^$ delimiters which are automatically added).
If not, this decorator will return None instead of calling the function.
"""
def __init__(self, regexp):
self.regexp = re.compile('^%s$' % regexp)
def __call__(self, func):
def wrapper(funcself, *args, **kwargs):
if self.regexp.match(args[0]):
return func(funcself, *args, **kwargs)
return None
return wrapper
def id2url(id2url):
"""
If the first argument is not an URL, this decorator will try to
convert it to one, by calling the id2url function.
If id2url returns None (because the id is invalid), the decorated
function will not be called and None will be returned.
If the DOMAIN attribute of the method's class is not empty, it will
also check it. If it does not match, the decorated function will not
be called and None will be returned.
"""
def wrapper(func):
def inner(self, *args, **kwargs):
arg = unicode(args[0])
if arg.startswith('http://') or arg.startswith('https://'):
domain = urlsplit(arg).netloc
if not self.DOMAIN or self.DOMAIN == domain or domain.endswith('.'+self.DOMAIN):
url = arg
else:
return None
else:
url = id2url(arg)
if url is None:
return None
new_args = [url]
new_args.extend(args[1:])
return func(self, *new_args, **kwargs)
return inner
return wrapper
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function
try:
import sqlite3 as sqlite
except ImportError as e:
from pysqlite2 import dbapi2 as sqlite
from mechanize import CookieJar, Cookie
__all__ = ['FirefoxCookieJar']
class FirefoxCookieJar(CookieJar):
def __init__(self, domain, sqlite_file=None, policy=None):
CookieJar.__init__(self, policy)
self.domain = domain
self.sqlite_file = sqlite_file
def __connect(self):
try:
db = sqlite.connect(database=self.sqlite_file, timeout=10.0)
except sqlite.OperationalError as err:
print('Unable to open %s database: %s' % (self.sqlite_file, err))
return None
return db
def load(self):
db = self.__connect()
if not db:
return
cookies = db.execute("""SELECT host, path, name, value, expiry, lastAccessed, isSecure
FROM moz_cookies
WHERE host LIKE '%%%s%%'""" % self.domain)
for entry in cookies:
domain = entry[0]
initial_dot = domain.startswith(".")
domain_specified = initial_dot
path = entry[1]
name = entry[2]
value = entry[3]
expires = entry[4]
secure = entry[6]
discard = False
c = Cookie(0, name, value,
None, False,
domain, domain_specified, initial_dot,
path, False,
secure,
expires,
discard,
None,
None,
{})
#if not ignore_discard and c.discard:
# continue
#if not ignore_expires and c.is_expired(now):
# continue
self.set_cookie(c)
def save(self):
db = self.__connect()
if not db:
return
db.execute("DELETE FROM moz_cookies WHERE host LIKE '%%%s%%'" % self.domain)
for cookie in self:
if cookie.secure:
secure = 1
else:
secure = 0
if cookie.expires is not None:
expires = cookie.expires
else:
expires = 0
if cookie.value is None:
# cookies.txt regards 'Set-Cookie: foo' as a cookie
# with no name, whereas cookielib regards it as a
# cookie with no value.
name = ""
value = cookie.name
else:
name = cookie.name
value = cookie.value
# XXX ugly hack to keep this cookie
if name == 'PHPSESSID':
expires = 1854242393
db.execute("""INSERT INTO moz_cookies (host, path, name, value, expiry, isSecure)
VALUES (?, ?, ?, ?, ?, ?)""",
(cookie.domain, cookie.path, name, value, int(expires), int(secure)))
db.commit()
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz, Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import logging
__all__ = ['get_parser', 'NoParserFound']
class NoParserFound(Exception):
pass
def load_lxml():
from .lxmlparser import LxmlHtmlParser
return LxmlHtmlParser
def load_lxmlsoup():
from .lxmlsoupparser import LxmlSoupParser
return LxmlSoupParser
def load_xml():
from .lxmlparser import LxmlXmlParser
return LxmlXmlParser
def load_json():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .jsonparser import JsonParser
return JsonParser
def load_csv():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .csvparser import CsvParser
return CsvParser
def load_raw():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .iparser import RawParser
return RawParser
def get_parser(preference_order=('lxml', 'lxmlsoup')):
"""
Get a parser from a preference order list.
Return a parser implementing IParser.
"""
if not isinstance(preference_order, (tuple, list)):
preference_order = [preference_order]
for kind in preference_order:
if not 'load_%s' % kind in globals():
continue
try:
return globals()['load_%s' % kind]()
except ImportError:
logging.debug('%s is not installed.' % kind)
raise NoParserFound("No parser found (%s)" % ','.join(preference_order))
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import csv
from weboob.tools.log import getLogger
from weboob.tools.compat import basestring, unicode
from .iparser import IParser
class Csv(object):
"""
CSV parser result.
header contains the first row if it is a header
rows contains the raw rows
drows contains the rows with cells indexed by header title
"""
def __init__(self):
self.header = None
self.rows = []
self.drows = []
class CsvParser(IParser):
"""
CSV Parser.
Since CSV files are not normalized, this parser is intended to be derived.
"""
DIALECT = 'excel'
FMTPARAMS = {}
"""
If True, will consider the first line as a header.
This means the rows will be also available as dictionnaries.
"""
HEADER = False
def parse(self, data, encoding=None):
reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS)
c = Csv()
try:
for row in reader:
row = self.decode_row(row, encoding)
if c.header is None and self.HEADER:
c.header = row
else:
c.rows.append(row)
if c.header:
drow = {}
for i, cell in enumerate(row):
drow[c.header[i]] = cell
c.drows.append(drow)
except csv.Error as error:
# If there are errors in CSV, for example the file is truncated, do
# not crash as there already are lines parsed.
logger = getLogger('csv')
logger.warning('Error during parse of CSV: %s', error)
return c
def decode_row(self, row, encoding):
if encoding:
return [unicode(cell, encoding) for cell in row]
else:
return row
def tostring(self, element):
if not isinstance(element, basestring):
return unicode(element)
return element
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
__all__ = ['IParser', 'RawParser']
class IParser(object):
def parse(self, data, encoding=None):
"""
Parse a HTML document with a specific encoding to get a tree.
@param data [str] HTML document
@param encoding [str] encoding to use
@return an object with the structured document
"""
raise NotImplementedError()
def tostring(self, elem):
"""
Get HTML string from an element.
"""
raise NotImplementedError()
def tocleanstring(self, elem):
"""
Get a clean string from an element.
"""
return self.strip(self.tostring(elem))
def strip(self, data):
"""
Strip a HTML string.
"""
p = re.compile(r'<.*?>')
return p.sub(' ', data).strip()
class RawParser(IParser):
def parse(self, data, encoding=None):
return data.read()
def tostring(self, elem):
return elem
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.json import json
from .iparser import IParser
__all__ = ['JsonParser']
class JsonParser(IParser):
"""
Json parser.
"""
def parse(self, data, encoding=None):
return json.load(data, encoding=encoding)
def tostring(self, element):
return json.dumps(element)
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import lxml.html as html
import lxml.etree as etree
from .iparser import IParser
from ..browser import BrokenPageError
__all__ = ['LxmlHtmlParser', 'LxmlXmlParser']
class LxmlParser(IParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def get_parser(encoding=None):
pass
def parse(self, data, encoding=None):
if encoding is None:
parser = None
else:
parser = self.get_parser(encoding=encoding)
return self.module.parse(data, parser)
def tostring(self, element):
return self.module.tostring(element, encoding=unicode)
def tocleanstring(self, element):
txt = [txt.strip() for txt in element.itertext()]
txt = u' '.join(txt) # 'foo bar'
txt = re.sub('\s+', ' ', txt) # 'foo bar'
return txt.strip()
def strip(self, s):
doc = self.module.fromstring(s) # parse html/xml string
return self.tocleanstring(doc)
@classmethod
def select(cls, element, selector, nb=None, method='cssselect', **kwargs):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises :class:`weboob.deprecated.browser.browser.BrokenPageError` if not found.
:param element: element on which to apply selector
:type element: object
:param selector: CSS or XPath expression
:type selector: str
:param method: (cssselect|xpath)
:type method: str
:param nb: number of elements expected to be found. Use None for
undefined number, and 'many' for 1 to infinite
:type nb: :class:`int` or :class:`str`
:rtype: Element
"""
if method == 'cssselect':
results = element.cssselect(selector, **kwargs)
elif method == 'xpath':
results = element.xpath(selector, **kwargs)
else:
raise NotImplementedError('Only the cssselect and xpath methods are supported')
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise BrokenPageError('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
class LxmlHtmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = html
def get_parser(self, encoding=None):
return html.HTMLParser(encoding=encoding)
class LxmlXmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = etree
def get_parser(self, encoding=None):
return etree.XMLParser(encoding=encoding, strip_cdata=False)
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import lxml.html
import lxml.html.soupparser
from .iparser import IParser
__all__ = ['LxmlSoupParser']
class LxmlSoupParser(IParser):
"""
Parser using lxml elementsoup.
Note that it is not available on every systems.
"""
def parse(self, data, encoding=None):
return lxml.html.soupparser.parse(data)
def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment