browsers.py 38.1 KB
Newer Older
1 2
# -*- coding: utf-8 -*-

3
# Copyright(C) 2012-2014 Laurent Bachelier
4 5 6 7
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
8
# it under the terms of the GNU Lesser General Public License as published by
9 10 11 12 13 14
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU Lesser General Public License for more details.
16
#
17
# You should have received a copy of the GNU Lesser General Public License
18 19
# along with weboob. If not, see <http://www.gnu.org/licenses/>.

20
from __future__ import absolute_import, print_function
21

22
from collections import OrderedDict
23
from functools import wraps
24
import re
25 26
import pickle
import base64
27
from hashlib import sha256
28
import zlib
Vincent A's avatar
Vincent A committed
29
from functools import reduce
30 31
try:
    from requests.packages import urllib3
32 33
except ImportError:
    import urllib3
34 35
import os
import sys
36
from copy import deepcopy
37
import inspect
38 39
from datetime import datetime, timedelta
from dateutil import parser
40
from threading import Lock
41 42 43

try:
    import requests
44 45
    if int(requests.__version__.split('.')[0]) < 2:
        raise ImportError()
46 47 48
except ImportError:
    raise ImportError('Please install python-requests >= 2.0')

49
from weboob.exceptions import BrowserHTTPSDowngrade, ModuleInstallError, BrowserRedirect, BrowserIncorrectPassword
50

51
from weboob.tools.log import getLogger
52
from weboob.tools.compat import basestring, unicode, urlparse, urljoin, urlencode, parse_qsl
Romain Bignon's avatar
Romain Bignon committed
53
from weboob.tools.json import json
54

55
from .cookies import WeboobCookieJar
56
from .exceptions import HTTPNotFound, ClientError, ServerError
57
from .sessions import FuturesSession
58 59
from .profiles import Firefox
from .pages import NextPage
60
from .url import URL, normalize_url
61 62


63
class Browser(object):
64 65 66 67 68
    """
    Simple browser class.
    Act like a browser, and don't try to do too much.
    """

69
    PROFILE = Firefox()
70 71 72 73
    """
    Default profile used by browser to navigate on websites.
    """

74
    TIMEOUT = 10.0
75 76 77 78
    """
    Default timeout during requests.
    """

79
    REFRESH_MAX = 0.0
80 81 82 83
    """
    When handling a Refresh header, the browsers considers it only if the sleep
    time in lesser than this value.
    """
84

85
    VERIFY = True
86 87 88
    """
    Check SSL certificates.
    """
89

90 91
    PROXIES = None

Johann Broudin's avatar
Johann Broudin committed
92 93
    MAX_RETRIES = 2

94 95 96 97 98
    MAX_WORKERS = 10
    """
    Maximum of threads for asynchronous requests.
    """

99 100 101 102 103
    ALLOW_REFERRER = True
    """
    Controls the behavior of get_referrer.
    """

104 105 106 107 108 109
    COOKIE_POLICY = None
    """
    Default CookieJar policy.
    Example: weboob.browser.cookies.BlockAllCookies()
    """

110 111 112 113 114 115 116 117 118
    @classmethod
    def asset(cls, localfile):
        """
        Absolute file path for a module local file.
        """
        if os.path.isabs(localfile):
            return localfile
        return os.path.join(os.path.dirname(inspect.getfile(cls)), localfile)

119
    def __init__(self, logger=None, proxy=None, responses_dirname=None, weboob=None):
120
        self.logger = getLogger('browser', logger)
121 122
        self.responses_dirname = responses_dirname
        self.responses_count = 1
123
        self.responses_count_lock = Lock()
124

125 126 127
        if isinstance(self.VERIFY, basestring):
            self.VERIFY = self.asset(self.VERIFY)

128
        self.PROXIES = proxy
129
        self._setup_session(self.PROFILE)
130 131 132
        self.url = None
        self.response = None

133 134 135
    def deinit(self):
        self.session.close()

136 137 138
    def set_normalized_url(self, response, **kwargs):
        response.url = normalize_url(response.url)

139
    def save_response(self, response, warning=False, **kwargs):
140
        if self.responses_dirname is None:
141
            import tempfile
142
            self.responses_dirname = tempfile.mkdtemp(prefix='weboob_session_')
143
            print('Debug data will be saved in this directory: %s' % self.responses_dirname, file=sys.stderr)
144 145
        elif not os.path.isdir(self.responses_dirname):
            os.makedirs(self.responses_dirname)
146

147
        import mimetypes
148 149 150 151 152 153 154 155
        # get the content-type, remove optionnal charset part
        mimetype = response.headers.get('Content-Type', '').split(';')[0]
        # due to http://bugs.python.org/issue1043134
        if mimetype == 'text/plain':
            ext = '.txt'
        else:
            # try to get an extension (and avoid adding 'None')
            ext = mimetypes.guess_extension(mimetype, False) or ''
156

157 158 159 160
        with self.responses_count_lock:
            counter = self.responses_count
            self.responses_count += 1

161
        path = re.sub(r'[^A-z0-9\.-_]+', '_', urlparse(response.url).path.rpartition('/')[2])[-10:]
162 163 164
        if path.endswith(ext):
            ext = ''
        filename = '%02d-%d%s%s%s' % \
165
            (counter, response.status_code, '-' if path else '', path, ext)
166 167

        response_filepath = os.path.join(self.responses_dirname, filename)
168

169
        request = response.request
170
        with open(response_filepath + '-request.txt', 'w') as f:
171
            f.write('%s %s\n\n\n' % (request.method, request.url))
172
            for key, value in request.headers.items():
173
                f.write('%s: %s\n' % (key, value))
174 175
            if request.body is not None:  # separate '' from None
                f.write('\n\n\n%s' % request.body)
176
        with open(response_filepath + '-response.txt', 'w') as f:
177 178
            if hasattr(response.elapsed, 'total_seconds'):
                f.write('Time: %3.3fs\n' % response.elapsed.total_seconds())
179
            f.write('%s %s\n\n\n' % (response.status_code, response.reason))
180
            for key, value in response.headers.items():
181
                f.write('%s: %s\n' % (key, value))
182

183 184 185
        with open(response_filepath, 'wb') as f:
            f.write(response.content)

186 187
        match_filepath = os.path.join(self.responses_dirname, 'url_response_match.txt')
        with open(match_filepath, 'a') as f:
188 189
            f.write('# %d %s %s\n' % (response.status_code, response.reason, response.headers.get('Content-Type', '')))
            f.write('%s\t%s\n' % (response.url, filename))
190 191 192 193 194 195 196

        msg = u'Response saved to %s' % response_filepath
        if warning:
            self.logger.warning(msg)
        else:
            self.logger.info(msg)

197 198 199
    def _create_session(self):
        return FuturesSession(max_workers=self.MAX_WORKERS, max_retries=self.MAX_RETRIES)

200
    def _setup_session(self, profile):
201 202 203
        """
        Set up a python-requests session for our usage.
        """
204
        session = self._create_session()
205

206 207
        session.proxies = self.PROXIES

208
        session.verify = not self.logger.settings['ssl_insecure'] and self.VERIFY
209
        if not session.verify:
210 211 212
            try:
                urllib3.disable_warnings()
            except AttributeError:
213 214
                # urllib3 is too old, warnings won't be disable
                pass
Johann Broudin's avatar
Johann Broudin committed
215 216 217

        # defines a max_retries. It's mandatory in case a server is not
        # handling keep alive correctly, like the proxy burp
218 219 220 221 222 223 224
        adapter_kwargs = dict(max_retries=self.MAX_RETRIES)
        # set connection pool size equal to MAX_WORKERS if needed
        if self.MAX_WORKERS > requests.adapters.DEFAULT_POOLSIZE:
            adapter_kwargs.update(pool_connections=self.MAX_WORKERS,
                                  pool_maxsize=self.MAX_WORKERS)
        session.mount('https://', requests.adapters.HTTPAdapter(**adapter_kwargs))
        session.mount('http://', requests.adapters.HTTPAdapter(**adapter_kwargs))
Johann Broudin's avatar
Johann Broudin committed
225

226 227
        if self.TIMEOUT:
            session.timeout = self.TIMEOUT
228 229
        ## weboob only can provide proxy and HTTP auth options
        session.trust_env = False
230 231 232

        profile.setup_session(session)

233
        session.hooks['response'].append(self.set_normalized_url)
234
        if self.responses_dirname is not None:
235
            session.hooks['response'].append(self.save_response)
236

237 238
        self.session = session

239
        session.cookies = WeboobCookieJar()
240 241
        if self.COOKIE_POLICY:
            session.cookies.set_policy(self.COOKIE_POLICY)
242

243 244 245
    def set_profile(self, profile):
        profile.setup_session(self.session)

246
    def location(self, url, **kwargs):
247
        """
248
        Like :meth:`open` but also changes the current URL and response.
249
        This is the most common method to request web pages.
250 251

        Other than that, has the exact same behavior of open().
252
        """
253
        assert not kwargs.get('is_async'), "Please use open() instead of location() to make asynchronous requests."
254
        response = self.open(url, **kwargs)
255 256 257 258
        self.response = response
        self.url = self.response.url
        return response

259 260 261 262 263 264 265
    def open(self, url, referrer=None,
                   allow_redirects=True,
                   stream=None,
                   timeout=None,
                   verify=None,
                   cert=None,
                   proxies=None,
266
                   data_encoding=None,
267
                   is_async=False,
268
                   callback=lambda response: response,
269
                   **kwargs):
270
        """
271 272 273 274 275 276 277
        Make an HTTP request like a browser does:
         * follow redirects (unless disabled)
         * provide referrers (unless disabled)

        Unless a `method` is explicitly provided, it makes a GET request,
        or a POST if data is not None,
        An empty `data` (not None, like '' or {}) *will* make a POST.
278

279 280 281 282 283
        It is a wrapper around session.request().
        All session.request() options are available.
        You should use location() or open() and not session.request(),
        since it has some interesting additions, which are easily
        individually disabled through the arguments.
284

285 286
        Call this instead of location() if you do not want to "visit" the URL
        (for instance, you are downloading a file).
287

288
        When `is_async` is True, open() returns a Future object (see
289 290
        concurrent.futures for more details), which can be evaluated with its
        result() method. If any exception is raised while processing request,
291
        it is caught and re-raised when calling result().
292 293 294

        For example:

295
        >>> Browser().open('http://google.com', is_async=True).result().text # doctest: +SKIP
296

297 298 299
        :param url: URL
        :type url: str

300 301 302 303 304 305
        :param data: POST data
        :type url: str or dict or None

        :param referrer: Force referrer. False to disable sending it, None for guessing
        :type referrer: str or False or None

306 307
        :param is_async: Process request in a non-blocking way
        :type is_async: bool
308 309 310 311 312

        :param callback: Callback to be called when request has finished,
                         with response as its first and only argument
        :type callback: function

313
        :rtype: :class:`requests.Response`
314
        """
315 316 317 318 319 320
        if 'async' in kwargs:
            import warnings
            warnings.warn('Please use is_async instead of async.', DeprecationWarning)
            is_async = kwargs['async']
            del kwargs['async']

321 322 323 324 325
        if isinstance(url, basestring):
            url = normalize_url(url)
        elif isinstance(url, requests.Request):
            url.url = normalize_url(url.url)

326
        req = self.build_request(url, referrer, data_encoding=data_encoding, **kwargs)
327
        preq = self.prepare_request(req)
328

329 330 331 332
        if hasattr(preq, '_cookies'):
            # The _cookies attribute is not present in requests < 2.2. As in
            # previous version it doesn't calls extract_cookies_to_jar(), it is
            # not a problem as we keep our own cookiejar instance.
333 334 335
            preq._cookies = WeboobCookieJar.from_cookiejar(preq._cookies)
            if self.COOKIE_POLICY:
                preq._cookies.set_policy(self.COOKIE_POLICY)
336

337 338 339 340
        if proxies is None:
            proxies = self.PROXIES

        if verify is None:
341
            verify = not self.logger.settings['ssl_insecure'] and self.VERIFY
342 343 344 345

        if timeout is None:
            timeout = self.TIMEOUT

346
        # We define an inner_callback here in order to execute the same code
347
        # regardless of is_async param.
348 349 350 351 352 353 354
        def inner_callback(future, response):
            if allow_redirects:
                response = self.handle_refresh(response)

            self.raise_for_status(response)
            return callback(response)

355
        # call python-requests
356 357 358 359 360 361
        response = self.session.send(preq,
                                     allow_redirects=allow_redirects,
                                     stream=stream,
                                     timeout=timeout,
                                     verify=verify,
                                     cert=cert,
362
                                     proxies=proxies,
363
                                     callback=inner_callback,
364
                                     is_async=is_async)
365 366
        return response

367 368
    def async_open(self, url, **kwargs):
        """
369
        Shortcut to open(url, is_async=True).
370 371 372
        """
        if 'async' in kwargs:
            del kwargs['async']
373 374 375
        if 'is_async' in kwargs:
            del kwargs['is_async']
        return self.open(url, is_async=True, **kwargs)
376

377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
    def raise_for_status(self, response):
        """
        Like Response.raise_for_status but will use other classes if needed.
        """
        http_error_msg = None
        if 400 <= response.status_code < 500:
            http_error_msg = '%s Client Error: %s' % (response.status_code, response.reason)
            cls = ClientError
            if response.status_code == 404:
                cls = HTTPNotFound
        elif 500 <= response.status_code < 600:
            http_error_msg = '%s Server Error: %s' % (response.status_code, response.reason)
            cls = ServerError

        if http_error_msg:
            raise cls(http_error_msg, response=response)

        # in case we did not catch something that should be
395 396
        response.raise_for_status()

397
    def build_request(self, url, referrer=None, data_encoding=None, **kwargs):
398 399 400 401 402
        """
        Does the same job as open(), but returns a Request without
        submitting it.
        This allows further customization to the Request.
        """
403 404 405 406 407
        if isinstance(url, requests.Request):
            req = url
            url = req.url
        else:
            req = requests.Request(url=url, **kwargs)
408 409

        # guess method
410
        if req.method is None:
411
            if req.data or req.json:
412
                req.method = 'POST'
413
            else:
414
                req.method = 'GET'
415

416 417 418 419
        # convert unicode strings to proper encoding
        if isinstance(req.data, unicode) and data_encoding:
            req.data = req.data.encode(data_encoding)
        if isinstance(req.data, dict) and data_encoding:
Romain Bignon's avatar
Romain Bignon committed
420
            req.data = OrderedDict([(k, v.encode(data_encoding) if isinstance(v, unicode) else v)
421
                                    for k, v in req.data.items()])
422

423
        if referrer is None:
424
            referrer = self.get_referrer(self.url, url)
425 426
        if referrer:
            # Yes, it is a misspelling.
427
            req.headers.setdefault('Referer', referrer)
428

429
        return req
430

431 432 433 434 435 436 437 438
    def prepare_request(self, req):
        """
        Get a prepared request from a Request object.

        This method aims to be overloaded by children classes.
        """
        return self.session.prepare_request(req)

439
    REFRESH_RE = re.compile(r"^(?P<sleep>[\d\.]+)(;\s*url=[\"']?(?P<url>.*?)[\"']?)?$", re.IGNORECASE)
440

441 442
    def handle_refresh(self, response):
        """
443
        Called by open, to handle Refresh HTTP header.
444 445 446 447

        It only redirect to the refresh URL if the sleep time is inferior to
        REFRESH_MAX.
        """
448
        if 'Refresh' not in response.headers:
449 450 451 452 453 454 455 456 457 458
            return response

        m = self.REFRESH_RE.match(response.headers['Refresh'])
        if m:
            # XXX perhaps we should not redirect if the refresh url is equal to the current url.
            url = m.groupdict().get('url', None) or response.request.url
            sleep = float(m.groupdict()['sleep'])

            if sleep <= self.REFRESH_MAX:
                self.logger.debug('Refresh to %s' % url)
459
                return self.open(url)
460 461 462 463 464
            else:
                self.logger.debug('Do not refresh to %s because %s > REFRESH_MAX(%s)' % (url, sleep, self.REFRESH_MAX))
                return response

        self.logger.warning('Unable to handle refresh "%s"' % response.headers['Refresh'])
465 466

        return response
467

468
    def get_referrer(self, oldurl, newurl):
469 470 471 472 473 474
        """
        Get the referrer to send when doing a request.
        If we should not send a referrer, it will return None.

        Reference: https://en.wikipedia.org/wiki/HTTP_referer

475 476 477 478 479
        The behavior can be controlled through the ALLOW_REFERRER attribute.
        True always allows the referers
        to be sent, False never, and None only if it is within
        the same domain.

480 481 482 483 484 485 486 487
        :param oldurl: Current absolute URL
        :type oldurl: str or None

        :param newurl: Target absolute URL
        :type newurl: str

        :rtype: str or None
        """
488 489
        if self.ALLOW_REFERRER is False:
            return
490
        if oldurl is None:
491
            return
492 493
        old = urlparse(oldurl)
        new = urlparse(newurl)
494 495
        # Do not leak secure URLs to insecure URLs
        if old.scheme == 'https' and new.scheme != 'https':
496
            return
497 498
        # Reloading the page. Usually no referrer.
        if oldurl == newurl:
499 500 501 502
            return
        # Domain-based privacy
        if self.ALLOW_REFERRER is None and old.netloc != new.netloc:
            return
503 504
        return oldurl

505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
    def export_session(self):
        def make_cookie(c):
            d = {
                k: getattr(c, k) for k in ['name', 'value', 'domain', 'path', 'secure']
            }
            #d['session'] = c.discard
            d['httpOnly'] = 'httponly' in [k.lower() for k in c._rest.keys()]
            d['expirationDate'] = getattr(c, 'expires', None)
            return d

        return {
            'url': self.url,
            'cookies': [make_cookie(c) for c in self.session.cookies],
        }

520

521
class UrlNotAllowed(Exception):
522 523 524 525
    """
    Raises by :class:`DomainBrowser` when `RESTRICT_URL` is set and trying to go
    on an url not matching `BASEURL`.
    """
526 527


528
class DomainBrowser(Browser):
529
    """
530
    A browser that handles relative URLs and can have a base URL (usually a domain).
531 532 533 534 535 536 537 538 539 540 541

    For instance self.location('/hello') will get http://weboob.org/hello
    if BASEURL is 'http://weboob.org/'.
    """

    BASEURL = None
    """
    Base URL, e.g. 'http://weboob.org/' or 'https://weboob.org/'
    See absurl().
    """

542
    RESTRICT_URL = False
543 544 545 546 547 548 549 550
    """
    URLs allowed to load.
    This can be used to force SSL (if the BASEURL is SSL) or any other leakage.
    Set to True to allow only URLs starting by the BASEURL.
    Set it to a list of allowed URLs if you have multiple allowed URLs.
    More complex behavior is possible by overloading url_allowed()
    """

551 552 553 554 555
    def __init__(self, baseurl=None, *args, **kwargs):
        super(DomainBrowser, self).__init__(*args, **kwargs)
        if baseurl is not None:
            self.BASEURL = baseurl

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
    def url_allowed(self, url):
        """
        Checks if we are allowed to visit an URL.
        See RESTRICT_URL.

        :param url: Absolute URL
        :type url: str
        :rtype: bool
        """
        if self.BASEURL is None or self.RESTRICT_URL is False:
            return True
        if self.RESTRICT_URL is True:
            return url.startswith(self.BASEURL)
        for restrict_url in self.RESTRICT_URL:
            if url.startswith(restrict_url):
                return True
        return False

574 575
    def absurl(self, uri, base=None):
        """
576 577 578 579
        Get the absolute URL, relative to a base URL.
        If base is None, it will try to use the current URL.
        If there is no current URL, it will try to use BASEURL.

580
        If base is False, it will always try to use the current URL.
581
        If base is True, it will always try to use BASEURL.
582 583 584 585 586

        :param uri: URI to make absolute. It can be already absolute.
        :type uri: str

        :param base: Base absolute URL.
587
        :type base: str or None or False or True
588 589 590

        :rtype: str
        """
591
        if not base:
592
            base = self.url
593 594
        if base is None or base is True:
            base = self.BASEURL
595
        return urljoin(base, uri)
596

597
    def open(self, req, *args, **kwargs):
598
        """
599
        Like :meth:`Browser.open` but handles urls without domains, using
600 601
        the :attr:`BASEURL` attribute.
        """
602 603
        uri = req.url if isinstance(req, requests.Request) else req

604 605 606
        url = self.absurl(uri)
        if not self.url_allowed(url):
            raise UrlNotAllowed(url)
607 608 609 610 611 612

        if isinstance(req, requests.Request):
            req.url = url
        else:
            req = url
        return super(DomainBrowser, self).open(req, *args, **kwargs)
613

614
    def go_home(self):
615 616 617 618
        """
        Go to the "home" page, usually the BASEURL.
        """
        return self.location(self.BASEURL or self.absurl('/'))
619 620


621
class PagesBrowser(DomainBrowser):
622 623 624 625 626 627 628 629 630 631
    r"""
    A browser which works pages and keep state of navigation.

    To use it, you have to derive it and to create URL objects as class
    attributes. When open() or location() are called, if the url matches
    one of URL objects, it returns a Page object. In case of location(), it
    stores it in self.page.

    Example:

632 633 634 635
    >>> from .pages import HTMLPage
    >>> class ListPage(HTMLPage):
    ...     def get_items():
    ...         return [el.attrib['id'] for el in self.doc.xpath('//div[@id="items"]/div')]
636
    ...
637
    >>> class ItemPage(HTMLPage):
638 639 640
    ...     pass
    ...
    >>> class MyBrowser(PagesBrowser):
641 642 643
    ...     BASEURL = 'http://example.org/'
    ...     list = URL('list-items', ListPage)
    ...     item = URL('item/view/(?P<id>\d+)', ItemPage)
644
    ...
645 646 647 648 649 650 651
    >>> MyBrowser().list.stay_or_go().get_items() # doctest: +SKIP
    >>> bool(MyBrowser().list.match('http://example.org/list-items'))
    True
    >>> bool(MyBrowser().list.match('http://example.org/'))
    False
    >>> str(MyBrowser().item.build(id=42))
    'http://example.org/item/view/42'
652 653 654 655 656 657 658

    You can then use URL instances to go on pages.
    """

    _urls = None

    def __init__(self, *args, **kwargs):
659
        self.highlight_el = kwargs.pop('highlight_el', False)
660 661 662
        super(PagesBrowser, self).__init__(*args, **kwargs)

        self.page = None
663

664 665 666 667 668 669
        # exclude properties because they can access other fields not yet defined
        def is_property(attr):
            v = getattr(type(self), attr, None)
            return hasattr(v, '__get__') or hasattr(v, '__set__')

        attrs = [(attr, getattr(self, attr)) for attr in dir(self) if not is_property(attr)]
670 671 672 673 674
        attrs = [v for v in attrs if isinstance(v[1], URL)]
        attrs.sort(key=lambda v: v[1]._creation_counter)
        self._urls = OrderedDict(deepcopy(attrs))
        for k, v in self._urls.items():
            setattr(self, k, v)
675
        for url in self._urls.values():
676 677 678 679 680 681 682 683 684 685 686
            url.browser = self

    def open(self, *args, **kwargs):
        """
        Same method than
        :meth:`weboob.browser.browsers.DomainBrowser.open`, but the
        response contains an attribute `page` if the url matches any
        :class:`URL` object.
        """

        callback = kwargs.pop('callback', lambda response: response)
687
        page_class = kwargs.pop('page', None)
688 689

        # Have to define a callback to seamlessly process synchronous and
690
        # asynchronous requests, see :meth:`Browser.open` and its `is_async`
691 692 693 694
        # and `callback` params.
        def internal_callback(response):
            # Try to handle the response page with an URL instance.
            response.page = None
695 696 697 698
            if page_class:
                response.page = page_class(self, response)
                return callback(response)

699
            for url in self._urls.values():
700 701 702
                response.page = url.handle(response)
                if response.page is not None:
                    self.logger.debug('Handle %s with %s', response.url, response.page.__class__.__name__)
703 704 705
                    break

            if response.page is None:
706 707
                regexp = r'^(?P<proto>\w+)://.*'

708 709 710 711
                proto_response = re.match(regexp, response.url)
                if proto_response:
                    proto_response = proto_response.group('proto')
                    proto_base = re.match(regexp, self.BASEURL).group('proto')
712

713
                    if proto_base == 'https' and proto_response != 'https':
714 715
                        raise BrowserHTTPSDowngrade()

716
                self.logger.debug('Unable to handle %s', response.url)
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754

            return callback(response)

        return super(PagesBrowser, self).open(callback=internal_callback, *args, **kwargs)

    def location(self, *args, **kwargs):
        """
        Same method than
        :meth:`weboob.browser.browsers.Browser.location`, but if the
        url matches any :class:`URL` object, an attribute `page` is added to
        response, and the attribute :attr:`PagesBrowser.page` is set.
        """
        if self.page is not None:
            # Call leave hook.
            self.page.on_leave()

        response = self.open(*args, **kwargs)

        self.response = response
        self.page = response.page
        self.url = response.url

        if self.page is not None:
            # Call load hook.
            self.page.on_load()

        # Returns self.response in case on_load recalls location()
        return self.response

    def pagination(self, func, *args, **kwargs):
        r"""
        This helper function can be used to handle pagination pages easily.

        When the called function raises an exception :class:`NextPage`, it goes
        on the wanted page and recall the function.

        :class:`NextPage` constructor can take an url or a Request object.

Romain Bignon's avatar
Romain Bignon committed
755
        >>> from .pages import HTMLPage
756 757 758 759 760 761 762 763
        >>> class Page(HTMLPage):
        ...     def iter_values(self):
        ...         for el in self.doc.xpath('//li'):
        ...             yield el.text
        ...         for next in self.doc.xpath('//a'):
        ...             raise NextPage(next.attrib['href'])
        ...
        >>> class Browser(PagesBrowser):
764
        ...     BASEURL = 'https://people.symlink.me'
765 766 767
        ...     list = URL('/~rom1/projects/weboob/list-(?P<pagenum>\d+).html', Page)
        ...
        >>> b = Browser()
768 769
        >>> b.list.go(pagenum=1) # doctest: +ELLIPSIS
        <weboob.browser.browsers.Page object at 0x...>
770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
        >>> list(b.pagination(lambda: b.page.iter_values()))
        ['One', 'Two', 'Three', 'Four']
        """
        while True:
            try:
                for r in func(*args, **kwargs):
                    yield r
            except NextPage as e:
                self.location(e.request)
            else:
                return


def need_login(func):
    """
    Decorator used to require to be logged to access to this function.
786 787

    This decorator can be used on any method whose first argument is a
788 789 790 791 792 793 794
    browser (typically a :class:`LoginBrowser`). It checks for the
    `logged` attribute in the current browser's page: when this
    attribute is set to ``True`` (e.g., when the page inherits
    :class:`LoggedPage`), then nothing special happens.

    In all other cases (when the browser isn't on any defined page or
    when the page's `logged` attribute is ``False``), the
795 796
    :meth:`LoginBrowser.do_login` method of the browser is called before
    calling :`func`.
797
    """
798 799

    @wraps(func)
800
    def inner(browser, *args, **kwargs):
801
        if (not hasattr(browser, 'logged') or (hasattr(browser, 'logged') and not browser.logged)) and \
802
                (not hasattr(browser, 'page') or browser.page is None or not browser.page.logged):
803
            browser.do_login()
804 805
            if browser.logger.settings.get('export_session'):
                browser.logger.debug('logged in with session: %s', json.dumps(browser.export_session()))
806 807 808 809 810 811 812 813 814
        return func(browser, *args, **kwargs)

    return inner


class LoginBrowser(PagesBrowser):
    """
    A browser which supports login.
    """
815

816 817 818 819 820 821 822 823 824
    def __init__(self, username, password, *args, **kwargs):
        super(LoginBrowser, self).__init__(*args, **kwargs)
        self.username = username
        self.password = password

    def do_login(self):
        """
        Abstract method to implement to login on website.

825
        It is called when a login is needed.
826 827
        """
        raise NotImplementedError()
Romain Bignon's avatar
Romain Bignon committed
828

829
    def do_logout(self):
Vincent A's avatar
Vincent A committed
830 831 832
        """
        Logout from website.

833
        By default, simply clears the cookies.
Vincent A's avatar
Vincent A committed
834
        """
835 836
        self.session.cookies.clear()

Romain Bignon's avatar
Romain Bignon committed
837 838 839 840 841 842 843 844 845 846 847

class StatesMixin(object):
    """
    Mixin to store states of browser.
    """

    __states__ = []
    """
    Saved state variables.
    """

848 849 850 851 852
    STATE_DURATION = None
    """
    In minutes, used to set an expiration datetime object of the state.
    """

853 854 855 856 857 858
    def locate_browser(self, state):
        try:
            self.location(state['url'])
        except (requests.exceptions.HTTPError, requests.exceptions.TooManyRedirects):
            pass

859
    def load_state(self, state):
860 861
        if 'expire' in state and parser.parse(state['expire']) < datetime.now():
            return self.logger.info('State expired, not reloading it from storage')
Romain Bignon's avatar
Romain Bignon committed
862 863 864 865 866 867 868 869 870 871
        if 'cookies' in state:
            try:
                self.session.cookies = pickle.loads(zlib.decompress(base64.b64decode(state['cookies'])))
            except (TypeError, zlib.error, EOFError, ValueError):
                self.logger.error('Unable to reload cookies from storage')
            else:
                self.logger.info('Reloaded cookies from storage')
        for attrname in self.__states__:
            if attrname in state:
                setattr(self, attrname, state[attrname])
872

873
        if 'url' in state:
874
            self.locate_browser(state)
875 876

    def dump_state(self):
Romain Bignon's avatar
Romain Bignon committed
877
        state = {}
878
        if hasattr(self, 'page') and self.page:
879
            state['url'] = self.page.url
880
        state['cookies'] = base64.b64encode(zlib.compress(pickle.dumps(self.session.cookies, -1))).decode('ascii')
Romain Bignon's avatar
Romain Bignon committed
881
        for attrname in self.__states__:
882 883 884 885
            try:
                state[attrname] = getattr(self, attrname)
            except AttributeError:
                pass
886 887
        if self.STATE_DURATION is not None:
            state['expire'] = unicode((datetime.now() + timedelta(minutes=self.STATE_DURATION)).replace(microsecond=0))
Romain Bignon's avatar
Romain Bignon committed
888
        self.logger.info('Stored cookies into storage')
889 890
        return state

Romain Bignon's avatar
Romain Bignon committed
891 892

class APIBrowser(DomainBrowser):
Vincent A's avatar
Vincent A committed
893 894 895 896
    """
    A browser for API websites.
    """

897 898 899 900 901 902 903 904 905
    def build_request(self, *args, **kwargs):
        if 'data' in kwargs:
            kwargs['data'] = json.dumps(kwargs['data'])
        if 'headers' not in kwargs:
            kwargs['headers'] = {}
        kwargs['headers']['Content-Type'] = 'application/json'

        return super(APIBrowser, self).build_request(*args, **kwargs)

Romain Bignon's avatar
Romain Bignon committed
906
    def open(self, *args, **kwargs):
Vincent A's avatar
Vincent A committed
907 908 909 910 911 912 913 914 915 916
        """
        Do a JSON request.

        The "Content-Type" header is always set to "application/json".

        :param data: if specified, format as JSON and send as request body
        :type data: :class:`dict`
        :param headers: if specified, add these headers to the request
        :type headers: :class:`dict`
        """
Romain Bignon's avatar
Romain Bignon committed
917 918 919
        return super(APIBrowser, self).open(*args, **kwargs)

    def request(self, *args, **kwargs):
Vincent A's avatar
Vincent A committed
920 921 922 923 924 925
        """
        Do a JSON request and parse the response.

        :returns: a dict containing the parsed JSON server response
        :rtype: :class:`dict`
        """
Romain Bignon's avatar
Romain Bignon committed
926
        return self.open(*args, **kwargs).json()
927 928 929 930 931 932 933


class AbstractBrowserMissingParentError(Exception):
    pass


class AbstractBrowser(Browser):
934 935 936 937 938 939 940 941 942 943 944 945 946
    """ AbstractBrowser allow inheritance of a browser defined in another module.

    Websites can share many pages and code base. This class allow to load a browser
    provided by another module and to build our own browser on top of it (like standard
    python inheritance. Weboob will install and download the PARENT module for you.

    PARENT is a mandatory attribute, it's the name of the module providing the parent Browser

    PARENT_ATTR is an optionnal attribute used when the parent module does not have only one
    browser defined as BROWSER class attribute: you can customized the path of the object to load.

    Note that you must pass a valid weboob instance as first argument of the constructor.
    """
947
    PARENT = None
948
    PARENT_ATTR = None
949

950 951 952
    def __new__(cls, *args, **kwargs):
        weboob = kwargs['weboob']

953 954 955
        if cls.PARENT is None:
            raise AbstractBrowserMissingParentError("PARENT is not defined for browser %s" % cls)

956 957 958
        try:
            module = weboob.load_or_install_module(cls.PARENT)
        except ModuleInstallError as err:
959
            raise ModuleInstallError('This module depends on %s module but %s\'s installation failed with: %s' % (cls.PARENT, cls.PARENT, err))
960

961 962 963 964 965 966 967
        if cls.PARENT_ATTR is None:
            parent = module.klass.BROWSER
        else:
            parent = reduce(getattr, cls.PARENT_ATTR.split('.'), module)

        if parent is None:
            raise AbstractBrowserMissingParentError("Failed to load parent class")
968 969

        cls.__bases__ = (parent,)
970
        return object.__new__(cls)
971 972 973 974 975


class OAuth2Mixin(StatesMixin):
    AUTHORIZATION_URI = None
    ACCESS_TOKEN_URI = None
976
    SCOPE = ''
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016

    client_id = None
    client_secret = None
    redirect_uri = None
    access_token = None
    access_token_expire = None
    auth_uri = None
    token_type = None
    refresh_token = None

    def __init__(self, *args, **kwargs):
        super(OAuth2Mixin, self).__init__(*args, **kwargs)
        self.__states__ += ('access_token', 'access_token_expire', 'refresh_token', 'token_type')

    def build_request(self, *args, **kwargs):
        headers = kwargs.setdefault('headers', {})
        if self.access_token:
            headers['Authorization'] = '{} {}'.format(self.token_type, self.access_token)
        return super(OAuth2Mixin, self).build_request(*args, **kwargs)

    def dump_state(self):
        self.access_token_expire = unicode(self.access_token_expire) if self.access_token_expire else None
        return super(OAuth2Mixin, self).dump_state()

    def load_state(self, state):
        super(OAuth2Mixin, self).load_state(state)
        self.access_token_expire = parser.parse(self.access_token_expire) if self.access_token_expire else None

    @property
    def logged(self):
        return self.access_token is not None and self.access_token_expire > datetime.now()

    def do_login(self):
        if self.refresh_token:
            self.use_refresh_token()
        elif self.auth_uri:
            self.request_access_token(self.auth_uri)
        else:
            self.request_authorization()

1017 1018 1019 1020 1021 1022 1023
    def build_authorization_parameters(self):
        return {'redirect_uri':    self.redirect_uri,
                'scope':           self.SCOPE,
                'client_id':       self.client_id,
                'response_type':   'code',
               }

1024
    def build_authorization_uri(self):
1025 1026 1027 1028
        p = urlparse(self.AUTHORIZATION_URI)
        q = dict(parse_qsl(p.query))
        q.update(self.build_authorization_parameters())
        return p._replace(query=urlencode(q)).geturl()
1029 1030

    def request_authorization(self):
1031
        self.logger.info('request authorization')
1032 1033 1034
        raise BrowserRedirect(self.build_authorization_uri())

    def build_access_token_parameters(self, values):
1035 1036 1037 1038 1039
        return {'code':             values['code'],
                'grant_type':       'authorization_code',
                'redirect_uri':     self.redirect_uri,
                'client_id':        self.client_id,
                'client_secret':    self.client_secret,
1040 1041
                }

1042
    def do_token_request(self, data):
Laurent Bachelier's avatar
Laurent Bachelier committed
1043
        return self.open(self.ACCESS_TOKEN_URI, data=data)
1044

1045
    def request_access_token(self, auth_uri):
1046 1047
        self.logger.info('requesting access token')

1048 1049 1050 1051
        if isinstance(auth_uri, dict):
            values = auth_uri
        else:
            values = dict(parse_qsl(urlparse(auth_uri).query))
1052 1053
        data = self.build_access_token_parameters(values)
        try:
1054
            auth_response = self.do_token_request(data).json()
1055 1056 1057
        except ClientError:
            raise BrowserIncorrectPassword()

1058
        self.update_token(auth_response)
1059 1060

    def use_refresh_token(self):
1061 1062
        self.logger.info('refreshing token')

1063 1064 1065 1066
        data = {'grant_type':       'refresh_token',
                'refresh_token':    self.refresh_token,
               }
        try:
1067
            auth_response = self.do_token_request(data).json()
1068 1069 1070
        except ClientError:
            raise BrowserIncorrectPassword()

1071 1072 1073
        self.update_token(auth_response)

    def update_token(self, auth_response):
1074
        self.token_type = auth_response['token_type'].capitalize() # don't know yet if this is a good idea, but required by bnpstet
1075 1076 1077 1078 1079
        if 'refresh_token' in auth_response:
            self.refresh_token = auth_response['refresh_token']
        self.access_token = auth_response['access_token']
        self.access_token_expire = datetime.now() + timedelta(seconds=int(auth_response['expires_in']))

Laurent Bachelier's avatar
Laurent Bachelier committed
1080

1081 1082 1083 1084 1085 1086 1087 1088 1089
class OAuth2PKCEMixin(OAuth2Mixin):
    def __init__(self, *args, **kwargs):
        super(OAuth2PKCEMixin, self).__init__(*args, **kwargs)
        self.__states__ += ('pkce_verifier', 'pkce_challenge')
        self.pkce_verifier = self.code_verifier()
        self.pkce_challenge = self.code_challenge(self.pkce_verifier)

    # PKCE (Proof Key for Code Exchange) standard protocol methods:
    def code_verifier(self, bytes_number=64):
1090
        return base64.urlsafe_b64encode(os.urandom(bytes_number)).rstrip(b'=').decode('ascii')
1091 1092

    def code_challenge(self, verifier):
1093
        digest = sha256(verifier).digest()
1094
        return base64.urlsafe_b64encode(digest).rstrip(b'=').decode('ascii')
1095

1096 1097
    def build_authorization_parameters(self):
        return {'redirect_uri':    self.redirect_uri,
1098
                'code_challenge_method': 'S256',
1099 1100 1101
                'code_challenge':  self.pkce_challenge,
                'client_id':       self.client_id
               }
1102 1103

    def build_access_token_parameters(self, values):
1104 1105 1106 1107 1108 1109
        return {'code':             values['code'],
                'grant_type':       'authorization_code',
                'code_verifier':    self.pkce_verifier,
                'redirect_uri':     self.redirect_uri,
                'client_id':        self.client_id,
                'client_secret':    self.client_secret,
1110
                }