pages.py

# -*- coding: utf-8 -*-

# Copyright(C) 2011  Romain Bignon
#
# This file is part of a woob module.
#
# This woob module is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This woob module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this woob module. If not, see <http://www.gnu.org/licenses/>.

from woob.browser.elements import ItemElement, ListElement, method
from woob.browser.pages import HTMLPage, pagination
from woob.browser.filters.standard import CleanText, Regexp, Env, Duration, DateTime
from woob.browser.filters.html import Link

from woob.capabilities.base import NotAvailable
from woob.capabilities.video import BaseVideo
from woob.capabilities.image import Thumbnail

from woob.exceptions import ParseError
from woob.tools.json import json

from datetime import timedelta
import re


def determine_ext(url, default_ext='unknown_video'):
    if url is None:
        return default_ext
    guess = url.partition('?')[0].rpartition('.')[2]
    if re.match(r'^[A-Za-z0-9]+$', guess):
        return guess
    elif guess.rstrip('/') in ('mp4', 'm3u8'):
        return guess.rstrip('/')
    else:
        return default_ext


class IndexPage(HTMLPage):
    @pagination
    @method
    class iter_videos(ListElement):
        item_xpath = '//div[@data-video-id]'
        next_page = Link('//a[@title="suivant"]')

        class item(ItemElement):
            klass = BaseVideo

            def validate(self, obj):
                return obj.id

            obj_id = CleanText('./div/@data-playable')
            obj_title = CleanText('./div[@class="media-block"]/h3')
            obj_author = CleanText('./div[@class="media-block"]/div/span/a')
            obj_duration = Duration(CleanText('./div/a/div[has-class("badge--duration")]'), default=NotAvailable)

            def obj_thumbnail(self):
                url = CleanText('./div/a/img/@data-src')(self)
                thumbnail = Thumbnail(url)
                thumbnail.url = url
                return thumbnail


class VideoPage(HTMLPage):

    @method
    class get_video(ItemElement):
        klass = BaseVideo

        obj_id = Env('_id')
        obj_title = CleanText('//title')
        obj_author = CleanText('//meta[@name="author"]/@content')
        obj_description = CleanText('//meta[@name="description"]/@content')

        def obj_duration(self):
            seconds = int(CleanText('//meta[@property="video:duration"]/@content', default=0)(self))
            return timedelta(seconds=seconds)

        def obj_thumbnail(self):
            url = CleanText('//meta[@property="og:image"]/@content')(self)
            thumbnail = Thumbnail(url)
            thumbnail.url = url
            return thumbnail

        obj_date = DateTime(CleanText('//meta[@property="video:release_date"]/@content'))

        def obj__formats(self):
            player = Regexp(CleanText('//script'), '.*var config = ({"context".*}}});\s*buildPlayer\(config\);.*', default=None)(self)
            if player:
                info = json.loads(player)
                if info.get('error') is not None:
                    raise ParseError(info['error']['title'])
                metadata = info.get('metadata')

                formats = {}
                for quality, media_list in metadata['qualities'].items():
                    for media in media_list:
                        media_url = media.get('url')
                        if not media_url:
                            continue
                        type_ = media.get('type')
                        if type_ == 'application/vnd.lumberjack.manifest':
                            continue
                        ext = determine_ext(media_url)
                        if ext in formats:
                            if quality in formats.get(ext):
                                formats[ext][quality] = media_url
                            else:
                                formats[ext] = {quality: media_url}
                        else:
                            formats[ext] = {quality: media_url}

                return formats
            return None