Skip to content
pages.py 4.45 KiB
Newer Older
Romain Bignon's avatar
Romain Bignon committed
# -*- coding: utf-8 -*-

# Copyright(C) 2011  Romain Bignon
#
Roger Philibert's avatar
Roger Philibert committed
# This file is part of a woob module.
Roger Philibert's avatar
Roger Philibert committed
# This woob module is free software: you can redistribute it and/or modify
Romain Bignon's avatar
Romain Bignon committed
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
Roger Philibert's avatar
Roger Philibert committed
# This woob module is distributed in the hope that it will be useful,
Romain Bignon's avatar
Romain Bignon committed
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Romain Bignon's avatar
Romain Bignon committed
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
Romain Bignon's avatar
Romain Bignon committed
# You should have received a copy of the GNU Affero General Public License
Roger Philibert's avatar
Roger Philibert committed
# along with this woob module. If not, see <http://www.gnu.org/licenses/>.
from woob.browser.elements import ItemElement, ListElement, method
from woob.browser.pages import HTMLPage, pagination
from woob.browser.filters.standard import CleanText, Regexp, Env, Duration, DateTime
from woob.browser.filters.html import Link
from woob.capabilities.base import NotAvailable
from woob.capabilities.video import BaseVideo
from woob.capabilities.image import Thumbnail
from woob.exceptions import ParseError
from woob.tools.json import json

from datetime import timedelta
Romain Bignon's avatar
Romain Bignon committed
import re


def determine_ext(url, default_ext='unknown_video'):
    if url is None:
        return default_ext
    guess = url.partition('?')[0].rpartition('.')[2]
    if re.match(r'^[A-Za-z0-9]+$', guess):
        return guess
    elif guess.rstrip('/') in ('mp4', 'm3u8'):
        return guess.rstrip('/')
    else:
        return default_ext


class IndexPage(HTMLPage):
    @pagination
    @method
    class iter_videos(ListElement):
        item_xpath = '//div[@data-video-id]'
        next_page = Link('//a[@title="suivant"]')

        class item(ItemElement):
            klass = BaseVideo

            def validate(self, obj):
                return obj.id

            obj_id = CleanText('./div/@data-playable')
            obj_title = CleanText('./div[@class="media-block"]/h3')
            obj_author = CleanText('./div[@class="media-block"]/div/span/a')
            obj_duration = Duration(CleanText('./div/a/div[has-class("badge--duration")]'), default=NotAvailable)

            def obj_thumbnail(self):
                url = CleanText('./div/a/img/@data-src')(self)
                thumbnail = Thumbnail(url)
                thumbnail.url = url
                return thumbnail


class VideoPage(HTMLPage):

    @method
    class get_video(ItemElement):
        klass = BaseVideo

        obj_id = Env('_id')
        obj_title = CleanText('//title')
        obj_author = CleanText('//meta[@name="author"]/@content')
        obj_description = CleanText('//meta[@name="description"]/@content')

        def obj_duration(self):
            seconds = int(CleanText('//meta[@property="video:duration"]/@content', default=0)(self))
            return timedelta(seconds=seconds)

        def obj_thumbnail(self):
            url = CleanText('//meta[@property="og:image"]/@content')(self)
            thumbnail = Thumbnail(url)
            thumbnail.url = url
            return thumbnail

        obj_date = DateTime(CleanText('//meta[@property="video:release_date"]/@content'))

        def obj__formats(self):
            player = Regexp(CleanText('//script'), '.*var config = ({"context".*}}});\s*buildPlayer\(config\);.*', default=None)(self)
            if player:
                info = json.loads(player)
                if info.get('error') is not None:
                    raise ParseError(info['error']['title'])
                metadata = info.get('metadata')

                formats = {}
                for quality, media_list in metadata['qualities'].items():
                    for media in media_list:
                        media_url = media.get('url')
                        if not media_url:
                            continue
                        type_ = media.get('type')
                        if type_ == 'application/vnd.lumberjack.manifest':
                            continue
                        ext = determine_ext(media_url)
                        if ext in formats:
                            if quality in formats.get(ext):
                                formats[ext][quality] = media_url
                            else:
                                formats[ext] = {quality: media_url}
                        else:
                            formats[ext] = {quality: media_url}

                return formats
            return None