From e10690265e2031e57f0610899c76de8c4e4f2d06 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Wed, 7 Nov 2018 12:34:44 +0100 Subject: [PATCH] [francetelevisions] fix modules: website got updated Closes #173 --- modules/francetelevisions/browser.py | 54 ++++------- modules/francetelevisions/module.py | 63 ++++++++++--- modules/francetelevisions/pages.py | 133 ++++++++------------------- modules/francetelevisions/test.py | 2 +- 4 files changed, 106 insertions(+), 146 deletions(-) diff --git a/modules/francetelevisions/browser.py b/modules/francetelevisions/browser.py index 76e4d6caae..7b0a3f099f 100644 --- a/modules/francetelevisions/browser.py +++ b/modules/francetelevisions/browser.py @@ -20,9 +20,10 @@ from __future__ import unicode_literals from weboob.browser import PagesBrowser, URL -from weboob.exceptions import BrowserHTTPNotFound +from weboob.tools.json import json +from .pages import SearchPage, HomePage -from .pages import SearchPage, VideoWebPage, VideoJsonPage, HomePage +import time __all__ = ['PluzzBrowser'] @@ -31,44 +32,27 @@ class PluzzBrowser(PagesBrowser): BASEURL = 'https://www.france.tv' PROGRAMS = None - search_page = URL(r'/recherche/', SearchPage) - video = URL(r'/.+/(?P\d+)-[^/]+.html$', VideoWebPage) - video_json = URL(r'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P.+)$', VideoJsonPage) + search_page = URL(r'https://vwdlashufe-dsn.algolia.net/1/indexes/\*/queries\?(?P

.*)', SearchPage) home = URL(r'/(?P.*)', HomePage) + base = URL(r'/', HomePage) def search_videos(self, s): - self.location(self.search_page.build(), params={'q': s}) - return self.page.iter_videos() + self.go_home() + algolia_app_id, algolia_api_key = self.page.get_params() - def get_video(self, id): - self.location(id) - number = self.page.get_number() + params = "x-algolia-agent=Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.10.2;JS Helper 2.26.0&x-algolia-application-id="+algolia_app_id+"&x-algolia-api-key="+algolia_api_key - try: - self.video_json.go(number=number) - except BrowserHTTPNotFound: - self.logger.warning('video info not found, probably needs payment') - return - video = self.page.get_video() - if not video: - self.logger.debug('video info not found, maybe not available?') - return - video.id = id + data = {} + data['requests'] = [0] + data['requests'][0] = {} + data['requests'][0]['indexName'] = "yatta_prod_contents" + ts = int(time.time()) + data['requests'][0]['params'] = 'query={}&hitsPerPage=20&page=0&filters=class:video AND ranges.replay.web.begin_date < {} AND ranges.replay.web.end_date > {}&facetFilters=["class:video"]&facets=[]&tagFilters='.format(s, ts, ts) + return self.search_page.go(p=params, data=json.dumps(data)).iter_videos() - return video - - def get_categories(self): - return self.home.go(cat="").iter_categories() - - def iter_subcategories(self, cat): - for cat in self.home.go(cat="/".join(cat)).iter_subcategories(cat=cat): + def get_categories(self, cat=""): + for cat in self.home.go(cat=cat).iter_categories(): yield cat - self.page.item_xpath = r"//li[@class='card card-li ']|//li[@class='card card-small ']" - for vid in self.page.iter_videos(): - yield vid - - def iter_videos(self, cat): - self.page = self.home.go(cat="") - self.page.item_xpath = r'//h1[contains(text(), "%s")]/following-sibling::ul/li' % cat - return self.page.iter_videos() + def iter_videos(self, cat=""): + return self.home.go(cat=cat).iter_videos() diff --git a/modules/francetelevisions/module.py b/modules/francetelevisions/module.py index 8c275eea17..3facf0e7c9 100644 --- a/modules/francetelevisions/module.py +++ b/modules/francetelevisions/module.py @@ -17,10 +17,11 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.capabilities.base import find_object +from weboob.capabilities.base import empty from weboob.capabilities.video import CapVideo, BaseVideo -from weboob.capabilities.collection import CapCollection, CollectionNotFound +from weboob.capabilities.collection import CapCollection, CollectionNotFound, Collection from weboob.tools.backend import Module +from weboob.tools.capabilities.video.ytdl import video_info from .browser import PluzzBrowser @@ -37,32 +38,68 @@ class PluzzModule(Module, CapVideo, CapCollection): LICENSE = 'AGPLv3+' BROWSER = PluzzBrowser - def get_video(self, _id): - return self.browser.get_video(_id) + def get_video(self, _id, video=None): + if not video: + video = BaseVideo(_id) + + new_video = video_info(_id) + + if not new_video: + return + + video.ext = u'm3u8' + + for k, v in new_video.iter_fields(): + if not empty(v) and empty(getattr(video, k)): + setattr(video, k, v) + + return video def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False): return self.browser.search_videos(pattern) def fill_video(self, video, fields): if 'url' in fields: - video = self.browser.get_video(video.id) - if 'thumbnail' in fields and video.thumbnail: + video = self.get_video(video.id, video) + if video and 'thumbnail' in fields and video.thumbnail: video.thumbnail.data = self.browser.open(video.thumbnail.url).content return video def iter_resources(self, objs, split_path): if BaseVideo in objs: collection = self.get_collection(objs, split_path) + if collection.path_level == 0: + yield Collection([u'videos'], u'Vidéos') + for category in self.browser.get_categories(): - yield category - elif collection.path_level == 1 and collection.split_path[0].startswith('vid_'): - cat = find_object(self.browser.get_categories(), id=collection.split_path[0], error=None) - for video in self.browser.iter_videos(cat.title): - yield video + if category.path_level == 1: + yield category + else: - for cat in self.browser.iter_subcategories(collection.split_path): - yield cat + + if split_path[-1] == u'videos': + for v in self.browser.iter_videos("/".join(collection.split_path[:-1])): + yield v + elif split_path[-1].endswith('-video'): + v = BaseVideo( + "{}/{}".format(self.browser.BASEURL, + "/".join(collection.split_path).replace('-video', '.html'))) + v.title = split_path[-1].replace('-video', '') + yield v + else: + iter = 0 + for category in self.browser.get_categories("/".join(collection.split_path)): + if category.path_level == collection.path_level + 1 and \ + category.split_path[0] == collection.split_path[0]: + iter = iter + 1 + yield category + + if iter > 0: + yield Collection(split_path + [u'videos'], u'Vidéos') + else: + for v in self.browser.iter_videos("/".join(collection.split_path).replace('-videos', '.html')): + yield v def validate_collection(self, objs, collection): if collection.path_level <= 2: diff --git a/modules/francetelevisions/pages.py b/modules/francetelevisions/pages.py index ce0c68a16a..9abbd8777e 100644 --- a/modules/francetelevisions/pages.py +++ b/modules/francetelevisions/pages.py @@ -19,21 +19,16 @@ from __future__ import unicode_literals -import re -import hashlib - from datetime import datetime, timedelta -from weboob.capabilities.base import NotAvailable -from weboob.capabilities.file import LICENSES from weboob.capabilities.image import Thumbnail from weboob.capabilities.video import BaseVideo from weboob.capabilities.collection import Collection from weboob.browser.pages import HTMLPage, JsonPage -from weboob.browser.elements import ItemElement, ListElement, method -from weboob.browser.filters.standard import CleanText, Regexp, Format, DateTime, Duration, Date, Eval, Env, Field -from weboob.browser.filters.html import Attr, AbsoluteLink, CleanHTML +from weboob.browser.elements import ItemElement, ListElement, method, DictElement +from weboob.browser.filters.standard import CleanText, Regexp, Format, Field, Eval +from weboob.browser.filters.html import CleanHTML from weboob.browser.filters.json import Dict @@ -41,126 +36,70 @@ def parse_duration(text): return timedelta(seconds=int(text) * 60) -class SearchPage(HTMLPage): +class SearchPage(JsonPage): @method - class iter_videos(ListElement): - item_xpath = '//section[h1[ends-with(text(), "vidéos")]]/ul/li' + class iter_videos(DictElement): + item_xpath = 'results/0/hits' class item(ItemElement): klass = BaseVideo - def parse(self, el): - self.env['infos'] = CleanText('.//h3/following-sibling::p[contains(text()," min")]')(self) - - basetitle = CleanText('.//h3/a')(self) - sub = CleanText('.//h3/following-sibling::p[1]')(self) - if re.search(r'\d min', sub): - self.env['title'] = basetitle - else: - self.env['title'] = '%s - %s' % (basetitle, sub) - - obj_id = AbsoluteLink('.//a') - # obj__number = Attr('./div[@class="card-content"]//a', 'data-video-content') - - obj_title = Env('title') - obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('./a//img', 'data-src'))) - - obj_date = Date(Regexp(Env('infos'), r'\| (\d+\.\d+\.\d+) \|', - default=NotAvailable), - dayfirst=True, default=NotAvailable) - obj_duration = Eval(parse_duration, Regexp(Env('infos'), r'(\d+) min')) - - -class VideoWebPage(HTMLPage): - def get_number(self): - return Attr('//div[@id="player"]', 'data-main-video')(self.doc) - - @method - class get_video(ItemElement): - obj_title = CleanText('//article[@id="description"]//h1') - obj_description = CleanText('//article[@id="description"]//section/following-sibling::div') - - obj_date = DateTime(Regexp( - CleanText('//article[@id="description"]//span[contains(text(),"diffusé le")]'), - r'(\d{2})\.(\d{2})\.(\d{2}) à (\d{2})h(\d{2})', r'20\3/\2/\1 \4:\5')) - obj_duration = Eval(parse_duration, Regexp(CleanText('//div[span[text()="|"]]'), r'| (\d+)min')) - - obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('//div[@id="playerPlaceholder"]//img', 'data-src'))) - obj__number = Attr('//div[@id="player"]', 'data-main-video') - obj_license = LICENSES.COPYRIGHT - - -class VideoJsonPage(JsonPage): - @method - class get_video(ItemElement): - klass = BaseVideo + obj_id = Format(r"https://www.france.tv/%s/%s-%s.html", Dict('path'), Dict('id'), Dict('url_page')) - obj_title = Format(u'%s - %s', Dict['titre'], Dict['sous_titre']) - obj_date = Eval(datetime.fromtimestamp, Dict('diffusion/timestamp')) - obj_duration = Dict['duree'] & Duration - obj_description = Dict['synopsis'] - obj_ext = u'm3u8' + obj_title = CleanText(Dict('title')) + obj_thumbnail = Eval(Thumbnail, + Format(r'https://www.france.tv%s', + Dict('image/formats/vignette_16x9/urls/w:1024'))) - obj__uuid = Dict['id'] - obj_license = LICENSES.COPYRIGHT + def obj_date(self): + return datetime.fromtimestamp(Dict('dates/first_publication_date')(self)) - def obj_url(self): - return next((v['url_secure'] for v in self.page.doc['videos'] if v['format'] == 'm3u8-download'), None) + def obj_duration(self): + return timedelta(seconds=Dict('duration')(self)) - obj_thumbnail = Eval(Thumbnail, Dict['image_secure']) - def validate(self, obj): - return obj.url +class HomePage(HTMLPage): + def get_params(self): + a = Regexp(CleanText('//script'), + '"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*', + '\\1|\\2')(self.doc) + return a.split('|') -class HomePage(HTMLPage): @method class iter_categories(ListElement): - item_xpath = '//h1' - - class item(ItemElement): - klass = Collection - - def obj_id(self): - id = Regexp(CleanText('./a/@href'), '//www.france.tv/(.*)/', default=None)(self) - if not id: - id = CleanText('.')(self) - id = id.encode('ascii', 'ignore') - id = hashlib.md5(id).hexdigest() - id = u'vid_%s' % id - return id + ignore_duplicate = True - obj_title = CleanText('.') - - def obj_split_path(self): - return [Field('id')(self)] - - @method - class iter_subcategories(ListElement): - item_xpath = '//h2[has-class("title-wall")]' + item_xpath = '//li[has-class("nav-item")]/a' class item(ItemElement): klass = Collection - obj_id = Regexp(CleanText('./a/@href'), '//www.france.tv/.*/(.*)/', default=None) + def condition(self): + return Regexp(CleanText('./@href'), '//www.france.tv/(.*)', default=False)(self) + + def obj_id(self): + id = Regexp(CleanText('./@href', + replace=[('.html', '-video/')]), + '//www.france.tv/(.*)', "\\1", + default=None)(self) + return id[:-1] obj_title = CleanText('.') def obj_split_path(self): - cat = Env('cat')(self) - cat.append(Field('id')(self)) - return cat + return Field('id')(self).split('/') @method class iter_videos(ListElement): def parse(self, el): - self.item_xpath = self.page.item_xpath + self.item_xpath = u'//a[@data-video]' class item(ItemElement): klass = BaseVideo - obj_id = Format('https:%s', CleanText('./a/@href')) - obj_title = CleanText(CleanHTML('./a/div[@class="card-content"]|./div[has-class("card-content")]')) + obj_id = Format('https:%s', CleanText('./@href')) + obj_title = CleanText(CleanHTML('./div[@class="card-content"]|./div[has-class("card-content")]')) def condition(self): return Field('title')(self) diff --git a/modules/francetelevisions/test.py b/modules/francetelevisions/test.py index 5e0da86501..a4b941a198 100644 --- a/modules/francetelevisions/test.py +++ b/modules/francetelevisions/test.py @@ -36,7 +36,7 @@ def test_categories(self): cat = list(self.backend.iter_resources([BaseVideo], [])) self.assertTrue(len(cat) > 0) for c in cat: - if c.split_path[-1].startswith('vid_'): + if c.split_path[-1] == u'videos': videos = list(self.backend.iter_resources([BaseVideo], c.split_path)) self.assertTrue(len(videos) > 0) v = videos[0] -- GitLab