Commit e1069026 authored by Bezleputh's avatar Bezleputh Committed by ntome

[francetelevisions] fix modules: website got updated

Closes #173
parent 1499695a
Pipeline #1776 failed with stages
in 2 minutes and 8 seconds
......@@ -20,9 +20,10 @@
from __future__ import unicode_literals
from weboob.browser import PagesBrowser, URL
from weboob.exceptions import BrowserHTTPNotFound
from import json
from .pages import SearchPage, HomePage
from .pages import SearchPage, VideoWebPage, VideoJsonPage, HomePage
import time
__all__ = ['PluzzBrowser']
......@@ -31,44 +32,27 @@ class PluzzBrowser(PagesBrowser):
search_page = URL(r'/recherche/', SearchPage)
video = URL(r'/.+/(?P<number>\d+)-[^/]+.html$', VideoWebPage)
video_json = URL(r'\?idDiffusion=(?P<number>.+)$', VideoJsonPage)
search_page = URL(r'\*/queries\?(?P<p>.*)', SearchPage)
home = URL(r'/(?P<cat>.*)', HomePage)
base = URL(r'/', HomePage)
def search_videos(self, s):
self.location(, params={'q': s})
algolia_app_id, algolia_api_key =
def get_video(self, id):
number =
params = "x-algolia-agent=Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.10.2;JS Helper 2.26.0&x-algolia-application-id="+algolia_app_id+"&x-algolia-api-key="+algolia_api_key
except BrowserHTTPNotFound:
self.logger.warning('video info not found, probably needs payment')
video =
if not video:
self.logger.debug('video info not found, maybe not available?')
return = id
data = {}
data['requests'] = [0]
data['requests'][0] = {}
data['requests'][0]['indexName'] = "yatta_prod_contents"
ts = int(time.time())
data['requests'][0]['params'] = 'query={}&hitsPerPage=20&page=0&filters=class:video AND ranges.replay.web.begin_date < {} AND ranges.replay.web.end_date > {}&facetFilters=["class:video"]&facets=[]&tagFilters='.format(s, ts, ts)
return self.search_page.go(p=params, data=json.dumps(data)).iter_videos()
return video
def get_categories(self):
return self.home.go(cat="").iter_categories()
def iter_subcategories(self, cat):
for cat in self.home.go(cat="/".join(cat)).iter_subcategories(cat=cat):
def get_categories(self, cat=""):
for cat in self.home.go(cat=cat).iter_categories():
yield cat = r"//li[@class='card card-li ']|//li[@class='card card-small ']"
for vid in
yield vid
def iter_videos(self, cat): = self.home.go(cat="") = r'//h1[contains(text(), "%s")]/following-sibling::ul/li' % cat
def iter_videos(self, cat=""):
return self.home.go(cat=cat).iter_videos()
......@@ -17,10 +17,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <>.
from weboob.capabilities.base import find_object
from weboob.capabilities.base import empty
from import CapVideo, BaseVideo
from weboob.capabilities.collection import CapCollection, CollectionNotFound
from weboob.capabilities.collection import CapCollection, CollectionNotFound, Collection
from import Module
from import video_info
from .browser import PluzzBrowser
......@@ -37,32 +38,68 @@ class PluzzModule(Module, CapVideo, CapCollection):
BROWSER = PluzzBrowser
def get_video(self, _id):
return self.browser.get_video(_id)
def get_video(self, _id, video=None):
if not video:
video = BaseVideo(_id)
new_video = video_info(_id)
if not new_video:
video.ext = u'm3u8'
for k, v in new_video.iter_fields():
if not empty(v) and empty(getattr(video, k)):
setattr(video, k, v)
return video
def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False):
return self.browser.search_videos(pattern)
def fill_video(self, video, fields):
if 'url' in fields:
video = self.browser.get_video(
if 'thumbnail' in fields and video.thumbnail:
video = self.get_video(, video)
if video and 'thumbnail' in fields and video.thumbnail: =
return video
def iter_resources(self, objs, split_path):
if BaseVideo in objs:
collection = self.get_collection(objs, split_path)
if collection.path_level == 0:
yield Collection([u'videos'], u'Vidéos')
for category in self.browser.get_categories():
yield category
elif collection.path_level == 1 and collection.split_path[0].startswith('vid_'):
cat = find_object(self.browser.get_categories(), id=collection.split_path[0], error=None)
for video in self.browser.iter_videos(cat.title):
yield video
if category.path_level == 1:
yield category
for cat in self.browser.iter_subcategories(collection.split_path):
yield cat
if split_path[-1] == u'videos':
for v in self.browser.iter_videos("/".join(collection.split_path[:-1])):
yield v
elif split_path[-1].endswith('-video'):
v = BaseVideo(
"/".join(collection.split_path).replace('-video', '.html')))
v.title = split_path[-1].replace('-video', '')
yield v
iter = 0
for category in self.browser.get_categories("/".join(collection.split_path)):
if category.path_level == collection.path_level + 1 and \
category.split_path[0] == collection.split_path[0]:
iter = iter + 1
yield category
if iter > 0:
yield Collection(split_path + [u'videos'], u'Vidéos')
for v in self.browser.iter_videos("/".join(collection.split_path).replace('-videos', '.html')):
yield v
def validate_collection(self, objs, collection):
if collection.path_level <= 2:
......@@ -19,21 +19,16 @@
from __future__ import unicode_literals
import re
import hashlib
from datetime import datetime, timedelta
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.file import LICENSES
from weboob.capabilities.image import Thumbnail
from import BaseVideo
from weboob.capabilities.collection import Collection
from weboob.browser.pages import HTMLPage, JsonPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, Regexp, Format, DateTime, Duration, Date, Eval, Env, Field
from weboob.browser.filters.html import Attr, AbsoluteLink, CleanHTML
from weboob.browser.elements import ItemElement, ListElement, method, DictElement
from weboob.browser.filters.standard import CleanText, Regexp, Format, Field, Eval
from weboob.browser.filters.html import CleanHTML
from weboob.browser.filters.json import Dict
......@@ -41,126 +36,70 @@ def parse_duration(text):
return timedelta(seconds=int(text) * 60)
class SearchPage(HTMLPage):
class SearchPage(JsonPage):
class iter_videos(ListElement):
item_xpath = '//section[h1[ends-with(text(), "vidéos")]]/ul/li'
class iter_videos(DictElement):
item_xpath = 'results/0/hits'
class item(ItemElement):
klass = BaseVideo
def parse(self, el):
self.env['infos'] = CleanText('.//h3/following-sibling::p[contains(text()," min")]')(self)
basetitle = CleanText('.//h3/a')(self)
sub = CleanText('.//h3/following-sibling::p[1]')(self)
if'\d min', sub):
self.env['title'] = basetitle
self.env['title'] = '%s - %s' % (basetitle, sub)
obj_id = AbsoluteLink('.//a')
# obj__number = Attr('./div[@class="card-content"]//a', 'data-video-content')
obj_title = Env('title')
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('./a//img', 'data-src')))
obj_date = Date(Regexp(Env('infos'), r'\| (\d+\.\d+\.\d+) \|',
dayfirst=True, default=NotAvailable)
obj_duration = Eval(parse_duration, Regexp(Env('infos'), r'(\d+) min'))
class VideoWebPage(HTMLPage):
def get_number(self):
return Attr('//div[@id="player"]', 'data-main-video')(self.doc)
class get_video(ItemElement):
obj_title = CleanText('//article[@id="description"]//h1')
obj_description = CleanText('//article[@id="description"]//section/following-sibling::div')
obj_date = DateTime(Regexp(
CleanText('//article[@id="description"]//span[contains(text(),"diffusé le")]'),
r'(\d{2})\.(\d{2})\.(\d{2}) à (\d{2})h(\d{2})', r'20\3/\2/\1 \4:\5'))
obj_duration = Eval(parse_duration, Regexp(CleanText('//div[span[text()="|"]]'), r'| (\d+)min'))
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('//div[@id="playerPlaceholder"]//img', 'data-src')))
obj__number = Attr('//div[@id="player"]', 'data-main-video')
class VideoJsonPage(JsonPage):
class get_video(ItemElement):
klass = BaseVideo
obj_id = Format(r"", Dict('path'), Dict('id'), Dict('url_page'))
obj_title = Format(u'%s - %s', Dict['titre'], Dict['sous_titre'])
obj_date = Eval(datetime.fromtimestamp, Dict('diffusion/timestamp'))
obj_duration = Dict['duree'] & Duration
obj_description = Dict['synopsis']
obj_ext = u'm3u8'
obj_title = CleanText(Dict('title'))
obj_thumbnail = Eval(Thumbnail,
obj__uuid = Dict['id']
def obj_date(self):
return datetime.fromtimestamp(Dict('dates/first_publication_date')(self))
def obj_url(self):
return next((v['url_secure'] for v in['videos'] if v['format'] == 'm3u8-download'), None)
def obj_duration(self):
return timedelta(seconds=Dict('duration')(self))
obj_thumbnail = Eval(Thumbnail, Dict['image_secure'])
def validate(self, obj):
return obj.url
class HomePage(HTMLPage):
def get_params(self):
a = Regexp(CleanText('//script'),
return a.split('|')
class HomePage(HTMLPage):
class iter_categories(ListElement):
item_xpath = '//h1'
class item(ItemElement):
klass = Collection
def obj_id(self):
id = Regexp(CleanText('./a/@href'), '//*)/', default=None)(self)
if not id:
id = CleanText('.')(self)
id = id.encode('ascii', 'ignore')
id = hashlib.md5(id).hexdigest()
id = u'vid_%s' % id
return id
ignore_duplicate = True
obj_title = CleanText('.')
def obj_split_path(self):
return [Field('id')(self)]
class iter_subcategories(ListElement):
item_xpath = '//h2[has-class("title-wall")]'
item_xpath = '//li[has-class("nav-item")]/a'
class item(ItemElement):
klass = Collection
obj_id = Regexp(CleanText('./a/@href'), '//*/(.*)/', default=None)
def condition(self):
return Regexp(CleanText('./@href'), '//*)', default=False)(self)
def obj_id(self):
id = Regexp(CleanText('./@href',
replace=[('.html', '-video/')]),
'//*)', "\\1",
return id[:-1]
obj_title = CleanText('.')
def obj_split_path(self):
cat = Env('cat')(self)
return cat
return Field('id')(self).split('/')
class iter_videos(ListElement):
def parse(self, el):
self.item_xpath =
self.item_xpath = u'//a[@data-video]'
class item(ItemElement):
klass = BaseVideo
obj_id = Format('https:%s', CleanText('./a/@href'))
obj_title = CleanText(CleanHTML('./a/div[@class="card-content"]|./div[has-class("card-content")]'))
obj_id = Format('https:%s', CleanText('./@href'))
obj_title = CleanText(CleanHTML('./div[@class="card-content"]|./div[has-class("card-content")]'))
def condition(self):
return Field('title')(self)
......@@ -36,7 +36,7 @@ class PluzzTest(BackendTest):
cat = list(self.backend.iter_resources([BaseVideo], []))
self.assertTrue(len(cat) > 0)
for c in cat:
if c.split_path[-1].startswith('vid_'):
if c.split_path[-1] == u'videos':
videos = list(self.backend.iter_resources([BaseVideo], c.split_path))
self.assertTrue(len(videos) > 0)
v = videos[0]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment