Bezleputh
Committed by ntome
Builds for 1 pipeline failed in 2 minutes 8 seconds

[francetelevisions] fix modules: website got updated

Closes #173
......@@ -20,9 +20,10 @@
from __future__ import unicode_literals
from weboob.browser import PagesBrowser, URL
from weboob.exceptions import BrowserHTTPNotFound
from weboob.tools.json import json
from .pages import SearchPage, HomePage
from .pages import SearchPage, VideoWebPage, VideoJsonPage, HomePage
import time
__all__ = ['PluzzBrowser']
......@@ -31,44 +32,27 @@ class PluzzBrowser(PagesBrowser):
BASEURL = 'https://www.france.tv'
PROGRAMS = None
search_page = URL(r'/recherche/', SearchPage)
video = URL(r'/.+/(?P<number>\d+)-[^/]+.html$', VideoWebPage)
video_json = URL(r'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<number>.+)$', VideoJsonPage)
search_page = URL(r'https://vwdlashufe-dsn.algolia.net/1/indexes/\*/queries\?(?P<p>.*)', SearchPage)
home = URL(r'/(?P<cat>.*)', HomePage)
base = URL(r'/', HomePage)
def search_videos(self, s):
self.location(self.search_page.build(), params={'q': s})
return self.page.iter_videos()
self.go_home()
algolia_app_id, algolia_api_key = self.page.get_params()
def get_video(self, id):
self.location(id)
number = self.page.get_number()
params = "x-algolia-agent=Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.10.2;JS Helper 2.26.0&x-algolia-application-id="+algolia_app_id+"&x-algolia-api-key="+algolia_api_key
try:
self.video_json.go(number=number)
except BrowserHTTPNotFound:
self.logger.warning('video info not found, probably needs payment')
return
video = self.page.get_video()
if not video:
self.logger.debug('video info not found, maybe not available?')
return
video.id = id
data = {}
data['requests'] = [0]
data['requests'][0] = {}
data['requests'][0]['indexName'] = "yatta_prod_contents"
ts = int(time.time())
data['requests'][0]['params'] = 'query={}&hitsPerPage=20&page=0&filters=class:video AND ranges.replay.web.begin_date < {} AND ranges.replay.web.end_date > {}&facetFilters=["class:video"]&facets=[]&tagFilters='.format(s, ts, ts)
return self.search_page.go(p=params, data=json.dumps(data)).iter_videos()
return video
def get_categories(self):
return self.home.go(cat="").iter_categories()
def iter_subcategories(self, cat):
for cat in self.home.go(cat="/".join(cat)).iter_subcategories(cat=cat):
def get_categories(self, cat=""):
for cat in self.home.go(cat=cat).iter_categories():
yield cat
self.page.item_xpath = r"//li[@class='card card-li ']|//li[@class='card card-small ']"
for vid in self.page.iter_videos():
yield vid
def iter_videos(self, cat):
self.page = self.home.go(cat="")
self.page.item_xpath = r'//h1[contains(text(), "%s")]/following-sibling::ul/li' % cat
return self.page.iter_videos()
def iter_videos(self, cat=""):
return self.home.go(cat=cat).iter_videos()
......
......@@ -17,10 +17,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.base import find_object
from weboob.capabilities.base import empty
from weboob.capabilities.video import CapVideo, BaseVideo
from weboob.capabilities.collection import CapCollection, CollectionNotFound
from weboob.capabilities.collection import CapCollection, CollectionNotFound, Collection
from weboob.tools.backend import Module
from weboob.tools.capabilities.video.ytdl import video_info
from .browser import PluzzBrowser
......@@ -37,32 +38,68 @@ class PluzzModule(Module, CapVideo, CapCollection):
LICENSE = 'AGPLv3+'
BROWSER = PluzzBrowser
def get_video(self, _id):
return self.browser.get_video(_id)
def get_video(self, _id, video=None):
if not video:
video = BaseVideo(_id)
new_video = video_info(_id)
if not new_video:
return
video.ext = u'm3u8'
for k, v in new_video.iter_fields():
if not empty(v) and empty(getattr(video, k)):
setattr(video, k, v)
return video
def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False):
return self.browser.search_videos(pattern)
def fill_video(self, video, fields):
if 'url' in fields:
video = self.browser.get_video(video.id)
if 'thumbnail' in fields and video.thumbnail:
video = self.get_video(video.id, video)
if video and 'thumbnail' in fields and video.thumbnail:
video.thumbnail.data = self.browser.open(video.thumbnail.url).content
return video
def iter_resources(self, objs, split_path):
if BaseVideo in objs:
collection = self.get_collection(objs, split_path)
if collection.path_level == 0:
yield Collection([u'videos'], u'Vidéos')
for category in self.browser.get_categories():
yield category
elif collection.path_level == 1 and collection.split_path[0].startswith('vid_'):
cat = find_object(self.browser.get_categories(), id=collection.split_path[0], error=None)
for video in self.browser.iter_videos(cat.title):
yield video
if category.path_level == 1:
yield category
else:
for cat in self.browser.iter_subcategories(collection.split_path):
yield cat
if split_path[-1] == u'videos':
for v in self.browser.iter_videos("/".join(collection.split_path[:-1])):
yield v
elif split_path[-1].endswith('-video'):
v = BaseVideo(
"{}/{}".format(self.browser.BASEURL,
"/".join(collection.split_path).replace('-video', '.html')))
v.title = split_path[-1].replace('-video', '')
yield v
else:
iter = 0
for category in self.browser.get_categories("/".join(collection.split_path)):
if category.path_level == collection.path_level + 1 and \
category.split_path[0] == collection.split_path[0]:
iter = iter + 1
yield category
if iter > 0:
yield Collection(split_path + [u'videos'], u'Vidéos')
else:
for v in self.browser.iter_videos("/".join(collection.split_path).replace('-videos', '.html')):
yield v
def validate_collection(self, objs, collection):
if collection.path_level <= 2:
......
......@@ -19,21 +19,16 @@
from __future__ import unicode_literals
import re
import hashlib
from datetime import datetime, timedelta
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.file import LICENSES
from weboob.capabilities.image import Thumbnail
from weboob.capabilities.video import BaseVideo
from weboob.capabilities.collection import Collection
from weboob.browser.pages import HTMLPage, JsonPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, Regexp, Format, DateTime, Duration, Date, Eval, Env, Field
from weboob.browser.filters.html import Attr, AbsoluteLink, CleanHTML
from weboob.browser.elements import ItemElement, ListElement, method, DictElement
from weboob.browser.filters.standard import CleanText, Regexp, Format, Field, Eval
from weboob.browser.filters.html import CleanHTML
from weboob.browser.filters.json import Dict
......@@ -41,126 +36,70 @@ def parse_duration(text):
return timedelta(seconds=int(text) * 60)
class SearchPage(HTMLPage):
class SearchPage(JsonPage):
@method
class iter_videos(ListElement):
item_xpath = '//section[h1[ends-with(text(), "vidéos")]]/ul/li'
class iter_videos(DictElement):
item_xpath = 'results/0/hits'
class item(ItemElement):
klass = BaseVideo
def parse(self, el):
self.env['infos'] = CleanText('.//h3/following-sibling::p[contains(text()," min")]')(self)
basetitle = CleanText('.//h3/a')(self)
sub = CleanText('.//h3/following-sibling::p[1]')(self)
if re.search(r'\d min', sub):
self.env['title'] = basetitle
else:
self.env['title'] = '%s - %s' % (basetitle, sub)
obj_id = AbsoluteLink('.//a')
# obj__number = Attr('./div[@class="card-content"]//a', 'data-video-content')
obj_title = Env('title')
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('./a//img', 'data-src')))
obj_date = Date(Regexp(Env('infos'), r'\| (\d+\.\d+\.\d+) \|',
default=NotAvailable),
dayfirst=True, default=NotAvailable)
obj_duration = Eval(parse_duration, Regexp(Env('infos'), r'(\d+) min'))
class VideoWebPage(HTMLPage):
def get_number(self):
return Attr('//div[@id="player"]', 'data-main-video')(self.doc)
@method
class get_video(ItemElement):
obj_title = CleanText('//article[@id="description"]//h1')
obj_description = CleanText('//article[@id="description"]//section/following-sibling::div')
obj_date = DateTime(Regexp(
CleanText('//article[@id="description"]//span[contains(text(),"diffusé le")]'),
r'(\d{2})\.(\d{2})\.(\d{2}) à (\d{2})h(\d{2})', r'20\3/\2/\1 \4:\5'))
obj_duration = Eval(parse_duration, Regexp(CleanText('//div[span[text()="|"]]'), r'| (\d+)min'))
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('//div[@id="playerPlaceholder"]//img', 'data-src')))
obj__number = Attr('//div[@id="player"]', 'data-main-video')
obj_license = LICENSES.COPYRIGHT
class VideoJsonPage(JsonPage):
@method
class get_video(ItemElement):
klass = BaseVideo
obj_id = Format(r"https://www.france.tv/%s/%s-%s.html", Dict('path'), Dict('id'), Dict('url_page'))
obj_title = Format(u'%s - %s', Dict['titre'], Dict['sous_titre'])
obj_date = Eval(datetime.fromtimestamp, Dict('diffusion/timestamp'))
obj_duration = Dict['duree'] & Duration
obj_description = Dict['synopsis']
obj_ext = u'm3u8'
obj_title = CleanText(Dict('title'))
obj_thumbnail = Eval(Thumbnail,
Format(r'https://www.france.tv%s',
Dict('image/formats/vignette_16x9/urls/w:1024')))
obj__uuid = Dict['id']
obj_license = LICENSES.COPYRIGHT
def obj_date(self):
return datetime.fromtimestamp(Dict('dates/first_publication_date')(self))
def obj_url(self):
return next((v['url_secure'] for v in self.page.doc['videos'] if v['format'] == 'm3u8-download'), None)
def obj_duration(self):
return timedelta(seconds=Dict('duration')(self))
obj_thumbnail = Eval(Thumbnail, Dict['image_secure'])
def validate(self, obj):
return obj.url
class HomePage(HTMLPage):
def get_params(self):
a = Regexp(CleanText('//script'),
'"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*',
'\\1|\\2')(self.doc)
return a.split('|')
class HomePage(HTMLPage):
@method
class iter_categories(ListElement):
item_xpath = '//h1'
class item(ItemElement):
klass = Collection
def obj_id(self):
id = Regexp(CleanText('./a/@href'), '//www.france.tv/(.*)/', default=None)(self)
if not id:
id = CleanText('.')(self)
id = id.encode('ascii', 'ignore')
id = hashlib.md5(id).hexdigest()
id = u'vid_%s' % id
return id
ignore_duplicate = True
obj_title = CleanText('.')
def obj_split_path(self):
return [Field('id')(self)]
@method
class iter_subcategories(ListElement):
item_xpath = '//h2[has-class("title-wall")]'
item_xpath = '//li[has-class("nav-item")]/a'
class item(ItemElement):
klass = Collection
obj_id = Regexp(CleanText('./a/@href'), '//www.france.tv/.*/(.*)/', default=None)
def condition(self):
return Regexp(CleanText('./@href'), '//www.france.tv/(.*)', default=False)(self)
def obj_id(self):
id = Regexp(CleanText('./@href',
replace=[('.html', '-video/')]),
'//www.france.tv/(.*)', "\\1",
default=None)(self)
return id[:-1]
obj_title = CleanText('.')
def obj_split_path(self):
cat = Env('cat')(self)
cat.append(Field('id')(self))
return cat
return Field('id')(self).split('/')
@method
class iter_videos(ListElement):
def parse(self, el):
self.item_xpath = self.page.item_xpath
self.item_xpath = u'//a[@data-video]'
class item(ItemElement):
klass = BaseVideo
obj_id = Format('https:%s', CleanText('./a/@href'))
obj_title = CleanText(CleanHTML('./a/div[@class="card-content"]|./div[has-class("card-content")]'))
obj_id = Format('https:%s', CleanText('./@href'))
obj_title = CleanText(CleanHTML('./div[@class="card-content"]|./div[has-class("card-content")]'))
def condition(self):
return Field('title')(self)
......
......@@ -36,7 +36,7 @@ class PluzzTest(BackendTest):
cat = list(self.backend.iter_resources([BaseVideo], []))
self.assertTrue(len(cat) > 0)
for c in cat:
if c.split_path[-1].startswith('vid_'):
if c.split_path[-1] == u'videos':
videos = list(self.backend.iter_resources([BaseVideo], c.split_path))
self.assertTrue(len(videos) > 0)
v = videos[0]
......