Commit cf45e256 authored by ntome's avatar ntome Committed by Romain Bignon

[francetelevisions] site changed

Some features like emission search was not done though.
parent e2799435
......@@ -17,67 +17,41 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from weboob.browser import PagesBrowser, URL
from .pages import IndexPage, VideoPage, Programs, VideoListPage, LatestPage, FrancetvinfoPage
from weboob.exceptions import BrowserHTTPNotFound
from .pages import SearchPage, VideoWebPage, VideoJsonPage
__all__ = ['PluzzBrowser']
class PluzzBrowser(PagesBrowser):
ENCODING = 'utf-8'
BASEURL = 'http://pluzz.francetv.fr'
BASEURL = 'https://www.france.tv'
PROGRAMS = None
francetvinfo = URL(r'http://www.francetvinfo.fr/(?P<url>.*)', FrancetvinfoPage)
latest = URL(r'http://pluzz.webservices.francetelevisions.fr/pluzz/liste/type/replay', LatestPage)
programs_page = URL(r'http://pluzz.webservices.francetelevisions.fr/pluzz/programme', Programs)
index_page = URL(r'recherche\?recherche=(?P<pattern>.*)', IndexPage)
video_page = URL(r'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<id>.*)&catalogue=Pluzz',
VideoPage)
videos_list_page = URL(r'(?P<program>videos/.*)', VideoListPage)
def get_video_id_from_francetvinfo(self, url):
return self.francetvinfo.go(url=url).get_video_id_from_francetvinfo()
def get_video_from_url(self, url):
video = self.videos_list_page.go(program=url).get_last_video()
if video:
return self.get_video(video.id, video)
def search_videos(self, pattern):
if not self.PROGRAMS:
self.PROGRAMS = list(self.get_program_list())
search_page = URL(r'/recherche/', SearchPage)
video = URL(r'/.+/(?P<number>\d+)-[^/]+.html$', VideoWebPage)
video_json = URL(r'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<number>.+)$', VideoJsonPage)
videos = []
for program in self.PROGRAMS:
if pattern.upper() in program._title.upper():
video = self.videos_list_page.go(program=program.id).get_last_video()
if video:
videos.append(video)
videos += list(self.page.iter_videos())
def search_videos(self, s):
self.location(self.search_page.build(), params={'q': s})
return self.page.iter_videos()
return videos if len(videos) > 0 else self.index_page.go(pattern=pattern).iter_videos()
def get_video(self, id):
self.location(id)
number = self.page.get_number()
def get_program_list(self):
return list(self.programs_page.go().iter_programs())
@video_page.id2url
def get_video(self, url, video=None):
self.location(url)
video = self.page.get_video(obj=video)
try:
self.video_json.go(number=number)
except BrowserHTTPNotFound:
self.logger.warning('video info not found, probably needs payment')
return
video = self.page.get_video()
if not video:
self.logger.debug('video info not found, maybe not available?')
return
video.id = id
for item in self.read_url(video.url):
video.url = u'%s' % item
return video
def read_url(self, url):
r = self.open(url, stream=True)
buf = r.iter_lines()
return buf
def latest_videos(self):
return self.latest.go().iter_videos()
......@@ -24,7 +24,6 @@ from weboob.tools.backend import Module
from .browser import PluzzBrowser
import re
__all__ = ['PluzzModule']
......@@ -39,28 +38,16 @@ class PluzzModule(Module, CapVideo, CapCollection):
BROWSER = PluzzBrowser
def get_video(self, _id):
m = re.match('http://pluzz.francetv.fr/(videos/.*)', _id)
if m:
return self.browser.get_video_from_url(m.group(1))
m2 = re.match('http://www.francetvinfo.fr/(.*)', _id)
if m2:
_id = self.browser.get_video_id_from_francetvinfo(m2.group(1))
if not _id:
return
return self.browser.get_video(_id)
def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False):
return self.browser.search_videos(pattern)
def fill_video(self, video, fields):
if fields != ['thumbnail']:
# if we don't want only the thumbnail, we probably want also every fields
video = self.browser.get_video(video.id, video)
if 'url' in fields:
video = self.browser.get_video(video.id)
if 'thumbnail' in fields and video.thumbnail:
video.thumbnail.data = self.browser.open(video.thumbnail.url).content
return video
def iter_resources(self, objs, split_path):
......
......@@ -17,143 +17,81 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from weboob.capabilities.file import LICENSES
from weboob.capabilities.image import Thumbnail
from weboob.capabilities.video import BaseVideo
from weboob.capabilities.base import BaseObject
from datetime import timedelta
from datetime import datetime
from weboob.browser.pages import HTMLPage, JsonPage
from weboob.browser.elements import ItemElement, ListElement, DictElement, method
from weboob.browser.filters.standard import Filter, CleanText, Regexp, Format, DateTime, Env, Duration
from weboob.browser.filters.html import Link, Attr
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, Regexp, Format, DateTime, Duration, Date, Eval
from weboob.browser.filters.html import Attr, AbsoluteLink
from weboob.browser.filters.json import Dict
class DurationPluzz(Filter):
def filter(self, el):
duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0])
if duration[-1:] == "'":
t = [0, int(duration[:-1])]
else:
t = map(int, duration.split(':'))
return timedelta(hours=t[0], minutes=t[1])
class FrancetvinfoPage(HTMLPage):
def get_video_id_from_francetvinfo(self):
return Regexp(CleanText('//a[@id="catchup"]/@href'),
'http://info.francetelevisions.fr/\?id-video=(.*)@Info-web',
default=None)(self.doc)
def parse_duration(text):
return int(text) * 60
class VideoListPage(HTMLPage):
@method
class get_last_video(ItemElement):
klass = BaseVideo
obj_id = CleanText('//div[@id="diffusion-info"]/@data-diffusion')
obj_title = CleanText('//div[@id="diffusion-info"]/h1/div[@id="diffusion-titre"]')
obj_date = DateTime(Regexp(CleanText('//div[@id="diffusion-info"]/h1|//div[@id="diffusion-info"]/div/div/*[1]',
replace=[(u'à', u''), (u' ', u' ')]),
'.+(\d{2}-\d{2}-\d{2}.+\d{1,2}h\d{1,2}).+'),
dayfirst=True)
class SearchPage(HTMLPage):
@method
class iter_videos(ListElement):
item_xpath = '//div[@id="player-memeProgramme"]/a'
item_xpath = '//section[h1[ends-with(text(), "vidéos")]]/ul/li'
class item(ItemElement):
klass = BaseVideo
def condition(self):
return CleanText('div[@class="autre-emission-c3"]')(self) == "En replay"
obj_id = AbsoluteLink('.//a')
#~ obj__number = Attr('./div[@class="card-content"]//a', 'data-video-content')
obj_id = Regexp(Link('.'), '^/videos/.+,(.+).html$')
obj_title = CleanText('//meta[@name="programme_titre"]/@content')
obj_date = DateTime(Regexp(CleanText('./div[@class="autre-emission-c2"]|./div[@class="autre-emission-c4"]',
replace=[(u'à', u''), (u' ', u' ')]),
'(\d{2}-\d{2}.+\d{1,2}:\d{1,2})'),
dayfirst=True)
obj_title = Format('%s - %s', CleanText('.//h3/a'), CleanText('.//h3/following-sibling::p[1]'))
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('./a//img', 'data-src')))
_infos = CleanText('.//h3/following-sibling::p[2]')
obj_date = Date(Regexp(_infos, r'\| (\d+\.\d+\.\d+) \|'), dayfirst=True)
obj_duration = Eval(parse_duration, Regexp(_infos, r' \| (\d+) min'))
class IndexPage(HTMLPage):
@method
class iter_videos(ListElement):
item_xpath = '//div[@class="panel-resultat panel-separateur"]'
ignore_duplicate = True
class item(ItemElement):
klass = BaseVideo
class VideoWebPage(HTMLPage):
def get_number(self):
return Attr('//div[@id="player"]', 'data-main-video')(self.doc)
obj_title = Format('%s du %s',
CleanText('div/div[@class="resultat-titre-diff"]/a'),
Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]'),
'.+(\d{2}-\d{2}-\d{2}).+'))
obj_id = Regexp(Link('div/div[@class="resultat-titre-diff"]/a'),
'^/videos/.+,(.+).html$')
obj_date = DateTime(Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]',
replace=[(u'à', u''), (u' ', u' ')]),
'.+(\d{2}-\d{2}-\d{2}.+\d{1,2}h\d{1,2}).+'))
obj_duration = DurationPluzz('div/div[3]')
def obj_thumbnail(self):
url = Attr('a/img[@class="resultat-vignette"]', 'data-src')(self)
thumbnail = Thumbnail(url)
thumbnail.url = thumbnail.id
return thumbnail
class VideoPage(JsonPage):
@method
class get_video(ItemElement):
klass = BaseVideo
obj_title = CleanText('//article[@id="description"]//h1')
obj_description = CleanText('//article[@id="description"]//section/following-sibling::div')
def validate(self, obj):
return obj.url
obj_date = DateTime(Regexp(
CleanText('//article[@id="description"]//span[contains(text(),"diffusé le")]'),
r'(\d{2})\.(\d{2})\.(\d{2}) à (\d{2})h(\d{2})', r'20\3/\2/\1 \4:\5'))
obj_duration = Eval(parse_duration, Regexp(CleanText('//div[span[text()="|"]]'), r'| (\d+)min'))
def parse(self, el):
for video in el['videos']:
if video['format'] != 'm3u8-download':
continue
obj_thumbnail = Eval(Thumbnail, Format('https:%s', Attr('//div[@id="playerPlaceholder"]//img', 'data-src')))
obj__number = Attr('//div[@id="player"]', 'data-main-video')
obj_license = LICENSES.COPYRIGHT
self.env['url'] = video['url']
obj_id = Env('id')
class VideoJsonPage(JsonPage):
@method
class get_video(ItemElement):
klass = BaseVideo
obj_title = Format(u'%s - %s', Dict['titre'], Dict['sous_titre'])
obj_url = Env('url')
obj_date = Dict['diffusion']['date_debut'] & DateTime
obj_date = Eval(datetime.fromtimestamp, Dict('diffusion/timestamp'))
obj_duration = Dict['duree'] & Duration
obj_description = Dict['synopsis']
obj_ext = u'm3u8'
def obj_thumbnail(self):
url = Format('http://www.francetv.fr%s', Dict['image'])(self)
thumbnail = Thumbnail(url)
thumbnail.url = thumbnail.id
return thumbnail
class Programs(JsonPage):
@method
class iter_programs(DictElement):
item_xpath = 'reponse/programme'
class item(ItemElement):
klass = BaseObject
obj_id = CleanText(Dict('url'))
obj__title = CleanText(Dict('titre_programme'))
obj__uuid = Dict['id']
obj_license = LICENSES.COPYRIGHT
def obj_url(self):
return next((v['url_secure'] for v in self.page.doc['videos'] if v['format'] == 'm3u8-download'), None)
class LatestPage(JsonPage):
@method
class iter_videos(DictElement):
item_xpath = 'reponse/emissions'
obj_thumbnail = Eval(Thumbnail, Dict['image_secure'])
class Item(ItemElement):
klass = BaseVideo
obj_id = Dict('id_diffusion')
obj_title = Dict('titre_programme')
obj_date = DateTime(Dict('date_diffusion'))
def validate(self, obj):
return obj.url
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment