Commit 4ed35a0b authored by Antoine BOSSY's avatar Antoine BOSSY Committed by Vincent A

[podnapisi] Fix module due to new website and migrate to browser2

parent aa9ede1b
......@@ -18,34 +18,26 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser, BrowserHTTPNotFound
from .pages import SearchPage, SubtitlePage, LANGUAGE_NUMBERS
from weboob.browser import PagesBrowser, URL
from .pages import SearchPage, SubtitlePage
__all__ = ['PodnapisiBrowser']
class PodnapisiBrowser(Browser):
DOMAIN = 'www.podnapisi.net'
PROTOCOL = 'http'
ENCODING = 'utf-8'
USER_AGENT = Browser.USER_AGENTS['wget']
PAGES = {
'http://www.podnapisi.net/fr/ppodnapisi/search\?sJ=[0-9]*&sK=.*&sS=downloads&sO=desc': SearchPage,
'http://www.podnapisi.net/fr/ppodnapisi/podnapis/i/[0-9]*': SubtitlePage
}
class PodnapisiBrowser(PagesBrowser):
BASEURL = 'https://www.podnapisi.net'
search = URL('/subtitles/search/advanced\?keywords=(?P<keywords>.*)&language=(?P<language>.*)',
'/en/subtitles/search/advanced\?keywords=(?P<keywords>.*)&language=(?P<language>.*)',
SearchPage)
file = URL('/subtitles/(?P<id>-*\w*)/download')
subtitle = URL('/subtitles/(?P<id>.*)', SubtitlePage)
def iter_subtitles(self, language, pattern):
nlang = LANGUAGE_NUMBERS[language]
self.location('http://www.podnapisi.net/fr/ppodnapisi/search?sJ=%s&sK=%s&sS=downloads&sO=desc' % (nlang, pattern.encode('utf-8')))
assert self.is_on_page(SearchPage)
return self.page.iter_subtitles(unicode(language))
return self.search.go(language=language, keywords=pattern).iter_subtitles()
def get_file(self, id):
return self.file.go(id=id).content
def get_subtitle(self, id):
try:
self.location('http://www.podnapisi.net/fr/ppodnapisi/podnapis/i/%s' % id)
except BrowserHTTPNotFound:
return
if self.is_on_page(SubtitlePage):
return self.page.get_subtitle(id)
return self.subtitle.go(id=id).get_subtitle()
......@@ -17,7 +17,7 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.subtitle import CapSubtitle, LanguageNotSupported, Subtitle
from weboob.capabilities.subtitle import CapSubtitle, LanguageNotSupported
from weboob.applications.suboob.suboob import LANGUAGE_CONV
from weboob.tools.backend import Module
from weboob.tools.compat import quote_plus
......@@ -37,29 +37,13 @@ class PodnapisiModule(Module, CapSubtitle):
LICENSE = 'AGPLv3+'
BROWSER = PodnapisiBrowser
def get_subtitle(self, id):
return self.browser.get_subtitle(id)
def get_subtitle_file(self, id):
subtitle = self.browser.get_subtitle(id)
if not subtitle:
return None
return self.browser.get_file(id)
return self.browser.openurl(subtitle.url.encode('utf-8')).read()
def get_subtitle(self, id):
return self.browser.get_subtitle(id)
def iter_subtitles(self, language, pattern):
if language not in LANGUAGE_CONV.keys():
if language not in list(LANGUAGE_CONV.keys()):
raise LanguageNotSupported()
return self.browser.iter_subtitles(language, quote_plus(pattern.encode('utf-8')))
def fill_subtitle(self, subtitle, fields):
if 'description' in fields or 'url' in fields:
sub = self.get_subtitle(subtitle.id)
subtitle.description = sub.description
subtitle.url = sub.url
return subtitle
OBJECTS = {
Subtitle: fill_subtitle,
}
......@@ -16,121 +16,54 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from weboob.browser.elements import TableElement, ItemElement, method
from weboob.browser.pages import HTMLPage, pagination
from weboob.browser.filters.html import TableCell, AbsoluteLink, Attr
from weboob.browser.filters.standard import CleanText, Field, Type, Regexp
from weboob.capabilities.subtitle import Subtitle
from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.deprecated.browser import Page
from weboob.tools.compat import urljoin
LANGUAGE_NUMBERS = {
'sq': '29',
'de': '5',
'en': '2',
'ar': '12',
'bn': '59',
'be': '50',
'bg': '33',
'ca': '53',
'zh': '17',
'ko': '4',
'hr': '38',
'da': '24',
'es': '28',
'et': '20',
'fi': '31',
'fr': '8',
'gr': '16',
'hi': '42',
'nl': '23',
'hu': '15',
'iw': '22',
'id': '54',
'ga': '49',
'is': '6',
'it': '9',
'ja': '11',
'lv': '21',
'lt': '19',
'mk': '35',
'ms': '55',
'no': '3',
'pl': '26',
'pt': '32',
'ro': '13',
'ru': '27',
'sr': '36',
'sk': '37',
'sl': '1',
'sv': '25',
'cz': '7',
'th': '44',
'tr': '30',
'uk': '46',
'vi': '51'
}
class SearchPage(Page):
class SearchPage(HTMLPage):
""" Page which contains results as a list of movies
"""
@pagination
@method
class iter_subtitles(TableElement):
head_xpath = '//div[has-class("table-responsive")]/table/thead/tr/th'
item_xpath = '//tr[has-class("subtitle-entry")]'
def iter_subtitles(self, language):
linksresults = self.parser.select(self.document.getroot(), 'a.subtitle_page_link')
for link in linksresults:
id = unicode(link.attrib.get('href', '').split('-p')[-1])
name = unicode(link.text_content())
tr = link.getparent().getparent().getparent()
cdtd = self.parser.select(tr, 'td')[4]
nb_cd = int(cdtd.text)
description = NotLoaded
subtitle = Subtitle(id, name)
subtitle.nb_cd = nb_cd
subtitle.language = language
subtitle.description = description
yield subtitle
col_cd = u'# CDs'
col_language = u'Language'
next_page = AbsoluteLink('//ul[has-class("pagination")]/li[has-class("next")]/a', default=None)
class SubtitlePage(Page):
""" Page which contains a single subtitle for a movie
"""
class item(ItemElement):
klass = Subtitle
def get_subtitle(self, id):
language = NotAvailable
url = NotAvailable
nb_cd = NotAvailable
links_info = self.parser.select(self.document.getroot(), 'fieldset.information a')
for link in links_info:
href = link.attrib.get('href', '')
if '/fr/ppodnapisi/kategorija/jezik/' in href:
nlang = href.split('/')[-1]
for lang, langnum in LANGUAGE_NUMBERS.items():
if str(langnum) == str(nlang):
language = unicode(lang)
break
obj_name = CleanText('.//td/a[@alt="Subtitles\' page"]')
obj_nb_cd = Type(CleanText(TableCell('cd')), type=int)
obj_language = CleanText(TableCell('language'))
obj_url = AbsoluteLink('.//td/div[has-class("pull-left")]/a[@alt="Download subtitles."]')
obj_id = Regexp(Field('url'), r'/(-*\w*)/download$', r'\1')
desc = u''
infos = self.parser.select(self.document.getroot(), 'fieldset.information')
for info in infos:
for p in self.parser.select(info, 'p'):
desc += '%s\n' % (u' '.join(p.text_content().strip().split()))
spans = self.parser.select(info, 'span')
for span in spans:
if span.text is not None and 'CD' in span.text:
nb_cd = int(self.parser.select(span.getparent(), 'span')[1].text)
title = unicode(self.parser.select(self.document.getroot(), 'head title', 1).text)
name = title.split(' - ')[0]
class SubtitlePage(HTMLPage):
@method
class get_subtitle(ItemElement):
klass = Subtitle
dllinks = self.parser.select(self.document.getroot(), 'div.footer > a.download')
for link in dllinks:
href = link.attrib.get('href', '')
if id in href:
url = u'http://www.podnapisi.net%s' % href
obj_id = CleanText('//div[has-class("col-md-3")]/table[has-class("table-condensed")]/tr[1]/td')
obj_language = Regexp(
CleanText(
Attr('//div[has-class("col-md-3")]/table[has-class("table-condensed")]/tr/td/a/span', 'class')
),
r'-(\w*)$', r'\1'
)
obj_name = CleanText('//div[has-class("clearfix")]/table[has-class("table-condensed")]/tr[1]/td/a')
subtitle = Subtitle(id, name)
subtitle.url = url
subtitle.language = language
subtitle.nb_cd = nb_cd
subtitle.description = desc
return subtitle
def obj_url(self):
return urljoin(self.page.browser.BASEURL,
CleanText(Attr('//form[has-class("download-form")]', 'action'))(self))
......@@ -28,14 +28,13 @@ class PodnapisiTest(BackendTest):
def test_subtitle(self):
lsub = []
subtitles = self.backend.iter_subtitles('fr', 'spiderman')
for i in range(5):
subtitle = subtitles.next()
for subtitle in subtitles:
lsub.append(subtitle)
assert (len(lsub) > 0)
# get the file of a random sub
if len(lsub):
subtitle = choice(lsub)
self.backend.get_subtitle_file(subtitle.id)
assert(not self.backend.get_subtitle_file(subtitle.id).startswith(b'<'))
ss = self.backend.get_subtitle(subtitle.id)
assert ss.url.startswith('http')
assert ss.url.startswith('https')
......@@ -94,6 +94,7 @@ parolesmania
pastebin
pastealacon
pixtoilelibre
podnapisi
popolemploi
pornhub
ratp
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment