Commit db00bea2 authored by Vincent A's avatar Vincent A

[imdb] fix, port to browser2 and python3

parent fd49216c
......@@ -17,13 +17,20 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
from __future__ import unicode_literals
from HTMLParser import HTMLParser
from weboob.deprecated.browser import Browser, BrowserHTTPNotFound
import re
try:
from HTMLParser import HTMLParser
except ImportError:
from html.parser import HTMLParser
from weboob.browser import PagesBrowser, URL
from weboob.browser.profiles import Wget
from weboob.exceptions import BrowserHTTPNotFound
from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.capabilities.cinema import Movie, Person
from weboob.tools.json import json
from weboob.tools.compat import unicode
from .pages import PersonPage, MovieCrewPage, BiographyPage, ReleasePage
......@@ -32,21 +39,18 @@ from datetime import datetime
__all__ = ['ImdbBrowser']
class ImdbBrowser(Browser):
DOMAIN = 'www.imdb.com'
PROTOCOL = 'http'
ENCODING = 'utf-8'
USER_AGENT = Browser.USER_AGENTS['wget']
PAGES = {
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
}
class ImdbBrowser(PagesBrowser):
BASEURL = 'http://www.imdb.com'
PROFILE = Wget()
movie_crew = URL(r'/title/tt[0-9]*/fullcredits.*', MovieCrewPage)
release = URL(r'/title/tt[0-9]*/releaseinfo.*', ReleasePage)
bio = URL(r'/name/nm[0-9]*/bio.*', BiographyPage)
person = URL(r'/name/nm[0-9]*/*', PersonPage)
def iter_movies(self, pattern):
res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s' % pattern.encode('utf-8'))
jres = json.loads(res)
res = self.open('http://www.imdb.com/xml/find?json=1&nr=1&tt=on', params={'q': pattern})
jres = res.json()
htmlparser = HTMLParser()
for cat in ['title_popular', 'title_exact', 'title_approx']:
if cat in jres:
......@@ -71,8 +75,8 @@ class ImdbBrowser(Browser):
yield movie
def iter_persons(self, pattern):
res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
jres = json.loads(res)
res = self.open('http://www.imdb.com/xml/find?json=1&nr=1&nm=on', params={'q': pattern})
jres = res.json()
htmlparser = HTMLParser()
for cat in ['name_popular', 'name_exact', 'name_approx']:
if cat in jres:
......@@ -91,9 +95,9 @@ class ImdbBrowser(Browser):
yield person
def get_movie(self, id):
res = self.readurl('http://www.omdbapi.com/?i=%s&plot=full' % id)
res = self.open('http://www.omdbapi.com/?apikey=b7c56eb5&i=%s&plot=full' % id)
if res is not None:
jres = json.loads(res)
jres = res.json()
else:
return None
htmlparser = HTMLParser()
......@@ -178,38 +182,38 @@ class ImdbBrowser(Browser):
self.location('http://www.imdb.com/name/%s' % id)
except BrowserHTTPNotFound:
return
assert self.is_on_page(PersonPage)
assert self.person.is_here()
return self.page.get_person(id)
def get_person_biography(self, id):
self.location('http://www.imdb.com/name/%s/bio' % id)
assert self.is_on_page(BiographyPage)
assert self.bio.is_here()
return self.page.get_biography()
def iter_movie_persons(self, movie_id, role):
self.location('http://www.imdb.com/title/%s/fullcredits' % movie_id)
assert self.is_on_page(MovieCrewPage)
assert self.movie_crew.is_here()
for p in self.page.iter_persons(role):
yield p
def iter_person_movies(self, person_id, role):
self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(PersonPage)
assert self.person.is_here()
return self.page.iter_movies(role)
def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(PersonPage)
assert self.person.is_here()
for movie in self.page.iter_movies_ids():
yield movie
def iter_movie_persons_ids(self, movie_id):
self.location('http://www.imdb.com/title/%s/fullcredits' % movie_id)
assert self.is_on_page(MovieCrewPage)
assert self.movie_crew.is_here()
for person in self.page.iter_persons_ids():
yield person
def get_movie_releases(self, id, country):
self.location('http://www.imdb.com/title/%s/releaseinfo' % id)
assert self.is_on_page(ReleasePage)
assert self.release.is_here()
return self.page.get_movie_releases(country)
......@@ -17,9 +17,10 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from weboob.capabilities.cinema import CapCinema, Person, Movie
from weboob.tools.backend import Module
from weboob.tools.compat import quote_plus
from .browser import ImdbBrowser
......@@ -43,10 +44,10 @@ class ImdbModule(Module, CapCinema):
return self.browser.get_person(id)
def iter_movies(self, pattern):
return self.browser.iter_movies(quote_plus(pattern.encode('utf-8')))
return self.browser.iter_movies(pattern)
def iter_persons(self, pattern):
return self.browser.iter_persons(quote_plus(pattern.encode('utf-8')))
return self.browser.iter_persons(pattern)
def iter_movie_persons(self, id, role=None):
return self.browser.iter_movie_persons(id, role)
......
......@@ -17,22 +17,24 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from weboob.capabilities.cinema import Person, Movie
from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.deprecated.browser import Page
from weboob.tools.html import html2text
from datetime import datetime
import re
from weboob.capabilities.cinema import Person, Movie
from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.browser.pages import HTMLPage
from weboob.browser.filters.html import CleanHTML
class ReleasePage(Page):
class ReleasePage(HTMLPage):
''' Page containing releases of a movie
'''
def get_movie_releases(self, country_filter):
result = unicode()
links = self.parser.select(self.document.getroot(), 'table#release_dates a')
result = ''
links = self.doc.xpath('//table[@id="release_dates"]//a')
for a in links:
href = a.attrib.get('href', '')
......@@ -40,13 +42,13 @@ class ReleasePage(Page):
if href.strip('/').split('/')[0] == 'calendar' and\
(country_filter is None or re.search('region=([a-zA-Z]+)&', href).group(1).lower() == country_filter):
country = a.text
td_date = self.parser.select(a.getparent().getparent().getparent(), 'td')[1]
date_links = self.parser.select(td_date, 'a')
td_date = a.xpath('./../../..//td')[1]
date_links = td_date.xpath('.//a')
if len(date_links) > 1:
date = date_links[1].attrib.get('href', '').strip('/').split('/')[-1]
date += '-'+date_links[0].attrib.get('href', '').strip('/').split('/')[-1]
else:
date = unicode(self.parser.select(a.getparent().getparent().getparent(), 'td')[1].text_content())
date = a.xpath('./../../..//td')[1].text_content()
result += '%s : %s\n' % (country, date)
if result == u'':
result = NotAvailable
......@@ -55,39 +57,39 @@ class ReleasePage(Page):
return result
class BiographyPage(Page):
class BiographyPage(HTMLPage):
''' Page containing biography of a person
'''
def get_biography(self):
bio = unicode()
bio = ''
start = False
tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1)
for el in tn.getchildren():
tn = self.doc.xpath('//div[@id="bio_content"]')[0]
for el in tn.xpath('./*'):
if el.attrib.get('name') == 'mini_bio':
start = True
if start:
bio += html2text(self.parser.tostring(el))
bio += CleanHTML('.')(el)
return bio
class MovieCrewPage(Page):
class MovieCrewPage(HTMLPage):
''' Page listing all the persons related to a movie
'''
def iter_persons(self, role_filter=None):
if (role_filter is None or (role_filter is not None and role_filter == 'actor')):
tables = self.parser.select(self.document.getroot(), 'table.cast_list')
tables = self.doc.xpath('//table[has-class("cast_list")]')
if len(tables) > 0:
table = tables[0]
tds = self.parser.select(table, 'td.itemprop')
tds = table.xpath('.//td[has-class("itemprop")]')
for td in tds:
id = td.find('a').attrib.get('href', '').strip('/').split('/')[1]
name = unicode(td.find('a').text)
char_name = unicode(self.parser.select(td.getparent(), 'td.character', 1).text_content())
name = td.find('a').text
char_name = td.xpath('..//td[has-class("character")]')[0].text_content()
person = Person(id, name)
person.short_description = char_name
person.real_name = NotLoaded
......@@ -101,35 +103,35 @@ class MovieCrewPage(Page):
person.thumbnail_url = NotLoaded
yield person
for gloss_link in self.parser.select(self.document.getroot(), 'table[cellspacing="1"] h5 a'):
for gloss_link in self.doc.xpath('//table[@cellspacing="1"]//h5//a'):
role = gloss_link.attrib.get('name', '').rstrip('s')
if (role_filter is None or (role_filter is not None and role == role_filter)):
tbody = gloss_link.getparent().getparent().getparent().getparent()
for line in self.parser.select(tbody, 'tr')[1:]:
for a in self.parser.select(line, 'a'):
for line in tbody.xpath('.//tr')[1:]:
for a in line.xpath('.//a'):
role_detail = NotAvailable
href = a.attrib.get('href', '')
if '/name/nm' in href:
id = href.strip('/').split('/')[-1]
name = unicode(a.text)
name = a.text
if 'glossary' in href:
role_detail = unicode(a.text)
role_detail = a.text
person = Person(id, name)
person.short_description = role_detail
yield person
# yield self.browser.get_person(id)
def iter_persons_ids(self):
tables = self.parser.select(self.document.getroot(), 'table.cast_list')
tables = self.doc.xpath('//table[has-class("cast_list")]')
if len(tables) > 0:
table = tables[0]
tds = self.parser.select(table, 'td.itemprop')
tds = table.xpath('.//td[has-class("itemprop")]')
for td in tds:
id = td.find('a').attrib.get('href', '').strip('/').split('/')[1]
yield id
class PersonPage(Page):
class PersonPage(HTMLPage):
''' Page informing about a person
It is used to build a Person instance and to get the movie list related to a person
'''
......@@ -146,35 +148,35 @@ class PersonPage(Page):
thumbnail_url = NotAvailable
roles = {}
nationality = NotAvailable
td_overview = self.parser.select(self.document.getroot(), 'td#overview-top', 1)
descs = self.parser.select(td_overview, 'span[itemprop=description]')
td_overview = self.doc.xpath('//td[@id="overview-top"]')[0]
descs = td_overview.xpath('.//span[@itemprop="description"]')
if len(descs) > 0:
short_biography = unicode(descs[0].text)
rname_block = self.parser.select(td_overview, 'div.txt-block h4.inline')
short_biography = descs[0].text
rname_block = td_overview.xpath('.//div[has-class("txt-block")]//h4[has-class("inline")]')
if len(rname_block) > 0 and "born" in rname_block[0].text.lower():
links = self.parser.select(rname_block[0].getparent(), 'a')
links = rname_block[0].xpath('..//a')
for a in links:
href = a.attrib.get('href', '').strip()
if href == 'bio':
real_name = unicode(a.text.strip())
real_name = a.text.strip()
elif 'birth_place' in href:
birth_place = unicode(a.text.lower().strip())
names = self.parser.select(td_overview, 'h1 span[itemprop=name]')
birth_place = a.text.lower().strip()
names = td_overview.xpath('.//h1//span[@itemprop="name"]')
if len(names) > 0:
name = unicode(names[0].text.strip())
times = self.parser.select(td_overview, 'time[itemprop=birthDate]')
name = names[0].text.strip()
times = td_overview.xpath('.//time[@itemprop="birthDate"]')
if len(times) > 0:
time = times[0].attrib.get('datetime', '').split('-')
if len(time) == 3 and int(time[0]) >= 1900:
birth_date = datetime(int(time[0]), int(time[1]), int(time[2]))
dtimes = self.parser.select(td_overview, 'time[itemprop=deathDate]')
dtimes = td_overview.xpath('.//time[@itemprop="deathDate"]')
if len(dtimes) > 0:
dtime = dtimes[0].attrib.get('datetime', '').split('-')
if len(dtime) == 3 and int(dtime[0]) >= 1900:
death_date = datetime(int(dtime[0]), int(dtime[1]), int(dtime[2]))
img_thumbnail = self.parser.select(self.document.getroot(), 'td#img_primary img')
img_thumbnail = self.doc.xpath('//td[@id="img_primary img"]')
if len(img_thumbnail) > 0:
thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', ''))
thumbnail_url = img_thumbnail[0].attrib.get('src', '')
roles = self.get_roles()
......@@ -192,26 +194,26 @@ class PersonPage(Page):
return person
def iter_movies_ids(self):
for role_div in self.parser.select(self.document.getroot(), 'div#filmography div.filmo-category-section > div'):
for a in self.parser.select(role_div, 'a'):
for role_div in self.doc.xpath('//div[@id="filmography"]//div[has-class("filmo-category-section")]/div'):
for a in role_div.xpath('.//a'):
m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
if m:
yield m.group(1)
def get_roles(self):
roles = {}
for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.head'):
role = self.parser.select(role_div, 'a')[-1].text
for role_div in self.doc.xpath('//div[@id="filmography"]/div[has-class("head")]'):
role = role_div.xpath('.//a')[-1].text
roles[role] = []
category = role_div.attrib.get('data-category')
for infos in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
for infos in self.doc.xpath('//div[@id="filmography"]/div[has-class("filmo-category-section")]/div'):
if category in infos.attrib.get('id'):
roles[role].append(('N/A',infos.text_content().replace('\n', ' ').strip()))
return roles
def iter_movies(self, role_filter=None):
for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
for a in self.parser.select(role_div, 'a'):
for role_div in self.doc.xpath('//div[@id="filmography"]/div[has-class("filmo-category-section")]/div'):
for a in role_div.xpath('.//a'):
m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
if m:
yield Movie(m.group(1), a.text)
......@@ -39,6 +39,7 @@ funmooc/
groupamaes/
hsbc/
hybride/
imdb/
indeed/
infomaniak/
ing/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment