From ad3de2eb3ce89a460329b8b08fa624a3b7eeca9c Mon Sep 17 00:00:00 2001 From: blckshrk Date: Sun, 3 Nov 2013 11:35:19 +0100 Subject: [PATCH] Bugs fix and improvement of the coverage. All fixes done are basically CSS selection corrections due to few changing in the HTML structure of pages. --- modules/imdb/pages.py | 26 +++++++++++++++----------- modules/imdb/test.py | 23 +++++++++++++++++++++-- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index 43f5171582..af91b43697 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -23,6 +23,7 @@ from weboob.tools.browser import BasePage from datetime import datetime +import re __all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'FilmographyPage', 'ReleasePage'] @@ -33,11 +34,13 @@ class ReleasePage(BasePage): ''' def get_movie_releases(self, country_filter): result = unicode() - links = self.parser.select(self.document.getroot(), 'b a') + links = self.parser.select(self.document.getroot(), 'table#release_dates a') for a in links: href = a.attrib.get('href', '') + + # XXX: search() could raise an exception if href.strip('/').split('/')[0] == 'calendar' and\ - (country_filter is None or href.split('region=')[-1].lower() == country_filter): + (country_filter is None or re.search('region=([a-zA-Z]+)&', href).group(1).lower() == country_filter): country = a.text td_date = self.parser.select(a.getparent().getparent().getparent(), 'td')[1] date_links = self.parser.select(td_date, 'a') @@ -74,14 +77,15 @@ class MovieCrewPage(BasePage): ''' def iter_persons(self, role_filter=None): if (role_filter is None or (role_filter is not None and role_filter == 'actor')): - tables = self.parser.select(self.document.getroot(), 'table.cast') + tables = self.parser.select(self.document.getroot(), 'table.cast_list') if len(tables) > 0: table = tables[0] - tds = self.parser.select(table, 'td.nm') + tds = self.parser.select(table, 'td.itemprop') + for td in tds: - id = td.find('a').attrib.get('href', '').strip('/').split('/')[-1] + id = td.find('a').attrib.get('href', '').strip('/').split('/')[1] name = unicode(td.find('a').text) - char_name = unicode(self.parser.select(td.getparent(), 'td.char', 1).text_content()) + char_name = unicode(self.parser.select(td.getparent(), 'td.character', 1).text_content()) person = Person(id, name) person.short_description = char_name person.real_name = NotLoaded @@ -95,7 +99,7 @@ def iter_persons(self, role_filter=None): person.thumbnail_url = NotLoaded yield person - for gloss_link in self.parser.select(self.document.getroot(), 'table[cellspacing=1] h5 a'): + for gloss_link in self.parser.select(self.document.getroot(), 'table[cellspacing="1"] h5 a'): role = gloss_link.attrib.get('name', '').rstrip('s') if (role_filter is None or (role_filter is not None and role == role_filter)): tbody = gloss_link.getparent().getparent().getparent().getparent() @@ -114,12 +118,12 @@ def iter_persons(self, role_filter=None): # yield self.browser.get_person(id) def iter_persons_ids(self): - tables = self.parser.select(self.document.getroot(), 'table.cast') + tables = self.parser.select(self.document.getroot(), 'table.cast_list') if len(tables) > 0: table = tables[0] - tds = self.parser.select(table, 'td.nm') + tds = self.parser.select(table, 'td.itemprop') for td in tds: - id = td.find('a').attrib.get('href', '').strip('/').split('/')[-1] + id = td.find('a').attrib.get('href', '').strip('/').split('/')[1] yield id @@ -152,7 +156,7 @@ def get_person(self, id): real_name = unicode(a.text.strip()) elif 'birth_place' in href: birth_place = unicode(a.text.lower().strip()) - names = self.parser.select(td_overview, 'h1[itemprop=name]') + names = self.parser.select(td_overview, 'h1 span[itemprop=name]') if len(names) > 0: name = unicode(names[0].text.strip()) times = self.parser.select(td_overview, 'time[itemprop=birthDate]') diff --git a/modules/imdb/test.py b/modules/imdb/test.py index aae7ba7168..81e6685413 100644 --- a/modules/imdb/test.py +++ b/modules/imdb/test.py @@ -19,39 +19,45 @@ from weboob.tools.test import BackendTest - class ImdbTest(BackendTest): BACKEND = 'imdb' def test_search_movie(self): movies = list(self.backend.iter_movies('spiderman')) + assert len(movies) > 0 for movie in movies: assert movie.id def test_get_movie(self): movie = self.backend.get_movie('tt0079980') + assert movie assert movie.id assert movie.original_title def test_search_person(self): persons = list(self.backend.iter_persons('dewaere')) + assert len(persons) > 0 for person in persons: assert person.id def test_get_person(self): person = self.backend.get_person('nm0223033') + assert person assert person.id assert person.name assert person.birth_date def test_movie_persons(self): persons = list(self.backend.iter_movie_persons('tt0079980')) + assert len(persons) > 0 for person in persons: assert person.id assert person.name + assert person.short_description def test_person_movies(self): movies = list(self.backend.iter_person_movies('nm0223033')) + assert len(movies) > 0 for movie in movies: assert movie.id assert movie.original_title @@ -62,6 +68,19 @@ def test_get_person_biography(self): assert bio is not None def test_get_movie_releases(self): - rel = self.backend.get_movie_releases('tt0079980') + rel = self.backend.get_movie_releases('tt0079980', 'fr') assert rel != '' assert rel is not None + assert rel == 'France : 25 April 1979' + + def test_iter_person_movies_ids(self): + movies_ids = list(self.backend.iter_person_movies_ids('nm0223033')) + assert len(movies_ids) > 0 + for movie_id in movies_ids: + assert movie_id + + def test_iter_movie_persons_ids(self): + persons_ids = list(self.backend.iter_movie_persons_ids('tt0079980')) + assert len(persons_ids) > 0 + for person_id in persons_ids: + assert person_id -- GitLab