From 77dd6f84e2be0f1bb8296792c9aacd40ac9ca805 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Tue, 2 Apr 2013 21:30:19 +0200 Subject: [PATCH] imdb: HTMLParser can replace latin2unicode fully --- modules/imdb/browser.py | 42 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index e4059f2fd0..5da86c15a1 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -18,7 +18,7 @@ # along with weboob. If not, see . -import HTMLParser +from HTMLParser import HTMLParser from weboob.tools.browser import BaseBrowser, BrowserHTTPNotFound from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.cinema import Movie, Person @@ -47,6 +47,7 @@ class ImdbBrowser(BaseBrowser): def iter_movies(self, pattern): res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s' % pattern.encode('utf-8')) jres = json.loads(res) + htmlparser = HTMLParser() for cat in ['title_popular', 'title_exact', 'title_approx']: if cat in jres: for m in jres[cat]: @@ -56,11 +57,11 @@ def iter_movies(self, pattern): 0].strip(', '), tdesc.split('>')[1].split('<')[0]) else: short_description = tdesc.strip(', ') - movie = Movie(m['id'], latin2unicode(m['title'])) + movie = Movie(m['id'], htmlparser.unescape(m['title'])) movie.other_titles = NotLoaded movie.release_date = NotLoaded movie.duration = NotLoaded - movie.short_description = latin2unicode(short_description) + movie.short_description = htmlparser.unescape(short_description) movie.pitch = NotLoaded movie.country = NotLoaded movie.note = NotLoaded @@ -72,10 +73,11 @@ def iter_movies(self, pattern): def iter_persons(self, pattern): res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8')) jres = json.loads(res) + htmlparser = HTMLParser() for cat in ['name_popular', 'name_exact', 'name_approx']: if cat in jres: for p in jres[cat]: - person = Person(p['id'], latin2unicode(unicode(p['name']))) + person = Person(p['id'], htmlparser.unescape(unicode(p['name']))) person.real_name = NotLoaded person.birth_place = NotLoaded person.birth_date = NotLoaded @@ -83,7 +85,7 @@ def iter_persons(self, pattern): person.gender = NotLoaded person.nationality = NotLoaded person.short_biography = NotLoaded - person.short_description = latin2unicode(p['description']) + person.short_description = htmlparser.unescape(p['description']) person.roles = NotLoaded person.thumbnail_url = NotLoaded yield person @@ -95,7 +97,7 @@ def get_movie(self, id): jres = json.loads(res) else: return None - htmlparser = HTMLParser.HTMLParser() + htmlparser = HTMLParser() title = NotAvailable duration = NotAvailable @@ -208,31 +210,3 @@ def get_movie_releases(self, id, country): self.location('http://www.imdb.com/title/%s/releaseinfo' % id) assert self.is_on_page(ReleasePage) return self.page.get_movie_releases(country) - - -dict_hex = {'á': u'á', - 'é': u'é', - 'è': u'è', - 'í': u'í', - 'ñ': u'ñ', - 'ó': u'ó', - 'ú': u'ú', - 'ü': u'ü', - '&': u'&', - ''': u"'", - 'à': u'à', - 'À': u'À', - 'â': u'â', - 'É': u'É', - 'ë': u'ë', - 'ô': u'ô', - 'ö': u'ö', - 'ä': u'ä', - 'ç': u'ç' - } - - -def latin2unicode(word): - for key in dict_hex.keys(): - word = word.replace(key, dict_hex[key]) - return unicode(word) -- GitLab