diff --git a/modules/allrecipes/browser.py b/modules/allrecipes/browser.py index d45e643353d639facd7c77fc40241d4e265afb7b..0281b800538cbf887b653cda1e10f50843e5dad8 100644 --- a/modules/allrecipes/browser.py +++ b/modules/allrecipes/browser.py @@ -16,36 +16,24 @@ # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . - - -from weboob.deprecated.browser import Browser, BrowserHTTPNotFound - -from .pages import RecipePage, ResultsPage, FourOFourPage - +from weboob.browser import PagesBrowser, URL +from .pages import ResultsPage, RecipePage __all__ = ['AllrecipesBrowser'] -class AllrecipesBrowser(Browser): - DOMAIN = 'allrecipes.com' - PROTOCOL = 'http' - ENCODING = 'utf-8' - USER_AGENT = Browser.USER_AGENTS['wget'] - PAGES = { - 'http://allrecipes.com/search/default.aspx\?qt=k&wt=.*&rt=r&origin=.*': ResultsPage, - 'http://allrecipes.com/Recipe/.*/Detail.aspx': RecipePage, - 'http://allrecipes.com/404.aspx.*': FourOFourPage - } +class AllrecipesBrowser(PagesBrowser): + BASEURL = 'http://allrecipes.com' + results = URL('search/results/\?wt=(?P.*)\&sort=re', + 'recipes/.*', ResultsPage) + recipe = URL('recipe/(?P<_id>.*)/', RecipePage) def iter_recipes(self, pattern): - self.location('http://allrecipes.com/search/default.aspx?qt=k&wt=%s&rt=r&origin=Home%%20Page' % (pattern)) - assert self.is_on_page(ResultsPage) - return self.page.iter_recipes() - - def get_recipe(self, id): - try: - self.location('http://allrecipes.com/Recipe/%s/Detail.aspx' % id) - except BrowserHTTPNotFound: - return - if self.is_on_page(RecipePage): - return self.page.get_recipe(id) + return self.results.go(pattern=pattern).iter_recipes() + + def get_recipe(self, _id, obj=None): + recipe = self.recipe.go(_id=_id).get_recipe(obj=obj) + comments = list(self.page.get_comments()) + if comments: + recipe.comments = comments + return recipe diff --git a/modules/allrecipes/module.py b/modules/allrecipes/module.py index f629b90fa3350cbdd6316b88b348078633c708d1..f4623ad2549981f898ca94c42dec48d042df6266 100644 --- a/modules/allrecipes/module.py +++ b/modules/allrecipes/module.py @@ -43,19 +43,8 @@ def iter_recipes(self, pattern): return self.browser.iter_recipes(quote_plus(pattern.encode('utf-8'))) def fill_recipe(self, recipe, fields): - if 'nb_person' in fields or 'instructions' in fields: - rec = self.get_recipe(recipe.id) - recipe.picture_url = rec.picture_url - recipe.instructions = rec.instructions - recipe.ingredients = rec.ingredients - recipe.comments = rec.comments - recipe.author = rec.author - recipe.nb_person = rec.nb_person - recipe.cooking_time = rec.cooking_time - recipe.preparation_time = rec.preparation_time - + if 'nb_person' in fields or 'instructions' in fields or 'thumbnail_url' in fields: + recipe = self.browser.get_recipe(recipe.id, recipe) return recipe - OBJECTS = { - Recipe: fill_recipe, - } + OBJECTS = {Recipe: fill_recipe} diff --git a/modules/allrecipes/pages.py b/modules/allrecipes/pages.py index 8fc91e35f4788bc2e269a47e86a26cfc8c419387..9ab519ae72db773a35b6b6b50cbb0df663da4650 100644 --- a/modules/allrecipes/pages.py +++ b/modules/allrecipes/pages.py @@ -18,109 +18,80 @@ # along with weboob. If not, see . -from weboob.capabilities.recipe import Recipe -from weboob.capabilities.base import NotAvailable, NotLoaded -from weboob.deprecated.browser import Page - - -class FourOFourPage(Page): - pass - - -class ResultsPage(Page): - """ Page which contains results as a list of recipies - """ - - def iter_recipes(self): - for div in self.parser.select(self.document.getroot(), 'div.recipe-info'): - thumbnail_url = NotAvailable - short_description = NotAvailable - imgs = self.parser.select(div.getparent(), 'img') - if len(imgs) > 0: - url = unicode(imgs[0].attrib.get('src', '')) - if url.startswith('http://'): - thumbnail_url = url - - link = self.parser.select(div, 'a.title', 1) - title = unicode(link.text) - id = unicode(link.attrib.get('href', '').split('/')[2]) - - recipe = Recipe(id, title) - recipe.thumbnail_url = thumbnail_url - recipe.short_description = short_description - recipe.instructions = NotLoaded - recipe.ingredients = NotLoaded - recipe.nb_person = NotLoaded - recipe.cooking_time = NotLoaded - recipe.preparation_time = NotLoaded - recipe.author = NotLoaded - yield recipe - - -class RecipePage(Page): - """ Page which contains a recipe - """ - - def get_recipe(self, id): - title = NotAvailable - preparation_time = NotAvailable - cooking_time = NotAvailable - author = NotAvailable - nb_person = NotAvailable - ingredients = NotAvailable - picture_url = NotAvailable - instructions = NotAvailable - comments = NotAvailable - - title = unicode(self.parser.select(self.document.getroot(), 'h1#itemTitle', 1).text) - imgillu = self.parser.select(self.document.getroot(), 'img#imgPhoto') - if len(imgillu) > 0: - picture_url = unicode(imgillu[0].attrib.get('src', '')) - - ingredients = [] - l_ing = self.parser.select(self.document.getroot(), 'li#liIngredient') - for ing in l_ing: - ingtxt = unicode(ing.text_content().strip()) - if ingtxt != '': - ingredients.append(ingtxt) - - instructions = u'' - l_divinst = self.parser.select(self.document.getroot(), 'div.directLeft li') - num_instr = 1 - for inst in l_divinst: - instructions += '%s: %s\n' % (num_instr, inst.text_content()) - num_instr += 1 - - prepmin = 0 - emprep = self.parser.select(self.document.getroot(), 'span#prepHoursSpan em') - if len(emprep) > 0: - prepmin += int(emprep[0].text) * 60 - emprep = self.parser.select(self.document.getroot(), 'span#prepMinsSpan em') - if len(emprep) > 0: - prepmin += int(emprep[0].text) - if prepmin != 0: - preparation_time = prepmin - cookmin = 0 - emcooktime = self.parser.select(self.document.getroot(), 'span#cookHoursSpan em') - if len(emcooktime) > 0: - cookmin += int(emcooktime[0].text) * 60 - emcooktime = self.parser.select(self.document.getroot(), 'span#cookMinsSpan em') - if len(emcooktime) > 0: - cookmin += int(emcooktime[0].text) - if cookmin != 0: - cooking_time = cookmin - l_nbpers = self.parser.select(self.document.getroot(), 'span#lblYield[itemprop=recipeYield]') - if len(l_nbpers) > 0 and 'servings' in l_nbpers[0].text: - nb_person = [int(l_nbpers[0].text.split()[0])] - - recipe = Recipe(id, title) - recipe.preparation_time = preparation_time - recipe.cooking_time = cooking_time - recipe.nb_person = nb_person - recipe.ingredients = ingredients - recipe.instructions = instructions - recipe.picture_url = picture_url - recipe.comments = comments - recipe.author = author - recipe.thumbnail_url = NotLoaded - return recipe +from weboob.browser.pages import HTMLPage, pagination +from weboob.browser.elements import ItemElement, ListElement, method +from weboob.capabilities.recipe import Recipe, Comment +from weboob.capabilities.base import NotAvailable +from weboob.browser.filters.standard import Regexp, CleanText, Env, Duration +from weboob.browser.filters.html import CleanHTML + +import re + + +class CookingDuration(Duration): + _regexp = re.compile(r'PT((?P\d+)H)?((?P\d+)M)?((?P\d+)S)?') + + +class ResultsPage(HTMLPage): + @pagination + @method + class iter_recipes(ListElement): + item_xpath = '//article[@class="grid-col--fixed-tiles"]' + + def next_page(self): + return CleanText('//button[@id="btnMoreResults"]/@href')(self) + + class item(ItemElement): + klass = Recipe + + obj_id = Regexp(CleanText('./a[1]/@href'), + '/recipe/(.*)/') + obj_title = CleanText('./a/h3') + obj_short_description = CleanText('./a/div/div[@class="rec-card__description"]') + + +class RecipePage(HTMLPage): + @method + class get_recipe(ItemElement): + klass = Recipe + + obj_id = Env('_id') + obj_title = CleanText('//h1[@itemprop="name"]') + + def obj_preparation_time(self): + dt = CookingDuration(CleanText('//time[@itemprop="prepTime"]/@datetime'))(self) + return int(dt.total_seconds() / 60) + + def obj_cooking_time(self): + dt = CookingDuration(CleanText('//time[@itemprop="cookTime"]/@datetime'))(self) + return int(dt.total_seconds() / 60) + + def obj_nb_person(self): + nb_pers = CleanText('//meta[@id="metaRecipeServings"]/@content')(self) + return [nb_pers] if nb_pers else NotAvailable + + def obj_ingredients(self): + ingredients = [] + for el in self.el.xpath('//ul[has-class("checklist")]/li/label/span[@itemprop="ingredients"]'): + ing = CleanText('.')(el) + if ing: + ingredients.append(ing) + return ingredients + + obj_instructions = CleanHTML('//ol[@itemprop="recipeInstructions"]') + obj_thumbnail_url = CleanText('//section[has-class("hero-photo")]/span/a/img/@src') + + obj_picture_url = CleanText('//section[has-class("hero-photo")]/span/a/img/@src') + + @method + class get_comments(ListElement): + item_xpath = '//div[@itemprop="review"]' + ignore_duplicate = True + + class item(ItemElement): + klass = Comment + + obj_author = CleanText('./article/a/div/a/ul/li/h4[@itemprop="author"]') + obj_rate = CleanText('./article/div/div[@class="rating-stars"]/@data-ratingstars') + obj_text = CleanText('./p[@itemprop="reviewBody"]') + obj_id = CleanText('./article/a/@href') diff --git a/modules/allrecipes/test.py b/modules/allrecipes/test.py index 799098769d44ed7142a7291d5d75c4815cb40389..fc04c144ec28b5b138bfbea3f3752fbcb55542b2 100644 --- a/modules/allrecipes/test.py +++ b/modules/allrecipes/test.py @@ -19,14 +19,16 @@ from weboob.tools.test import BackendTest +import itertools + class AllrecipesTest(BackendTest): MODULE = 'allrecipes' def test_recipe(self): - recipes = self.backend.iter_recipes('french fries') - for recipe in recipes: - full_recipe = self.backend.get_recipe(recipe.id) - assert full_recipe.instructions - assert full_recipe.ingredients - assert full_recipe.title + recipes = list(itertools.islice(self.backend.iter_recipes('french fries'), 0, 20)) + assert len(recipes) + full_recipe = self.backend.get_recipe(recipes[0].id) + assert full_recipe.instructions + assert full_recipe.ingredients + assert full_recipe.title