From f1079c783892360dedace28571143b271d23dc62 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Wed, 7 Oct 2020 20:32:48 +0200 Subject: [PATCH] [pagesjaunes] fix broken backend --- modules/pagesjaunes/browser.py | 9 +++--- modules/pagesjaunes/module.py | 1 - modules/pagesjaunes/pages.py | 58 +++++++++++++++++++++++----------- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/modules/pagesjaunes/browser.py b/modules/pagesjaunes/browser.py index 19fac9ce25..4a4955dc54 100644 --- a/modules/pagesjaunes/browser.py +++ b/modules/pagesjaunes/browser.py @@ -30,8 +30,10 @@ class PagesjaunesBrowser(PagesBrowser): BASEURL = 'https://www.pagesjaunes.fr' - search = URL('/recherche/(?P[a-z0-9-]+)/(?P[a-z0-9-]+)', ResultsPage) - company = URL('/pros/\d+', PlacePage) + search = URL( + r'/annuaire/chercherlespros\?quoiqui=(?P[a-z0-9-]+)&ou=(?P[a-z0-9-]+)&page=(?P\d+)', + ResultsPage) + company = URL(r'/pros/\d+', PlacePage) def simplify(self, name): return re.sub(r'[^a-z0-9-]+', '-', name.lower()) @@ -40,11 +42,10 @@ def search_contacts(self, query): assert query.name assert query.city - self.search.go(city=self.simplify(query.city), pattern=self.simplify(query.name)) + self.search.go(city=self.simplify(query.city), pattern=self.simplify(query.name), page=1) return self.page.iter_contacts() def fill_hours(self, contact): self.location(contact.url) contact.opening = OpeningHours() contact.opening.rules = list(self.page.iter_hours()) - diff --git a/modules/pagesjaunes/module.py b/modules/pagesjaunes/module.py index 5c2003f033..620ed4fc95 100644 --- a/modules/pagesjaunes/module.py +++ b/modules/pagesjaunes/module.py @@ -49,4 +49,3 @@ def fill_contact(self, obj, fields): OBJECTS = { Place: fill_contact, } - diff --git a/modules/pagesjaunes/pages.py b/modules/pagesjaunes/pages.py index 1e129e7541..f6188b050b 100644 --- a/modules/pagesjaunes/pages.py +++ b/modules/pagesjaunes/pages.py @@ -24,52 +24,72 @@ from dateutil import rrule from weboob.browser.elements import method, ListElement, ItemElement -from weboob.browser.filters.standard import CleanText, Regexp -from weboob.browser.filters.html import AbsoluteLink, HasElement -from weboob.browser.pages import HTMLPage +from weboob.browser.filters.standard import CleanText, Regexp, Field, Env, BrowserURL +from weboob.browser.filters.html import AbsoluteLink, HasElement, XPath +from weboob.browser.pages import HTMLPage, pagination from weboob.capabilities.base import NotLoaded, NotAvailable from weboob.capabilities.contact import Place, OpeningRule class ResultsPage(HTMLPage): + @pagination @method class iter_contacts(ListElement): - item_xpath = '//section[@id="listResults"]/article' + item_xpath = '//section[@id="listResults"]/ul/li' + + def next_page(self): + if XPath('//div/@class="pagination"', default=False)(self): + next_page = int(Env('page')(self)) + 1 + return BrowserURL('search', + city=Env('city'), + pattern=Env('pattern'), + page=next_page)(self) class item(ItemElement): klass = Place obj_name = CleanText('.//a[has-class("denomination-links")]') obj_address = CleanText('.//a[has-class("adresse")]') - obj_phone = Regexp( - CleanText( - './/div[has-class("tel-zone")][span[contains(text(),"Tél")]]//strong[@class="num"]', - replace=[(' ', '')]), r'^0(\d{9})$', r'+33\1') - obj_url = AbsoluteLink('.//a[has-class("denomination-links")]') + + def obj_phone(self): + tel = [] + for _ in XPath( + './/div[has-class("tel-zone")][span[contains(text(),"Tél")]]//strong[@class="num"]')(self): + tel.append(Regexp(CleanText('.', replace=[(' ', '')]), r'^0(\d{9})$', r'+33\1')(_)) + + return " / ".join(tel) + + def obj_url(self): + if CleanText('.//a[has-class("denomination-links")]/@href', replace=[('#', '')])(self): + return AbsoluteLink('.//a[has-class("denomination-links")]')(self) + return NotAvailable + obj_opening = HasElement('.//span[text()="Horaires"]', NotLoaded, NotAvailable) class PlacePage(HTMLPage): @method class iter_hours(ListElement): - item_xpath = '//ul[@class="liste-horaires-principaux"]/li[@class="horaire-ouvert"]' + item_xpath = '//div[@id="infos-horaires"]/ul/li[@class="horaire-ouvert"]' class item(ItemElement): klass = OpeningRule def obj_dates(self): - wday = CleanText('./span')(self) - wday = ['lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche'].index(wday) + wday = CleanText('./p')(self) + wday = ['lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche'].index(wday.lower()) assert wday >= 0 - return rrule.rrule(rrule.DAILY, byweekday=wday) + return rrule.rrule(rrule.DAILY, byweekday=wday, count=1) def obj_times(self): times = [] - for sub in self.el.xpath('.//li[@itemprop]'): - t = CleanText('./@content')(sub) - m = re.match(r'\w{2} (\d{2}):(\d{2})-(\d{2}):(\d{2})$', t) - m = [int(x) for x in m.groups()] - times.append((time(m[0], m[1]), time(m[2], m[3]))) + for sub in XPath('.//li')(self): + t = CleanText('.')(sub) + m = re.match(r'(\d{2})h(\d{2}) - (\d{2})h(\d{2})$', t) + if m: + m = [int(x) for x in m.groups()] + times.append((time(m[0], m[1]), time(m[2], m[3]))) return times - obj_is_open = True + def obj_is_open(self): + return len(Field('times')(self)) > 0 -- GitLab