From a240f4f1b0a16412e55b86adc7d8c3e81b62400c Mon Sep 17 00:00:00 2001 From: Christophe Francois Date: Thu, 13 Aug 2020 11:28:27 +0200 Subject: [PATCH] [s2e] Manage documents with the same ID Some documents have the same label and the same date. In this case their ID is the same, there is no usable info on the website to discriminate them, we do it manually. --- modules/s2e/browser.py | 18 +++++++++++++++++- modules/s2e/pages.py | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/modules/s2e/browser.py b/modules/s2e/browser.py index b6e7f653cf..37f84281a5 100644 --- a/modules/s2e/browser.py +++ b/modules/s2e/browser.py @@ -329,7 +329,23 @@ def iter_documents(self): # we might land on the documents page, but sometimes we land on user info "tab" self.page.select_documents_tab() self.page.show_more() - return self.page.iter_documents() + + # Sometimes two documents have the same ID (same date and same type) + existing_id = set() + for document in self.page.iter_documents(): + if document._url_id in existing_id: + id_suffix = 1 + while '%s-%s' % (document._url_id, id_suffix) in existing_id: + id_suffix += 1 + if id_suffix > 5: + # Avoid infinite loops in case of an issue + # There shouldn't be that many documents with the same id, we let it raise an exception + break + document.id = '%s-%s' % (document._url_id, id_suffix) + else: + document.id = document._url_id + existing_id.add(document.id) + yield document class EsaliaBrowser(S2eBrowser): diff --git a/modules/s2e/pages.py b/modules/s2e/pages.py index 26541f1308..bed1679c05 100644 --- a/modules/s2e/pages.py +++ b/modules/s2e/pages.py @@ -1161,5 +1161,6 @@ class item(ItemElement): # Note: the id is constructed from the file name, which gives us some interesting information: # - Document date # Ex: RDCdirect_28112018link - obj_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ') + # Using _url_id instead of id because of duplicate IDs which are managed in the browser + obj__url_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ') obj_type = MapIn(Field('label'), DOCUMENT_TYPE_LABEL, default=DocumentTypes.OTHER) -- GitLab