diff --git a/modules/s2e/browser.py b/modules/s2e/browser.py index b6e7f653cf4ccfcebab7aaeffb983a05ba85dbed..37f84281a58fada2b435704d77e35d52750b3918 100644 --- a/modules/s2e/browser.py +++ b/modules/s2e/browser.py @@ -329,7 +329,23 @@ def iter_documents(self): # we might land on the documents page, but sometimes we land on user info "tab" self.page.select_documents_tab() self.page.show_more() - return self.page.iter_documents() + + # Sometimes two documents have the same ID (same date and same type) + existing_id = set() + for document in self.page.iter_documents(): + if document._url_id in existing_id: + id_suffix = 1 + while '%s-%s' % (document._url_id, id_suffix) in existing_id: + id_suffix += 1 + if id_suffix > 5: + # Avoid infinite loops in case of an issue + # There shouldn't be that many documents with the same id, we let it raise an exception + break + document.id = '%s-%s' % (document._url_id, id_suffix) + else: + document.id = document._url_id + existing_id.add(document.id) + yield document class EsaliaBrowser(S2eBrowser): diff --git a/modules/s2e/pages.py b/modules/s2e/pages.py index 26541f13083a6607d175b0d44943a188a4f286af..bed1679c05e2d77c4bb9747ce3c6586916502a01 100644 --- a/modules/s2e/pages.py +++ b/modules/s2e/pages.py @@ -1161,5 +1161,6 @@ class item(ItemElement): # Note: the id is constructed from the file name, which gives us some interesting information: # - Document date # Ex: RDCdirect_28112018link - obj_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ') + # Using _url_id instead of id because of duplicate IDs which are managed in the browser + obj__url_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ') obj_type = MapIn(Field('label'), DOCUMENT_TYPE_LABEL, default=DocumentTypes.OTHER)