Commit a240f4f1 authored by Christophe Francois's avatar Christophe Francois Committed by Vincent A

[s2e] Manage documents with the same ID

Some documents have the same label and the same date. In this case their
ID is the same, there is no usable info on the website to discriminate them,
we do it manually.
parent 92970a13
......@@ -329,7 +329,23 @@ class S2eBrowser(LoginBrowser, StatesMixin):
# we might land on the documents page, but sometimes we land on user info "tab"
self.page.select_documents_tab()
self.page.show_more()
return self.page.iter_documents()
# Sometimes two documents have the same ID (same date and same type)
existing_id = set()
for document in self.page.iter_documents():
if document._url_id in existing_id:
id_suffix = 1
while '%s-%s' % (document._url_id, id_suffix) in existing_id:
id_suffix += 1
if id_suffix > 5:
# Avoid infinite loops in case of an issue
# There shouldn't be that many documents with the same id, we let it raise an exception
break
document.id = '%s-%s' % (document._url_id, id_suffix)
else:
document.id = document._url_id
existing_id.add(document.id)
yield document
class EsaliaBrowser(S2eBrowser):
......
......@@ -1161,5 +1161,6 @@ class EServicePage(LoggedPage, HTMLPage):
# Note: the id is constructed from the file name, which gives us some interesting information:
# - Document date
# Ex: RDCdirect_28112018link
obj_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ')
# Using _url_id instead of id because of duplicate IDs which are managed in the browser
obj__url_id = CleanText(QueryValue(obj_url, 'titrePDF'), symbols='/ ')
obj_type = MapIn(Field('label'), DOCUMENT_TYPE_LABEL, default=DocumentTypes.OTHER)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment