Coverage for src/ptf/display/resolver.py: 36%
343 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import os
2import shutil
3import time
5from django.conf import settings
7from ptf.cmds.xml import xml_utils
9NOW = time.gmtime()[0]
11extids_hrefs = {
12 "doi": "https://doi.org/{0}",
13 "zbl-item-id": "https://zbmath.org/?q=an:{0}",
14 "jfm-item-id": "https://zbmath.org/?q=an:{0}",
15 "mr-item-id": "https://mathscinet.ams.org/mathscinet-getitem?mr={0}",
16 "nmid": "/item/{0}",
17 "numdam-id": "/item/{0}",
18 "mathdoc-id": "/item/{0}",
19 "eudml-item-id": "https://eudml.org/doc/{0}",
20 "sps-id": "http://sites.mathdoc.fr/cgi-bin/spitem?id={0}",
21 "arxiv": "https://arxiv.org/abs/{0}",
22 "hal": "https://hal.archives-ouvertes.fr/{0}",
23 "tel": "https://tel.archives-ouvertes.fr/{0}",
24 "theses.fr": "https://theses.fr/{0}",
25 "orcid": "https://orcid.org/{0}",
26 "idref": "https://www.idref.fr/{0}",
27 "semantic-scholar": "https://www.semanticscholar.org/paper/{0}",
28 "pmid": "https://pubmed.ncbi.nlm.nih.gov/{0}",
29 "ark": "http://ark.bnf.fr/{0}",
30}
33def resolve_id(id_type: str, id_value: str, force_numdam=False):
34 if id_type == "mr-item-id" and "#" in id_value: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 id_value = id_value.replace(" #", ":")
36 elif id_type == "nmid" and force_numdam: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true
37 return f"http://www.numdam.org/item/{id_value}"
38 elif id_type == "eudml-item-id":
39 values = id_value.split(":")
40 if len(values) > 0: 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was always true
41 id_value = values[-1]
43 if id_type in extids_hrefs: 43 ↛ 45line 43 didn't jump to line 45 because the condition on line 43 was always true
44 return extids_hrefs[id_type].format(id_value)
45 return ""
48def find_id_type(id):
49 id_type = None
50 if id.find("10.") == 0:
51 id_type = "doi"
52 elif id.find("hal-") == 0:
53 id_type = "hal"
54 elif id.lower().find("arxiv:") == 0: 54 ↛ 64line 54 didn't jump to line 64 because the condition on line 54 was always true
55 id_type = "arxiv"
57 # if (len(id) == 9 or len(id) == 10) and id.find(".") == 5:
58 # year = id[0:1]
59 # month = id[2:3]
60 # sequence = id[5:]
61 # if year.is_numeric() and month.is_numeric() and 1 < int(month) < 13 and sequence.is_numeric():
62 # id_type = "arxiv"
64 return id_type
67def get_mimetype(filename):
68 type_extension = {
69 "pdf": "application/pdf",
70 "djvu": "image/x.djvu",
71 "tex": "application/x-tex",
72 "png": "image/png",
73 "jpg": "image/jpeg",
74 }
76 basename = os.path.basename(filename)
77 lower_basename = basename.lower()
78 extension = os.path.splitext(lower_basename)[1][1:]
79 mimetype = type_extension.get(extension, "")
80 return mimetype
83def get_article_base_url():
84 return settings.ARTICLE_BASE_URL
87def get_issue_base_url():
88 return settings.ISSUE_BASE_URL
91def get_icon_base_url():
92 return settings.ICON_BASE_URL
95def get_icon_url(id_, filename):
96 href = get_icon_base_url() + filename
97 # path = get_relative_file_path(id, filename)
98 # if os.path.isabs(path):
99 # path = path[1:]
100 # href = os.path.join(get_icon_base_url(), path)
101 return href
104def get_doi_url(doi):
105 href = settings.DOI_BASE_URL + doi
106 return href
109def get_relative_folder(collection_id, container_id=None, article_id=None):
110 folder = collection_id
111 if container_id: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 folder += "/" + container_id
113 if article_id: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 folder += "/" + article_id
115 return folder
118def embargo(wall, year):
119 result = False
120 y = NOW
122 if wall:
123 try:
124 y = int(year.split("-")[0])
125 except BaseException:
126 pass
128 result = NOW - y <= wall
130 return result
133# Iterate a folder with a collection
134# The folder must look like @COL/@ISSUE/@ISSUE.XML
137def iterate_collection_folder(folder, pid, first_issue=""):
138 root_folder = os.path.join(folder, pid)
140 start = len(first_issue) == 0
141 # first_issue = 'CRMATH_2008__346_1-2'
142 for item in sorted(os.listdir(root_folder)):
143 if not start and item == first_issue:
144 start = True
145 if start: # and item != 'CRMATH_2015__353_2':
146 dir = os.path.join(root_folder, item)
147 if os.path.isdir(dir):
148 file = os.path.join(root_folder, item, item + ".xml")
149 if os.path.isfile(file):
150 yield item, file
151 file = os.path.join(root_folder, item, item + "-cdrxml.xml")
152 if os.path.isfile(file):
153 yield item, file
156def create_folder(folder):
157 try:
158 os.makedirs(folder)
159 except BaseException:
160 pass
162 if not os.path.isdir(folder): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise RuntimeError("Unable to create " + folder)
166def copy_folder(from_dir, to_dir):
167 if os.path.isdir(from_dir):
168 create_folder(to_dir)
170 for f in os.listdir(from_dir):
171 from_path = os.path.join(from_dir, f)
172 if os.path.isfile(from_path):
173 copy_file(from_path, to_dir)
174 if os.path.isdir(from_path):
175 copy_folder(from_path, os.path.join(to_dir, f))
178def copy_file(from_path, to_path):
179 if os.path.isfile(from_path): 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 if os.path.isdir(to_path):
181 to_path = os.path.join(to_path, os.path.basename(from_path))
182 if to_path.startswith(settings.MATHDOC_ARCHIVE_FOLDER):
183 # copy2 attempts to preserve all file metadata
184 # on /mathdoc_archive, we don't want to preserve the mode, just the dates
185 shutil.copyfile(from_path, to_path)
186 shutil.copystat(from_path, to_path)
187 else:
188 shutil.copy2(from_path, to_path)
191def copy_html_images_in_folder(colid, issue, article_to_copy, from_folder, to_folder, folder_name):
192 # copy depuis archive, directement tout le répertoire contenant les images
193 dest_folder = os.path.join(
194 to_folder,
195 get_relative_folder(colid, issue.pid, article_to_copy.pid),
196 folder_name,
197 )
198 if os.path.isdir(dest_folder):
199 try:
200 shutil.rmtree(dest_folder)
201 except OSError:
202 message = "Unable to remove " + dest_folder
203 raise RuntimeError(message)
205 src_folder = os.path.join(
206 from_folder,
207 get_relative_folder(colid, issue.pid, article_to_copy.pid),
208 folder_name,
209 )
210 if os.path.isdir(src_folder):
211 copy_folder(src_folder, dest_folder)
214def copy_html_images(resource, to_folder, from_folder):
215 """
216 Copy the figures associated with the HTML body of an article
217 if from_archive:
218 Images are in settings.MATHDOC/@colid/@issue_id/@a_id/src/tex/figures/
219 if from_cedram:
220 Images are in settings.CEDRAM_TEX_FOLDER/@colid/@issue_id/@tex_aid/Fulltext/figures/
222 @param resource:
223 @param to_folder:
224 @param from_folder:
225 @return: nothing
226 """
228 if resource.classname != "Article":
229 return
231 article_to_copy = resource
232 issue = article_to_copy.my_container
233 colid = article_to_copy.get_collection().pid
235 if from_folder == settings.CEDRAM_XML_FOLDER:
236 tex_src_folder = get_cedram_issue_tex_folder(colid, issue.pid)
237 tex_folders, _ = get_cedram_tex_folders(colid, issue.pid)
239 if len(tex_folders) > 0:
240 i = 0
241 for article in issue.article_set.all():
242 if article_to_copy.pid == article.pid:
243 # l'ordre d'enregistrement des articles dans la bdd est important : l'ordre du tex est SENSE correspondre au xml de l'issue
245 dest_folder = os.path.join(
246 to_folder,
247 get_relative_folder(colid, issue.pid, article.pid),
248 "src/tex/figures",
249 )
251 if os.path.isdir(dest_folder):
252 try:
253 shutil.rmtree(dest_folder)
254 except OSError:
255 message = "Unable to remove " + dest_folder
256 raise RuntimeError(message)
258 src_folder = os.path.join(
259 tex_src_folder, tex_folders[i], "FullText", "figures"
260 )
261 qs = article.relatedobject_set.filter(rel="html-image")
262 if qs.count() > 0:
263 create_folder(dest_folder)
265 for related_obj in qs:
266 img_file = os.path.join(src_folder, os.path.basename(related_obj.location))
267 copy_file(img_file, dest_folder)
269 i += 1
271 else:
272 # copy depuis archive, directement tout le répertoire contenant les images
273 copy_html_images_in_folder(
274 colid, issue, article_to_copy, from_folder, to_folder, "src/tex/figures"
275 )
276 copy_html_images_in_folder(
277 colid, issue, article_to_copy, from_folder, to_folder, "src/media"
278 )
281def copy_file_obj_to_article_folder(
282 file_obj, colid, issue_pid, article_pid, is_image=False, article_container_pid=None, path=""
283):
284 if not is_image:
285 name, extension = os.path.splitext(file_obj.name)
286 relative_folder = get_relative_folder(colid, issue_pid, article_pid)
287 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder)
288 create_folder(folder)
289 full_file_name = os.path.join(folder, article_pid + extension)
290 relative_file_name = os.path.join(relative_folder, article_pid + extension)
292 with open(full_file_name, "wb+") as destination:
293 for chunk in file_obj.chunks():
294 destination.write(chunk)
296 else:
297 name, extension = os.path.splitext(file_obj.name)
298 relative_folder = get_relative_folder(colid, issue_pid, article_pid)
299 if path:
300 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder, path)
301 else:
302 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder + "/src/media")
303 create_folder(folder)
304 full_file_name = os.path.join(folder, name + extension)
305 with open(full_file_name, "wb+") as destination:
306 for chunk in file_obj.chunks():
307 destination.write(chunk)
309 relative_file_name = os.path.join(relative_folder, article_pid + extension)
311 return relative_file_name
314def copy_binary_files(resource, from_folder, to_folder, binary_files=None):
315 if to_folder is not None and not from_folder == to_folder: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true
316 if binary_files is None:
317 copy_html_images(resource, to_folder, from_folder)
318 binary_files = resource.get_binary_files_location()
320 for file in binary_files:
321 to_path = os.path.join(to_folder, file)
322 dest_folder = os.path.dirname(to_path)
324 os.makedirs(dest_folder, exist_ok=True)
325 skip_copy = False
327 if "http" in file:
328 skip_copy = True
329 from_path = os.path.join(from_folder, file)
331 if not skip_copy and os.path.isfile(from_path):
332 copy_file(from_path, to_path)
335def delete_object_folder(object_folder, to_folder):
336 folder = os.path.join(to_folder, object_folder)
338 # pas de sécurité car pour garder le mode CASCADE de la db, on supprime le rép sans s'occuper de ce qu'il y a dedans
339 # si on veut vérifier, décommenter :
340 # for entry in os.listdir(folder):
341 # if entry.startswith(colid) and os.path.isdir(os.path.join(folder, entry)):
342 # print(entry)
343 # os.path.join(folder, entry)
344 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des articles/containers')
345 #
346 # if verify == True:
347 # for root, dirs, files in os.walk(folder):
348 # if len(files) > 0:
349 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des objects')
351 folder = os.path.normpath(folder)
352 # garde fous :)
353 if folder in [
354 "/mersenne_prod_data",
355 "/mersenne_test_data",
356 "/mathdoc_archive",
357 ] or folder.startswith("/cedram_dev"):
358 raise Exception("Attention, pb avec la suppression de " + folder)
360 if os.path.isdir(folder):
361 shutil.rmtree(folder)
364def delete_file(path):
365 if os.path.isfile(path):
366 os.remove(path)
369def get_disk_location(
370 root_folder, collection_id, ext, container_id=None, article_id=None, do_create_folder=False
371):
372 if do_create_folder: 372 ↛ 373line 372 didn't jump to line 373 because the condition on line 372 was never true
373 folder = os.path.join(root_folder, collection_id)
374 create_folder(folder)
376 if container_id:
377 folder = os.path.join(root_folder, collection_id, container_id)
378 create_folder(folder)
380 if article_id:
381 folder = os.path.join(root_folder, collection_id, container_id, article_id)
382 create_folder(folder)
384 last_id = collection_id
385 filename = os.path.join(root_folder, collection_id)
386 if container_id: 386 ↛ 389line 386 didn't jump to line 389 because the condition on line 386 was always true
387 filename = os.path.join(filename, container_id)
388 last_id = container_id
389 if article_id: 389 ↛ 393line 389 didn't jump to line 393 because the condition on line 389 was always true
390 filename = os.path.join(filename, article_id)
391 last_id = article_id
393 filename = os.path.join(filename, last_id + "." + ext)
395 return filename
398def get_body(filename):
399 with open(filename, encoding="utf-8") as file_:
400 body = file_.read()
401 return body
404def get_archive_filename(root_folder, colid, pid, ext, do_create_folder=False, article_pid=None):
405 """
407 :param root_folder: root_folder of the archive. Ex: /mathdoc_archive
408 :param colid: collection id
409 :param pid: issue id
410 :param ext: filename extension ("xml" or "json")
411 :param create_folder: option to recursively create sub folders
412 :return:
413 """
415 # TODO: call get_disk_location(root_folder, colid, ext, pid, None, do_create_folder)
417 if do_create_folder:
418 folder = os.path.join(root_folder, colid)
419 create_folder(folder)
421 if pid: 421 ↛ 425line 421 didn't jump to line 425 because the condition on line 421 was always true
422 folder = os.path.join(root_folder, colid, pid)
423 create_folder(folder)
425 if article_pid: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true
426 folder = os.path.join(folder, article_pid)
427 create_folder(folder)
429 if pid and article_pid: 429 ↛ 430line 429 didn't jump to line 430 because the condition on line 429 was never true
430 filename = os.path.join(root_folder, colid, pid, article_pid, article_pid + "." + ext)
431 elif pid: 431 ↛ 434line 431 didn't jump to line 434 because the condition on line 431 was always true
432 filename = os.path.join(root_folder, colid, pid, pid + "." + ext)
433 else:
434 filename = os.path.join(root_folder, colid, colid + "." + ext)
436 return filename
439# Read the XML of an issue/collection within an archive folder
440# The folder must look like @COL/@ISSUE/@ISSUE.XML
441# @COL/@COL.XML
444def get_archive_body(root_folder, colid, pid):
445 filename = get_archive_filename(root_folder, colid, pid, "xml")
446 return get_body(filename)
449def is_tex_comment(text, i):
450 is_comment = False
451 while i > 0 and text[i] == " ": 451 ↛ 452line 451 didn't jump to line 452 because the condition on line 451 was never true
452 i -= 1
454 if i >= 0 and text[i] == "%": 454 ↛ 455line 454 didn't jump to line 455 because the condition on line 454 was never true
455 is_comment = True
456 elif i > 0 and text[i] == "~" and text[i - 1] == "%": 456 ↛ 457line 456 didn't jump to line 457 because the condition on line 456 was never true
457 is_comment = True
459 return is_comment
462def is_tex_def(text, i):
463 is_def = False
465 if text[i - 5 : i - 1] == "\\def":
466 is_def = True
468 return is_def
471def is_tex_newcommand(text, i):
472 is_newcommand = False
474 if text[i - 12 : i - 1] == "\\newcommand":
475 is_newcommand = True
477 return is_newcommand
480def get_cedram_issue_tex_folder(colid, issue_id):
481 return os.path.join(settings.CEDRAM_TEX_FOLDER, colid, issue_id)
484def get_cedram_tex_folders(colid, issue_id):
485 """
486 return article filenames in cedram tex issue folder and corresponding doi if present, extracted from issue tex file
487 @param colid:
488 @param issue_id:
489 @return: list of filename, list of doi
490 """
491 filenames = []
492 dois = []
494 body = ""
495 issue_filename = os.path.join(get_cedram_issue_tex_folder(colid, issue_id), issue_id + ".tex")
496 if os.path.isfile(issue_filename):
497 try:
498 with open(issue_filename, encoding="utf-8") as f:
499 body = f.read()
500 except UnicodeDecodeError:
501 with open(issue_filename, encoding="iso-8859-1") as f:
502 body = f.read()
504 lower_body = body.lower()
506 li = []
507 j = body.find("includearticle")
508 if j >= 0:
509 li.append(j)
510 j = body.find("includeprearticle")
511 if j >= 0:
512 li.append(j)
513 j = lower_body.find("includepreface")
514 if j >= 0:
515 li.append(j)
516 i = min(li) if len(li) > 0 else -1
518 while i >= 0:
519 if (
520 i > 1
521 and not is_tex_comment(body, i - 2)
522 and not is_tex_def(body, i)
523 and not is_tex_newcommand(body, i)
524 ):
525 doi = None
526 while body[i] != "{":
527 if len(body) > i + 4 and body[i : i + 4] == "doi=":
528 j = i + 4
529 while body[i] != "," and body[i] != "]":
530 i += 1
531 doi = xml_utils.normalize_space(body[j:i])
532 i += 1
533 i += 1
534 filename = ""
535 while body[i] != "}":
536 filename += body[i]
537 i += 1
538 if len(filename) > 0:
539 filenames.append(filename)
540 dois.append(doi)
541 else:
542 i += 1
544 li = []
545 j = body.find("includearticle", i)
546 if j >= 0:
547 li.append(j)
548 j = body.find("includeprearticle", i)
549 if j >= 0:
550 li.append(j)
551 j = lower_body.find("includepreface", i)
552 if j >= 0:
553 li.append(j)
554 i = min(li) if len(li) > 0 else -1
556 return filenames, dois
559def get_bibtex_from_tex(tex_filename):
560 bibtex_filename = ""
562 body = ""
563 if os.path.isfile(tex_filename): 563 ↛ 585line 563 didn't jump to line 585 because the condition on line 563 was always true
564 try:
565 with open(tex_filename, encoding="utf-8") as f:
566 body = f.read()
567 except UnicodeDecodeError:
568 with open(tex_filename, encoding="iso-8859-1") as f:
569 body = f.read()
571 i = body.find("\\bibliography")
572 while i >= 0:
573 if i > 1 and not is_tex_comment(body, i - 2): 573 ↛ 581line 573 didn't jump to line 581 because the condition on line 573 was always true
574 while body[i] != "{":
575 i += 1
576 i += 1
577 while body[i] != "}":
578 bibtex_filename += body[i]
579 i += 1
580 else:
581 i += 1
583 i = body.find("\\bibliography", i)
585 return bibtex_filename
588PCJ_SECTIONS = {
589 "animsci": "Animal Science",
590 "archaeo": "Archaeology",
591 "ecology": "Ecology",
592 "ecotoxenvchem": "Ecotoxicology & Environmental Chemistry",
593 "evolbiol": "Evolutionary Biology",
594 "forestwoodsci": "Forest & Wood Sciences",
595 "genomics": "Genomics",
596 "healthmovsci": "Health & Movement Sciences",
597 "infections": "Infections",
598 "mcb": "Mathematical & Computational Biology",
599 "microbiol": "Microbiology",
600 "networksci": "Network Science",
601 "neuro": "Neuroscience",
602 "orgstudies": "Organization Studies",
603 "paleo": "Paleontology",
604 "rr": "Registered Reports",
605 "zool": "Zoology",
606}
608PCJ_UGA_SECTION = ["healthmovsci", "rr", "orgstudies"]
609PCJ_CONFERENCES = ["Euring 2023"]
610PCJ_MANDATORY_TOPICS = {
611 "ecology": "Ecology",
612 "evolbiol": "Evolution",
613 "genomics": "Genetics/genomics",
614 "paleo": "Paleontology",
615 "archaeo": "Archaeology",
616 "microbiol": "Microbiology",
617 "neuro": "Neuroscience",
618}
621def get_pci(value):
622 if value in PCJ_SECTIONS: 622 ↛ 624line 622 didn't jump to line 624 because the condition on line 622 was always true
623 return PCJ_SECTIONS[value]
624 return ""
627ARTICLE_TYPES = {
628 "biographical-note": "Notice biographique",
629 "book-review": "Recension d’ouvrage",
630 "clarification": "Mise au point",
631 "congress": "Intervention en colloque",
632 "corrigendum": "Corrigendum",
633 "editorial": "Éditorial",
634 "erratum": "Erratum",
635 "expression-of-concern": "Avertissement des éditeurs",
636 "foreword": "Avant-propos",
637 "guest-editors": "Rédacteurs invités",
638 "historical-commentary": "Commentaire historique",
639 "history-of-sciences": "Histoire des sciences et des idées",
640 "letter": "Commentaire et réponse",
641 "news": "C'est apparu dans la presse",
642 "opinion": "Opinion / Perspective",
643 "preliminary-communication": "Communication préliminaire",
644 "research-article": "Article de recherche",
645 "retraction": "Rétractation",
646 "review": "Article de synthèse",
647 "software-tool": "Outil logiciel",
648}
650# From : apps/ptf/templates/common/externalid_detail.html
651extids_formats: dict[str, str] = {
652 "arxiv": "arXiv",
653 "tel": "TEL",
654 "hal": "HAL",
655 "theses.fr": "theses.fr",
656 "doi": "DOI",
657 "numdam-id": "Numdam",
658 "sps-id": "SPS",
659 "mr-item-id": "MR",
660 "zbl-item-id": "Zbl",
661 "jfm-item-id": "JFM",
662 "eudml-item-id": "EuDML",
663 "pmid": "PMID",
664 "ark": "Gallica",
665}