Coverage for src/ptf/cmds/ptf_cmds/archive_ptf_cmds.py: 7%
322 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import datetime
2import os
3import shutil
4import subprocess
6import lxml.etree as etree
7import requests
9from django.conf import settings
11from ptf import model_helpers
12from ptf.cmds.database_cmds import baseCmd
13from ptf.display import resolver
15from .base_ptf_cmds import exportExtraDataPtfCmd
16from .base_ptf_cmds import exportPtfCmd
19def create_toc_xml(colid, issues):
20 """
21 Create the toc.xml file for the collection
22 """
23 if os.access(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid), os.R_OK):
24 os.chdir(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid))
25 if os.access("toc.xml", os.R_OK):
26 os.remove("toc.xml")
27 journal = etree.Element("journal")
29 for issue in issues:
30 new_node = etree.Element("journal-meta")
32 issue_id = etree.Element("journal-id") # Par exemple : JEP
33 issue_id.text = colid
34 new_node.append(issue_id)
36 year = etree.Element("year") # Par exemple : 2019
37 year.text = issue.year
38 new_node.append(year)
40 if len(issue.vseries) != 0:
41 series = etree.Element("series") # Peut être null
42 series.text = issue.vseries
43 new_node.append(series)
45 if len(issue.volume) != 0:
46 volume = etree.Element("volume") # Par exemple : 1
47 volume.text = issue.volume
48 new_node.append(volume)
50 if len(issue.number) != 0:
51 number = etree.Element("number") # Peut être null
52 number.text = issue.number
53 new_node.append(number)
55 folder = etree.Element("folder")
56 folder.text = issue.pid
57 new_node.append(folder)
59 journal.append(new_node)
61 node_str = etree.tostring(journal, pretty_print=True, encoding="unicode")
62 toc_file = open("toc.xml", "w+") # Création du fichier toc.xml
63 toc_file.write(node_str)
64 toc_file.close()
67class archiveCollectionPtfCmd(baseCmd):
68 """
69 Archive the collection on disk
70 """
72 def __init__(self, params=None):
73 super().__init__(params)
74 if params is None:
75 params = {}
76 else:
77 self.pid = params["colid"]
78 self.mathdoc_archive = settings.MATHDOC_ARCHIVE_FOLDER
79 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER
80 self.issues = params["issues"]
81 self.required_params.extend(["colid"])
83 def internal_do(self):
84 exportPtfCmd(
85 {
86 "pid": self.pid, # On exporte la collection
87 "export_folder": self.mathdoc_archive, # On exporte dans le répertoire de la collection
88 "for_archive": True, # On exporte pour l'archive
89 "with_internal_data": False, # On n'exporte pas les données internes
90 "with_binary_files": True, # On exporte les fichiers binaires (ex : img)
91 "binary_files_folder": self.binary_files_folder,
92 }
93 ).do() # On exporte dans le répertoire temporaire
95 create_toc_xml(self.pid, self.issues)
98class archiveIssuePtfCmd(baseCmd):
99 """
100 Archive the issue on disk
101 """
103 def __init__(self, params=None):
104 self.pid = None # container pid
105 self.article = None # Allow archiving of only 1 article
106 self.skip_pdfa = False
107 self.xml_only = False # Geodesic needs to archive only the XML
108 if "issue" in params.keys():
109 self.issue = params["issue"]
110 else:
111 self.issue = None
113 # The derived archiveNumdamIssuePtfCmd class has other default values
114 if not hasattr(self, "export_folder"):
115 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER
116 if not hasattr(self, "binary_files_folder"):
117 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER
119 super().__init__(params)
121 self.required_params.extend(["pid"])
123 # Pass an article to archive only 1 article
124 def set_article(self, article):
125 self.article = article
127 def archive_tex_src(self, article, colid, tex_src_folder, tex_article_folder):
128 src_folder = os.path.join(tex_src_folder, tex_article_folder)
129 dest_folder = os.path.join(self.export_folder, article.get_relative_folder(), "src/tex/")
131 resolver.create_folder(dest_folder)
133 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:
134 file_.write(f"Create {dest_folder}\n")
136 # 1. Copy tex file
137 tex_file = os.path.join(src_folder, tex_article_folder + ".tex")
138 resolver.copy_file(tex_file, dest_folder)
140 # 2. sty files
141 sty_files = [
142 os.path.join(src_folder, f)
143 for f in os.listdir(src_folder)
144 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".sty")
145 ]
146 for sty_file in sty_files:
147 resolver.copy_file(sty_file, dest_folder)
149 # 3. bib file
150 aux_file = os.path.join(src_folder, tex_article_folder + ".aux")
151 if os.path.isfile(aux_file):
152 cmd_str = "cd " + src_folder + "; grep bibdata " + aux_file
153 try:
154 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8")
155 for line in result.split("\n"):
156 words = line.split("{")
157 if len(words) > 1:
158 line = words[1]
159 words = line.split("}")
160 if len(words) > 1:
161 base_file = words[0]
163 full_src_file = os.path.join(src_folder, base_file + ".bib")
164 if os.path.isfile(full_src_file):
165 full_dest_file = os.path.join(dest_folder, base_file + ".bib")
166 resolver.copy_file(full_src_file, full_dest_file)
167 except subprocess.CalledProcessError as e:
168 if e.returncode != 1: # grep returns 1 if nothing was found
169 message = f'Error {e.returncode} with "{cmd_str}": {e.output}'
170 raise RuntimeError(message)
171 else:
172 base_file = resolver.get_bibtex_from_tex(tex_file)
173 if base_file:
174 full_src_file = os.path.join(src_folder, base_file + ".bib")
175 if os.path.isfile(full_src_file):
176 full_dest_file = os.path.join(dest_folder, base_file + ".bib")
177 resolver.copy_file(full_src_file, full_dest_file)
179 # 4. cdrdoidates, figures/ folder,...
180 cmd_str = (
181 "cd "
182 + src_folder
183 + r"; grep -v /usr/local/texlive/ *.fls | grep '\./' | grep -v '\.out$' | sort -u"
184 )
186 try:
187 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8")
188 for line in result.split("\n"):
189 words = line.split(" ")
190 if len(words) > 1:
191 file = words[1]
192 file_folder = os.path.dirname(file)
194 full_src_file = os.path.join(src_folder, file)
195 if os.path.isfile(full_src_file):
196 file_dest_folder = os.path.join(dest_folder, file_folder)
197 resolver.create_folder(file_dest_folder)
199 full_dest_file = os.path.join(dest_folder, file)
200 resolver.copy_file(full_src_file, full_dest_file)
201 except subprocess.CalledProcessError as e:
202 if e.returncode != 1: # grep returns 1 if nothing was found
203 message = f'Error {e.returncode} with "{cmd_str}": {e.output}'
204 raise RuntimeError(message)
206 def create_pdfa(self, colid, article_pid):
207 """
208 Create the pdfa files of the pdfs associated with the collection
209 """
211 in_file = os.path.join(
212 self.export_folder, colid, self.pid, article_pid, article_pid + ".pdf"
213 )
214 out_file = os.path.join(
215 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"
216 )
218 code_return = os.system(
219 f"gs -dSAFER -dBATCH -DNOPAUSE -sPAPERSIZE=halfletter -dPDFFitPage -dFIXEDMEDIA "
220 f"-dEmbedAllFonts=true -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPrinted=false -dPDFSETTINGS=/printer "
221 f"-q -o {out_file} {in_file}"
222 ) # On convertit le fichier pdf en fichier pdfa
223 if code_return != 0: # Si le code retour de la commande n'est pas 0, il y a eu une erreur
224 with open(
225 os.path.join(settings.LOG_DIR, "archive_error.log"), "a", encoding="utf-8"
226 ) as file_:
227 file_.write(article_pid + " , PDF/A\n")
229 raise RuntimeError(f"Le fichier {in_file} n'a pas pu être converti en PDFA")
231 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:
232 file_.write(f"Write {out_file}\n")
234 def backup_pdfa_if_necessary(self, colid, article_pids):
235 """
236 Since it is time consuming to create a PDF/A, check if they have to be re-created
237 before deleting the /mathdoc_archive issue folder.
239 If so, backup the PDF/A in a temp folder. They will be put back in /mathdoc_archive in create_or_restore_pdfa()
241 To check, we compare the date of the article PDF between
242 - /mathdoc_archive and
243 - self.binary_files_folder (/mersenne_prod_data or /numdam_data)
244 """
246 pdfas = {}
247 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid)
248 resolver.create_folder(tmp_folder)
250 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:
251 file_.write(f"Create {tmp_folder}\n")
253 # Before deleting the issue folder, check if we have to recreate the PDF/A (save them to a tmp folder)
254 # If the PDF has been updated in /mersenne_prod_data, we recreate the PDF/A
255 for article_pid in article_pids:
256 pdf_in_archive = resolver.get_disk_location(
257 self.export_folder, colid, "pdf", self.pid, article_pid
258 )
259 pdf_in_prod = resolver.get_disk_location(
260 self.binary_files_folder, colid, "pdf", self.pid, article_pid
261 )
263 do_create_pdfa = True
265 if os.path.isfile(pdf_in_prod) and os.path.isfile(pdf_in_archive):
266 pdfa_in_archive = os.path.join(
267 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"
268 )
270 date_pdf_in_archive = datetime.datetime.fromtimestamp(
271 os.stat(pdf_in_prod).st_mtime
272 ).strftime("%Y-%m-%d")
273 date_pdf_in_prod = datetime.datetime.fromtimestamp(
274 os.stat(pdf_in_archive).st_mtime
275 ).strftime("%Y-%m-%d")
277 do_create_pdfa = (
278 not os.path.isfile(pdfa_in_archive) or date_pdf_in_prod != date_pdf_in_archive
279 )
281 pdfas[article_pid] = do_create_pdfa
282 if not do_create_pdfa:
283 # Copy the PDF/A in the temp folder
284 src_pdfa = os.path.join(
285 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"
286 )
287 dest_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf")
288 resolver.copy_file(src_pdfa, dest_pdfa)
290 with open(
291 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
292 ) as file_:
293 file_.write(f"Backup {dest_pdfa}\n")
295 return pdfas
297 def create_or_restore_pdfa(self, colid, article_pids, pdfas):
298 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid)
300 for article_pid in article_pids:
301 if pdfas[article_pid]:
302 self.create_pdfa(colid, article_pid)
303 else:
304 src_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf")
305 dest_pdfa = os.path.join(
306 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"
307 )
308 resolver.copy_file(src_pdfa, dest_pdfa)
310 with open(
311 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
312 ) as file_:
313 file_.write(f"Restore {dest_pdfa}\n")
315 if os.path.isdir(tmp_folder):
316 shutil.rmtree(tmp_folder)
318 with open(
319 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
320 ) as file_:
321 file_.write(f"Delete {tmp_folder}\n")
323 def archive_files(self, colid, container, articles, article_pids, pdfas):
324 # II. Copy binary files (PDF...)
325 for a in articles:
326 article_folder = a.get_relative_folder()
328 with open(
329 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
330 ) as file_:
331 file_.write(f"Delete {article_folder}\n")
333 resolver.delete_object_folder(article_folder, to_folder=self.export_folder)
334 resolver.copy_binary_files(a, self.binary_files_folder, self.export_folder)
336 params = {"pid": self.pid, "export_folder": self.export_folder, "export_all": False}
337 exportExtraDataPtfCmd(params).do()
339 tex_src_folder = resolver.get_cedram_issue_tex_folder(colid, self.pid)
340 tex_folders, _ = resolver.get_cedram_tex_folders(colid, self.pid)
342 # III. Articles written in LaTeX. We need to archive files needed to re-compile the LaTex source code
343 if len(tex_folders) > 0:
344 i = 0
345 for article in container.article_set.exclude(do_not_publish=True):
346 if self.article is None or self.article.pid == article.pid:
347 self.archive_tex_src(article, colid, tex_src_folder, tex_folders[i])
349 i += 1
351 # IV. Digitized papers (Numdam). We basically need to archive the images (*.tif)
352 copy_numdam_src_files(colid, self.pid, article_pids, self.export_folder)
354 # V. Create PDF/A
355 if not self.skip_pdfa:
356 self.create_or_restore_pdfa(colid, article_pids, pdfas)
358 def internal_do(self):
359 super().internal_do()
361 if self.article is None:
362 container = model_helpers.get_container(self.pid, prefetch=False)
363 qs = container.article_set.all()
364 article_pids = list(qs.values_list("pid", flat=True))
365 articles = qs
366 else:
367 container = self.article.my_container
368 article_pids = [self.article.pid]
369 articles = [self.article]
371 colid = container.get_top_collection().pid
372 self.pid = container.pid
374 pdfas = {}
375 if not self.xml_only and not self.skip_pdfa:
376 # Backup PDF/A before deleting the issue folder
377 # (it is time consuming to create a PDF/A, we will check if they have to be re-created)
378 pdfas = self.backup_pdfa_if_necessary(colid, article_pids)
380 # Delete the issue folder if we archive an issue
381 if self.article is None:
382 issue_folder = container.get_relative_folder()
384 with open(
385 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
386 ) as file_:
387 file_.write(f"Delete {self.export_folder}/{issue_folder}\n")
389 resolver.delete_object_folder(issue_folder, to_folder=self.export_folder)
391 # I. Always archive the issue XML, even if we archive only 1 article
392 exportPtfCmd(
393 {
394 "pid": self.pid,
395 "export_folder": self.export_folder,
396 "with_binary_files": self.article
397 is None, # binary files for 1 article are copied below
398 "for_archive": True,
399 "binary_files_folder": self.binary_files_folder,
400 }
401 ).do()
403 if not self.xml_only:
404 self.archive_files(colid, container, articles, article_pids, pdfas)
407def archive_numdam_xml(colid, pid, export_folder):
408 """
409 Get the XML of a collection or an issue
410 """
411 url = settings.NUMDAM_URL + "/api-item-xml/"
412 if pid is None:
413 url += colid
414 else:
415 url += pid
416 response = requests.get(url)
417 response.raise_for_status()
419 xml_body = response.content.decode("utf-8")
421 if xml_body:
422 file = resolver.get_archive_filename(export_folder, colid, pid, "xml", True)
424 with open(file, "w", encoding="utf-8") as f:
425 f.write(xml_body)
428def get_numdam_issues_list(colid):
429 """
430 Get the list of issues of a collection from numdam.org
431 """
432 response = requests.get(f"{settings.NUMDAM_URL}/api-issues/{colid}")
433 response.raise_for_status()
435 return response.json()["issues"]
438def get_numdam_file_list(colid, pid):
439 """
440 Get the list of files to archive (of a collection or an issue)
441 The files are those visible by the user, like PDF or DjVus
442 """
443 url = settings.NUMDAM_URL + "/api-item-file-list/"
444 if pid is None:
445 url += colid
446 else:
447 url += pid
448 response = requests.get(url)
449 response.raise_for_status()
451 data = response.json()
452 return data
455def copy_numdam_src_files(colid, pid, article_pids, export_folder, log_file=None):
456 src_folder = os.path.join(settings.NUMDAM_ISSUE_SRC_FOLDER, colid, pid)
458 if not os.path.isdir(src_folder):
459 return
461 # 1. Files related to the issue
462 dest_folder = os.path.join(
463 export_folder, resolver.get_relative_folder(colid, pid), "src/digitisation/"
464 )
466 if log_file:
467 log_file.write("Create " + dest_folder + "...")
469 resolver.create_folder(dest_folder)
471 if log_file:
472 log_file.write("done\n")
474 # 1a. issue.xml
475 full_src_file = os.path.join(src_folder, pid + ".xml")
476 if os.path.isfile(full_src_file):
477 full_dest_file = os.path.join(dest_folder, pid + ".xml")
479 if log_file:
480 log_file.write(f"Copy {os.path.basename(full_src_file)} ...")
482 resolver.copy_file(full_src_file, full_dest_file)
484 if log_file:
485 log_file.write("done\n")
487 # 1b. tif, jpg files
488 img_files = [
489 os.path.join(src_folder, f)
490 for f in os.listdir(src_folder)
491 if (
492 os.path.isfile(os.path.join(src_folder, f))
493 and (f.endswith(".tif") or f.endswith(".jpg"))
494 )
495 ]
497 for img_file in img_files:
498 if log_file:
499 log_file.write(f"Copy {os.path.basename(img_file)} ...")
501 resolver.copy_file(img_file, dest_folder)
503 if log_file:
504 log_file.write("done\n")
506 # 2. Files related to articles
507 for article_pid in article_pids:
508 src_folder = os.path.join(settings.NUMDAM_ARTICLE_SRC_FOLDER, colid, pid, article_pid)
510 dest_folder = os.path.join(
511 export_folder,
512 resolver.get_relative_folder(colid, pid, article_pid),
513 "src/digitisation/",
514 )
516 if log_file:
517 log_file.write("Create " + dest_folder + "...")
519 resolver.create_folder(dest_folder)
521 if log_file:
522 log_file.write("done\n")
524 # 2a. article.xml (Full Text)
525 full_src_file = os.path.join(src_folder, article_pid + ".xml")
526 if os.path.isfile(full_src_file):
527 full_dest_file = os.path.join(dest_folder, article_pid + ".xml")
529 if log_file:
530 log_file.write(f"Copy {os.path.basename(full_src_file)} ...")
532 resolver.copy_file(full_src_file, full_dest_file)
534 if log_file:
535 log_file.write("done\n")
537 # 2b. tif, jpg files
538 img_files = [
539 os.path.join(src_folder, f)
540 for f in os.listdir(src_folder)
541 if (
542 os.path.isfile(os.path.join(src_folder, f))
543 and (f.endswith(".tif") or f.endswith(".jpg"))
544 )
545 ]
547 for img_file in img_files:
548 if log_file:
549 log_file.write(f"Copy {os.path.basename(img_file)} ...")
551 resolver.copy_file(img_file, dest_folder)
553 if log_file:
554 log_file.write("done\n")
556 # PDF/DJVU without headers
557 # Olivier 09/05/2019: these files are almost identicals to the final PDFs
558 # There is no need to archive them. Just use the final PDF and remove the first page
559 # if needed
561 # # 2c. pdf files (without header)
562 # full_src_file = os.path.join(src_folder, article_pid + '.pdf')
563 # if os.path.isfile(full_src_file):
564 # full_dest_file = os.path.join(dest_folder, article_pid + '.pdf')
565 # if file:
566 # file.write("Copy {} ...".format(os.path.basename(full_src_file)))
567 # copy_file(full_src_file, full_dest_file)
568 # if file:
569 # file.write("done\n")
570 #
573# def copy_numdam_djvu(colid, pid, article_pids, export_folder, log_file=None):
574# """
575# Djvu might not be visible/listed in centre Mersenne articles, but might exist in Numdam
576# """
577#
578# if hasattr(settings, "NUMDAM_DATA_ROOT"):
579# for article_pid in article_pids:
580# article_folder = resolver.get_relative_folder(colid, pid, article_pid)
581# full_src_file = os.path.join(
582# settings.NUMDAM_DATA_ROOT, article_folder, article_pid + ".djvu"
583# )
584# if os.path.isfile(full_src_file):
585# full_dest_file = os.path.join(export_folder, article_folder, article_pid + ".djvu")
586# if log_file:
587# log_file.write(f"Copy {os.path.basename(full_src_file)} ...")
588# resolver.copy_file(full_src_file, full_dest_file)
589# if log_file:
590# log_file.write("done\n")
593class archiveNumdamResourcePtfCmd(archiveIssuePtfCmd):
594 """
595 Archive a Container or a Collection (just the collection level) stored in Numdam
596 """
598 def __init__(self, params=None):
599 self.colid = None # self.pid from the base class is the id of the container
600 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER
601 self.binary_files_folder = settings.NUMDAM_DATA_ROOT
603 super().__init__(params)
605 self.required_params.extend(["colid"])
606 # self.pid is optional when you want to archive a Collection
607 self.required_params = [id for id in self.required_params if id != "pid"]
609 def internal_do(self):
610 """
611 Archive files of Numdam.
612 - Send http requests to numdam.org to get the list of user files (PDF/DjVu/XML) to preserve
613 - Copy these files to self.export_older (/mathdoc_archive)
614 - Copy src files (mainly digitized TIF/JPG files) from /numdam_dev
616 Warning: this class does not operate with Resource objects stored in the database (Collection, Container...)
617 since the data comes from numdam.org
618 Information is only based on pids (collection, issue, article)
619 """
621 # 1. Get the list of user files to archive (files visible by the user, like PDF or DjVus) from numdam.org
622 data = get_numdam_file_list(self.colid, self.pid)
623 pdfas = {}
624 article_pids = []
626 # 2. Prepare the backup or an Issue (backup, delete previous folder)
627 if self.pid is not None: # Archive an issue
628 article_pids = [item["pid"] for item in data["articles"]] if "articles" in data else []
630 # Backup PDF/A before deleting the issue folder
631 # (it is time consuming to create a PDF/A, we will check if they have to be re-created)
632 pdfas = self.backup_pdfa_if_necessary(self.colid, article_pids)
634 issue_folder = resolver.get_relative_folder(self.colid, self.pid)
636 with open(
637 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"
638 ) as file_:
639 file_.write(f"Delete {self.export_folder}/{issue_folder}\n")
641 # Delete the issue folder
642 resolver.delete_object_folder(issue_folder, self.export_folder)
644 # 3. Archive the JATS XML of the pid
645 # TODO: The XML coming from Numdam does not list PDF/A in the <self-uri> of the articles, only PDF/DjVu
646 # create_or_restore_pdfa is going to add the PDF/A on disk
647 # We should modify the xml to add the PDF/A
648 archive_numdam_xml(self.colid, self.pid, self.export_folder)
650 # 4. Archive the user files (list gotten in 2.)
651 if "files" in data:
652 # Files of a Collection or a Container
653 resolver.copy_binary_files(
654 None,
655 self.binary_files_folder,
656 self.export_folder,
657 data["files"],
658 )
660 if "articles" in data:
661 # In case of a Container, files of each article
662 for article_data in data["articles"]:
663 resolver.copy_binary_files(
664 None,
665 self.binary_files_folder,
666 self.export_folder,
667 article_data["files"],
668 )
670 if self.pid is not None:
671 # 5. Archive the src files (tiff, pdf/djvu without headers,...)
672 copy_numdam_src_files(self.colid, self.pid, article_pids, self.export_folder)
674 # 6. Create PDF/A
675 self.create_or_restore_pdfa(self.colid, article_pids, pdfas)