Coverage for src/ptf/cmds/ptf_cmds/archive_ptf_cmds.py: 7%

322 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1import datetime 

2import os 

3import shutil 

4import subprocess 

5 

6import lxml.etree as etree 

7import requests 

8 

9from django.conf import settings 

10 

11from ptf import model_helpers 

12from ptf.cmds.database_cmds import baseCmd 

13from ptf.display import resolver 

14 

15from .base_ptf_cmds import exportExtraDataPtfCmd 

16from .base_ptf_cmds import exportPtfCmd 

17 

18 

19def create_toc_xml(colid, issues): 

20 """ 

21 Create the toc.xml file for the collection 

22 """ 

23 if os.access(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid), os.R_OK): 

24 os.chdir(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid)) 

25 if os.access("toc.xml", os.R_OK): 

26 os.remove("toc.xml") 

27 journal = etree.Element("journal") 

28 

29 for issue in issues: 

30 new_node = etree.Element("journal-meta") 

31 

32 issue_id = etree.Element("journal-id") # Par exemple : JEP 

33 issue_id.text = colid 

34 new_node.append(issue_id) 

35 

36 year = etree.Element("year") # Par exemple : 2019 

37 year.text = issue.year 

38 new_node.append(year) 

39 

40 if len(issue.vseries) != 0: 

41 series = etree.Element("series") # Peut être null 

42 series.text = issue.vseries 

43 new_node.append(series) 

44 

45 if len(issue.volume) != 0: 

46 volume = etree.Element("volume") # Par exemple : 1 

47 volume.text = issue.volume 

48 new_node.append(volume) 

49 

50 if len(issue.number) != 0: 

51 number = etree.Element("number") # Peut être null 

52 number.text = issue.number 

53 new_node.append(number) 

54 

55 folder = etree.Element("folder") 

56 folder.text = issue.pid 

57 new_node.append(folder) 

58 

59 journal.append(new_node) 

60 

61 node_str = etree.tostring(journal, pretty_print=True, encoding="unicode") 

62 toc_file = open("toc.xml", "w+") # Création du fichier toc.xml 

63 toc_file.write(node_str) 

64 toc_file.close() 

65 

66 

67class archiveCollectionPtfCmd(baseCmd): 

68 """ 

69 Archive the collection on disk 

70 """ 

71 

72 def __init__(self, params=None): 

73 super().__init__(params) 

74 if params is None: 

75 params = {} 

76 else: 

77 self.pid = params["colid"] 

78 self.mathdoc_archive = settings.MATHDOC_ARCHIVE_FOLDER 

79 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER 

80 self.issues = params["issues"] 

81 self.required_params.extend(["colid"]) 

82 

83 def internal_do(self): 

84 exportPtfCmd( 

85 { 

86 "pid": self.pid, # On exporte la collection 

87 "export_folder": self.mathdoc_archive, # On exporte dans le répertoire de la collection 

88 "for_archive": True, # On exporte pour l'archive 

89 "with_internal_data": False, # On n'exporte pas les données internes 

90 "with_binary_files": True, # On exporte les fichiers binaires (ex : img) 

91 "binary_files_folder": self.binary_files_folder, 

92 } 

93 ).do() # On exporte dans le répertoire temporaire 

94 

95 create_toc_xml(self.pid, self.issues) 

96 

97 

98class archiveIssuePtfCmd(baseCmd): 

99 """ 

100 Archive the issue on disk 

101 """ 

102 

103 def __init__(self, params=None): 

104 self.pid = None # container pid 

105 self.article = None # Allow archiving of only 1 article 

106 self.skip_pdfa = False 

107 self.xml_only = False # Geodesic needs to archive only the XML 

108 if "issue" in params.keys(): 

109 self.issue = params["issue"] 

110 else: 

111 self.issue = None 

112 

113 # The derived archiveNumdamIssuePtfCmd class has other default values 

114 if not hasattr(self, "export_folder"): 

115 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER 

116 if not hasattr(self, "binary_files_folder"): 

117 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER 

118 

119 super().__init__(params) 

120 

121 self.required_params.extend(["pid"]) 

122 

123 # Pass an article to archive only 1 article 

124 def set_article(self, article): 

125 self.article = article 

126 

127 def archive_tex_src(self, article, colid, tex_src_folder, tex_article_folder): 

128 src_folder = os.path.join(tex_src_folder, tex_article_folder) 

129 dest_folder = os.path.join(self.export_folder, article.get_relative_folder(), "src/tex/") 

130 

131 resolver.create_folder(dest_folder) 

132 

133 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

134 file_.write(f"Create {dest_folder}\n") 

135 

136 # 1. Copy tex file 

137 tex_file = os.path.join(src_folder, tex_article_folder + ".tex") 

138 resolver.copy_file(tex_file, dest_folder) 

139 

140 # 2. sty files 

141 sty_files = [ 

142 os.path.join(src_folder, f) 

143 for f in os.listdir(src_folder) 

144 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".sty") 

145 ] 

146 for sty_file in sty_files: 

147 resolver.copy_file(sty_file, dest_folder) 

148 

149 # 3. bib file 

150 aux_file = os.path.join(src_folder, tex_article_folder + ".aux") 

151 if os.path.isfile(aux_file): 

152 cmd_str = "cd " + src_folder + "; grep bibdata " + aux_file 

153 try: 

154 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8") 

155 for line in result.split("\n"): 

156 words = line.split("{") 

157 if len(words) > 1: 

158 line = words[1] 

159 words = line.split("}") 

160 if len(words) > 1: 

161 base_file = words[0] 

162 

163 full_src_file = os.path.join(src_folder, base_file + ".bib") 

164 if os.path.isfile(full_src_file): 

165 full_dest_file = os.path.join(dest_folder, base_file + ".bib") 

166 resolver.copy_file(full_src_file, full_dest_file) 

167 except subprocess.CalledProcessError as e: 

168 if e.returncode != 1: # grep returns 1 if nothing was found 

169 message = f'Error {e.returncode} with "{cmd_str}": {e.output}' 

170 raise RuntimeError(message) 

171 else: 

172 base_file = resolver.get_bibtex_from_tex(tex_file) 

173 if base_file: 

174 full_src_file = os.path.join(src_folder, base_file + ".bib") 

175 if os.path.isfile(full_src_file): 

176 full_dest_file = os.path.join(dest_folder, base_file + ".bib") 

177 resolver.copy_file(full_src_file, full_dest_file) 

178 

179 # 4. cdrdoidates, figures/ folder,... 

180 cmd_str = ( 

181 "cd " 

182 + src_folder 

183 + r"; grep -v /usr/local/texlive/ *.fls | grep '\./' | grep -v '\.out$' | sort -u" 

184 ) 

185 

186 try: 

187 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8") 

188 for line in result.split("\n"): 

189 words = line.split(" ") 

190 if len(words) > 1: 

191 file = words[1] 

192 file_folder = os.path.dirname(file) 

193 

194 full_src_file = os.path.join(src_folder, file) 

195 if os.path.isfile(full_src_file): 

196 file_dest_folder = os.path.join(dest_folder, file_folder) 

197 resolver.create_folder(file_dest_folder) 

198 

199 full_dest_file = os.path.join(dest_folder, file) 

200 resolver.copy_file(full_src_file, full_dest_file) 

201 except subprocess.CalledProcessError as e: 

202 if e.returncode != 1: # grep returns 1 if nothing was found 

203 message = f'Error {e.returncode} with "{cmd_str}": {e.output}' 

204 raise RuntimeError(message) 

205 

206 def create_pdfa(self, colid, article_pid): 

207 """ 

208 Create the pdfa files of the pdfs associated with the collection 

209 """ 

210 

211 in_file = os.path.join( 

212 self.export_folder, colid, self.pid, article_pid, article_pid + ".pdf" 

213 ) 

214 out_file = os.path.join( 

215 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

216 ) 

217 

218 code_return = os.system( 

219 f"gs -dSAFER -dBATCH -DNOPAUSE -sPAPERSIZE=halfletter -dPDFFitPage -dFIXEDMEDIA " 

220 f"-dEmbedAllFonts=true -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPrinted=false -dPDFSETTINGS=/printer " 

221 f"-q -o {out_file} {in_file}" 

222 ) # On convertit le fichier pdf en fichier pdfa 

223 if code_return != 0: # Si le code retour de la commande n'est pas 0, il y a eu une erreur 

224 with open( 

225 os.path.join(settings.LOG_DIR, "archive_error.log"), "a", encoding="utf-8" 

226 ) as file_: 

227 file_.write(article_pid + " , PDF/A\n") 

228 

229 raise RuntimeError(f"Le fichier {in_file} n'a pas pu être converti en PDFA") 

230 

231 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

232 file_.write(f"Write {out_file}\n") 

233 

234 def backup_pdfa_if_necessary(self, colid, article_pids): 

235 """ 

236 Since it is time consuming to create a PDF/A, check if they have to be re-created 

237 before deleting the /mathdoc_archive issue folder. 

238 

239 If so, backup the PDF/A in a temp folder. They will be put back in /mathdoc_archive in create_or_restore_pdfa() 

240 

241 To check, we compare the date of the article PDF between 

242 - /mathdoc_archive and 

243 - self.binary_files_folder (/mersenne_prod_data or /numdam_data) 

244 """ 

245 

246 pdfas = {} 

247 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid) 

248 resolver.create_folder(tmp_folder) 

249 

250 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

251 file_.write(f"Create {tmp_folder}\n") 

252 

253 # Before deleting the issue folder, check if we have to recreate the PDF/A (save them to a tmp folder) 

254 # If the PDF has been updated in /mersenne_prod_data, we recreate the PDF/A 

255 for article_pid in article_pids: 

256 pdf_in_archive = resolver.get_disk_location( 

257 self.export_folder, colid, "pdf", self.pid, article_pid 

258 ) 

259 pdf_in_prod = resolver.get_disk_location( 

260 self.binary_files_folder, colid, "pdf", self.pid, article_pid 

261 ) 

262 

263 do_create_pdfa = True 

264 

265 if os.path.isfile(pdf_in_prod) and os.path.isfile(pdf_in_archive): 

266 pdfa_in_archive = os.path.join( 

267 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

268 ) 

269 

270 date_pdf_in_archive = datetime.datetime.fromtimestamp( 

271 os.stat(pdf_in_prod).st_mtime 

272 ).strftime("%Y-%m-%d") 

273 date_pdf_in_prod = datetime.datetime.fromtimestamp( 

274 os.stat(pdf_in_archive).st_mtime 

275 ).strftime("%Y-%m-%d") 

276 

277 do_create_pdfa = ( 

278 not os.path.isfile(pdfa_in_archive) or date_pdf_in_prod != date_pdf_in_archive 

279 ) 

280 

281 pdfas[article_pid] = do_create_pdfa 

282 if not do_create_pdfa: 

283 # Copy the PDF/A in the temp folder 

284 src_pdfa = os.path.join( 

285 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

286 ) 

287 dest_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf") 

288 resolver.copy_file(src_pdfa, dest_pdfa) 

289 

290 with open( 

291 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

292 ) as file_: 

293 file_.write(f"Backup {dest_pdfa}\n") 

294 

295 return pdfas 

296 

297 def create_or_restore_pdfa(self, colid, article_pids, pdfas): 

298 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid) 

299 

300 for article_pid in article_pids: 

301 if pdfas[article_pid]: 

302 self.create_pdfa(colid, article_pid) 

303 else: 

304 src_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf") 

305 dest_pdfa = os.path.join( 

306 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

307 ) 

308 resolver.copy_file(src_pdfa, dest_pdfa) 

309 

310 with open( 

311 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

312 ) as file_: 

313 file_.write(f"Restore {dest_pdfa}\n") 

314 

315 if os.path.isdir(tmp_folder): 

316 shutil.rmtree(tmp_folder) 

317 

318 with open( 

319 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

320 ) as file_: 

321 file_.write(f"Delete {tmp_folder}\n") 

322 

323 def archive_files(self, colid, container, articles, article_pids, pdfas): 

324 # II. Copy binary files (PDF...) 

325 for a in articles: 

326 article_folder = a.get_relative_folder() 

327 

328 with open( 

329 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

330 ) as file_: 

331 file_.write(f"Delete {article_folder}\n") 

332 

333 resolver.delete_object_folder(article_folder, to_folder=self.export_folder) 

334 resolver.copy_binary_files(a, self.binary_files_folder, self.export_folder) 

335 

336 params = {"pid": self.pid, "export_folder": self.export_folder, "export_all": False} 

337 exportExtraDataPtfCmd(params).do() 

338 

339 tex_src_folder = resolver.get_cedram_issue_tex_folder(colid, self.pid) 

340 tex_folders, _ = resolver.get_cedram_tex_folders(colid, self.pid) 

341 

342 # III. Articles written in LaTeX. We need to archive files needed to re-compile the LaTex source code 

343 if len(tex_folders) > 0: 

344 i = 0 

345 for article in container.article_set.exclude(do_not_publish=True): 

346 if self.article is None or self.article.pid == article.pid: 

347 self.archive_tex_src(article, colid, tex_src_folder, tex_folders[i]) 

348 

349 i += 1 

350 

351 # IV. Digitized papers (Numdam). We basically need to archive the images (*.tif) 

352 copy_numdam_src_files(colid, self.pid, article_pids, self.export_folder) 

353 

354 # V. Create PDF/A 

355 if not self.skip_pdfa: 

356 self.create_or_restore_pdfa(colid, article_pids, pdfas) 

357 

358 def internal_do(self): 

359 super().internal_do() 

360 

361 if self.article is None: 

362 container = model_helpers.get_container(self.pid, prefetch=False) 

363 qs = container.article_set.all() 

364 article_pids = list(qs.values_list("pid", flat=True)) 

365 articles = qs 

366 else: 

367 container = self.article.my_container 

368 article_pids = [self.article.pid] 

369 articles = [self.article] 

370 

371 colid = container.get_top_collection().pid 

372 self.pid = container.pid 

373 

374 pdfas = {} 

375 if not self.xml_only and not self.skip_pdfa: 

376 # Backup PDF/A before deleting the issue folder 

377 # (it is time consuming to create a PDF/A, we will check if they have to be re-created) 

378 pdfas = self.backup_pdfa_if_necessary(colid, article_pids) 

379 

380 # Delete the issue folder if we archive an issue 

381 if self.article is None: 

382 issue_folder = container.get_relative_folder() 

383 

384 with open( 

385 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

386 ) as file_: 

387 file_.write(f"Delete {self.export_folder}/{issue_folder}\n") 

388 

389 resolver.delete_object_folder(issue_folder, to_folder=self.export_folder) 

390 

391 # I. Always archive the issue XML, even if we archive only 1 article 

392 exportPtfCmd( 

393 { 

394 "pid": self.pid, 

395 "export_folder": self.export_folder, 

396 "with_binary_files": self.article 

397 is None, # binary files for 1 article are copied below 

398 "for_archive": True, 

399 "binary_files_folder": self.binary_files_folder, 

400 } 

401 ).do() 

402 

403 if not self.xml_only: 

404 self.archive_files(colid, container, articles, article_pids, pdfas) 

405 

406 

407def archive_numdam_xml(colid, pid, export_folder): 

408 """ 

409 Get the XML of a collection or an issue 

410 """ 

411 url = settings.NUMDAM_URL + "/api-item-xml/" 

412 if pid is None: 

413 url += colid 

414 else: 

415 url += pid 

416 response = requests.get(url) 

417 response.raise_for_status() 

418 

419 xml_body = response.content.decode("utf-8") 

420 

421 if xml_body: 

422 file = resolver.get_archive_filename(export_folder, colid, pid, "xml", True) 

423 

424 with open(file, "w", encoding="utf-8") as f: 

425 f.write(xml_body) 

426 

427 

428def get_numdam_issues_list(colid): 

429 """ 

430 Get the list of issues of a collection from numdam.org 

431 """ 

432 response = requests.get(f"{settings.NUMDAM_URL}/api-issues/{colid}") 

433 response.raise_for_status() 

434 

435 return response.json()["issues"] 

436 

437 

438def get_numdam_file_list(colid, pid): 

439 """ 

440 Get the list of files to archive (of a collection or an issue) 

441 The files are those visible by the user, like PDF or DjVus 

442 """ 

443 url = settings.NUMDAM_URL + "/api-item-file-list/" 

444 if pid is None: 

445 url += colid 

446 else: 

447 url += pid 

448 response = requests.get(url) 

449 response.raise_for_status() 

450 

451 data = response.json() 

452 return data 

453 

454 

455def copy_numdam_src_files(colid, pid, article_pids, export_folder, log_file=None): 

456 src_folder = os.path.join(settings.NUMDAM_ISSUE_SRC_FOLDER, colid, pid) 

457 

458 if not os.path.isdir(src_folder): 

459 return 

460 

461 # 1. Files related to the issue 

462 dest_folder = os.path.join( 

463 export_folder, resolver.get_relative_folder(colid, pid), "src/digitisation/" 

464 ) 

465 

466 if log_file: 

467 log_file.write("Create " + dest_folder + "...") 

468 

469 resolver.create_folder(dest_folder) 

470 

471 if log_file: 

472 log_file.write("done\n") 

473 

474 # 1a. issue.xml 

475 full_src_file = os.path.join(src_folder, pid + ".xml") 

476 if os.path.isfile(full_src_file): 

477 full_dest_file = os.path.join(dest_folder, pid + ".xml") 

478 

479 if log_file: 

480 log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

481 

482 resolver.copy_file(full_src_file, full_dest_file) 

483 

484 if log_file: 

485 log_file.write("done\n") 

486 

487 # 1b. tif, jpg files 

488 img_files = [ 

489 os.path.join(src_folder, f) 

490 for f in os.listdir(src_folder) 

491 if ( 

492 os.path.isfile(os.path.join(src_folder, f)) 

493 and (f.endswith(".tif") or f.endswith(".jpg")) 

494 ) 

495 ] 

496 

497 for img_file in img_files: 

498 if log_file: 

499 log_file.write(f"Copy {os.path.basename(img_file)} ...") 

500 

501 resolver.copy_file(img_file, dest_folder) 

502 

503 if log_file: 

504 log_file.write("done\n") 

505 

506 # 2. Files related to articles 

507 for article_pid in article_pids: 

508 src_folder = os.path.join(settings.NUMDAM_ARTICLE_SRC_FOLDER, colid, pid, article_pid) 

509 

510 dest_folder = os.path.join( 

511 export_folder, 

512 resolver.get_relative_folder(colid, pid, article_pid), 

513 "src/digitisation/", 

514 ) 

515 

516 if log_file: 

517 log_file.write("Create " + dest_folder + "...") 

518 

519 resolver.create_folder(dest_folder) 

520 

521 if log_file: 

522 log_file.write("done\n") 

523 

524 # 2a. article.xml (Full Text) 

525 full_src_file = os.path.join(src_folder, article_pid + ".xml") 

526 if os.path.isfile(full_src_file): 

527 full_dest_file = os.path.join(dest_folder, article_pid + ".xml") 

528 

529 if log_file: 

530 log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

531 

532 resolver.copy_file(full_src_file, full_dest_file) 

533 

534 if log_file: 

535 log_file.write("done\n") 

536 

537 # 2b. tif, jpg files 

538 img_files = [ 

539 os.path.join(src_folder, f) 

540 for f in os.listdir(src_folder) 

541 if ( 

542 os.path.isfile(os.path.join(src_folder, f)) 

543 and (f.endswith(".tif") or f.endswith(".jpg")) 

544 ) 

545 ] 

546 

547 for img_file in img_files: 

548 if log_file: 

549 log_file.write(f"Copy {os.path.basename(img_file)} ...") 

550 

551 resolver.copy_file(img_file, dest_folder) 

552 

553 if log_file: 

554 log_file.write("done\n") 

555 

556 # PDF/DJVU without headers 

557 # Olivier 09/05/2019: these files are almost identicals to the final PDFs 

558 # There is no need to archive them. Just use the final PDF and remove the first page 

559 # if needed 

560 

561 # # 2c. pdf files (without header) 

562 # full_src_file = os.path.join(src_folder, article_pid + '.pdf') 

563 # if os.path.isfile(full_src_file): 

564 # full_dest_file = os.path.join(dest_folder, article_pid + '.pdf') 

565 # if file: 

566 # file.write("Copy {} ...".format(os.path.basename(full_src_file))) 

567 # copy_file(full_src_file, full_dest_file) 

568 # if file: 

569 # file.write("done\n") 

570 # 

571 

572 

573# def copy_numdam_djvu(colid, pid, article_pids, export_folder, log_file=None): 

574# """ 

575# Djvu might not be visible/listed in centre Mersenne articles, but might exist in Numdam 

576# """ 

577# 

578# if hasattr(settings, "NUMDAM_DATA_ROOT"): 

579# for article_pid in article_pids: 

580# article_folder = resolver.get_relative_folder(colid, pid, article_pid) 

581# full_src_file = os.path.join( 

582# settings.NUMDAM_DATA_ROOT, article_folder, article_pid + ".djvu" 

583# ) 

584# if os.path.isfile(full_src_file): 

585# full_dest_file = os.path.join(export_folder, article_folder, article_pid + ".djvu") 

586# if log_file: 

587# log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

588# resolver.copy_file(full_src_file, full_dest_file) 

589# if log_file: 

590# log_file.write("done\n") 

591 

592 

593class archiveNumdamResourcePtfCmd(archiveIssuePtfCmd): 

594 """ 

595 Archive a Container or a Collection (just the collection level) stored in Numdam 

596 """ 

597 

598 def __init__(self, params=None): 

599 self.colid = None # self.pid from the base class is the id of the container 

600 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER 

601 self.binary_files_folder = settings.NUMDAM_DATA_ROOT 

602 

603 super().__init__(params) 

604 

605 self.required_params.extend(["colid"]) 

606 # self.pid is optional when you want to archive a Collection 

607 self.required_params = [id for id in self.required_params if id != "pid"] 

608 

609 def internal_do(self): 

610 """ 

611 Archive files of Numdam. 

612 - Send http requests to numdam.org to get the list of user files (PDF/DjVu/XML) to preserve 

613 - Copy these files to self.export_older (/mathdoc_archive) 

614 - Copy src files (mainly digitized TIF/JPG files) from /numdam_dev 

615 

616 Warning: this class does not operate with Resource objects stored in the database (Collection, Container...) 

617 since the data comes from numdam.org 

618 Information is only based on pids (collection, issue, article) 

619 """ 

620 

621 # 1. Get the list of user files to archive (files visible by the user, like PDF or DjVus) from numdam.org 

622 data = get_numdam_file_list(self.colid, self.pid) 

623 pdfas = {} 

624 article_pids = [] 

625 

626 # 2. Prepare the backup or an Issue (backup, delete previous folder) 

627 if self.pid is not None: # Archive an issue 

628 article_pids = [item["pid"] for item in data["articles"]] if "articles" in data else [] 

629 

630 # Backup PDF/A before deleting the issue folder 

631 # (it is time consuming to create a PDF/A, we will check if they have to be re-created) 

632 pdfas = self.backup_pdfa_if_necessary(self.colid, article_pids) 

633 

634 issue_folder = resolver.get_relative_folder(self.colid, self.pid) 

635 

636 with open( 

637 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

638 ) as file_: 

639 file_.write(f"Delete {self.export_folder}/{issue_folder}\n") 

640 

641 # Delete the issue folder 

642 resolver.delete_object_folder(issue_folder, self.export_folder) 

643 

644 # 3. Archive the JATS XML of the pid 

645 # TODO: The XML coming from Numdam does not list PDF/A in the <self-uri> of the articles, only PDF/DjVu 

646 # create_or_restore_pdfa is going to add the PDF/A on disk 

647 # We should modify the xml to add the PDF/A 

648 archive_numdam_xml(self.colid, self.pid, self.export_folder) 

649 

650 # 4. Archive the user files (list gotten in 2.) 

651 if "files" in data: 

652 # Files of a Collection or a Container 

653 resolver.copy_binary_files( 

654 None, 

655 self.binary_files_folder, 

656 self.export_folder, 

657 data["files"], 

658 ) 

659 

660 if "articles" in data: 

661 # In case of a Container, files of each article 

662 for article_data in data["articles"]: 

663 resolver.copy_binary_files( 

664 None, 

665 self.binary_files_folder, 

666 self.export_folder, 

667 article_data["files"], 

668 ) 

669 

670 if self.pid is not None: 

671 # 5. Archive the src files (tiff, pdf/djvu without headers,...) 

672 copy_numdam_src_files(self.colid, self.pid, article_pids, self.export_folder) 

673 

674 # 6. Create PDF/A 

675 self.create_or_restore_pdfa(self.colid, article_pids, pdfas)