Coverage for src/ptf/cmds/xml_cmds.py: 51%

1259 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1import copy 

2import datetime 

3import os.path 

4import subprocess 

5import sys 

6import traceback 

7 

8from lxml import ElementInclude 

9from lxml import etree 

10 

11from django.conf import settings 

12from django.db import transaction 

13from django.db.models import Prefetch 

14from django.utils import timezone 

15 

16from ptf import exceptions 

17from ptf import model_data 

18from ptf import model_data_comparator 

19from ptf import model_data_converter 

20from ptf import model_helpers 

21from ptf import tex 

22from ptf import utils 

23from ptf.cmds import ptf_cmds 

24from ptf.cmds import solr_cmds 

25from ptf.cmds.base_cmds import baseCmd 

26from ptf.cmds.xml import xml_utils 

27from ptf.cmds.xml.cedrics import cedrics_parser 

28 

29# KEEP THIS UNUSED IMPORT THEY ARE USED 

30from ptf.cmds.xml.jats import jats_parser 

31from ptf.cmds.xml.jats import xmldata as xmldata_jats 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.display import resolver 

34 

35# from ptf.models import Resource 

36from ptf.models import Article 

37from ptf.models import Collection 

38from ptf.models import Container 

39from ptf.models import Person 

40from ptf.models import PtfSite 

41from ptf.models import backup_obj_not_in_metadata 

42from ptf.models import backup_translation 

43from ptf.models import restore_obj_not_in_metadata 

44from ptf.models import restore_translation 

45 

46 

47def find_file(name): 

48 paths = settings.MANAGER_XSLT_DIRS 

49 for path in paths: 

50 for root, _, files in os.walk(path): 

51 if name in files: 

52 return os.path.join(root, name) 

53 return None 

54 

55 

56def get_transform(name): 

57 file_path = find_file(f"{name}.xsl") 

58 xslt_doc = etree.parse(file_path) 

59 return etree.XSLT(xslt_doc) 

60 

61 

62class addXmlCmd(baseCmd): 

63 """ 

64 addXmlCmd: base class for commands that take an XML as input 

65 The XML is passed with the body param 

66 

67 from_folder / to_folder: location of binary files to copy 

68 

69 Example with a file: 

70 f = open('journal.xml') 

71 body = f.read() 

72 f.close() 

73 cmd = add...XmlCmd( { "body":body } ) 

74 

75 Exception raised: 

76 - ValueError if the init params are empty 

77 """ 

78 

79 use_body = True 

80 body: str | None = None 

81 tree = None 

82 solr_commit_at_the_end = True 

83 xml_filename_in_log = None 

84 remove_blank_text = False 

85 xml_file_folder = None 

86 

87 def __init__(self, params=None): 

88 super().__init__(params) 

89 

90 if self.use_body: 

91 self.required_params.extend(["body"]) 

92 

93 def get_logname(self): 

94 filename = "" 

95 

96 if hasattr(settings, "LOG_DIR"): 96 ↛ 106line 96 didn't jump to line 106 because the condition on line 96 was always true

97 i = 0 

98 today = datetime.date.today() 

99 basename = str(today) + "-" + self.__class__.__name__ + "-" 

100 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

101 

102 while os.path.isfile(filename): 

103 i += 1 

104 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

105 

106 return filename 

107 

108 def pre_do(self): 

109 super().pre_do() 

110 

111 if self.use_body: 

112 # The Cedrics -> JATS XSLT transform manually adds space=preserve around 

113 # the nodes with mixed-content, but leaves the text unchanged. 

114 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True 

115 # Or the spaces will be removed whereas the JATS XML will keep them. 

116 # We still need the remove_blank_text=True for JATS XML for all the other nodes 

117 parser = etree.XMLParser( 

118 huge_tree=True, 

119 recover=True, 

120 remove_blank_text=self.remove_blank_text, 

121 remove_comments=True, 

122 resolve_entities=True, 

123 ) 

124 # if isinstance(self.body, str): 

125 # self.body = self.body 

126 if self.xml_file_folder is not None: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 if self.xml_file_folder[-1] != "/": 

128 self.xml_file_folder += "/" 

129 # For ElementInclude to find the href 

130 self.body = self.body.replace( 

131 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

132 ).replace("xlink:href", "href") 

133 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

134 

135 if self.xml_file_folder is not None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 ElementInclude.include(tree, base_url=self.xml_file_folder) 

137 # t = get_transform('strip-namespace') 

138 # self.tree = t(tree).getroot() 

139 self.tree = tree 

140 

141 if self.tree is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 raise ValueError("tree est vide") 

143 

144 # Write the xml body on disk 

145 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

146 self.xml_filename_in_log = self.get_logname() 

147 

148 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_: 

149 file_.write(self.body) 

150 

151 @transaction.atomic 

152 def do(self, parent=None): 

153 try: 

154 obj = super().do(parent) 

155 except Exception as e: 

156 ptf_cmds.do_solr_rollback() 

157 

158 # Empty sub_cmds to ignore undo 

159 self.cmds = [] 

160 

161 # Write the xml body on disk 

162 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

163 with open( 

164 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8" 

165 ) as file_: 

166 file_.write("----------------------\n") 

167 

168 if self.xml_filename_in_log is None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 self.xml_filename_in_log = self.get_logname() 

170 

171 file_.write(self.xml_filename_in_log + " : FAILED\n") 

172 exc_type, exc_value, exc_traceback = sys.exc_info() 

173 lines = traceback.format_exception(exc_type, exc_value, exc_traceback) 

174 for line in lines: 

175 file_.write(line + "\n") 

176 file_.write("----------------------\n") 

177 

178 raise e 

179 

180 if self.solr_commit_at_the_end: 

181 ptf_cmds.do_solr_commit() 

182 

183 return obj 

184 

185 def post_undo(self): 

186 super().post_undo() 

187 

188 Person.objects.clean() 

189 

190 def post_do(self, resource=None): 

191 super().post_do(resource) 

192 

193 Person.objects.clean() 

194 

195 if hasattr(settings, "LOG_DIR") and resource and self.use_body: 

196 today = datetime.date.today() 

197 basename = str(today) + "-" + self.__class__.__name__ 

198 

199 pids = "" 

200 first = True 

201 if isinstance(resource, list): 

202 for resource_item in resource: 

203 if first: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true

204 first = False 

205 else: 

206 pids += ", " 

207 

208 pids += resource_item.pid 

209 else: 

210 pids = resource.pid 

211 

212 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_: 

213 file_.write(basename + " : " + pids + "\n") 

214 

215 if hasattr(resource, "my_collection") and resource.my_collection: 

216 folder = os.path.join( 

217 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid 

218 ) 

219 filename = os.path.join(folder, resource.pid + ".xml") 

220 resolver.create_folder(folder) 

221 with open(filename, "w", encoding="utf-8") as file_: 

222 file_.write(self.body) 

223 

224 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed) 

225 # if 'test' in sys.argv: 

226 # if len(self.warnings) > 0: 

227 # print(self.warnings) 

228 # raise UserWarning("All tags are not parsed", self.warnings) 

229 

230 def undo(self): 

231 super().undo() 

232 

233 if self.solr_commit_at_the_end: 

234 ptf_cmds.do_solr_commit() 

235 

236 def add_objects_with_location(self, xobjs, resource, cmd_type): 

237 seq = 1 

238 

239 for xobj in xobjs: 

240 base = None 

241 

242 if xobj["base"]: 

243 base_name = xobj["base"] 

244 base = model_helpers.get_xmlbase(base_name) 

245 if base is None: 

246 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False}) 

247 base = cmd.do(self) 

248 

249 rel = xobj["rel"] 

250 location = xobj["location"] 

251 

252 params = { 

253 "rel": rel, 

254 "mimetype": xobj.get("mimetype", ""), 

255 "location": location, 

256 "seq": seq, 

257 "solr_commit": False, 

258 "from_folder": self.from_folder, 

259 "to_folder": self.to_folder, 

260 } 

261 

262 # Ignore XML file 

263 if params["mimetype"] != "application/xml": 263 ↛ 239line 263 didn't jump to line 239 because the condition on line 263 was always true

264 if "metadata" in xobj: 

265 params["metadata"] = xobj["metadata"] 

266 

267 if "text" in xobj: 

268 params["text"] = xobj["text"] 

269 

270 # TODO: cmd factory ? 

271 cmd = None 

272 if cmd_type == "ExtLink": 

273 cmd = ptf_cmds.addExtLinkPtfCmd(params) 

274 elif cmd_type == "RelatedObject": 

275 cmd = ptf_cmds.addRelatedObjectPtfCmd(params) 

276 elif cmd_type == "SupplementaryMaterial": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 params["caption"] = xobj.get("caption", "") 

278 params["supplementary_material"] = True 

279 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params) 

280 elif cmd_type == "DataStream": 280 ↛ 286line 280 didn't jump to line 286 because the condition on line 280 was always true

281 cmd = ptf_cmds.addDataStreamPtfCmd(params) 

282 

283 # Always try to add an ExtLink or a RelatedObject 

284 # May raise ResourceExists if the ExtLink/RelatedObject is added twice 

285 

286 if cmd is not None: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true

287 cmd.set_base(base) 

288 cmd.set_resource(resource) 

289 

290 cmd.do(self) 

291 

292 seq += 1 

293 

294 # def add_metadata_parts(self, xobj, resource): 

295 # for (seq, name, data) in xobj.metadataparts: 

296 # params = {"name": name, 

297 # "data": data, 

298 # "seq": seq, 

299 # "solr_commit": False} 

300 # 

301 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params) 

302 # cmd.set_resource(resource) 

303 # cmd.do(self) 

304 

305 @staticmethod 

306 def remove_publisher(publisher): 

307 cmd = ptf_cmds.addPublisherPtfCmd() 

308 cmd.set_object_to_be_deleted(publisher) 

309 cmd.undo() 

310 

311 # Update the published years of a collection (journal/acta/book-series...) 

312 @staticmethod 

313 def update_collection_years(pid, container, save=True): 

314 collection = Collection.objects.get(pid=pid) 

315 if container.year: 

316 year = container.year 

317 fyear, lyear = model_helpers.get_first_last_years(year) 

318 fyear = int(fyear) 

319 lyear = int(lyear) 

320 

321 if fyear < collection.fyear or not collection.fyear: 

322 collection.fyear = fyear 

323 

324 if lyear > collection.lyear or not collection.lyear: 

325 collection.lyear = lyear 

326 

327 if save: 

328 collection.save() 

329 

330 

331class addCollectionsXmlCmd(addXmlCmd): 

332 """ 

333 addCollectionsXmlCmd: adds/remove a collection 

334 

335 TODO: merge Collection and Journal ? 

336 

337 Exception raised: 

338 - exceptions.ResourceExists during do 

339 if the Collection already exists 

340 if the collection defines the same extlink/relatedobject multiple times 

341 - exceptions.ResourceDoesNotExist 

342 during undo if the Collection does not exist 

343 during do of the provider does not exist 

344 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

345 - RuntimeError during undo if resources are still published 

346 """ 

347 

348 provider = None 

349 xml_format = None 

350 

351 def set_provider(self, provider): 

352 self.provider = provider 

353 

354 def add_collection(self, xcol, update=False): 

355 if not xcol: 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true

356 return None 

357 

358 if xcol.provider: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true

359 provider = model_helpers.get_provider_by_name(xcol.provider) 

360 else: 

361 provider = self.provider 

362 

363 col_id = xcol.pid 

364 collection = model_helpers.get_collection(col_id) 

365 

366 existing = False 

367 

368 if collection is not None: 

369 existing = True 

370 if not update: 370 ↛ 374line 370 didn't jump to line 374

371 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists") 

372 

373 # Create a collection 

374 params = { 

375 "xobj": xcol, 

376 "from_folder": self.from_folder, 

377 "to_folder": self.to_folder, 

378 "solr_commit": False, 

379 } 

380 

381 cls = ptf_cmds.addCollectionPtfCmd 

382 if update and existing: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true

383 cls = ptf_cmds.updateCollectionPtfCmd 

384 

385 cmd = cls(params) 

386 cmd.set_provider(provider) 

387 collection = cmd.do(self) 

388 

389 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink") 

390 

391 # if publisher: 

392 # model_helpers.publish_resource(publisher, journal) 

393 

394 return collection 

395 

396 def internal_do(self): 

397 super().internal_do() 

398 

399 collections = [] 

400 

401 if self.tree.tag == "journal-meta": 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 raise ValueError( 

403 "Creation of a journal on the fly from an article is not yet supported" 

404 ) 

405 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI) 

406 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....) 

407 # # to be compatible with jats_parser.py 

408 # # TODO 2 : Prevent the creation of the collection on the fly ? 

409 # # Shouldn't the collection be monitored/controlled ? 

410 # xmldata = globals()[self.xml_format] 

411 # xcol = xmldata.Journal(self.tree) 

412 # collection = self.add_collection(xcol, update=True) 

413 # collections.append(collection) 

414 else: 

415 for node in self.tree: 

416 xcol = None 

417 if node.tag == "collection-meta": 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 raise ValueError("Collection can only be created from <publication-meta>") 

419 # xcol = jats_parser.BitsCollection(tree=node) 

420 elif node.tag == "journal-meta": 420 ↛ 421line 420 didn't jump to line 421 because the condition on line 420 was never true

421 raise ValueError( 

422 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>" 

423 ) 

424 # xcol = jats_parser.JatsJournal(tree=node) 

425 elif node.tag == "publication-meta": 425 ↛ 428line 425 didn't jump to line 428 because the condition on line 425 was always true

426 xcol = jats_parser.MathdocPublication(tree=node) 

427 

428 collection = self.add_collection(xcol) 

429 collections.append(collection) 

430 

431 return collections 

432 

433 

434class addIssueXmlCmd(addXmlCmd): 

435 """ 

436 addIssueXmlCmd: adds/remove an issue 

437 

438 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

439 

440 extra_folder: folder where extra data (extid false_positive...) are stored in a json 

441 It is used 

442 - when you call addIssueXmlCmd directly to import from an archive, 

443 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import 

444 

445 Exception raised: 

446 - exceptions.ResourceExists during do if the issue already exists 

447 - exceptions.ResourceDoesNotExist 

448 during undo if the Issue does not exist 

449 during do if the serial/provider does not exist 

450 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

451 - RuntimeError during undo if resources are still published 

452 """ 

453 

454 assign_doi = False 

455 full_text_folder = "" 

456 extra_folder = None 

457 prod_deployed_date_iso_8601_date_str = None 

458 xissue = None 

459 count = 0 

460 no_bib = False # Ignore the references during the import (used in Geodesic) 

461 embargo = False # Import only the open articles (used in Geodesic) 

462 

463 def create_child_collection(self, xjournal, journal): 

464 issn = xjournal.issn if xjournal.issn else xjournal.e_issn 

465 

466 new_xjournal = copy.deepcopy(xjournal) 

467 new_xjournal.wall = 0 

468 new_xjournal.pid = f"{xjournal.pid}-{issn}" 

469 new_xjournal.coltype = journal.coltype 

470 

471 params = {"xobj": new_xjournal} 

472 provider = model_helpers.get_provider_by_name("mathdoc") 

473 

474 cmd = ptf_cmds.addCollectionPtfCmd(params) 

475 cmd.set_parent(journal) 

476 cmd.set_provider(provider) 

477 

478 collection = cmd.do() 

479 # collection.parent = journal 

480 # journal = collection 

481 return collection 

482 

483 def get_historic_collection(self, xjournal, journal): 

484 use_meta_collections = ( 

485 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False 

486 ) 

487 

488 if not use_meta_collections: 

489 return journal 

490 

491 # meta-collections are used : journal may be the top collection or one of its children 

492 

493 value = id_type = None 

494 

495 # Take care of special case of STNB : 

496 # For that, we ignore the issn of STNB 2nd series 

497 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 497 ↛ 498line 497 didn't jump to line 498 because the condition on line 497 was never true

498 xjournal.issn = None 

499 xjournal.e_issn = None 

500 xjournal.ids = [] 

501 else: 

502 if xjournal.issn: 502 ↛ 505line 502 didn't jump to line 505 because the condition on line 502 was always true

503 value = xjournal.issn 

504 id_type = "issn" 

505 elif xjournal.e_issn: 

506 value = xjournal.e_issn 

507 id_type = "e-issn" 

508 

509 if value: 509 ↛ 519line 509 didn't jump to line 519 because the condition on line 509 was always true

510 # collection has at least one issn 

511 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type) 

512 if qs.exists(): 512 ↛ 513line 512 didn't jump to line 513 because the condition on line 512 was never true

513 journal = qs.first() 

514 else: 

515 # xjournal does not exist yet. 

516 journal = self.create_child_collection(xjournal, journal) 

517 else: 

518 # collection has no issn 

519 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"] 

520 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter( 

521 pid__in=possible_pids 

522 ) 

523 if qs.exists(): 

524 journal = qs.first() 

525 else: 

526 journal = self.create_child_collection(xjournal, journal) 

527 

528 return journal 

529 

530 def internal_do(self): 

531 super().internal_do() 

532 

533 ####################################################################### 

534 # get xissue 

535 

536 if self.xissue: 

537 xissue = self.xissue 

538 else: 

539 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib) 

540 self.warnings.extend(xissue.warnings) 

541 

542 ####################################################################### 

543 # Check if there is an existing issue / journal 

544 

545 issue_id = xissue.pid 

546 issue = model_helpers.get_container(issue_id) 

547 

548 if issue is not None: 

549 raise exceptions.ResourceExists(f"Issue {issue_id} already exists") 

550 

551 xjournal = xissue.journal 

552 journal_id = xjournal.pid 

553 journal = model_helpers.get_collection(journal_id) 

554 

555 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal 

556 # as there is a <journal-meta> with an id ? 

557 # The ptf_resource table (Resource objects) are created with only 1 id. 

558 # When you add a journal, the journal id is the one of its 

559 # <custom-meta-group><custom-meta> provider. 

560 # If you want to find the journal of an issue based on the <journal-meta> information, you might 

561 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select 

562 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure 

563 # we use the correct provider. A simple select in the ptf_resource table is then needed. 

564 if journal is None: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist") 

566 

567 # Journal is the top collection (ex: AFST) 

568 # We want to get (or create) the journal that corresponds to the issue 

569 journal = self.get_historic_collection(xjournal, journal) 

570 

571 if self.embargo and journal.wall > 0: 571 ↛ 574line 571 didn't jump to line 574 because the condition on line 571 was never true

572 # Geodesic is for open access articles. 

573 # We do not want to import the issues under embargo 

574 if resolver.embargo(journal.wall, xissue.year): 

575 print(f"Embargo, ignore {xissue.pid}") 

576 return None 

577 

578 ####################################################################### 

579 # Get provider/publisher 

580 

581 provider_name = xissue.provider if xissue.provider else "mathdoc" 

582 provider = model_helpers.get_provider_by_name(provider_name) 

583 

584 ####################################################################### 

585 # Add the issue 

586 

587 params = { 

588 "xobj": xissue, 

589 "pid": xissue.pid, 

590 "from_folder": self.from_folder, 

591 "to_folder": self.to_folder, 

592 "solr_commit": False, 

593 } 

594 

595 cmd = ptf_cmds.addContainerPtfCmd(params) 

596 cmd.add_collection(journal) 

597 cmd.set_provider(provider) 

598 issue = cmd.do(self) 

599 

600 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink") 

601 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject") 

602 self.add_objects_with_location(xissue.streams, issue, "DataStream") 

603 

604 ####################################################################### 

605 # Add the issue's articles 

606 

607 # JatsIssue is an iterator (has the __iter__ function) 

608 # you simply iterate the xissue to get its articles 

609 if xissue.ctype == "issue": 609 ↛ 626line 609 didn't jump to line 626 because the condition on line 609 was always true

610 for seq, xarticle in enumerate(xissue, start=1): 

611 params = { 

612 "xarticle": xarticle, 

613 "journal": journal, 

614 "issue": issue, 

615 "seq": seq, 

616 "provider": provider, 

617 "assign_doi": self.assign_doi, 

618 "full_text_folder": self.full_text_folder, 

619 "use_body": False, 

620 "from_folder": self.from_folder, 

621 "to_folder": self.to_folder, 

622 "solr_commit_at_the_end": False, 

623 } 

624 cmd = addArticleXmlCmd(params) 

625 cmd.do(self) 

626 elif xissue.ctype == "issue_special": 

627 site = PtfSite.objects.get(id=settings.SITE_ID) 

628 issue.deploy(site) 

629 for seq, xresource in enumerate(xissue.articles, start=1): 

630 # en fait on peut appeler directement la ptfCMD 

631 # et on peut supprimer la xml cmd 

632 

633 params = { 

634 "use_body": False, 

635 "xcontainer": issue, 

636 "seq": seq, 

637 # on veut juste passer le champ resource_doi dans ma fonction 

638 "xresource": xresource, 

639 "resource_doi": xresource.doi, 

640 } 

641 cmd = addResourceInSpecialIssueXmlCmd(params) 

642 cmd.do(self) 

643 

644 # Update the top journal first year and last year 

645 self.update_collection_years(journal_id, issue) 

646 

647 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi) 

648 # Update issue before returning the object. 

649 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db() 

650 issue.my_collection.refresh_from_db() 

651 

652 # Used in post_do 

653 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str 

654 

655 return issue 

656 

657 def post_do(self, resource=None): 

658 super().post_do(resource) 

659 

660 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une. 

661 if resource.last_modified is None: 661 ↛ 662line 661 didn't jump to line 662 because the condition on line 661 was never true

662 resource.last_modified = timezone.now() 

663 resource.save() 

664 

665 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date, 

666 # On la propage aux Articles/Issue. 

667 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date 

668 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools": 

669 prod_deployed_date = model_helpers.parse_date_str( 

670 self._prod_deployed_date_iso_8601_date_str 

671 ) 

672 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid) 

673 if journal_site: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true

674 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date) 

675 

676 if self.extra_folder: 

677 ptf_cmds.importExtraDataPtfCmd( 

678 {"pid": resource.pid, "import_folder": self.extra_folder} 

679 ).do() 

680 

681 

682class addResourceInSpecialIssueXmlCmd(addXmlCmd): 

683 """ 

684 addResourceXmlCmd: adds/remove resource from special issue 

685 """ 

686 

687 xcontainer = None 

688 resource_doi = "" 

689 xresource = None 

690 seq = 0 

691 citation = "" 

692 provider = None 

693 

694 def __init__(self, params=None): 

695 super().__init__(params) 

696 self.required_params.extend(["xcontainer"]) 

697 

698 def internal_do(self): 

699 super().internal_do() 

700 # for later, check the type of the resource first 

701 resource_in_special_issue = model_helpers.get_resource_in_special_issue_by_doi( 

702 self.resource_doi 

703 ) 

704 resource_doi = self.resource_doi 

705 

706 # if self.xcontainer: 

707 container = model_helpers.get_container(self.xcontainer.pid) 

708 

709 seq = self.seq 

710 # needs_to_restore_resource = False 

711 

712 if resource_in_special_issue is not None: 

713 # temporary 

714 raise ValueError( 

715 "First step of developpement require to manually delete all resources in special issue" 

716 ) 

717 # self.provider = self.xresource.provider 

718 # 2 is the id of ptf_tools. If we are not in ptf tools we are dealing with jats article which has no citation 

719 if settings.SITE_ID == 2: 

720 citation = self.xresource["citation"] 

721 else: 

722 citation = "" 

723 params = { 

724 # "xobj": self.xresource, 

725 "obj_doi": resource_doi, 

726 "container": container, 

727 "seq": seq, 

728 "citation": citation, 

729 # "provider": self.provider, 

730 } 

731 

732 cmd = ptf_cmds.addResourceInSpecialIssuePtfCmd(params) 

733 resource_in_special_issue = cmd.do(self) 

734 

735 return resource_in_special_issue 

736 

737 

738class addArticleXmlCmd(addXmlCmd): 

739 """ 

740 addArticleXmlCmd: adds/remove an issue 

741 

742 Exception raised: 

743 - exceptions.ResourceExists during do if the article already exists 

744 - exceptions.ResourceDoesNotExist 

745 during undo if the Article does not exist 

746 during do if the serial/issue/provider does not exist 

747 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

748 """ 

749 

750 xarticle = None 

751 journal = None 

752 issue = None 

753 provider = None 

754 provider_col = None 

755 assign_doi = False 

756 full_text_folder = "" 

757 xml_format = "xmldata_jats" 

758 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset 

759 restricted_mode = False 

760 # standalone is used to import isolated article, without issues 

761 standalone = False 

762 seq = ( 

763 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ) 

764 ) 

765 keep_translations = False 

766 

767 def set_collection(self, collection): 

768 self.journal = collection 

769 self.provider = collection.provider 

770 

771 def set_xml_format(self, xml_format): 

772 self.xml_format = xml_format 

773 

774 def set_provider(self, provider): 

775 self.provider = provider 

776 

777 def set_provider_col(self, provider_col): 

778 self.provider_col = provider_col 

779 

780 def set_article_single_mode(self): 

781 self.xarticle = jats_parser.JatsArticle(tree=self.tree) 

782 self.warnings.extend(self.xarticle.warnings) 

783 

784 # TODO: MaxiDML: allow the creation of an issue on the fly 

785 # if not self.provider: 

786 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider) 

787 # 

788 # xmldata_jats.set_pid_type(self.provider.pid_type) 

789 # 

790 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8") 

791 # cmd = addCollectionsXmlCmd({'body': bdy, 

792 # 'xml_format': self.xml_format, 

793 # 'coltype': "journal"}) 

794 # cmd.set_provider(self.provider_col if self.provider_col else self.provider) 

795 # self.journal = cmd.do()[0] 

796 # 

797 # self.issue = model_helpers.get_container(self.xarticle.issue_id) 

798 # if self.issue is None: 

799 # # need to create the issue 

800 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str, 

801 # '%Y-%m-%d') 

802 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year) 

803 # self.issue = model_helpers.get_container(pid) 

804 # if self.issue is None: 

805 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid, 

806 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime( 

807 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume, 

808 # # if copy binary, need from_folder / to_folder 

809 # } 

810 # 

811 # cmd = ptf_cmds.addContainerPtfCmd(params) 

812 # cmd.add_collection(self.journal) 

813 # cmd.set_provider(self.provider) 

814 # self.issue = cmd.do() 

815 

816 def get_oai_identifier(self): 

817 return self.xarticle.oai_identifier 

818 

819 def update_xobj_with_body(self): 

820 # Import CEDRICS, le plein texte provient d'un fichier séparé 

821 if self.full_text_folder and not self.xarticle.body: 821 ↛ 822line 821 didn't jump to line 822 because the condition on line 821 was never true

822 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 

823 text = "" 

824 locs = [ 

825 stream["location"] 

826 for stream in self.xarticle.streams 

827 if stream["mimetype"] == "application/pdf" 

828 ] 

829 if locs: 

830 full_pdf_location = os.path.join(self.full_text_folder, locs[0]) 

831 text = utils.pdf_to_text(full_pdf_location) 

832 self.xarticle.body = text 

833 else: 

834 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml" 

835 

836 with open(full_text_file, mode="rb") as file_: 

837 body = file_.read() 

838 

839 parser = etree.XMLParser(huge_tree=True, recover=True) 

840 tree = etree.fromstring(body, parser=parser) 

841 node = tree.find("body") 

842 self.xarticle.body = xml_utils.get_text_from_node(node) 

843 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

844 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true

845 full_text_file = os.path.join( 

846 "/numdam_dev/acquisition/donnees_traitees", 

847 self.journal.pid, 

848 self.issue.pid, 

849 self.xarticle.pid, 

850 self.xarticle.pid + ".xml", 

851 ) 

852 if os.path.isfile(full_text_file): 

853 with open(full_text_file, mode="rb") as file_: 

854 body = file_.read() 

855 

856 parser = etree.XMLParser(huge_tree=True, recover=True) 

857 tree = etree.fromstring(body, parser=parser) 

858 node = tree.find("body") 

859 self.xarticle.body = xml_utils.get_text_from_node(node) 

860 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

861 

862 def internal_do(self): 

863 super().internal_do() 

864 

865 if self.xarticle is None and self.journal is not None: 865 ↛ 867line 865 didn't jump to line 867 because the condition on line 865 was never true

866 # self.restricted_mode = True 

867 self.set_article_single_mode() 

868 self.update = True 

869 else: 

870 self.update = False 

871 

872 if self.xarticle.pid is None: 

873 self.xarticle.pid = ( 

874 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

875 ) 

876 

877 for xtranslated_article in self.xarticle.translations: 877 ↛ 878line 877 didn't jump to line 878 because the loop on line 877 never started

878 for xtream in xtranslated_article.streams: 

879 if xtream["mimetype"] == "text/html": 

880 if self.from_folder is None: 

881 raise ValueError( 

882 "The article has its full text in a separate HTML file. You need to set from_folder" 

883 ) 

884 

885 location = os.path.join(self.from_folder, xtream["location"]) 

886 body_html = resolver.get_body(location) 

887 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

888 xtranslated_article.body_html = body_html 

889 xtranslated_article.body = body 

890 

891 for stream in self.xarticle.streams: 

892 if stream["mimetype"] == "text/html": 

893 location = os.path.join(self.from_folder, stream["location"]) 

894 body_html = resolver.get_body(location) 

895 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

896 self.xarticle.body_html = body_html 

897 self.xarticle.body = body 

898 

899 if self.xarticle.doi: 

900 article = model_helpers.get_article_by_doi(self.xarticle.doi) 

901 else: 

902 article = model_helpers.get_article(self.xarticle.pid) 

903 needs_to_restore_article = False 

904 

905 if article is not None: 905 ↛ 906line 905 didn't jump to line 906 because the condition on line 905 was never true

906 if self.update or self.standalone: 

907 if self.standalone: 

908 self.provider = article.provider 

909 

910 needs_to_restore_article = True 

911 backup_obj_not_in_metadata(article) 

912 

913 if self.keep_translations: 

914 backup_translation(article) 

915 

916 cmd = ptf_cmds.addArticlePtfCmd( 

917 { 

918 "pid": article.pid, 

919 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr 

920 } 

921 ) 

922 cmd.set_object_to_be_deleted(article) 

923 cmd.undo() 

924 else: 

925 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists") 

926 

927 # Override seq 

928 if self.standalone and article is not None: 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 self.xarticle.seq = article.seq 

930 elif ( 

931 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0 

932 ) or (hasattr(self, "pii") and self.seq != 0): 

933 self.xarticle.seq = self.seq 

934 

935 # Get the article's text (body) for SolR if it is empty from the PDF 

936 self.update_xobj_with_body() 

937 

938 params = { 

939 "xobj": self.xarticle, 

940 "pid": self.xarticle.pid, 

941 "from_folder": self.from_folder, 

942 "to_folder": self.to_folder, 

943 "assign_doi": self.assign_doi and not self.xarticle.doi, 

944 "solr_commit": False, 

945 } 

946 

947 cmd = ptf_cmds.addArticlePtfCmd(params) 

948 if self.issue or not self.standalone: 948 ↛ 950line 948 didn't jump to line 950 because the condition on line 948 was always true

949 cmd.set_container(self.issue) 

950 cmd.add_collection(self.journal) 

951 article = cmd.do(self) 

952 

953 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink") 

954 self.add_objects_with_location(self.xarticle.streams, article, "DataStream") 

955 if not self.restricted_mode: 955 ↛ 960line 955 didn't jump to line 960 because the condition on line 955 was always true

956 self.add_objects_with_location( 

957 self.xarticle.supplementary_materials, article, "SupplementaryMaterial" 

958 ) 

959 

960 if ( 

961 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

962 ) or settings.SITE_NAME == "ptf_tools": 

963 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject") 

964 

965 for xtrans_article, trans_article in zip( 965 ↛ 968line 965 didn't jump to line 968 because the loop on line 965 never started

966 self.xarticle.translations, cmd.cmd.translated_articles 

967 ): 

968 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream") 

969 

970 if needs_to_restore_article: 970 ↛ 971line 970 didn't jump to line 971 because the condition on line 970 was never true

971 restore_obj_not_in_metadata(article) 

972 

973 if self.keep_translations: 

974 restore_translation(article) 

975 

976 return article 

977 

978 

979class addTranslatedArticleXmlCmd(addXmlCmd): 

980 """ 

981 addTranslatedArticleXmlCmd: adds/remove translations. 

982 The original article is not changed 

983 The current translations are first removed 

984 """ 

985 

986 lang = "" 

987 html_file_name = "" 

988 pdf_file_name = "" 

989 date_published_str = "" 

990 

991 def internal_do(self): 

992 super().internal_do() 

993 

994 xarticle = jats_parser.JatsArticle(tree=self.tree) 

995 article = model_helpers.get_article(xarticle.pid) 

996 

997 if article is None: 

998 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist") 

999 

1000 # Merge existing article with new translation 

1001 data_article = model_data_converter.db_to_article_data(article) 

1002 new_translations = [ 

1003 translation 

1004 for translation in data_article.translations 

1005 if translation.lang != self.lang 

1006 ] 

1007 

1008 for xtrans_article in xarticle.translations: 

1009 if xtrans_article.lang == self.lang: 

1010 # Upload/views has copied the HTML file on disk 

1011 # Add a DataStream. 

1012 # TODO: check if the datastream is not already present 

1013 if self.html_file_name: 

1014 data = model_data.create_datastream() 

1015 data["rel"] = "full-text" 

1016 data["mimetype"] = "text/html" 

1017 data["location"] = self.html_file_name 

1018 xtrans_article.streams.append(data) 

1019 

1020 if self.pdf_file_name: 

1021 # Create a pdf file 

1022 # pdf-translate needs the article/sub-article XML 

1023 # Simply add a datastream for now 

1024 # The new Article created in Django will be complete 

1025 # But generate the PDF file at the end 

1026 data = model_data.create_datastream() 

1027 data["rel"] = "full-text" 

1028 data["mimetype"] = "application/pdf" 

1029 data["location"] = self.pdf_file_name 

1030 xtrans_article.streams.append(data) 

1031 

1032 if self.date_published_str: 

1033 xtrans_article.date_published_iso_8601_date_str = self.date_published_str 

1034 

1035 new_translations.append(xtrans_article) 

1036 

1037 data_article.translations = new_translations 

1038 

1039 cmd = addArticleXmlCmd( 

1040 { 

1041 "xarticle": data_article, 

1042 "use_body": False, 

1043 "issue": article.my_container, 

1044 "standalone": True, 

1045 "from_folder": self.from_folder, 

1046 } 

1047 ) 

1048 cmd.set_collection(article.get_collection()) 

1049 article = cmd.do() 

1050 

1051 # pdf-translate needs the article/sub-article XML 

1052 xml = ptf_cmds.exportPtfCmd( 

1053 { 

1054 "pid": article.pid, 

1055 "with_body": False, 

1056 "with_djvu": False, 

1057 "article_standalone": True, 

1058 "collection_pid": settings.COLLECTION_PID, 

1059 } 

1060 ).do() 

1061 

1062 tex.create_translated_pdf( 

1063 article, 

1064 xml, 

1065 self.lang, 

1066 os.path.join(self.from_folder, self.pdf_file_name), 

1067 os.path.join(self.from_folder, self.html_file_name), 

1068 # If the date_published is specified, we assume that the PDF already exists 

1069 skip_compilation=self.date_published_str != "", 

1070 ) 

1071 

1072 return article 

1073 

1074 

1075class addPCJArticleXmlCmd(addXmlCmd): 

1076 """ 

1077 addPCJArticleXmlCmd: 

1078 """ 

1079 

1080 html_file_name = "" 

1081 

1082 def internal_do(self): 

1083 super().internal_do() 

1084 

1085 xarticle = jats_parser.JatsArticle(tree=self.tree) 

1086 

1087 if self.html_file_name: 1087 ↛ 1094line 1087 didn't jump to line 1094 because the condition on line 1087 was always true

1088 data = model_data.create_datastream() 

1089 data["rel"] = "full-text" 

1090 data["mimetype"] = "text/html" 

1091 data["location"] = self.html_file_name 

1092 xarticle.streams.append(data) 

1093 

1094 cmd = addArticleXmlCmd( 

1095 { 

1096 "xarticle": xarticle, 

1097 "use_body": False, 

1098 "issue": self.issue, 

1099 "standalone": True, 

1100 "from_folder": self.from_folder, 

1101 } 

1102 ) 

1103 cmd.set_collection(self.collection) 

1104 article = cmd.do() 

1105 

1106 return article 

1107 

1108 

1109class addBookXmlCmd(addXmlCmd): 

1110 """ 

1111 addBookXmlCmd: adds/remove a book 

1112 

1113 Exception raised: 

1114 - exceptions.ResourceExists during do if the book already exists 

1115 - exceptions.ResourceDoesNotExist 

1116 during undo if the Book does not exist 

1117 during do if the serial/provider does not exist 

1118 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1119 - RuntimeError during undo if resources are still published 

1120 """ 

1121 

1122 provider = None 

1123 import_oai_mode = False 

1124 journal = None 

1125 xml_format = "xmldata_jats" 

1126 xbook = None 

1127 _collection = None 

1128 no_bib = False # Ignore the references during the import (used in Geodesic) 

1129 

1130 def set_provider(self, provider): 

1131 self.provider = provider 

1132 

1133 def add_parts(self, xparts, pseq): 

1134 if xparts: 

1135 seq = 1 

1136 for xpart in xparts: 

1137 self.add_part(xpart, seq, pseq) 

1138 seq += 1 

1139 

1140 def add_part(self, xpart, seq, pseq): 

1141 if xpart is None: 1141 ↛ 1142line 1141 didn't jump to line 1142 because the condition on line 1141 was never true

1142 return 

1143 

1144 # An Article is used to store a book part in the database 

1145 article = model_helpers.get_article(xpart.pid) 

1146 

1147 if article is not None: 1147 ↛ 1148line 1147 didn't jump to line 1148 because the condition on line 1147 was never true

1148 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists") 

1149 

1150 params = { 

1151 "xobj": xpart, 

1152 "pid": xpart.pid, 

1153 "seq": seq, 

1154 "pseq": pseq, 

1155 # "deployed": deployed, 

1156 "from_folder": self.from_folder, 

1157 "to_folder": self.to_folder, 

1158 "solr_commit": False, 

1159 } 

1160 

1161 cmd = ptf_cmds.addBookPartPtfCmd(params) 

1162 cmd.set_container(self.book) 

1163 cmd.add_collection(self._collection) 

1164 article = cmd.do(self) 

1165 

1166 self.add_objects_with_location(xpart.ext_links, article, "ExtLink") 

1167 self.add_objects_with_location(xpart.streams, article, "DataStream") 

1168 

1169 self.add_parts(xpart.parts, seq) 

1170 

1171 def set_import_oai_mode(self): 

1172 self.import_oai_mode = True 

1173 

1174 def internal_do(self): 

1175 super().internal_do() 

1176 

1177 ####################################################################### 

1178 # Get xbook 

1179 

1180 if self.import_oai_mode: 1180 ↛ 1181line 1180 didn't jump to line 1181 because the condition on line 1180 was never true

1181 xmldata = globals()[self.xml_format] 

1182 xbook = xmldata.Book(self.tree) 

1183 self.journal = model_helpers.get_collection("GDML_Books") 

1184 

1185 else: 

1186 if self.xbook: 

1187 xbook = self.xbook 

1188 else: 

1189 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1190 self.warnings.extend(xbook.warnings) 

1191 

1192 ####################################################################### 

1193 # Get existing book if any 

1194 

1195 if not self.provider: 1195 ↛ 1199line 1195 didn't jump to line 1199 because the condition on line 1195 was always true

1196 provider = model_helpers.get_provider_by_name(xbook.provider) 

1197 self.provider = provider 

1198 

1199 book_id = xbook.pid 

1200 book = model_helpers.get_container(book_id) 

1201 

1202 ####################################################################### 

1203 # Delete any existing book 

1204 

1205 if book is not None: 

1206 if self.import_oai_mode: 1206 ↛ 1207line 1206 didn't jump to line 1207 because the condition on line 1206 was never true

1207 publisher = book.my_publisher 

1208 

1209 # Note: the existing collection is not removed even if it no longer has a resource 

1210 # TODO: urls/commands to add/update/delete a collection 

1211 

1212 # Removes the book 

1213 cmd = ptf_cmds.addContainerPtfCmd() 

1214 cmd.set_object_to_be_deleted(book) 

1215 cmd.undo() 

1216 

1217 if publisher and publisher.publishes.count() == 0: 

1218 self.remove_publisher(publisher) 

1219 else: 

1220 raise exceptions.ResourceExists("Book %s already exists" % book_id) 

1221 

1222 ####################################################################### 

1223 # Add new book 

1224 

1225 if xbook.incollection: 1225 ↛ 1230line 1225 didn't jump to line 1230 because the condition on line 1225 was always true

1226 colid = xbook.incollection[0].pid 

1227 self._collection = model_helpers.get_collection(colid) 

1228 if self._collection is None: 

1229 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist") 

1230 elif self.import_oai_mode: 

1231 self._collection = self.journal 

1232 

1233 params = { 

1234 "xobj": xbook, 

1235 "pid": xbook.pid, 

1236 "from_folder": self.from_folder, 

1237 "to_folder": self.to_folder, 

1238 "solr_commit": False, 

1239 } 

1240 

1241 cmd = ptf_cmds.addContainerPtfCmd(params) 

1242 cmd.add_collection(self._collection) 

1243 cmd.set_provider(provider) 

1244 

1245 book = cmd.do(self) 

1246 self.book = book 

1247 

1248 self.add_objects_with_location(xbook.ext_links, book, "ExtLink") 

1249 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject") 

1250 self.add_objects_with_location(xbook.streams, book, "DataStream") 

1251 

1252 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ? 

1253 

1254 ####################################################################### 

1255 # Add Book parts 

1256 

1257 # JatsIssue is an iterator (has the __iter__ function) 

1258 # TODO make JatsBook an iterator as well ? 

1259 self.add_parts(xbook.parts, 0) 

1260 

1261 # Update the collection first year and last year 

1262 for incol in xbook.incollection: 

1263 self.update_collection_years(incol.pid, book) 

1264 

1265 return book 

1266 

1267 

1268###################################################################################### 

1269###################################################################################### 

1270# 

1271# Update Commands 

1272# 

1273###################################################################################### 

1274###################################################################################### 

1275 

1276 

1277class updateCollectionsXmlCmd(addXmlCmd): 

1278 """ 

1279 updateSerialsXmlCmd: updates one or more journals 

1280 

1281 Exception raised: 

1282 - exceptions.ResourceDoesNotExist during do if the Collection does not exist 

1283 - RuntimeError if undo is called 

1284 """ 

1285 

1286 def update_collection(self, xcol, do_update=True): 

1287 if not xcol: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true

1288 return None 

1289 

1290 provider = model_helpers.get_provider_by_name(xcol.provider) 

1291 

1292 col_id = xcol.pid 

1293 col = model_helpers.get_collection(col_id) 

1294 

1295 if col is None: 

1296 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid) 

1297 

1298 if do_update: 

1299 params = { 

1300 "xobj": xcol, 

1301 "solr_commit": False, 

1302 "from_folder": self.from_folder, 

1303 "to_folder": self.to_folder, 

1304 } 

1305 

1306 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do 

1307 # and the new ones are added in the post_do (addResourceDatabaseCmd) 

1308 

1309 cmd = ptf_cmds.updateCollectionPtfCmd(params) 

1310 cmd.set_provider(provider) 

1311 # cmd.set_publisher(publisher) 

1312 col = cmd.do() 

1313 

1314 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do 

1315 self.add_objects_with_location(xcol.ext_links, col, "ExtLink") 

1316 resolver.copy_binary_files(col, self.from_folder, self.to_folder) 

1317 

1318 # if publisher: 

1319 # model_helpers.publish_resource(publisher, col) 

1320 

1321 return col 

1322 

1323 def internal_do(self): 

1324 super().internal_do() 

1325 

1326 collections = [] 

1327 

1328 # First, check that all journals exist 

1329 for node in self.tree: 

1330 xcol = None 

1331 if node.tag == "collection-meta": 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true

1332 xcol = jats_parser.BitsCollection(tree=node) 

1333 elif node.tag == "journal-meta": 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 xcol = jats_parser.JatsJournal(tree=node) 

1335 elif node.tag == "publication-meta": 1335 ↛ 1337line 1335 didn't jump to line 1337 because the condition on line 1335 was always true

1336 xcol = jats_parser.MathdocPublication(tree=node) 

1337 self.update_collection(xcol, False) 

1338 

1339 for node in self.tree: 

1340 xcol = None 

1341 if node.tag == "collection-meta": 1341 ↛ 1342line 1341 didn't jump to line 1342 because the condition on line 1341 was never true

1342 xcol = jats_parser.BitsCollection(tree=node) 

1343 elif node.tag == "journal-meta": 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 xcol = jats_parser.JatsJournal(tree=node) 

1345 elif node.tag == "publication-meta": 1345 ↛ 1347line 1345 didn't jump to line 1347 because the condition on line 1345 was always true

1346 xcol = jats_parser.MathdocPublication(tree=node) 

1347 self.warnings.extend(xcol.warnings) 

1348 xcol = self.update_collection(xcol) 

1349 collections.append(xcol) 

1350 

1351 return collections 

1352 

1353 def internal_undo(self): 

1354 raise RuntimeError("update commands do not support the undo") 

1355 

1356 

1357##################################################################### 

1358# 

1359# replaceIssueXmlCmd: updates an issue 

1360# 

1361# Exception raised: 

1362# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1363# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1364# - RuntimeError if undo is called 

1365# 

1366###################################################################### 

1367class replaceIssueXmlCmd(addXmlCmd): 

1368 def internal_do(self): 

1369 super().internal_do() 

1370 

1371 xissue = jats_parser.JatsIssue(tree=self.tree) 

1372 self.warnings.extend(xissue.warnings) 

1373 

1374 xjournal = xissue.journal 

1375 journal_id = xjournal.pid 

1376 journal = model_helpers.get_collection(journal_id) 

1377 

1378 if journal is None: 1378 ↛ 1379line 1378 didn't jump to line 1379 because the condition on line 1378 was never true

1379 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1380 

1381 issue_id = xissue.pid 

1382 issue = model_helpers.get_container(issue_id) 

1383 

1384 if issue is None: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id) 

1386 

1387 publisher = issue.my_publisher 

1388 

1389 cmd = ptf_cmds.addContainerPtfCmd() 

1390 cmd.set_object_to_be_deleted(issue) 

1391 cmd.undo() 

1392 

1393 if publisher.publishes.count() == 0: 

1394 self.remove_publisher(publisher) 

1395 

1396 # update the journal first and last year 

1397 for the_issue in journal.content.all(): 

1398 self.update_collection_years(journal_id, the_issue, False) 

1399 

1400 journal.save() 

1401 

1402 cmd = addIssueXmlCmd( 

1403 { 

1404 "xissue": xissue, 

1405 "use_body": False, 

1406 "solr_commit": False, 

1407 "extra_folder": self.from_folder, 

1408 "to_folder": self.to_folder, 

1409 } 

1410 ) 

1411 issue = cmd.do() 

1412 

1413 return issue 

1414 

1415 # node_tag = self.tree.tag 

1416 # for child in self.tree: 

1417 # node_tag = child.tag 

1418 

1419 def internal_undo(self): 

1420 raise RuntimeError("update commands do not support the undo") 

1421 

1422 

1423class updateBookXmlCmd(addXmlCmd): 

1424 """ 

1425 updateBookXmlCmd: updates a book 

1426 

1427 Exception raised: 

1428 - exceptions.ResourceDoesNotExist during do if the Book does not exist 

1429 - RuntimeError if undo is called 

1430 """ 

1431 

1432 no_bib = False # Ignore the references during the import (used in Geodesic) 

1433 

1434 def internal_do(self): 

1435 super().internal_do() 

1436 

1437 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1438 self.warnings.extend(xbook.warnings) 

1439 

1440 book_id = xbook.pid 

1441 book = model_helpers.get_container(book_id) 

1442 

1443 if book is None: 1443 ↛ 1444line 1443 didn't jump to line 1444 because the condition on line 1443 was never true

1444 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid) 

1445 

1446 # unpublish and delete the existing publisher if necessary 

1447 # self.update_publisher(xbook, book) 

1448 

1449 # Note: the existing collection is not removed even if it no longer has a resource 

1450 # TODO: urls/commands to add/update/delete a collection 

1451 

1452 # Removes the book 

1453 cmd = ptf_cmds.addContainerPtfCmd() 

1454 cmd.set_object_to_be_deleted(book) 

1455 cmd.undo() 

1456 

1457 cmd = addBookXmlCmd( 

1458 { 

1459 "xbook": xbook, 

1460 "use_body": False, 

1461 "solr_commit": False, 

1462 "from_folder": self.from_folder, 

1463 "no_bib": self.no_bib, 

1464 "to_folder": self.to_folder, 

1465 } 

1466 ) 

1467 book = cmd.do() 

1468 

1469 return book 

1470 

1471 def internal_undo(self): 

1472 raise RuntimeError("update commands do not support the undo") 

1473 

1474 

1475class addOrUpdateContainerXmlCmd(addXmlCmd): 

1476 """ 

1477 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book 

1478 

1479 just detect Container type (do not check params etc.) 

1480 """ 

1481 

1482 keep_metadata = False 

1483 keep_translations = False 

1484 backup_folder = None 

1485 full_text_folder = "" 

1486 fake = False # Parse the XML but do not import 

1487 no_bib = False # Ignore the references during the import (used in Geodesic) 

1488 embargo = False # Import only the open articles (used in Geodesic) 

1489 

1490 def check_params(self): 

1491 super().check_params() 

1492 

1493 def internal_do(self): 

1494 super().internal_do() 

1495 

1496 tag = normalize(self.tree.tag) 

1497 

1498 if tag == "journal-issue": 

1499 cmd = addOrUpdateIssueXmlCmd( 

1500 { 

1501 "body": self.body, 

1502 "keep_metadata": self.keep_metadata, 

1503 "keep_translations": self.keep_translations, 

1504 "backup_folder": self.backup_folder, 

1505 "to_folder": self.to_folder, 

1506 "from_folder": self.from_folder, 

1507 "xml_file_folder": self.xml_file_folder, 

1508 "fake": self.fake, 

1509 "no_bib": self.no_bib, 

1510 "embargo": self.embargo, 

1511 } 

1512 ) 

1513 obj = cmd.do() 

1514 self.warnings.extend(cmd.warnings) 

1515 return obj 

1516 elif tag == "book": 

1517 cmd = addOrUpdateBookXmlCmd( 

1518 { 

1519 "body": self.body, 

1520 "from_folder": self.from_folder, 

1521 "to_folder": self.to_folder, 

1522 "no_bib": self.no_bib, 

1523 "embargo": self.embargo, 

1524 } 

1525 ) 

1526 obj = cmd.do() 

1527 self.warnings.extend(cmd.warnings) 

1528 return obj 

1529 else: 

1530 raise RuntimeError("addOrupdateContainer command can't detect container type") 

1531 

1532 def internal_undo(self): 

1533 raise RuntimeError("update commands do not support the undo") 

1534 

1535 

1536class addOrUpdateIssueXmlCmd(addXmlCmd): 

1537 """ 

1538 addOrUpdateIssueXmlCmd: adds or updates an issue 

1539 

1540 Adds an issue if it is not in the system or updates the issue if it is already there. 

1541 By default, no DOI is assigned for the articles. Set assign_doi to True. 

1542 

1543 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

1544 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json 

1545 

1546 keep_metadata: 

1547 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder 

1548 Default: False 

1549 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called 

1550 We always want to preserve GraphicalAbstracts (they are not in the issue XML) 

1551 

1552 keep_translations: 

1553 True if you want back up/restore translations. 

1554 Default: False 

1555 Note: When you post an article to a journal (test) website, the translation is declared in the XML 

1556 But if you import a Cedrics article in Trammel, the XML does not list translations 

1557 

1558 Exception raised: 

1559 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1560 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1561 - RuntimeError if undo is called 

1562 """ 

1563 

1564 keep_metadata = False 

1565 keep_translations = False 

1566 backup_folder = None 

1567 assign_doi = False 

1568 full_text_folder = "" 

1569 

1570 xissue = None 

1571 fake = False # Parse the XML but do not import 

1572 no_bib = False # Ignore the references during the import (used in Geodesic) 

1573 embargo = False # Import only the open articles (used in Geodesic) 

1574 

1575 def check_params(self): 

1576 super().check_params() 

1577 

1578 if self.keep_metadata and self.assign_doi: 1578 ↛ 1579line 1578 didn't jump to line 1579 because the condition on line 1578 was never true

1579 raise ValueError("keep_metadata and assign_doi cannot both be true.") 

1580 

1581 if self.keep_metadata and self.backup_folder is None: 1581 ↛ 1582line 1581 didn't jump to line 1582 because the condition on line 1581 was never true

1582 raise ValueError("backup_folder needs to be set when keep_metadata is true.") 

1583 

1584 def internal_do(self): 

1585 super().internal_do() 

1586 

1587 if not self.xissue: 1587 ↛ 1605line 1587 didn't jump to line 1605 because the condition on line 1587 was always true

1588 self.xissue = xissue = jats_parser.JatsIssue( 

1589 tree=self.tree, 

1590 from_folder=self.from_folder, 

1591 no_bib=self.no_bib, 

1592 ) 

1593 if len(xissue.warnings) > 0 and self.xml_file_folder: 1593 ↛ 1594line 1593 didn't jump to line 1594 because the condition on line 1593 was never true

1594 warnings = [] 

1595 warning_keys = [] 

1596 for warning in xissue.warnings: 

1597 for key, value in warning.items(): 

1598 if value not in warning_keys: 

1599 warning_keys.append(value) 

1600 warnings.append({key: value}) 

1601 for warning in warnings: 

1602 print(warning) 

1603 self.warnings.extend(xissue.warnings) 

1604 else: 

1605 xissue = self.xissue 

1606 

1607 if self.fake: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true

1608 return 

1609 

1610 xjournal = xissue.journal 

1611 journal_id = xjournal.pid 

1612 journal = model_helpers.get_collection(journal_id) 

1613 

1614 if journal is None: 1614 ↛ 1615line 1614 didn't jump to line 1615 because the condition on line 1614 was never true

1615 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1616 

1617 existing_issue = model_helpers.get_container(xissue.pid) 

1618 

1619 if existing_issue: 

1620 if self.embargo and existing_issue.embargo(): 1620 ↛ 1623line 1620 didn't jump to line 1623 because the condition on line 1620 was never true

1621 # Geodesic is for open access articles. 

1622 # We do not want to import the issues under embargo 

1623 print(f"Embargo, ignore {xissue.pid}") 

1624 return None 

1625 

1626 if self.keep_metadata: 

1627 # On commence par faire un backup de l'existant en cas de bug. 

1628 ptf_cmds.exportPtfCmd( 

1629 { 

1630 "pid": existing_issue.pid, 

1631 "with_internal_data": True, 

1632 "with_binary_files": False, 

1633 "for_archive": False, 

1634 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"), 

1635 } 

1636 ).do() 

1637 

1638 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1639 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1640 params = { 

1641 "pid": existing_issue.pid, 

1642 "export_folder": self.backup_folder, 

1643 "export_all": True, 

1644 "with_binary_files": True, 

1645 } 

1646 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1647 

1648 for article in existing_issue.article_set.all(): 

1649 backup_obj_not_in_metadata(article) 

1650 if self.keep_translations: 1650 ↛ 1651line 1650 didn't jump to line 1651 because the condition on line 1650 was never true

1651 backup_translation(article) 

1652 # changer nom de variable resource 

1653 for resource_in_special_issue in existing_issue.resources_in_special_issue.all(): 1653 ↛ 1656line 1653 didn't jump to line 1656 because the loop on line 1653 never started

1654 # External article can be part of special issue and backup can bug if so 

1655 

1656 if resource_in_special_issue.resource: 

1657 backup_obj_not_in_metadata(resource_in_special_issue.resource) 

1658 

1659 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants 

1660 

1661 cmd = ptf_cmds.addContainerPtfCmd() 

1662 cmd.set_object_to_be_deleted(existing_issue) 

1663 cmd.undo() 

1664 

1665 # update the journal first and last year 

1666 for the_issue in journal.content.all(): 

1667 self.update_collection_years(journal_id, the_issue, False) 

1668 

1669 journal.save() 

1670 else: 

1671 issue_to_appear = model_helpers.get_issue_to_appear(journal_id) 

1672 

1673 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés 

1674 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex) 

1675 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None. 

1676 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2. 

1677 # L'import va échouer car on ne peut avoir 2 fois le même article. 

1678 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3. 

1679 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import 

1680 # du nouveau volume sinon il va y avoir des conflits. 

1681 

1682 if issue_to_appear and xissue.pid != issue_to_appear.pid: 

1683 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1684 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1685 # ainsi que image associée via ptf-tools 

1686 if self.keep_metadata: 1686 ↛ 1696line 1686 didn't jump to line 1696 because the condition on line 1686 was always true

1687 params = { 

1688 "pid": issue_to_appear.pid, 

1689 "force_pid": xissue.pid, 

1690 "export_folder": self.backup_folder, 

1691 "export_all": True, 

1692 "with_binary_files": True, 

1693 } 

1694 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1695 

1696 for xarticle in xissue.articles: 

1697 if isinstance(xarticle, dict): 1697 ↛ 1698line 1697 didn't jump to line 1698 because the condition on line 1697 was never true

1698 xdoi = xarticle["doi"] 

1699 else: 

1700 xdoi = getattr(xarticle, "doi") 

1701 article = issue_to_appear.article_set.filter(doi=xdoi).first() 

1702 if article: 1702 ↛ 1696line 1702 didn't jump to line 1696 because the condition on line 1702 was always true

1703 backup_obj_not_in_metadata(article) 

1704 if self.keep_translations: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true

1705 backup_translation(article) 

1706 

1707 params = {"to_folder": self.to_folder} # pour suppression des binaires 

1708 cmd = ptf_cmds.addArticlePtfCmd(params) 

1709 cmd.set_object_to_be_deleted(article) 

1710 cmd.undo() 

1711 

1712 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd 

1713 cmd = addIssueXmlCmd( 

1714 { 

1715 "xissue": xissue, 

1716 "use_body": False, 

1717 # "body": self.body, 

1718 "assign_doi": self.assign_doi, 

1719 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file 

1720 "extra_folder": self.backup_folder, 

1721 "from_folder": self.from_folder, 

1722 "to_folder": self.to_folder, 

1723 "no_bib": self.no_bib, 

1724 "embargo": self.embargo, 

1725 "solr_commit": False, 

1726 } 

1727 ) 

1728 new_issue = cmd.do() 

1729 

1730 if new_issue: 1730 ↛ 1749line 1730 didn't jump to line 1749 because the condition on line 1730 was always true

1731 new_articles = new_issue.article_set.all() 

1732 

1733 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés 

1734 for article in new_articles: 

1735 if self.assign_doi and article.doi is None: 1735 ↛ 1736line 1735 didn't jump to line 1736 because the condition on line 1735 was never true

1736 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid) 

1737 

1738 # TODO garbage collector on articles no longer in the issue 

1739 restore_obj_not_in_metadata(article) 

1740 if self.keep_translations: 1740 ↛ 1741line 1740 didn't jump to line 1741 because the condition on line 1740 was never true

1741 restore_translation(article) 

1742 if new_issue.ctype == "issue_special": 1742 ↛ 1743line 1742 didn't jump to line 1743 because the condition on line 1742 was never true

1743 resources_in_special_issue = new_issue.resources_in_special_issue.all() 

1744 for resource_in_special_issue in resources_in_special_issue: 

1745 # External article can be part of special issue and restore can bug if so 

1746 if resource_in_special_issue.resource: 

1747 restore_obj_not_in_metadata(resource_in_special_issue.resource) 

1748 

1749 return new_issue 

1750 

1751 def internal_undo(self): 

1752 raise RuntimeError("update commands do not support the undo") 

1753 

1754 

1755class addOrUpdateBookXmlCmd(addXmlCmd): 

1756 xbook = None 

1757 no_bib = False # Ignore the references during the import (used in Geodesic) 

1758 

1759 def internal_do(self): 

1760 super().internal_do() 

1761 

1762 if not self.xbook: 1762 ↛ 1766line 1762 didn't jump to line 1766 because the condition on line 1762 was always true

1763 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1764 self.warnings.extend(xbook.warnings) 

1765 else: 

1766 xbook = self.xbook 

1767 

1768 book_id = xbook.pid 

1769 book = model_helpers.get_container(book_id) 

1770 

1771 if book: 1771 ↛ 1772line 1771 didn't jump to line 1772 because the condition on line 1771 was never true

1772 cmd = ptf_cmds.addContainerPtfCmd() 

1773 cmd.set_object_to_be_deleted(book) 

1774 cmd.undo() 

1775 

1776 collection = book.get_collection() 

1777 

1778 # update the collection first and last year 

1779 for container in collection.content.all(): 

1780 self.update_collection_years(collection.pid, container, False) 

1781 

1782 collection.save() 

1783 

1784 cmd = addBookXmlCmd( 

1785 { 

1786 "xbook": xbook, 

1787 "use_body": False, 

1788 # "body": self.body, 

1789 "from_folder": self.from_folder, 

1790 "to_folder": self.to_folder, 

1791 "no_bib": self.no_bib, 

1792 "solr_commit": False, 

1793 } 

1794 ) 

1795 book = cmd.do() 

1796 return book 

1797 

1798 

1799class updateBibitemCitationXmlCmd(baseCmd): 

1800 """ """ 

1801 

1802 def __init__(self, params=None): 

1803 self.bibitem = None 

1804 

1805 super().__init__(params) 

1806 

1807 self.required_params.extend(["bibitem"]) 

1808 

1809 def set_bibitem(self, bibitem): 

1810 self.bibitem = bibitem 

1811 

1812 def internal_do(self): 

1813 super().internal_do() 

1814 

1815 new_ids = {} 

1816 for bibitemid in self.bibitem.bibitemid_set.all(): 

1817 new_ids[bibitemid.id_type] = { 

1818 "id_type": bibitemid.id_type, 

1819 "id_value": bibitemid.id_value, 

1820 "checked": bibitemid.checked, 

1821 "false_positive": bibitemid.false_positive, 

1822 } 

1823 

1824 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids) 

1825 self.warnings.extend(xbibitem.warnings) 

1826 

1827 self.bibitem.citation_xml = xbibitem.citation_xml 

1828 self.bibitem.citation_html = xbibitem.citation_html 

1829 self.bibitem.citation_tex = xbibitem.citation_tex 

1830 self.bibitem.save() 

1831 

1832 def internal_undo(self): 

1833 raise RuntimeError("update commands do not support the undo") 

1834 

1835 

1836###################################################################################### 

1837###################################################################################### 

1838# 

1839# Import Commands 

1840# 

1841###################################################################################### 

1842###################################################################################### 

1843 

1844 

1845class collectEntireCollectionXmlCmd(baseCmd): 

1846 """ 

1847 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder 

1848 

1849 results: 

1850 """ 

1851 

1852 def __init__(self, params=None): 

1853 self.pid = None 

1854 self.folder = None 

1855 

1856 super().__init__(params) 

1857 

1858 self.required_params.extend(["pid", "folder"]) 

1859 

1860 def internal_do(self): 

1861 super().internal_do() 

1862 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)] 

1863 return pids 

1864 

1865 

1866class importEntireCollectionXmlCmd(baseCmd): 

1867 """ 

1868 Import all the XML of a collection (collection.xml, issues.xml) of a given folder 

1869 

1870 results: 

1871 """ 

1872 

1873 def __init__(self, params=None): 

1874 self.pid = None 

1875 self.from_folder = None 

1876 self.to_folder = None 

1877 self.backup_folder = None 

1878 self.keep_metadata = False 

1879 self.keep_translations = False 

1880 

1881 self.with_cedrics = True 

1882 self.from_cedrics = False # The entire collection is in Cedrics format 

1883 self.date_for_pii = False # Fetch publication_date for Elsevier articles 

1884 self.first_issue = "" 

1885 self.fake = False # Parse the XML but do not import 

1886 

1887 self.no_bib = False # Ignore the references during the import (used in Geodesic) 

1888 self.embargo = False # Import only the open articles (used in Geodesic) 

1889 

1890 self.caller = None 

1891 self.callback = None 

1892 self.job = None 

1893 

1894 super().__init__(params) 

1895 

1896 self.required_params.extend(["pid", "from_folder"]) 

1897 

1898 def internal_do(self): 

1899 super().internal_do() 

1900 

1901 pid = self.pid 

1902 resource = model_helpers.get_resource(pid) 

1903 if not resource and not self.fake: 

1904 body = resolver.get_archive_body(self.from_folder, pid, None) 

1905 journals = addCollectionsXmlCmd( 

1906 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder} 

1907 ).do() 

1908 if not journals: 

1909 raise ValueError(self.from_folder + " does not contain a collection") 

1910 resource = journals[0] 

1911 

1912 obj = resource.cast() 

1913 

1914 if obj.classname != "Collection": 

1915 raise ValueError(pid + " does not contain a collection") 

1916 

1917 if self.with_cedrics: 

1918 # with_cedrics means that you want to import everything from scratch 

1919 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID) 

1920 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"}) 

1921 cmd.do() 

1922 

1923 i = 0 

1924 for pid, file_ in resolver.iterate_collection_folder( 

1925 self.from_folder, self.pid, self.first_issue 

1926 ): 

1927 if self.callback is None: 

1928 print(pid) 

1929 

1930 if self.from_cedrics: 

1931 cmd = importCedricsIssueDirectlyXmlCmd( 

1932 { 

1933 "colid": self.pid, 

1934 "input_file": file_, 

1935 "remove_email": False, 

1936 "remove_date_prod": True, 

1937 "copy_files": True, 

1938 "force_dois": False, 

1939 } 

1940 ) 

1941 else: 

1942 body = resolver.get_body(file_) 

1943 xml_file_folder = os.path.dirname(file_) 

1944 cmd = addOrUpdateContainerXmlCmd( 

1945 { 

1946 "body": body, 

1947 "from_folder": self.from_folder, 

1948 "to_folder": self.to_folder, 

1949 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file 

1950 "xml_file_folder": xml_file_folder, # when article.XML are in separate files 

1951 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML 

1952 "keep_translations": self.keep_translations, # Backup/Restore existing translations 

1953 "no_bib": self.no_bib, 

1954 "embargo": self.embargo, 

1955 # Needed in Trammel 

1956 "fake": self.fake, 

1957 } 

1958 ) 

1959 cmd.do() 

1960 

1961 i += 1 

1962 if self.callback: 

1963 self.callback(self.job, i) 

1964 

1965 if self.with_cedrics: 

1966 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata") 

1967 

1968 xml_files = [ 

1969 os.path.join(src_folder, f) 

1970 for f in os.listdir(src_folder) 

1971 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml") 

1972 ] 

1973 for xml_file in xml_files: 

1974 if self.callback is None: 

1975 print(xml_file) 

1976 

1977 cmd = importCedricsIssueXmlCmd( 

1978 { 

1979 "colid": self.pid, 

1980 "input_file": xml_file, 

1981 "from_folder": self.from_folder, 

1982 "to_folder": self.to_folder, 

1983 } 

1984 ) 

1985 cmd.do() 

1986 

1987 

1988class importCedricsIssueXmlCmd(baseCmd): 

1989 def __init__(self, params=None): 

1990 self.colid = None 

1991 self.input_file = None 

1992 self.remove_email = True 

1993 self.remove_date_prod = True 

1994 self.diff_only = False 

1995 self.body = None 

1996 self.xissue = None 

1997 self.copy_files = True 

1998 

1999 super().__init__(params) 

2000 

2001 self.required_params.extend(["colid"]) 

2002 

2003 def import_full_text(self, issue): 

2004 """ 

2005 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL) 

2006 Read the XML file and convert the body in HTML 

2007 """ 

2008 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid) 

2009 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid) 

2010 

2011 if len(tex_folders) > 0: 

2012 i = 0 

2013 for article in issue.article_set.all(): 

2014 article_folder = tex_folders[i] 

2015 xml_file = os.path.join( 

2016 tex_src_folder, article_folder, "FullText", article_folder + ".xml" 

2017 ) 

2018 

2019 cmd = ptf_cmds.updateResourceIdPtfCmd( 

2020 {"id_type": "ojs-id", "id_value": article_folder} 

2021 ) 

2022 cmd.set_resource(article) 

2023 cmd.do() 

2024 

2025 if os.path.isfile(xml_file): 

2026 with open(xml_file, encoding="utf-8") as f: 

2027 body = f.read() 

2028 

2029 cmd = addBodyInHtmlXmlCmd( 

2030 { 

2031 "body": body, 

2032 "from_folder": settings.CEDRAM_XML_FOLDER, 

2033 # nécessaire pour la copie des binaires type image 

2034 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

2035 } 

2036 ) 

2037 cmd.set_article(article) 

2038 cmd.do() 

2039 

2040 i += 1 

2041 

2042 def import_in_db(self): 

2043 """ 

2044 Import Cedrics issue from /cedram_dev/exploitation/cedram 

2045 This worflow is no longer used. 

2046 """ 

2047 

2048 # Cedrics: the full text for SolR is in a separate file 

2049 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/" 

2050 

2051 params = { 

2052 "assign_doi": False, 

2053 "full_text_folder": full_text_folder, 

2054 "keep_metadata": True, 

2055 "keep_translations": True, 

2056 "use_body": False, 

2057 "xissue": self.xissue, 

2058 "backup_folder": settings.MERSENNE_TMP_FOLDER, 

2059 "from_folder": settings.CEDRAM_XML_FOLDER, 

2060 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

2061 } 

2062 

2063 # params['body'] = self.body 

2064 

2065 cmd = addOrUpdateIssueXmlCmd(params) 

2066 issue = cmd.do() 

2067 self.warnings.extend(cmd.get_warnings()) 

2068 

2069 # resolver.copy_binary_files( 

2070 # issue, 

2071 # settings.CEDRAM_XML_FOLDER, 

2072 # settings.MERSENNE_TEST_DATA_FOLDER) 

2073 

2074 self.import_full_text(issue) 

2075 

2076 return issue 

2077 

2078 def compare_issue(self): 

2079 xissue = self.xissue 

2080 issues_diff = {} 

2081 result = True 

2082 

2083 time1 = timezone.now() 

2084 

2085 new_dois = [article.doi for article in xissue.articles] 

2086 

2087 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related( 

2088 "abstract_set", 

2089 "kwd_set", 

2090 "subj_set", 

2091 "datastream_set", 

2092 "relatedobject_set", 

2093 "resourcecount_set", 

2094 "contributions", 

2095 "contributions__contribaddress_set", 

2096 "bibitem_set__bibitemid_set", 

2097 "bibitem_set__contributions", 

2098 "bibitem_set__contributions__contribaddress_set", 

2099 ) 

2100 

2101 issue = None 

2102 try: 

2103 issue = ( 

2104 Container.objects.select_related("my_collection", "my_publisher") 

2105 .prefetch_related( 

2106 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi") 

2107 ) 

2108 .get(sites__id=settings.SITE_ID, pid=xissue.pid) 

2109 ) 

2110 except Container.DoesNotExist: 

2111 pass 

2112 

2113 if issue: 

2114 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi) 

2115 

2116 time2 = timezone.now() 

2117 delta = time2 - time1 

2118 

2119 delta.seconds + delta.microseconds / 1e6 

2120 print(delta) 

2121 

2122 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...) 

2123 model_data_comparator.prepare_issue_for_comparison(xissue) 

2124 

2125 issue_comparator = model_data_comparator.IssueDataComparator() 

2126 

2127 result = issue_comparator.compare(data_issue, xissue, issues_diff) 

2128 

2129 return (result, issues_diff, xissue) 

2130 

2131 def delete_previous_file(self, output_folder): 

2132 basename = os.path.basename(self.input_file) 

2133 

2134 output_file = os.path.join(output_folder, self.colid, basename) 

2135 if os.path.isfile(output_file): 

2136 os.remove(output_file) 

2137 

2138 os.makedirs(output_folder, exist_ok=True) 

2139 os.makedirs(os.path.dirname(output_file), exist_ok=True) 

2140 

2141 return output_file 

2142 

2143 def import_cedrics_issue(self): 

2144 """ 

2145 Import Cedrics issue from /cedram_dev/exploitation/cedram 

2146 This worflow is no longer used. 

2147 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM 

2148 (see importCedricsIssueDirectlyXmlCmd below) 

2149 """ 

2150 

2151 output_folder = settings.MERSENNE_TMP_FOLDER 

2152 ptf_xsl_folder = settings.PTF_XSL_FOLDER 

2153 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE) 

2154 

2155 # 1. Delete the previous file 

2156 output_file = self.delete_previous_file(output_folder) 

2157 

2158 # 2. Transform the cedrics XML into JATS 

2159 cmd_folder = os.path.join(ptf_xsl_folder, "cedram") 

2160 

2161 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format( 

2162 cmd_folder, 

2163 os.path.join(settings.VIRTUALENV_DIR, "bin/python"), 

2164 "-s" if self.colid in settings.MERSENNE_SEMINARS else "", 

2165 self.input_file, 

2166 output_folder, 

2167 log_file + "1", 

2168 # option -e for cedram2ptf.py for not removing email 

2169 "-e" if not self.remove_email else "", 

2170 "-t" if self.remove_date_prod else "", 

2171 log_file, 

2172 ) 

2173 

2174 log_file2 = log_file + "2" 

2175 with open(log_file2, "w", encoding="ascii") as file_: 

2176 file_.write(cmd_str + "\n") 

2177 

2178 sys.path.append(ptf_xsl_folder + "/lib") 

2179 

2180 try: 

2181 result = subprocess.check_output(cmd_str, shell=True) 

2182 except Exception as e: 

2183 with open(log_file) as logfile_: 

2184 logfile_body = logfile_.read() 

2185 message = str(e) + "\n" + logfile_body + "\n" 

2186 file_.write(message) 

2187 file_.close() 

2188 raise RuntimeError(message) 

2189 

2190 file_.write(str(result) + "\n") 

2191 

2192 # Check if the output_file has been created 

2193 if not os.path.isfile(output_file): 

2194 raise RuntimeError("The file was not converted in JATS") 

2195 

2196 with open(output_file, encoding="utf-8") as f: 

2197 self.body = f.read() 

2198 

2199 parser = etree.XMLParser( 

2200 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True 

2201 ) 

2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2203 self.xissue = jats_parser.JatsIssue(tree=tree) 

2204 self.warnings.extend(self.xissue.warnings) 

2205 

2206 def internal_do(self): 

2207 super().internal_do() 

2208 

2209 if not self.xissue: 

2210 self.import_cedrics_issue() 

2211 

2212 result = None 

2213 

2214 if self.diff_only: 

2215 result = self.compare_issue() 

2216 else: 

2217 result = self.import_in_db() 

2218 

2219 return result 

2220 

2221 

2222# import from /cedram_dev/production_tex/CEDRAM 

2223class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd): 

2224 def __init__(self, params=None): 

2225 self.is_seminar = False 

2226 self.article_folders = None 

2227 self.force_dois = True 

2228 super().__init__(params) 

2229 

2230 def read_file(self, filename, skip_lines=2): 

2231 i = 0 

2232 lines = [] 

2233 try: 

2234 with open(filename, encoding="utf-8") as fr: 

2235 for line in fr: 

2236 if i > skip_lines: 

2237 lines.append(line) 

2238 i += 1 

2239 except UnicodeDecodeError: 

2240 i = 0 

2241 lines = [] 

2242 with open(filename, encoding="iso-8859-1") as fr: 

2243 for line in fr: 

2244 if i > skip_lines: 

2245 lines.append(line) 

2246 i += 1 

2247 

2248 return lines 

2249 

2250 def import_cedrics_issue(self): 

2251 """ 

2252 Parse the Cedrics XML directly, without Cedrics -> JATS transformation 

2253 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created 

2254 Workflow 

2255 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM 

2256 2. Cat the article XML files into one issue.XML 

2257 3. Read the Cedrics issue.XML 

2258 

2259 :return: 

2260 """ 

2261 

2262 output_folder = settings.MERSENNE_TMP_FOLDER 

2263 output_file = self.delete_previous_file(output_folder) 

2264 

2265 basename = os.path.basename(self.input_file) 

2266 if "-cdrxml" in basename: 

2267 pid = basename.split("-cdrxml.")[0] 

2268 else: 

2269 pid = basename.split(".xml")[0] 

2270 

2271 # 1. Get the list of articles 

2272 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid) 

2273 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid) 

2274 

2275 # 2. Create the issue XML file 

2276 with open(output_file, "w", encoding="utf-8") as fw: 

2277 # 2.a. Start the issue.xml based on @pid-cdrxml.xml 

2278 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n') 

2279 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n') 

2280 fw.write("<cedram>\n") 

2281 

2282 lines = self.read_file(self.input_file) 

2283 for line in lines: 

2284 fw.write(line) 

2285 

2286 # 2.b. Cat the article XML files 

2287 for basename in self.article_folders: 

2288 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml") 

2289 

2290 lines = self.read_file(src_file) 

2291 for line in lines: 

2292 fw.write(line) 

2293 

2294 fw.write("</cedram>\n") 

2295 

2296 # 3. Read the Cedrics issue.XML 

2297 with open(output_file, encoding="utf-8") as f: 

2298 self.body = f.read() 

2299 

2300 parser = etree.XMLParser( 

2301 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2302 ) 

2303 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2304 self.xissue = cedrics_parser.CedricsIssue( 

2305 tree=tree, 

2306 is_seminar=self.is_seminar, 

2307 ignore_date_published=self.remove_date_prod, 

2308 article_folders=self.article_folders, 

2309 dois=self.dois, 

2310 ) 

2311 if self.force_dois: 

2312 for xarticle in self.xissue.articles: 

2313 if xarticle.doi is None: 

2314 raise ValueError(xarticle.pid, "n'a pas de doi") 

2315 

2316 self.warnings.extend(self.xissue.warnings) 

2317 

2318 def import_in_db(self): 

2319 params = { 

2320 "assign_doi": False, 

2321 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2322 "keep_metadata": True, 

2323 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them. 

2324 "use_body": False, 

2325 "xissue": self.xissue, 

2326 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import 

2327 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2328 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

2329 } 

2330 

2331 cmd = addOrUpdateIssueXmlCmd(params) 

2332 issue = cmd.do() 

2333 self.warnings.extend(cmd.get_warnings()) 

2334 

2335 self.import_full_text(issue) 

2336 

2337 return issue 

2338 

2339 

2340class addCedricsIssueXmlCmd(addXmlCmd): 

2341 assign_doi = False 

2342 full_text_folder = "" 

2343 import_folder = None 

2344 prod_deployed_date_iso_8601_date_str = None 

2345 xissue = None 

2346 remove_blank_text = False 

2347 is_seminar = False 

2348 

2349 def internal_do(self): 

2350 super().internal_do() 

2351 

2352 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar) 

2353 

2354 return self.xissue 

2355 

2356 

2357class addorUpdateCedricsArticleXmlCmd(baseCmd): 

2358 def __init__(self, params=None): 

2359 self.container_pid = None 

2360 self.article_folder_name = None 

2361 

2362 super().__init__(params) 

2363 

2364 self.required_params.extend(["container_pid", "article_folder_name"]) 

2365 

2366 def internal_do(self): 

2367 super().internal_do() 

2368 

2369 issue = model_helpers.get_container(self.container_pid) 

2370 if not issue: 

2371 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist") 

2372 

2373 colid = issue.my_collection.pid 

2374 article_folder = os.path.join( 

2375 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name 

2376 ) 

2377 

2378 # 1. Read the Cedrics article.XML 

2379 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml") 

2380 with open(input_file, encoding="utf-8") as f: 

2381 body = f.read() 

2382 

2383 # 2. Parse the file and create an xarticle 

2384 is_seminar = colid in settings.MERSENNE_SEMINARS 

2385 parser = etree.XMLParser( 

2386 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2387 ) 

2388 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

2389 xarticle = cedrics_parser.CedricsArticle( 

2390 tree=tree, 

2391 colid=colid, 

2392 issue_id=self.container_pid, 

2393 is_seminar=is_seminar, 

2394 ignore_date_published=True, 

2395 article_folder=self.article_folder_name, 

2396 ) 

2397 if xarticle.doi is None: 

2398 raise ValueError(xarticle.pid, "n'a pas de doi") 

2399 

2400 # Get the article position in its issue (seq) to preserve its order 

2401 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid) 

2402 i = 1 

2403 for folder in article_folders: 

2404 if folder == self.article_folder_name: 

2405 xarticle.seq = i 

2406 i += 1 

2407 

2408 existing_article = model_helpers.get_article(xarticle.pid) 

2409 temp_folder = settings.MERSENNE_TMP_FOLDER 

2410 

2411 # 3. Backup/Suppression de l'article existant 

2412 if existing_article: 

2413 # On commence par faire un backup de l'existant en cas de bug. 

2414 ptf_cmds.exportPtfCmd( 

2415 { 

2416 "pid": self.container_pid, 

2417 "with_internal_data": True, 

2418 "with_binary_files": False, 

2419 "for_archive": False, 

2420 "export_folder": os.path.join(temp_folder, "backup"), 

2421 } 

2422 ).do() 

2423 

2424 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json 

2425 params = { 

2426 "pid": existing_article.pid, 

2427 "export_folder": temp_folder, 

2428 "export_all": True, 

2429 "with_binary_files": True, 

2430 } 

2431 ptf_cmds.exportExtraDataPtfCmd(params).do() 

2432 

2433 backup_obj_not_in_metadata(existing_article) 

2434 backup_translation(existing_article) 

2435 

2436 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone 

2437 

2438 # 4. Ajout de l'article dans Django/SolR 

2439 params = { 

2440 "xarticle": xarticle, 

2441 "issue": issue, 

2442 "standalone": True, 

2443 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly 

2444 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2445 # temp folder used to backup/restore info during the import 

2446 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2447 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, 

2448 "keep_translations": True, 

2449 } 

2450 

2451 cmd = addArticleXmlCmd(params) 

2452 cmd.set_collection(issue.my_collection) 

2453 article = cmd.do() 

2454 

2455 # 5. Lecture du full text en HTML 

2456 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml") 

2457 if os.path.isfile(xml_file): 

2458 with open(xml_file, encoding="utf-8") as f: 

2459 body = f.read() 

2460 

2461 cmd = addBodyInHtmlXmlCmd( 

2462 { 

2463 "body": body, 

2464 "from_folder": settings.CEDRAM_XML_FOLDER, 

2465 # nécessaire pour la copie des binaires type image 

2466 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

2467 "remove_blank_text": False, 

2468 } 

2469 ) 

2470 cmd.set_article(article) 

2471 cmd.do() 

2472 

2473 # 6. On ajoute l'ojs-id pour ptf-tools 

2474 cmd = ptf_cmds.updateResourceIdPtfCmd( 

2475 {"id_type": "ojs-id", "id_value": self.article_folder_name} 

2476 ) 

2477 cmd.set_resource(article) 

2478 cmd.do() 

2479 

2480 # 7. On restaure les données additionnelles (extid, deployed_date,...) 

2481 if existing_article: 

2482 ptf_cmds.importExtraDataPtfCmd( 

2483 {"pid": existing_article.pid, "import_folder": temp_folder} 

2484 ).do() 

2485 

2486 restore_obj_not_in_metadata(article) 

2487 restore_translation(article) 

2488 

2489 return article 

2490 

2491 

2492class transformBodyInHtmlXmlCmd(addXmlCmd): 

2493 """ 

2494 transformBodyInHtmlXmlCmd: transform the JATS body in HTML 

2495 

2496 TODO: handle images,... 

2497 

2498 """ 

2499 

2500 use_body = False 

2501 

2502 def internal_do(self): 

2503 super().internal_do() 

2504 

2505 xsl_file = settings.PTF_HTML_XSL 

2506 xslt_doc = etree.parse(xsl_file) 

2507 t = etree.XSLT(xslt_doc) 

2508 

2509 html_tree = t(self.tree).getroot() 

2510 

2511 body = html_tree.find("body/article/main") 

2512 text = xmldata_jats.innerxml(body).decode("utf-8") 

2513 

2514 return text 

2515 

2516 

2517class addBodyInHtmlXmlCmd(addXmlCmd): 

2518 """ 

2519 addBodyInHtmlXmlCmd: read the JATS body of an article 

2520 and create the corresponding HTML 

2521 

2522 TODO: handle images,... manage warnings for unused tag ? 

2523 

2524 """ 

2525 

2526 def __init__(self, params=None): 

2527 self.article = None 

2528 self.pid = None 

2529 

2530 super().__init__(params) 

2531 

2532 def set_article(self, article): 

2533 self.article = article 

2534 

2535 def pre_do(self): 

2536 super().pre_do() 

2537 

2538 if self.pid is None and self.article is None: 

2539 raise ValueError("pid et article sont vides") 

2540 

2541 if self.article is None: 

2542 self.article = model_helpers.get_article(self.pid) 

2543 

2544 if self.pid is None: 

2545 self.pid = self.article.pid 

2546 

2547 def internal_do(self): 

2548 super().internal_do() 

2549 

2550 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid) 

2551 # faut il récupérer les warnings du parseHTML ? 

2552 # self.warnings.extend(xarticle.warnings) 

2553 self.article.relatedobject_set.filter(rel="html-image").delete() 

2554 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject") 

2555 

2556 params = { 

2557 "body_html": xarticle.body_html, 

2558 "body_tex": xarticle.body_tex, 

2559 "body_xml": xarticle.body_xml, 

2560 "use_page_count": False, 

2561 } 

2562 

2563 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2564 cmd.set_article(self.article) 

2565 cmd.do() 

2566 

2567 # copy_binary_files will call resolver.copy_html_images 

2568 # to copy the article images 

2569 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here 

2570 

2571 resolver.copy_html_images( 

2572 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER 

2573 ) 

2574 

2575 

2576class updateCacheXmlCmd(baseCmd): 

2577 """ 

2578 recreate the citation_html field of the bibitems 

2579 

2580 Params: colid: pid of the collection to process 

2581 """ 

2582 

2583 def __init__(self, params=None): 

2584 self.colid = None 

2585 self.start_id = None 

2586 

2587 super().__init__(params) 

2588 

2589 self.required_params.extend(["colid"]) 

2590 

2591 def update_article(self, xarticle): 

2592 article = model_helpers.get_article(xarticle.pid) 

2593 if article is None: 

2594 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist") 

2595 

2596 article.title_html = xarticle.title_html 

2597 article.title_tex = xarticle.title_tex 

2598 article.trans_title_html = xarticle.trans_title_html 

2599 article.trans_title_tex = xarticle.trans_title_tex 

2600 article.save() 

2601 

2602 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()): 

2603 abstract.value_html = xabstract["value_html"] 

2604 abstract.value_tex = xabstract["value_tex"] 

2605 abstract.save() 

2606 

2607 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()): 

2608 # kwd_group.value_html = xkwd_group['value_html'] 

2609 # kwd_group.value_tex = xkwd_group['value_tex'] 

2610 # kwd_group.save() 

2611 

2612 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()): 

2613 bib.citation_html = xbib.citation_html 

2614 bib.citation_tex = xbib.citation_tex 

2615 bib.article_title_tex = xbib.article_title_tex 

2616 bib.chapter_title_tex = xbib.chapter_title_tex 

2617 bib.source_tex = xbib.source_tex 

2618 bib.volume = xbib.volume 

2619 bib.save() 

2620 

2621 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY: 

2622 params = { 

2623 "body_html": xarticle.body_html, 

2624 "body_tex": xarticle.body_tex, 

2625 "body_xml": xarticle.body_xml, 

2626 "use_page_count": False, 

2627 } 

2628 

2629 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2630 cmd.set_article(article) 

2631 cmd.do() 

2632 

2633 def internal_do(self): 

2634 super().internal_do() 

2635 

2636 collection = model_helpers.get_collection(self.colid) 

2637 if collection is None: 

2638 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist") 

2639 

2640 qs = collection.content.all().order_by("pid") 

2641 start = self.start_id is None 

2642 for container in qs: 

2643 if not start and container.pid == self.start_id: 

2644 start = True 

2645 

2646 if start: 

2647 print(container.pid) 

2648 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

2649 xml_body = ptf_cmds.exportPtfCmd( 

2650 {"pid": container.pid, "with_body": with_body} 

2651 ).do() 

2652 

2653 parser = etree.XMLParser( 

2654 huge_tree=True, 

2655 recover=True, 

2656 remove_blank_text=False, 

2657 remove_comments=True, 

2658 resolve_entities=True, 

2659 ) 

2660 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser) 

2661 xissue = jats_parser.JatsIssue(tree=tree) 

2662 

2663 for xarticle in xissue: 

2664 self.update_article(xarticle)