Coverage for src/ptf/cmds/xml

1import copy

2import datetime

3import os.path

4import subprocess

5import sys

6import traceback

8from lxml import ElementInclude

9from lxml import etree

11from django.conf import settings

12from django.db import transaction

13from django.db.models import Prefetch

14from django.utils import timezone

16from ptf import exceptions

17from ptf import model_data

18from ptf import model_data_comparator

19from ptf import model_data_converter

20from ptf import model_helpers

21from ptf import tex

22from ptf import utils

23from ptf.cmds import ptf_cmds

24from ptf.cmds import solr_cmds

25from ptf.cmds.base_cmds import baseCmd

26from ptf.cmds.xml import xml_utils

27from ptf.cmds.xml.cedrics import cedrics_parser

29# KEEP THIS UNUSED IMPORT THEY ARE USED

30from ptf.cmds.xml.jats import jats_parser

31from ptf.cmds.xml.jats import xmldata as xmldata_jats

32from ptf.cmds.xml.xml_utils import normalize

33from ptf.display import resolver

35# from ptf.models import Resource

36from ptf.models import Article

37from ptf.models import Collection

38from ptf.models import Container

39from ptf.models import Person

40from ptf.models import PtfSite

41from ptf.models import backup_obj_not_in_metadata

42from ptf.models import backup_translation

43from ptf.models import restore_obj_not_in_metadata

44from ptf.models import restore_translation

47def find_file(name):

48 paths = settings.MANAGER_XSLT_DIRS

49 for path in paths:

50 for root, _, files in os.walk(path):

51 if name in files:

52 return os.path.join(root, name)

53 return None

56def get_transform(name):

57 file_path = find_file(f"{name}.xsl")

58 xslt_doc = etree.parse(file_path)

59 return etree.XSLT(xslt_doc)

62class addXmlCmd(baseCmd):

63 """

64 addXmlCmd: base class for commands that take an XML as input

65 The XML is passed with the body param

67 from_folder / to_folder: location of binary files to copy

69 Example with a file:

70 f = open('journal.xml')

71 body = f.read()

72 f.close()

73 cmd = add...XmlCmd( { "body":body } )

75 Exception raised:

76 - ValueError if the init params are empty

77 """

79 use_body = True

80 body: str | None = None

81 tree = None

82 solr_commit_at_the_end = True

83 xml_filename_in_log = None

84 remove_blank_text = False

85 xml_file_folder = None

87 def __init__(self, params=None):

88 super().__init__(params)

90 if self.use_body:

91 self.required_params.extend(["body"])

93 def get_logname(self):

94 filename = ""

96 if hasattr(settings, "LOG_DIR"): 96 ↛ 106line 96 didn't jump to line 106 because the condition on line 96 was always true

97 i = 0

98 today = datetime.date.today()

99 basename = str(today) + "-" + self.__class__.__name__ + "-"

100 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")

101

102 while os.path.isfile(filename):

103 i += 1

104 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")

105

106 return filename

107

108 def pre_do(self):

109 super().pre_do()

110

111 if self.use_body:

112 # The Cedrics -> JATS XSLT transform manually adds space=preserve around

113 # the nodes with mixed-content, but leaves the text unchanged.

114 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True

115 # Or the spaces will be removed whereas the JATS XML will keep them.

116 # We still need the remove_blank_text=True for JATS XML for all the other nodes

117 parser = etree.XMLParser(

118 huge_tree=True,

119 recover=True,

120 remove_blank_text=self.remove_blank_text,

121 remove_comments=True,

122 resolve_entities=True,

123 )

124 # if isinstance(self.body, str):

125 # self.body = self.body

126 if self.xml_file_folder is not None: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 if self.xml_file_folder[-1] != "/":

128 self.xml_file_folder += "/"

129 # For ElementInclude to find the href

130 self.body = self.body.replace(

131 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""

132 ).replace("xlink:href", "href")

133 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

134

135 if self.xml_file_folder is not None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 ElementInclude.include(tree, base_url=self.xml_file_folder)

137 # t = get_transform('strip-namespace')

138 # self.tree = t(tree).getroot()

139 self.tree = tree

140

141 if self.tree is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 raise ValueError("tree est vide")

143

144 # Write the xml body on disk

145 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:

146 self.xml_filename_in_log = self.get_logname()

147

148 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_:

149 file_.write(self.body)

150

151 @transaction.atomic

152 def do(self, parent=None):

153 try:

154 obj = super().do(parent)

155 except Exception as e:

156 ptf_cmds.do_solr_rollback()

157

158 # Empty sub_cmds to ignore undo

159 self.cmds = []

160

161 # Write the xml body on disk

162 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:

163 with open(

164 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8"

165 ) as file_:

166 file_.write("----------------------\n")

167

168 if self.xml_filename_in_log is None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 self.xml_filename_in_log = self.get_logname()

170

171 file_.write(self.xml_filename_in_log + " : FAILED\n")

172 exc_type, exc_value, exc_traceback = sys.exc_info()

173 lines = traceback.format_exception(exc_type, exc_value, exc_traceback)

174 for line in lines:

175 file_.write(line + "\n")

176 file_.write("----------------------\n")

177

178 raise e

179

180 if self.solr_commit_at_the_end:

181 ptf_cmds.do_solr_commit()

182

183 return obj

184

185 def post_undo(self):

186 super().post_undo()

187

188 Person.objects.clean()

189

190 def post_do(self, resource=None):

191 super().post_do(resource)

192

193 Person.objects.clean()

194

195 if hasattr(settings, "LOG_DIR") and resource and self.use_body:

196 today = datetime.date.today()

197 basename = str(today) + "-" + self.__class__.__name__

198

199 pids = ""

200 first = True

201 if isinstance(resource, list):

202 for resource_item in resource:

203 if first: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true

204 first = False

205 else:

206 pids += ", "

207

208 pids += resource_item.pid

209 else:

210 pids = resource.pid

211

212 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_:

213 file_.write(basename + " : " + pids + "\n")

214

215 if hasattr(resource, "my_collection") and resource.my_collection:

216 folder = os.path.join(

217 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid

218 )

219 filename = os.path.join(folder, resource.pid + ".xml")

220 resolver.create_folder(folder)

221 with open(filename, "w", encoding="utf-8") as file_:

222 file_.write(self.body)

223

224 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed)

225 # if 'test' in sys.argv:

226 # if len(self.warnings) > 0:

227 # print(self.warnings)

228 # raise UserWarning("All tags are not parsed", self.warnings)

229

230 def undo(self):

231 super().undo()

232

233 if self.solr_commit_at_the_end:

234 ptf_cmds.do_solr_commit()

235

236 def add_objects_with_location(self, xobjs, resource, cmd_type):

237 seq = 1

238

239 for xobj in xobjs:

240 base = None

241

242 if xobj["base"]:

243 base_name = xobj["base"]

244 base = model_helpers.get_xmlbase(base_name)

245 if base is None:

246 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False})

247 base = cmd.do(self)

248

249 rel = xobj["rel"]

250 location = xobj["location"]

251

252 params = {

253 "rel": rel,

254 "mimetype": xobj.get("mimetype", ""),

255 "location": location,

256 "seq": seq,

257 "solr_commit": False,

258 "from_folder": self.from_folder,

259 "to_folder": self.to_folder,

260 }

261

262 # Ignore XML file

263 if params["mimetype"] != "application/xml": 263 ↛ 239line 263 didn't jump to line 239 because the condition on line 263 was always true

264 if "metadata" in xobj:

265 params["metadata"] = xobj["metadata"]

266

267 if "text" in xobj:

268 params["text"] = xobj["text"]

269

270 # TODO: cmd factory ?

271 cmd = None

272 if cmd_type == "ExtLink":

273 cmd = ptf_cmds.addExtLinkPtfCmd(params)

274 elif cmd_type == "RelatedObject":

275 cmd = ptf_cmds.addRelatedObjectPtfCmd(params)

276 elif cmd_type == "SupplementaryMaterial": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 params["caption"] = xobj.get("caption", "")

278 params["supplementary_material"] = True

279 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params)

280 elif cmd_type == "DataStream": 280 ↛ 286line 280 didn't jump to line 286 because the condition on line 280 was always true

281 cmd = ptf_cmds.addDataStreamPtfCmd(params)

282

283 # Always try to add an ExtLink or a RelatedObject

284 # May raise ResourceExists if the ExtLink/RelatedObject is added twice

285

286 if cmd is not None: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true

287 cmd.set_base(base)

288 cmd.set_resource(resource)

289

290 cmd.do(self)

291

292 seq += 1

293

294 # def add_metadata_parts(self, xobj, resource):

295 # for (seq, name, data) in xobj.metadataparts:

296 # params = {"name": name,

297 # "data": data,

298 # "seq": seq,

299 # "solr_commit": False}

300 #

301 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params)

302 # cmd.set_resource(resource)

303 # cmd.do(self)

304

305 @staticmethod

306 def remove_publisher(publisher):

307 cmd = ptf_cmds.addPublisherPtfCmd()

308 cmd.set_object_to_be_deleted(publisher)

309 cmd.undo()

310

311 # Update the published years of a collection (journal/acta/book-series...)

312 @staticmethod

313 def update_collection_years(pid, container, save=True):

314 collection = Collection.objects.get(pid=pid)

315 if container.year:

316 year = container.year

317 fyear, lyear = model_helpers.get_first_last_years(year)

318 fyear = int(fyear)

319 lyear = int(lyear)

320

321 if fyear < collection.fyear or not collection.fyear:

322 collection.fyear = fyear

323

324 if lyear > collection.lyear or not collection.lyear:

325 collection.lyear = lyear

326

327 if save:

328 collection.save()

329

330

331class addCollectionsXmlCmd(addXmlCmd):

332 """

333 addCollectionsXmlCmd: adds/remove a collection

334

335 TODO: merge Collection and Journal ?

336

337 Exception raised:

338 - exceptions.ResourceExists during do

339 if the Collection already exists

340 if the collection defines the same extlink/relatedobject multiple times

341 - exceptions.ResourceDoesNotExist

342 during undo if the Collection does not exist

343 during do of the provider does not exist

344 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

345 - RuntimeError during undo if resources are still published

346 """

347

348 provider = None

349 xml_format = None

350

351 def set_provider(self, provider):

352 self.provider = provider

353

354 def add_collection(self, xcol, update=False):

355 if not xcol: 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true

356 return None

357

358 if xcol.provider: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true

359 provider = model_helpers.get_provider_by_name(xcol.provider)

360 else:

361 provider = self.provider

362

363 col_id = xcol.pid

364 collection = model_helpers.get_collection(col_id)

365

366 existing = False

367

368 if collection is not None:

369 existing = True

370 if not update: 370 ↛ 374line 370 didn't jump to line 374

371 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists")

372

373 # Create a collection

374 params = {

375 "xobj": xcol,

376 "from_folder": self.from_folder,

377 "to_folder": self.to_folder,

378 "solr_commit": False,

379 }

380

381 cls = ptf_cmds.addCollectionPtfCmd

382 if update and existing: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true

383 cls = ptf_cmds.updateCollectionPtfCmd

384

385 cmd = cls(params)

386 cmd.set_provider(provider)

387 collection = cmd.do(self)

388

389 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink")

390

391 # if publisher:

392 # model_helpers.publish_resource(publisher, journal)

393

394 return collection

395

396 def internal_do(self):

397 super().internal_do()

398

399 collections = []

400

401 if self.tree.tag == "journal-meta": 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 raise ValueError(

403 "Creation of a journal on the fly from an article is not yet supported"

404 )

405 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI)

406 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....)

407 # # to be compatible with jats_parser.py

408 # # TODO 2 : Prevent the creation of the collection on the fly ?

409 # # Shouldn't the collection be monitored/controlled ?

410 # xmldata = globals()[self.xml_format]

411 # xcol = xmldata.Journal(self.tree)

412 # collection = self.add_collection(xcol, update=True)

413 # collections.append(collection)

414 else:

415 for node in self.tree:

416 xcol = None

417 if node.tag == "collection-meta": 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 raise ValueError("Collection can only be created from <publication-meta>")

419 # xcol = jats_parser.BitsCollection(tree=node)

420 elif node.tag == "journal-meta": 420 ↛ 421line 420 didn't jump to line 421 because the condition on line 420 was never true

421 raise ValueError(

422 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>"

423 )

424 # xcol = jats_parser.JatsJournal(tree=node)

425 elif node.tag == "publication-meta": 425 ↛ 428line 425 didn't jump to line 428 because the condition on line 425 was always true

426 xcol = jats_parser.MathdocPublication(tree=node)

427

428 collection = self.add_collection(xcol)

429 collections.append(collection)

430

431 return collections

432

433

434class addIssueXmlCmd(addXmlCmd):

435 """

436 addIssueXmlCmd: adds/remove an issue

437

438 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy

439

440 extra_folder: folder where extra data (extid false_positive...) are stored in a json

441 It is used

442 - when you call addIssueXmlCmd directly to import from an archive,

443 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import

444

445 Exception raised:

446 - exceptions.ResourceExists during do if the issue already exists

447 - exceptions.ResourceDoesNotExist

448 during undo if the Issue does not exist

449 during do if the serial/provider does not exist

450 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

451 - RuntimeError during undo if resources are still published

452 """

453

454 assign_doi = False

455 full_text_folder = ""

456 extra_folder = None

457 prod_deployed_date_iso_8601_date_str = None

458 xissue = None

459 count = 0

460 no_bib = False # Ignore the references during the import (used in Geodesic)

461 embargo = False # Import only the open articles (used in Geodesic)

462

463 def create_child_collection(self, xjournal, journal):

464 issn = xjournal.issn if xjournal.issn else xjournal.e_issn

465

466 new_xjournal = copy.deepcopy(xjournal)

467 new_xjournal.wall = 0

468 new_xjournal.pid = f"{xjournal.pid}-{issn}"

469 new_xjournal.coltype = journal.coltype

470

471 params = {"xobj": new_xjournal}

472 provider = model_helpers.get_provider_by_name("mathdoc")

473

474 cmd = ptf_cmds.addCollectionPtfCmd(params)

475 cmd.set_parent(journal)

476 cmd.set_provider(provider)

477

478 collection = cmd.do()

479 # collection.parent = journal

480 # journal = collection

481 return collection

482

483 def get_historic_collection(self, xjournal, journal):

484 use_meta_collections = (

485 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False

486 )

487

488 if not use_meta_collections:

489 return journal

490

491 # meta-collections are used : journal may be the top collection or one of its children

492

493 value = id_type = None

494

495 # Take care of special case of STNB :

496 # For that, we ignore the issn of STNB 2nd series

497 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 497 ↛ 498line 497 didn't jump to line 498 because the condition on line 497 was never true

498 xjournal.issn = None

499 xjournal.e_issn = None

500 xjournal.ids = []

501 else:

502 if xjournal.issn: 502 ↛ 505line 502 didn't jump to line 505 because the condition on line 502 was always true

503 value = xjournal.issn

504 id_type = "issn"

505 elif xjournal.e_issn:

506 value = xjournal.e_issn

507 id_type = "e-issn"

508

509 if value: 509 ↛ 519line 509 didn't jump to line 519 because the condition on line 509 was always true

510 # collection has at least one issn

511 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type)

512 if qs.exists(): 512 ↛ 513line 512 didn't jump to line 513 because the condition on line 512 was never true

513 journal = qs.first()

514 else:

515 # xjournal does not exist yet.

516 journal = self.create_child_collection(xjournal, journal)

517 else:

518 # collection has no issn

519 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"]

520 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter(

521 pid__in=possible_pids

522 )

523 if qs.exists():

524 journal = qs.first()

525 else:

526 journal = self.create_child_collection(xjournal, journal)

527

528 return journal

529

530 def internal_do(self):

531 super().internal_do()

532

533 #######################################################################

534 # get xissue

535

536 if self.xissue:

537 xissue = self.xissue

538 else:

539 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib)

540 self.warnings.extend(xissue.warnings)

541

542 #######################################################################

543 # Check if there is an existing issue / journal

544

545 issue_id = xissue.pid

546 issue = model_helpers.get_container(issue_id)

547

548 if issue is not None:

549 raise exceptions.ResourceExists(f"Issue {issue_id} already exists")

550

551 xjournal = xissue.journal

552 journal_id = xjournal.pid

553 journal = model_helpers.get_collection(journal_id)

554

555 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal

556 # as there is a <journal-meta> with an id ?

557 # The ptf_resource table (Resource objects) are created with only 1 id.

558 # When you add a journal, the journal id is the one of its

559 # <custom-meta-group><custom-meta> provider.

560 # If you want to find the journal of an issue based on the <journal-meta> information, you might

561 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select

562 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure

563 # we use the correct provider. A simple select in the ptf_resource table is then needed.

564 if journal is None: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist")

566

567 # Journal is the top collection (ex: AFST)

568 # We want to get (or create) the journal that corresponds to the issue

569 journal = self.get_historic_collection(xjournal, journal)

570

571 if self.embargo and journal.wall > 0: 571 ↛ 574line 571 didn't jump to line 574 because the condition on line 571 was never true

572 # Geodesic is for open access articles.

573 # We do not want to import the issues under embargo

574 if resolver.embargo(journal.wall, xissue.year):

575 print(f"Embargo, ignore {xissue.pid}")

576 return None

577

578 #######################################################################

579 # Get provider/publisher

580

581 provider_name = xissue.provider if xissue.provider else "mathdoc"

582 provider = model_helpers.get_provider_by_name(provider_name)

583

584 #######################################################################

585 # Add the issue

586

587 params = {

588 "xobj": xissue,

589 "pid": xissue.pid,

590 "from_folder": self.from_folder,

591 "to_folder": self.to_folder,

592 "solr_commit": False,

593 }

594

595 cmd = ptf_cmds.addContainerPtfCmd(params)

596 cmd.add_collection(journal)

597 cmd.set_provider(provider)

598 issue = cmd.do(self)

599

600 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink")

601 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject")

602 self.add_objects_with_location(xissue.streams, issue, "DataStream")

603

604 #######################################################################

605 # Add the issue's articles

606

607 # JatsIssue is an iterator (has the __iter__ function)

608 # you simply iterate the xissue to get its articles

609 if xissue.ctype == "issue": 609 ↛ 626line 609 didn't jump to line 626 because the condition on line 609 was always true

610 for seq, xarticle in enumerate(xissue, start=1):

611 params = {

612 "xarticle": xarticle,

613 "journal": journal,

614 "issue": issue,

615 "seq": seq,

616 "provider": provider,

617 "assign_doi": self.assign_doi,

618 "full_text_folder": self.full_text_folder,

619 "use_body": False,

620 "from_folder": self.from_folder,

621 "to_folder": self.to_folder,

622 "solr_commit_at_the_end": False,

623 }

624 cmd = addArticleXmlCmd(params)

625 cmd.do(self)

626 elif xissue.ctype == "issue_special":

627 site = PtfSite.objects.get(id=settings.SITE_ID)

628 issue.deploy(site)

629 for seq, xresource in enumerate(xissue.articles, start=1):

630 # en fait on peut appeler directement la ptfCMD

631 # et on peut supprimer la xml cmd

632

633 params = {

634 "use_body": False,

635 "xcontainer": issue,

636 "seq": seq,

637 # on veut juste passer le champ resource_doi dans ma fonction

638 "xresource": xresource,

639 "resource_doi": xresource.doi,

640 }

641 cmd = addResourceInSpecialIssueXmlCmd(params)

642 cmd.do(self)

643

644 # Update the top journal first year and last year

645 self.update_collection_years(journal_id, issue)

646

647 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi)

648 # Update issue before returning the object.

649 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db()

650 issue.my_collection.refresh_from_db()

651

652 # Used in post_do

653 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str

654

655 return issue

656

657 def post_do(self, resource=None):

658 super().post_do(resource)

659

660 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une.

661 if resource.last_modified is None: 661 ↛ 662line 661 didn't jump to line 662 because the condition on line 661 was never true

662 resource.last_modified = timezone.now()

663 resource.save()

664

665 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date,

666 # On la propage aux Articles/Issue.

667 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date

668 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools":

669 prod_deployed_date = model_helpers.parse_date_str(

670 self._prod_deployed_date_iso_8601_date_str

671 )

672 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid)

673 if journal_site: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true

674 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date)

675

676 if self.extra_folder:

677 ptf_cmds.importExtraDataPtfCmd(

678 {"pid": resource.pid, "import_folder": self.extra_folder}

679 ).do()

680

681

682class addResourceInSpecialIssueXmlCmd(addXmlCmd):

683 """

684 addResourceXmlCmd: adds/remove resource from special issue

685 """

686

687 xcontainer = None

688 resource_doi = ""

689 xresource = None

690 seq = 0

691 citation = ""

692 provider = None

693

694 def __init__(self, params=None):

695 super().__init__(params)

696 self.required_params.extend(["xcontainer"])

697

698 def internal_do(self):

699 super().internal_do()

700 # for later, check the type of the resource first

701 resource_in_special_issue = model_helpers.get_resource_in_special_issue_by_doi(

702 self.resource_doi

703 )

704 resource_doi = self.resource_doi

705

706 # if self.xcontainer:

707 container = model_helpers.get_container(self.xcontainer.pid)

708

709 seq = self.seq

710 # needs_to_restore_resource = False

711

712 if resource_in_special_issue is not None:

713 # temporary

714 raise ValueError(

715 "First step of developpement require to manually delete all resources in special issue"

716 )

717 # self.provider = self.xresource.provider

718 # 2 is the id of ptf_tools. If we are not in ptf tools we are dealing with jats article which has no citation

719 if settings.SITE_ID == 2:

720 citation = self.xresource["citation"]

721 else:

722 citation = ""

723 params = {

724 # "xobj": self.xresource,

725 "obj_doi": resource_doi,

726 "container": container,

727 "seq": seq,

728 "citation": citation,

729 # "provider": self.provider,

730 }

731

732 cmd = ptf_cmds.addResourceInSpecialIssuePtfCmd(params)

733 resource_in_special_issue = cmd.do(self)

734

735 return resource_in_special_issue

736

737

738class addArticleXmlCmd(addXmlCmd):

739 """

740 addArticleXmlCmd: adds/remove an issue

741

742 Exception raised:

743 - exceptions.ResourceExists during do if the article already exists

744 - exceptions.ResourceDoesNotExist

745 during undo if the Article does not exist

746 during do if the serial/issue/provider does not exist

747 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

748 """

749

750 xarticle = None

751 journal = None

752 issue = None

753 provider = None

754 provider_col = None

755 assign_doi = False

756 full_text_folder = ""

757 xml_format = "xmldata_jats"

758 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset

759 restricted_mode = False

760 # standalone is used to import isolated article, without issues

761 standalone = False

762 seq = (

763 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ)

764 )

765 keep_translations = False

766

767 def set_collection(self, collection):

768 self.journal = collection

769 self.provider = collection.provider

770

771 def set_xml_format(self, xml_format):

772 self.xml_format = xml_format

773

774 def set_provider(self, provider):

775 self.provider = provider

776

777 def set_provider_col(self, provider_col):

778 self.provider_col = provider_col

779

780 def set_article_single_mode(self):

781 self.xarticle = jats_parser.JatsArticle(tree=self.tree)

782 self.warnings.extend(self.xarticle.warnings)

783

784 # TODO: MaxiDML: allow the creation of an issue on the fly

785 # if not self.provider:

786 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider)

787 #

788 # xmldata_jats.set_pid_type(self.provider.pid_type)

789 #

790 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8")

791 # cmd = addCollectionsXmlCmd({'body': bdy,

792 # 'xml_format': self.xml_format,

793 # 'coltype': "journal"})

794 # cmd.set_provider(self.provider_col if self.provider_col else self.provider)

795 # self.journal = cmd.do()[0]

796 #

797 # self.issue = model_helpers.get_container(self.xarticle.issue_id)

798 # if self.issue is None:

799 # # need to create the issue

800 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str,

801 # '%Y-%m-%d')

802 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year)

803 # self.issue = model_helpers.get_container(pid)

804 # if self.issue is None:

805 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid,

806 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime(

807 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume,

808 # # if copy binary, need from_folder / to_folder

809 # }

810 #

811 # cmd = ptf_cmds.addContainerPtfCmd(params)

812 # cmd.add_collection(self.journal)

813 # cmd.set_provider(self.provider)

814 # self.issue = cmd.do()

815

816 def get_oai_identifier(self):

817 return self.xarticle.oai_identifier

818

819 def update_xobj_with_body(self):

820 # Import CEDRICS, le plein texte provient d'un fichier séparé

821 if self.full_text_folder and not self.xarticle.body: 821 ↛ 822line 821 didn't jump to line 822 because the condition on line 821 was never true

822 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER:

823 text = ""

824 locs = [

825 stream["location"]

826 for stream in self.xarticle.streams

827 if stream["mimetype"] == "application/pdf"

828 ]

829 if locs:

830 full_pdf_location = os.path.join(self.full_text_folder, locs[0])

831 text = utils.pdf_to_text(full_pdf_location)

832 self.xarticle.body = text

833 else:

834 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml"

835

836 with open(full_text_file, mode="rb") as file_:

837 body = file_.read()

838

839 parser = etree.XMLParser(huge_tree=True, recover=True)

840 tree = etree.fromstring(body, parser=parser)

841 node = tree.find("body")

842 self.xarticle.body = xml_utils.get_text_from_node(node)

843 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)

844 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true

845 full_text_file = os.path.join(

846 "/numdam_dev/acquisition/donnees_traitees",

847 self.journal.pid,

848 self.issue.pid,

849 self.xarticle.pid,

850 self.xarticle.pid + ".xml",

851 )

852 if os.path.isfile(full_text_file):

853 with open(full_text_file, mode="rb") as file_:

854 body = file_.read()

855

856 parser = etree.XMLParser(huge_tree=True, recover=True)

857 tree = etree.fromstring(body, parser=parser)

858 node = tree.find("body")

859 self.xarticle.body = xml_utils.get_text_from_node(node)

860 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)

861

862 def internal_do(self):

863 super().internal_do()

864

865 if self.xarticle is None and self.journal is not None: 865 ↛ 867line 865 didn't jump to line 867 because the condition on line 865 was never true

866 # self.restricted_mode = True

867 self.set_article_single_mode()

868 self.update = True

869 else:

870 self.update = False

871

872 if self.xarticle.pid is None:

873 self.xarticle.pid = (

874 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")

875 )

876

877 for xtranslated_article in self.xarticle.translations: 877 ↛ 878line 877 didn't jump to line 878 because the loop on line 877 never started

878 for xtream in xtranslated_article.streams:

879 if xtream["mimetype"] == "text/html":

880 if self.from_folder is None:

881 raise ValueError(

882 "The article has its full text in a separate HTML file. You need to set from_folder"

883 )

884

885 location = os.path.join(self.from_folder, xtream["location"])

886 body_html = resolver.get_body(location)

887 body = xml_utils.get_text_from_xml_with_mathml(body_html)

888 xtranslated_article.body_html = body_html

889 xtranslated_article.body = body

890

891 for stream in self.xarticle.streams:

892 if stream["mimetype"] == "text/html":

893 location = os.path.join(self.from_folder, stream["location"])

894 body_html = resolver.get_body(location)

895 body = xml_utils.get_text_from_xml_with_mathml(body_html)

896 self.xarticle.body_html = body_html

897 self.xarticle.body = body

898

899 if self.xarticle.doi:

900 article = model_helpers.get_article_by_doi(self.xarticle.doi)

901 else:

902 article = model_helpers.get_article(self.xarticle.pid)

903 needs_to_restore_article = False

904

905 if article is not None: 905 ↛ 906line 905 didn't jump to line 906 because the condition on line 905 was never true

906 if self.update or self.standalone:

907 if self.standalone:

908 self.provider = article.provider

909

910 needs_to_restore_article = True

911 backup_obj_not_in_metadata(article)

912

913 if self.keep_translations:

914 backup_translation(article)

915

916 cmd = ptf_cmds.addArticlePtfCmd(

917 {

918 "pid": article.pid,

919 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr

920 }

921 )

922 cmd.set_object_to_be_deleted(article)

923 cmd.undo()

924 else:

925 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists")

926

927 # Override seq

928 if self.standalone and article is not None: 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 self.xarticle.seq = article.seq

930 elif (

931 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0

932 ) or (hasattr(self, "pii") and self.seq != 0):

933 self.xarticle.seq = self.seq

934

935 # Get the article's text (body) for SolR if it is empty from the PDF

936 self.update_xobj_with_body()

937

938 params = {

939 "xobj": self.xarticle,

940 "pid": self.xarticle.pid,

941 "from_folder": self.from_folder,

942 "to_folder": self.to_folder,

943 "assign_doi": self.assign_doi and not self.xarticle.doi,

944 "solr_commit": False,

945 }

946

947 cmd = ptf_cmds.addArticlePtfCmd(params)

948 if self.issue or not self.standalone: 948 ↛ 950line 948 didn't jump to line 950 because the condition on line 948 was always true

949 cmd.set_container(self.issue)

950 cmd.add_collection(self.journal)

951 article = cmd.do(self)

952

953 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink")

954 self.add_objects_with_location(self.xarticle.streams, article, "DataStream")

955 if not self.restricted_mode: 955 ↛ 960line 955 didn't jump to line 960 because the condition on line 955 was always true

956 self.add_objects_with_location(

957 self.xarticle.supplementary_materials, article, "SupplementaryMaterial"

958 )

959

960 if (

961 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY

962 ) or settings.SITE_NAME == "ptf_tools":

963 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject")

964

965 for xtrans_article, trans_article in zip( 965 ↛ 968line 965 didn't jump to line 968 because the loop on line 965 never started

966 self.xarticle.translations, cmd.cmd.translated_articles

967 ):

968 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream")

969

970 if needs_to_restore_article: 970 ↛ 971line 970 didn't jump to line 971 because the condition on line 970 was never true

971 restore_obj_not_in_metadata(article)

972

973 if self.keep_translations:

974 restore_translation(article)

975

976 return article

977

978

979class addTranslatedArticleXmlCmd(addXmlCmd):

980 """

981 addTranslatedArticleXmlCmd: adds/remove translations.

982 The original article is not changed

983 The current translations are first removed

984 """

985

986 lang = ""

987 html_file_name = ""

988 pdf_file_name = ""

989 date_published_str = ""

990

991 def internal_do(self):

992 super().internal_do()

993

994 xarticle = jats_parser.JatsArticle(tree=self.tree)

995 article = model_helpers.get_article(xarticle.pid)

996

997 if article is None:

998 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist")

999

1000 # Merge existing article with new translation

1001 data_article = model_data_converter.db_to_article_data(article)

1002 new_translations = [

1003 translation

1004 for translation in data_article.translations

1005 if translation.lang != self.lang

1006 ]

1007

1008 for xtrans_article in xarticle.translations:

1009 if xtrans_article.lang == self.lang:

1010 # Upload/views has copied the HTML file on disk

1011 # Add a DataStream.

1012 # TODO: check if the datastream is not already present

1013 if self.html_file_name:

1014 data = model_data.create_datastream()

1015 data["rel"] = "full-text"

1016 data["mimetype"] = "text/html"

1017 data["location"] = self.html_file_name

1018 xtrans_article.streams.append(data)

1019

1020 if self.pdf_file_name:

1021 # Create a pdf file

1022 # pdf-translate needs the article/sub-article XML

1023 # Simply add a datastream for now

1024 # The new Article created in Django will be complete

1025 # But generate the PDF file at the end

1026 data = model_data.create_datastream()

1027 data["rel"] = "full-text"

1028 data["mimetype"] = "application/pdf"

1029 data["location"] = self.pdf_file_name

1030 xtrans_article.streams.append(data)

1031

1032 if self.date_published_str:

1033 xtrans_article.date_published_iso_8601_date_str = self.date_published_str

1034

1035 new_translations.append(xtrans_article)

1036

1037 data_article.translations = new_translations

1038

1039 cmd = addArticleXmlCmd(

1040 {

1041 "xarticle": data_article,

1042 "use_body": False,

1043 "issue": article.my_container,

1044 "standalone": True,

1045 "from_folder": self.from_folder,

1046 }

1047 )

1048 cmd.set_collection(article.get_collection())

1049 article = cmd.do()

1050

1051 # pdf-translate needs the article/sub-article XML

1052 xml = ptf_cmds.exportPtfCmd(

1053 {

1054 "pid": article.pid,

1055 "with_body": False,

1056 "with_djvu": False,

1057 "article_standalone": True,

1058 "collection_pid": settings.COLLECTION_PID,

1059 }

1060 ).do()

1061

1062 tex.create_translated_pdf(

1063 article,

1064 xml,

1065 self.lang,

1066 os.path.join(self.from_folder, self.pdf_file_name),

1067 os.path.join(self.from_folder, self.html_file_name),

1068 # If the date_published is specified, we assume that the PDF already exists

1069 skip_compilation=self.date_published_str != "",

1070 )

1071

1072 return article

1073

1074

1075class addPCJArticleXmlCmd(addXmlCmd):

1076 """

1077 addPCJArticleXmlCmd:

1078 """

1079

1080 html_file_name = ""

1081

1082 def internal_do(self):

1083 super().internal_do()

1084

1085 xarticle = jats_parser.JatsArticle(tree=self.tree)

1086

1087 if self.html_file_name: 1087 ↛ 1094line 1087 didn't jump to line 1094 because the condition on line 1087 was always true

1088 data = model_data.create_datastream()

1089 data["rel"] = "full-text"

1090 data["mimetype"] = "text/html"

1091 data["location"] = self.html_file_name

1092 xarticle.streams.append(data)

1093

1094 cmd = addArticleXmlCmd(

1095 {

1096 "xarticle": xarticle,

1097 "use_body": False,

1098 "issue": self.issue,

1099 "standalone": True,

1100 "from_folder": self.from_folder,

1101 }

1102 )

1103 cmd.set_collection(self.collection)

1104 article = cmd.do()

1105

1106 return article

1107

1108

1109class addBookXmlCmd(addXmlCmd):

1110 """

1111 addBookXmlCmd: adds/remove a book

1112

1113 Exception raised:

1114 - exceptions.ResourceExists during do if the book already exists

1115 - exceptions.ResourceDoesNotExist

1116 during undo if the Book does not exist

1117 during do if the serial/provider does not exist

1118 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1119 - RuntimeError during undo if resources are still published

1120 """

1121

1122 provider = None

1123 import_oai_mode = False

1124 journal = None

1125 xml_format = "xmldata_jats"

1126 xbook = None

1127 _collection = None

1128 no_bib = False # Ignore the references during the import (used in Geodesic)

1129

1130 def set_provider(self, provider):

1131 self.provider = provider

1132

1133 def add_parts(self, xparts, pseq):

1134 if xparts:

1135 seq = 1

1136 for xpart in xparts:

1137 self.add_part(xpart, seq, pseq)

1138 seq += 1

1139

1140 def add_part(self, xpart, seq, pseq):

1141 if xpart is None: 1141 ↛ 1142line 1141 didn't jump to line 1142 because the condition on line 1141 was never true

1142 return

1143

1144 # An Article is used to store a book part in the database

1145 article = model_helpers.get_article(xpart.pid)

1146

1147 if article is not None: 1147 ↛ 1148line 1147 didn't jump to line 1148 because the condition on line 1147 was never true

1148 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists")

1149

1150 params = {

1151 "xobj": xpart,

1152 "pid": xpart.pid,

1153 "seq": seq,

1154 "pseq": pseq,

1155 # "deployed": deployed,

1156 "from_folder": self.from_folder,

1157 "to_folder": self.to_folder,

1158 "solr_commit": False,

1159 }

1160

1161 cmd = ptf_cmds.addBookPartPtfCmd(params)

1162 cmd.set_container(self.book)

1163 cmd.add_collection(self._collection)

1164 article = cmd.do(self)

1165

1166 self.add_objects_with_location(xpart.ext_links, article, "ExtLink")

1167 self.add_objects_with_location(xpart.streams, article, "DataStream")

1168

1169 self.add_parts(xpart.parts, seq)

1170

1171 def set_import_oai_mode(self):

1172 self.import_oai_mode = True

1173

1174 def internal_do(self):

1175 super().internal_do()

1176

1177 #######################################################################

1178 # Get xbook

1179

1180 if self.import_oai_mode: 1180 ↛ 1181line 1180 didn't jump to line 1181 because the condition on line 1180 was never true

1181 xmldata = globals()[self.xml_format]

1182 xbook = xmldata.Book(self.tree)

1183 self.journal = model_helpers.get_collection("GDML_Books")

1184

1185 else:

1186 if self.xbook:

1187 xbook = self.xbook

1188 else:

1189 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)

1190 self.warnings.extend(xbook.warnings)

1191

1192 #######################################################################

1193 # Get existing book if any

1194

1195 if not self.provider: 1195 ↛ 1199line 1195 didn't jump to line 1199 because the condition on line 1195 was always true

1196 provider = model_helpers.get_provider_by_name(xbook.provider)

1197 self.provider = provider

1198

1199 book_id = xbook.pid

1200 book = model_helpers.get_container(book_id)

1201

1202 #######################################################################

1203 # Delete any existing book

1204

1205 if book is not None:

1206 if self.import_oai_mode: 1206 ↛ 1207line 1206 didn't jump to line 1207 because the condition on line 1206 was never true

1207 publisher = book.my_publisher

1208

1209 # Note: the existing collection is not removed even if it no longer has a resource

1210 # TODO: urls/commands to add/update/delete a collection

1211

1212 # Removes the book

1213 cmd = ptf_cmds.addContainerPtfCmd()

1214 cmd.set_object_to_be_deleted(book)

1215 cmd.undo()

1216

1217 if publisher and publisher.publishes.count() == 0:

1218 self.remove_publisher(publisher)

1219 else:

1220 raise exceptions.ResourceExists("Book %s already exists" % book_id)

1221

1222 #######################################################################

1223 # Add new book

1224

1225 if xbook.incollection: 1225 ↛ 1230line 1225 didn't jump to line 1230 because the condition on line 1225 was always true

1226 colid = xbook.incollection[0].pid

1227 self._collection = model_helpers.get_collection(colid)

1228 if self._collection is None:

1229 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist")

1230 elif self.import_oai_mode:

1231 self._collection = self.journal

1232

1233 params = {

1234 "xobj": xbook,

1235 "pid": xbook.pid,

1236 "from_folder": self.from_folder,

1237 "to_folder": self.to_folder,

1238 "solr_commit": False,

1239 }

1240

1241 cmd = ptf_cmds.addContainerPtfCmd(params)

1242 cmd.add_collection(self._collection)

1243 cmd.set_provider(provider)

1244

1245 book = cmd.do(self)

1246 self.book = book

1247

1248 self.add_objects_with_location(xbook.ext_links, book, "ExtLink")

1249 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject")

1250 self.add_objects_with_location(xbook.streams, book, "DataStream")

1251

1252 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ?

1253

1254 #######################################################################

1255 # Add Book parts

1256

1257 # JatsIssue is an iterator (has the __iter__ function)

1258 # TODO make JatsBook an iterator as well ?

1259 self.add_parts(xbook.parts, 0)

1260

1261 # Update the collection first year and last year

1262 for incol in xbook.incollection:

1263 self.update_collection_years(incol.pid, book)

1264

1265 return book

1266

1267

1268######################################################################################

1269######################################################################################

1270#

1271# Update Commands

1272#

1273######################################################################################

1274######################################################################################

1275

1276

1277class updateCollectionsXmlCmd(addXmlCmd):

1278 """

1279 updateSerialsXmlCmd: updates one or more journals

1280

1281 Exception raised:

1282 - exceptions.ResourceDoesNotExist during do if the Collection does not exist

1283 - RuntimeError if undo is called

1284 """

1285

1286 def update_collection(self, xcol, do_update=True):

1287 if not xcol: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true

1288 return None

1289

1290 provider = model_helpers.get_provider_by_name(xcol.provider)

1291

1292 col_id = xcol.pid

1293 col = model_helpers.get_collection(col_id)

1294

1295 if col is None:

1296 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid)

1297

1298 if do_update:

1299 params = {

1300 "xobj": xcol,

1301 "solr_commit": False,

1302 "from_folder": self.from_folder,

1303 "to_folder": self.to_folder,

1304 }

1305

1306 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do

1307 # and the new ones are added in the post_do (addResourceDatabaseCmd)

1308

1309 cmd = ptf_cmds.updateCollectionPtfCmd(params)

1310 cmd.set_provider(provider)

1311 # cmd.set_publisher(publisher)

1312 col = cmd.do()

1313

1314 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do

1315 self.add_objects_with_location(xcol.ext_links, col, "ExtLink")

1316 resolver.copy_binary_files(col, self.from_folder, self.to_folder)

1317

1318 # if publisher:

1319 # model_helpers.publish_resource(publisher, col)

1320

1321 return col

1322

1323 def internal_do(self):

1324 super().internal_do()

1325

1326 collections = []

1327

1328 # First, check that all journals exist

1329 for node in self.tree:

1330 xcol = None

1331 if node.tag == "collection-meta": 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true

1332 xcol = jats_parser.BitsCollection(tree=node)

1333 elif node.tag == "journal-meta": 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 xcol = jats_parser.JatsJournal(tree=node)

1335 elif node.tag == "publication-meta": 1335 ↛ 1337line 1335 didn't jump to line 1337 because the condition on line 1335 was always true

1336 xcol = jats_parser.MathdocPublication(tree=node)

1337 self.update_collection(xcol, False)

1338

1339 for node in self.tree:

1340 xcol = None

1341 if node.tag == "collection-meta": 1341 ↛ 1342line 1341 didn't jump to line 1342 because the condition on line 1341 was never true

1342 xcol = jats_parser.BitsCollection(tree=node)

1343 elif node.tag == "journal-meta": 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 xcol = jats_parser.JatsJournal(tree=node)

1345 elif node.tag == "publication-meta": 1345 ↛ 1347line 1345 didn't jump to line 1347 because the condition on line 1345 was always true

1346 xcol = jats_parser.MathdocPublication(tree=node)

1347 self.warnings.extend(xcol.warnings)

1348 xcol = self.update_collection(xcol)

1349 collections.append(xcol)

1350

1351 return collections

1352

1353 def internal_undo(self):

1354 raise RuntimeError("update commands do not support the undo")

1355

1356

1357#####################################################################

1358#

1359# replaceIssueXmlCmd: updates an issue

1360#

1361# Exception raised:

1362# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist

1363# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1364# - RuntimeError if undo is called

1365#

1366######################################################################

1367class replaceIssueXmlCmd(addXmlCmd):

1368 def internal_do(self):

1369 super().internal_do()

1370

1371 xissue = jats_parser.JatsIssue(tree=self.tree)

1372 self.warnings.extend(xissue.warnings)

1373

1374 xjournal = xissue.journal

1375 journal_id = xjournal.pid

1376 journal = model_helpers.get_collection(journal_id)

1377

1378 if journal is None: 1378 ↛ 1379line 1378 didn't jump to line 1379 because the condition on line 1378 was never true

1379 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)

1380

1381 issue_id = xissue.pid

1382 issue = model_helpers.get_container(issue_id)

1383

1384 if issue is None: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id)

1386

1387 publisher = issue.my_publisher

1388

1389 cmd = ptf_cmds.addContainerPtfCmd()

1390 cmd.set_object_to_be_deleted(issue)

1391 cmd.undo()

1392

1393 if publisher.publishes.count() == 0:

1394 self.remove_publisher(publisher)

1395

1396 # update the journal first and last year

1397 for the_issue in journal.content.all():

1398 self.update_collection_years(journal_id, the_issue, False)

1399

1400 journal.save()

1401

1402 cmd = addIssueXmlCmd(

1403 {

1404 "xissue": xissue,

1405 "use_body": False,

1406 "solr_commit": False,

1407 "extra_folder": self.from_folder,

1408 "to_folder": self.to_folder,

1409 }

1410 )

1411 issue = cmd.do()

1412

1413 return issue

1414

1415 # node_tag = self.tree.tag

1416 # for child in self.tree:

1417 # node_tag = child.tag

1418

1419 def internal_undo(self):

1420 raise RuntimeError("update commands do not support the undo")

1421

1422

1423class updateBookXmlCmd(addXmlCmd):

1424 """

1425 updateBookXmlCmd: updates a book

1426

1427 Exception raised:

1428 - exceptions.ResourceDoesNotExist during do if the Book does not exist

1429 - RuntimeError if undo is called

1430 """

1431

1432 no_bib = False # Ignore the references during the import (used in Geodesic)

1433

1434 def internal_do(self):

1435 super().internal_do()

1436

1437 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)

1438 self.warnings.extend(xbook.warnings)

1439

1440 book_id = xbook.pid

1441 book = model_helpers.get_container(book_id)

1442

1443 if book is None: 1443 ↛ 1444line 1443 didn't jump to line 1444 because the condition on line 1443 was never true

1444 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid)

1445

1446 # unpublish and delete the existing publisher if necessary

1447 # self.update_publisher(xbook, book)

1448

1449 # Note: the existing collection is not removed even if it no longer has a resource

1450 # TODO: urls/commands to add/update/delete a collection

1451

1452 # Removes the book

1453 cmd = ptf_cmds.addContainerPtfCmd()

1454 cmd.set_object_to_be_deleted(book)

1455 cmd.undo()

1456

1457 cmd = addBookXmlCmd(

1458 {

1459 "xbook": xbook,

1460 "use_body": False,

1461 "solr_commit": False,

1462 "from_folder": self.from_folder,

1463 "no_bib": self.no_bib,

1464 "to_folder": self.to_folder,

1465 }

1466 )

1467 book = cmd.do()

1468

1469 return book

1470

1471 def internal_undo(self):

1472 raise RuntimeError("update commands do not support the undo")

1473

1474

1475class addOrUpdateContainerXmlCmd(addXmlCmd):

1476 """

1477 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book

1478

1479 just detect Container type (do not check params etc.)

1480 """

1481

1482 keep_metadata = False

1483 keep_translations = False

1484 backup_folder = None

1485 full_text_folder = ""

1486 fake = False # Parse the XML but do not import

1487 no_bib = False # Ignore the references during the import (used in Geodesic)

1488 embargo = False # Import only the open articles (used in Geodesic)

1489

1490 def check_params(self):

1491 super().check_params()

1492

1493 def internal_do(self):

1494 super().internal_do()

1495

1496 tag = normalize(self.tree.tag)

1497

1498 if tag == "journal-issue":

1499 cmd = addOrUpdateIssueXmlCmd(

1500 {

1501 "body": self.body,

1502 "keep_metadata": self.keep_metadata,

1503 "keep_translations": self.keep_translations,

1504 "backup_folder": self.backup_folder,

1505 "to_folder": self.to_folder,

1506 "from_folder": self.from_folder,

1507 "xml_file_folder": self.xml_file_folder,

1508 "fake": self.fake,

1509 "no_bib": self.no_bib,

1510 "embargo": self.embargo,

1511 }

1512 )

1513 obj = cmd.do()

1514 self.warnings.extend(cmd.warnings)

1515 return obj

1516 elif tag == "book":

1517 cmd = addOrUpdateBookXmlCmd(

1518 {

1519 "body": self.body,

1520 "from_folder": self.from_folder,

1521 "to_folder": self.to_folder,

1522 "no_bib": self.no_bib,

1523 "embargo": self.embargo,

1524 }

1525 )

1526 obj = cmd.do()

1527 self.warnings.extend(cmd.warnings)

1528 return obj

1529 else:

1530 raise RuntimeError("addOrupdateContainer command can't detect container type")

1531

1532 def internal_undo(self):

1533 raise RuntimeError("update commands do not support the undo")

1534

1535

1536class addOrUpdateIssueXmlCmd(addXmlCmd):

1537 """

1538 addOrUpdateIssueXmlCmd: adds or updates an issue

1539

1540 Adds an issue if it is not in the system or updates the issue if it is already there.

1541 By default, no DOI is assigned for the articles. Set assign_doi to True.

1542

1543 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy

1544 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json

1545

1546 keep_metadata:

1547 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder

1548 Default: False

1549 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called

1550 We always want to preserve GraphicalAbstracts (they are not in the issue XML)

1551

1552 keep_translations:

1553 True if you want back up/restore translations.

1554 Default: False

1555 Note: When you post an article to a journal (test) website, the translation is declared in the XML

1556 But if you import a Cedrics article in Trammel, the XML does not list translations

1557

1558 Exception raised:

1559 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist

1560 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1561 - RuntimeError if undo is called

1562 """

1563

1564 keep_metadata = False

1565 keep_translations = False

1566 backup_folder = None

1567 assign_doi = False

1568 full_text_folder = ""

1569

1570 xissue = None

1571 fake = False # Parse the XML but do not import

1572 no_bib = False # Ignore the references during the import (used in Geodesic)

1573 embargo = False # Import only the open articles (used in Geodesic)

1574

1575 def check_params(self):

1576 super().check_params()

1577

1578 if self.keep_metadata and self.assign_doi: 1578 ↛ 1579line 1578 didn't jump to line 1579 because the condition on line 1578 was never true

1579 raise ValueError("keep_metadata and assign_doi cannot both be true.")

1580

1581 if self.keep_metadata and self.backup_folder is None: 1581 ↛ 1582line 1581 didn't jump to line 1582 because the condition on line 1581 was never true

1582 raise ValueError("backup_folder needs to be set when keep_metadata is true.")

1583

1584 def internal_do(self):

1585 super().internal_do()

1586

1587 if not self.xissue: 1587 ↛ 1605line 1587 didn't jump to line 1605 because the condition on line 1587 was always true

1588 self.xissue = xissue = jats_parser.JatsIssue(

1589 tree=self.tree,

1590 from_folder=self.from_folder,

1591 no_bib=self.no_bib,

1592 )

1593 if len(xissue.warnings) > 0 and self.xml_file_folder: 1593 ↛ 1594line 1593 didn't jump to line 1594 because the condition on line 1593 was never true

1594 warnings = []

1595 warning_keys = []

1596 for warning in xissue.warnings:

1597 for key, value in warning.items():

1598 if value not in warning_keys:

1599 warning_keys.append(value)

1600 warnings.append({key: value})

1601 for warning in warnings:

1602 print(warning)

1603 self.warnings.extend(xissue.warnings)

1604 else:

1605 xissue = self.xissue

1606

1607 if self.fake: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true

1608 return

1609

1610 xjournal = xissue.journal

1611 journal_id = xjournal.pid

1612 journal = model_helpers.get_collection(journal_id)

1613

1614 if journal is None: 1614 ↛ 1615line 1614 didn't jump to line 1615 because the condition on line 1614 was never true

1615 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)

1616

1617 existing_issue = model_helpers.get_container(xissue.pid)

1618

1619 if existing_issue:

1620 if self.embargo and existing_issue.embargo(): 1620 ↛ 1623line 1620 didn't jump to line 1623 because the condition on line 1620 was never true

1621 # Geodesic is for open access articles.

1622 # We do not want to import the issues under embargo

1623 print(f"Embargo, ignore {xissue.pid}")

1624 return None

1625

1626 if self.keep_metadata:

1627 # On commence par faire un backup de l'existant en cas de bug.

1628 ptf_cmds.exportPtfCmd(

1629 {

1630 "pid": existing_issue.pid,

1631 "with_internal_data": True,

1632 "with_binary_files": False,

1633 "for_archive": False,

1634 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"),

1635 }

1636 ).do()

1637

1638 # On sauvegarde les données additionnelles (extid, deployed_date,...)

1639 # dans un json qui sera ré-importé avec l'import du nouvel issue

1640 params = {

1641 "pid": existing_issue.pid,

1642 "export_folder": self.backup_folder,

1643 "export_all": True,

1644 "with_binary_files": True,

1645 }

1646 ptf_cmds.exportExtraDataPtfCmd(params).do()

1647

1648 for article in existing_issue.article_set.all():

1649 backup_obj_not_in_metadata(article)

1650 if self.keep_translations: 1650 ↛ 1651line 1650 didn't jump to line 1651 because the condition on line 1650 was never true

1651 backup_translation(article)

1652 # changer nom de variable resource

1653 for resource_in_special_issue in existing_issue.resources_in_special_issue.all(): 1653 ↛ 1656line 1653 didn't jump to line 1656 because the loop on line 1653 never started

1654 # External article can be part of special issue and backup can bug if so

1655

1656 if resource_in_special_issue.resource:

1657 backup_obj_not_in_metadata(resource_in_special_issue.resource)

1658

1659 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants

1660

1661 cmd = ptf_cmds.addContainerPtfCmd()

1662 cmd.set_object_to_be_deleted(existing_issue)

1663 cmd.undo()

1664

1665 # update the journal first and last year

1666 for the_issue in journal.content.all():

1667 self.update_collection_years(journal_id, the_issue, False)

1668

1669 journal.save()

1670 else:

1671 issue_to_appear = model_helpers.get_issue_to_appear(journal_id)

1672

1673 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés

1674 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex)

1675 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None.

1676 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2.

1677 # L'import va échouer car on ne peut avoir 2 fois le même article.

1678 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3.

1679 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import

1680 # du nouveau volume sinon il va y avoir des conflits.

1681

1682 if issue_to_appear and xissue.pid != issue_to_appear.pid:

1683 # On sauvegarde les données additionnelles (extid, deployed_date,...)

1684 # dans un json qui sera ré-importé avec l'import du nouvel issue

1685 # ainsi que image associée via ptf-tools

1686 if self.keep_metadata: 1686 ↛ 1696line 1686 didn't jump to line 1696 because the condition on line 1686 was always true

1687 params = {

1688 "pid": issue_to_appear.pid,

1689 "force_pid": xissue.pid,

1690 "export_folder": self.backup_folder,

1691 "export_all": True,

1692 "with_binary_files": True,

1693 }

1694 ptf_cmds.exportExtraDataPtfCmd(params).do()

1695

1696 for xarticle in xissue.articles:

1697 if isinstance(xarticle, dict): 1697 ↛ 1698line 1697 didn't jump to line 1698 because the condition on line 1697 was never true

1698 xdoi = xarticle["doi"]

1699 else:

1700 xdoi = getattr(xarticle, "doi")

1701 article = issue_to_appear.article_set.filter(doi=xdoi).first()

1702 if article: 1702 ↛ 1696line 1702 didn't jump to line 1696 because the condition on line 1702 was always true

1703 backup_obj_not_in_metadata(article)

1704 if self.keep_translations: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true

1705 backup_translation(article)

1706

1707 params = {"to_folder": self.to_folder} # pour suppression des binaires

1708 cmd = ptf_cmds.addArticlePtfCmd(params)

1709 cmd.set_object_to_be_deleted(article)

1710 cmd.undo()

1711

1712 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd

1713 cmd = addIssueXmlCmd(

1714 {

1715 "xissue": xissue,

1716 "use_body": False,

1717 # "body": self.body,

1718 "assign_doi": self.assign_doi,

1719 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file

1720 "extra_folder": self.backup_folder,

1721 "from_folder": self.from_folder,

1722 "to_folder": self.to_folder,

1723 "no_bib": self.no_bib,

1724 "embargo": self.embargo,

1725 "solr_commit": False,

1726 }

1727 )

1728 new_issue = cmd.do()

1729

1730 if new_issue: 1730 ↛ 1749line 1730 didn't jump to line 1749 because the condition on line 1730 was always true

1731 new_articles = new_issue.article_set.all()

1732

1733 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés

1734 for article in new_articles:

1735 if self.assign_doi and article.doi is None: 1735 ↛ 1736line 1735 didn't jump to line 1736 because the condition on line 1735 was never true

1736 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid)

1737

1738 # TODO garbage collector on articles no longer in the issue

1739 restore_obj_not_in_metadata(article)

1740 if self.keep_translations: 1740 ↛ 1741line 1740 didn't jump to line 1741 because the condition on line 1740 was never true

1741 restore_translation(article)

1742 if new_issue.ctype == "issue_special": 1742 ↛ 1743line 1742 didn't jump to line 1743 because the condition on line 1742 was never true

1743 resources_in_special_issue = new_issue.resources_in_special_issue.all()

1744 for resource_in_special_issue in resources_in_special_issue:

1745 # External article can be part of special issue and restore can bug if so

1746 if resource_in_special_issue.resource:

1747 restore_obj_not_in_metadata(resource_in_special_issue.resource)

1748

1749 return new_issue

1750

1751 def internal_undo(self):

1752 raise RuntimeError("update commands do not support the undo")

1753

1754

1755class addOrUpdateBookXmlCmd(addXmlCmd):

1756 xbook = None

1757 no_bib = False # Ignore the references during the import (used in Geodesic)

1758

1759 def internal_do(self):

1760 super().internal_do()

1761

1762 if not self.xbook: 1762 ↛ 1766line 1762 didn't jump to line 1766 because the condition on line 1762 was always true

1763 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)

1764 self.warnings.extend(xbook.warnings)

1765 else:

1766 xbook = self.xbook

1767

1768 book_id = xbook.pid

1769 book = model_helpers.get_container(book_id)

1770

1771 if book: 1771 ↛ 1772line 1771 didn't jump to line 1772 because the condition on line 1771 was never true

1772 cmd = ptf_cmds.addContainerPtfCmd()

1773 cmd.set_object_to_be_deleted(book)

1774 cmd.undo()

1775

1776 collection = book.get_collection()

1777

1778 # update the collection first and last year

1779 for container in collection.content.all():

1780 self.update_collection_years(collection.pid, container, False)

1781

1782 collection.save()

1783

1784 cmd = addBookXmlCmd(

1785 {

1786 "xbook": xbook,

1787 "use_body": False,

1788 # "body": self.body,

1789 "from_folder": self.from_folder,

1790 "to_folder": self.to_folder,

1791 "no_bib": self.no_bib,

1792 "solr_commit": False,

1793 }

1794 )

1795 book = cmd.do()

1796 return book

1797

1798

1799class updateBibitemCitationXmlCmd(baseCmd):

1800 """ """

1801

1802 def __init__(self, params=None):

1803 self.bibitem = None

1804

1805 super().__init__(params)

1806

1807 self.required_params.extend(["bibitem"])

1808

1809 def set_bibitem(self, bibitem):

1810 self.bibitem = bibitem

1811

1812 def internal_do(self):

1813 super().internal_do()

1814

1815 new_ids = {}

1816 for bibitemid in self.bibitem.bibitemid_set.all():

1817 new_ids[bibitemid.id_type] = {

1818 "id_type": bibitemid.id_type,

1819 "id_value": bibitemid.id_value,

1820 "checked": bibitemid.checked,

1821 "false_positive": bibitemid.false_positive,

1822 }

1823

1824 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids)

1825 self.warnings.extend(xbibitem.warnings)

1826

1827 self.bibitem.citation_xml = xbibitem.citation_xml

1828 self.bibitem.citation_html = xbibitem.citation_html

1829 self.bibitem.citation_tex = xbibitem.citation_tex

1830 self.bibitem.save()

1831

1832 def internal_undo(self):

1833 raise RuntimeError("update commands do not support the undo")

1834

1835

1836######################################################################################

1837######################################################################################

1838#

1839# Import Commands

1840#

1841######################################################################################

1842######################################################################################

1843

1844

1845class collectEntireCollectionXmlCmd(baseCmd):

1846 """

1847 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder

1848

1849 results:

1850 """

1851

1852 def __init__(self, params=None):

1853 self.pid = None

1854 self.folder = None

1855

1856 super().__init__(params)

1857

1858 self.required_params.extend(["pid", "folder"])

1859

1860 def internal_do(self):

1861 super().internal_do()

1862 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)]

1863 return pids

1864

1865

1866class importEntireCollectionXmlCmd(baseCmd):

1867 """

1868 Import all the XML of a collection (collection.xml, issues.xml) of a given folder

1869

1870 results:

1871 """

1872

1873 def __init__(self, params=None):

1874 self.pid = None

1875 self.from_folder = None

1876 self.to_folder = None

1877 self.backup_folder = None

1878 self.keep_metadata = False

1879 self.keep_translations = False

1880

1881 self.with_cedrics = True

1882 self.from_cedrics = False # The entire collection is in Cedrics format

1883 self.date_for_pii = False # Fetch publication_date for Elsevier articles

1884 self.first_issue = ""

1885 self.fake = False # Parse the XML but do not import

1886

1887 self.no_bib = False # Ignore the references during the import (used in Geodesic)

1888 self.embargo = False # Import only the open articles (used in Geodesic)

1889

1890 self.caller = None

1891 self.callback = None

1892 self.job = None

1893

1894 super().__init__(params)

1895

1896 self.required_params.extend(["pid", "from_folder"])

1897

1898 def internal_do(self):

1899 super().internal_do()

1900

1901 pid = self.pid

1902 resource = model_helpers.get_resource(pid)

1903 if not resource and not self.fake:

1904 body = resolver.get_archive_body(self.from_folder, pid, None)

1905 journals = addCollectionsXmlCmd(

1906 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder}

1907 ).do()

1908 if not journals:

1909 raise ValueError(self.from_folder + " does not contain a collection")

1910 resource = journals[0]

1911

1912 obj = resource.cast()

1913

1914 if obj.classname != "Collection":

1915 raise ValueError(pid + " does not contain a collection")

1916

1917 if self.with_cedrics:

1918 # with_cedrics means that you want to import everything from scratch

1919 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID)

1920 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"})

1921 cmd.do()

1922

1923 i = 0

1924 for pid, file_ in resolver.iterate_collection_folder(

1925 self.from_folder, self.pid, self.first_issue

1926 ):

1927 if self.callback is None:

1928 print(pid)

1929

1930 if self.from_cedrics:

1931 cmd = importCedricsIssueDirectlyXmlCmd(

1932 {

1933 "colid": self.pid,

1934 "input_file": file_,

1935 "remove_email": False,

1936 "remove_date_prod": True,

1937 "copy_files": True,

1938 "force_dois": False,

1939 }

1940 )

1941 else:

1942 body = resolver.get_body(file_)

1943 xml_file_folder = os.path.dirname(file_)

1944 cmd = addOrUpdateContainerXmlCmd(

1945 {

1946 "body": body,

1947 "from_folder": self.from_folder,

1948 "to_folder": self.to_folder,

1949 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file

1950 "xml_file_folder": xml_file_folder, # when article.XML are in separate files

1951 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML

1952 "keep_translations": self.keep_translations, # Backup/Restore existing translations

1953 "no_bib": self.no_bib,

1954 "embargo": self.embargo,

1955 # Needed in Trammel

1956 "fake": self.fake,

1957 }

1958 )

1959 cmd.do()

1960

1961 i += 1

1962 if self.callback:

1963 self.callback(self.job, i)

1964

1965 if self.with_cedrics:

1966 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata")

1967

1968 xml_files = [

1969 os.path.join(src_folder, f)

1970 for f in os.listdir(src_folder)

1971 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml")

1972 ]

1973 for xml_file in xml_files:

1974 if self.callback is None:

1975 print(xml_file)

1976

1977 cmd = importCedricsIssueXmlCmd(

1978 {

1979 "colid": self.pid,

1980 "input_file": xml_file,

1981 "from_folder": self.from_folder,

1982 "to_folder": self.to_folder,

1983 }

1984 )

1985 cmd.do()

1986

1987

1988class importCedricsIssueXmlCmd(baseCmd):

1989 def __init__(self, params=None):

1990 self.colid = None

1991 self.input_file = None

1992 self.remove_email = True

1993 self.remove_date_prod = True

1994 self.diff_only = False

1995 self.body = None

1996 self.xissue = None

1997 self.copy_files = True

1998

1999 super().__init__(params)

2000

2001 self.required_params.extend(["colid"])

2002

2003 def import_full_text(self, issue):

2004 """

2005 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL)

2006 Read the XML file and convert the body in HTML

2007 """

2008 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid)

2009 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid)

2010

2011 if len(tex_folders) > 0:

2012 i = 0

2013 for article in issue.article_set.all():

2014 article_folder = tex_folders[i]

2015 xml_file = os.path.join(

2016 tex_src_folder, article_folder, "FullText", article_folder + ".xml"

2017 )

2018

2019 cmd = ptf_cmds.updateResourceIdPtfCmd(

2020 {"id_type": "ojs-id", "id_value": article_folder}

2021 )

2022 cmd.set_resource(article)

2023 cmd.do()

2024

2025 if os.path.isfile(xml_file):

2026 with open(xml_file, encoding="utf-8") as f:

2027 body = f.read()

2028

2029 cmd = addBodyInHtmlXmlCmd(

2030 {

2031 "body": body,

2032 "from_folder": settings.CEDRAM_XML_FOLDER,

2033 # nécessaire pour la copie des binaires type image

2034 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem

2035 }

2036 )

2037 cmd.set_article(article)

2038 cmd.do()

2039

2040 i += 1

2041

2042 def import_in_db(self):

2043 """

2044 Import Cedrics issue from /cedram_dev/exploitation/cedram

2045 This worflow is no longer used.

2046 """

2047

2048 # Cedrics: the full text for SolR is in a separate file

2049 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/"

2050

2051 params = {

2052 "assign_doi": False,

2053 "full_text_folder": full_text_folder,

2054 "keep_metadata": True,

2055 "keep_translations": True,

2056 "use_body": False,

2057 "xissue": self.xissue,

2058 "backup_folder": settings.MERSENNE_TMP_FOLDER,

2059 "from_folder": settings.CEDRAM_XML_FOLDER,

2060 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,

2061 }

2062

2063 # params['body'] = self.body

2064

2065 cmd = addOrUpdateIssueXmlCmd(params)

2066 issue = cmd.do()

2067 self.warnings.extend(cmd.get_warnings())

2068

2069 # resolver.copy_binary_files(

2070 # issue,

2071 # settings.CEDRAM_XML_FOLDER,

2072 # settings.MERSENNE_TEST_DATA_FOLDER)

2073

2074 self.import_full_text(issue)

2075

2076 return issue

2077

2078 def compare_issue(self):

2079 xissue = self.xissue

2080 issues_diff = {}

2081 result = True

2082

2083 time1 = timezone.now()

2084

2085 new_dois = [article.doi for article in xissue.articles]

2086

2087 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related(

2088 "abstract_set",

2089 "kwd_set",

2090 "subj_set",

2091 "datastream_set",

2092 "relatedobject_set",

2093 "resourcecount_set",

2094 "contributions",

2095 "contributions__contribaddress_set",

2096 "bibitem_set__bibitemid_set",

2097 "bibitem_set__contributions",

2098 "bibitem_set__contributions__contribaddress_set",

2099 )

2100

2101 issue = None

2102 try:

2103 issue = (

2104 Container.objects.select_related("my_collection", "my_publisher")

2105 .prefetch_related(

2106 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi")

2107 )

2108 .get(sites__id=settings.SITE_ID, pid=xissue.pid)

2109 )

2110 except Container.DoesNotExist:

2111 pass

2112

2113 if issue:

2114 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi)

2115

2116 time2 = timezone.now()

2117 delta = time2 - time1

2118

2119 delta.seconds + delta.microseconds / 1e6

2120 print(delta)

2121

2122 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...)

2123 model_data_comparator.prepare_issue_for_comparison(xissue)

2124

2125 issue_comparator = model_data_comparator.IssueDataComparator()

2126

2127 result = issue_comparator.compare(data_issue, xissue, issues_diff)

2128

2129 return (result, issues_diff, xissue)

2130

2131 def delete_previous_file(self, output_folder):

2132 basename = os.path.basename(self.input_file)

2133

2134 output_file = os.path.join(output_folder, self.colid, basename)

2135 if os.path.isfile(output_file):

2136 os.remove(output_file)

2137

2138 os.makedirs(output_folder, exist_ok=True)

2139 os.makedirs(os.path.dirname(output_file), exist_ok=True)

2140

2141 return output_file

2142

2143 def import_cedrics_issue(self):

2144 """

2145 Import Cedrics issue from /cedram_dev/exploitation/cedram

2146 This worflow is no longer used.

2147 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM

2148 (see importCedricsIssueDirectlyXmlCmd below)

2149 """

2150

2151 output_folder = settings.MERSENNE_TMP_FOLDER

2152 ptf_xsl_folder = settings.PTF_XSL_FOLDER

2153 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE)

2154

2155 # 1. Delete the previous file

2156 output_file = self.delete_previous_file(output_folder)

2157

2158 # 2. Transform the cedrics XML into JATS

2159 cmd_folder = os.path.join(ptf_xsl_folder, "cedram")

2160

2161 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format(

2162 cmd_folder,

2163 os.path.join(settings.VIRTUALENV_DIR, "bin/python"),

2164 "-s" if self.colid in settings.MERSENNE_SEMINARS else "",

2165 self.input_file,

2166 output_folder,

2167 log_file + "1",

2168 # option -e for cedram2ptf.py for not removing email

2169 "-e" if not self.remove_email else "",

2170 "-t" if self.remove_date_prod else "",

2171 log_file,

2172 )

2173

2174 log_file2 = log_file + "2"

2175 with open(log_file2, "w", encoding="ascii") as file_:

2176 file_.write(cmd_str + "\n")

2177

2178 sys.path.append(ptf_xsl_folder + "/lib")

2179

2180 try:

2181 result = subprocess.check_output(cmd_str, shell=True)

2182 except Exception as e:

2183 with open(log_file) as logfile_:

2184 logfile_body = logfile_.read()

2185 message = str(e) + "\n" + logfile_body + "\n"

2186 file_.write(message)

2187 file_.close()

2188 raise RuntimeError(message)

2189

2190 file_.write(str(result) + "\n")

2191

2192 # Check if the output_file has been created

2193 if not os.path.isfile(output_file):

2194 raise RuntimeError("The file was not converted in JATS")

2195

2196 with open(output_file, encoding="utf-8") as f:

2197 self.body = f.read()

2198

2199 parser = etree.XMLParser(

2200 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True

2201 )

2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

2203 self.xissue = jats_parser.JatsIssue(tree=tree)

2204 self.warnings.extend(self.xissue.warnings)

2205

2206 def internal_do(self):

2207 super().internal_do()

2208

2209 if not self.xissue:

2210 self.import_cedrics_issue()

2211

2212 result = None

2213

2214 if self.diff_only:

2215 result = self.compare_issue()

2216 else:

2217 result = self.import_in_db()

2218

2219 return result

2220

2221

2222# import from /cedram_dev/production_tex/CEDRAM

2223class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd):

2224 def __init__(self, params=None):

2225 self.is_seminar = False

2226 self.article_folders = None

2227 self.force_dois = True

2228 super().__init__(params)

2229

2230 def read_file(self, filename, skip_lines=2):

2231 i = 0

2232 lines = []

2233 try:

2234 with open(filename, encoding="utf-8") as fr:

2235 for line in fr:

2236 if i > skip_lines:

2237 lines.append(line)

2238 i += 1

2239 except UnicodeDecodeError:

2240 i = 0

2241 lines = []

2242 with open(filename, encoding="iso-8859-1") as fr:

2243 for line in fr:

2244 if i > skip_lines:

2245 lines.append(line)

2246 i += 1

2247

2248 return lines

2249

2250 def import_cedrics_issue(self):

2251 """

2252 Parse the Cedrics XML directly, without Cedrics -> JATS transformation

2253 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created

2254 Workflow

2255 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM

2256 2. Cat the article XML files into one issue.XML

2257 3. Read the Cedrics issue.XML

2258

2259 :return:

2260 """

2261

2262 output_folder = settings.MERSENNE_TMP_FOLDER

2263 output_file = self.delete_previous_file(output_folder)

2264

2265 basename = os.path.basename(self.input_file)

2266 if "-cdrxml" in basename:

2267 pid = basename.split("-cdrxml.")[0]

2268 else:

2269 pid = basename.split(".xml")[0]

2270

2271 # 1. Get the list of articles

2272 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid)

2273 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid)

2274

2275 # 2. Create the issue XML file

2276 with open(output_file, "w", encoding="utf-8") as fw:

2277 # 2.a. Start the issue.xml based on @pid-cdrxml.xml

2278 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n')

2279 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n')

2280 fw.write("<cedram>\n")

2281

2282 lines = self.read_file(self.input_file)

2283 for line in lines:

2284 fw.write(line)

2285

2286 # 2.b. Cat the article XML files

2287 for basename in self.article_folders:

2288 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml")

2289

2290 lines = self.read_file(src_file)

2291 for line in lines:

2292 fw.write(line)

2293

2294 fw.write("</cedram>\n")

2295

2296 # 3. Read the Cedrics issue.XML

2297 with open(output_file, encoding="utf-8") as f:

2298 self.body = f.read()

2299

2300 parser = etree.XMLParser(

2301 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

2302 )

2303 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

2304 self.xissue = cedrics_parser.CedricsIssue(

2305 tree=tree,

2306 is_seminar=self.is_seminar,

2307 ignore_date_published=self.remove_date_prod,

2308 article_folders=self.article_folders,

2309 dois=self.dois,

2310 )

2311 if self.force_dois:

2312 for xarticle in self.xissue.articles:

2313 if xarticle.doi is None:

2314 raise ValueError(xarticle.pid, "n'a pas de doi")

2315

2316 self.warnings.extend(self.xissue.warnings)

2317

2318 def import_in_db(self):

2319 params = {

2320 "assign_doi": False,

2321 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file

2322 "keep_metadata": True,

2323 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them.

2324 "use_body": False,

2325 "xissue": self.xissue,

2326 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import

2327 "from_folder": settings.CEDRAM_TEX_FOLDER,

2328 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,

2329 }

2330

2331 cmd = addOrUpdateIssueXmlCmd(params)

2332 issue = cmd.do()

2333 self.warnings.extend(cmd.get_warnings())

2334

2335 self.import_full_text(issue)

2336

2337 return issue

2338

2339

2340class addCedricsIssueXmlCmd(addXmlCmd):

2341 assign_doi = False

2342 full_text_folder = ""

2343 import_folder = None

2344 prod_deployed_date_iso_8601_date_str = None

2345 xissue = None

2346 remove_blank_text = False

2347 is_seminar = False

2348

2349 def internal_do(self):

2350 super().internal_do()

2351

2352 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar)

2353

2354 return self.xissue

2355

2356

2357class addorUpdateCedricsArticleXmlCmd(baseCmd):

2358 def __init__(self, params=None):

2359 self.container_pid = None

2360 self.article_folder_name = None

2361

2362 super().__init__(params)

2363

2364 self.required_params.extend(["container_pid", "article_folder_name"])

2365

2366 def internal_do(self):

2367 super().internal_do()

2368

2369 issue = model_helpers.get_container(self.container_pid)

2370 if not issue:

2371 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist")

2372

2373 colid = issue.my_collection.pid

2374 article_folder = os.path.join(

2375 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name

2376 )

2377

2378 # 1. Read the Cedrics article.XML

2379 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml")

2380 with open(input_file, encoding="utf-8") as f:

2381 body = f.read()

2382

2383 # 2. Parse the file and create an xarticle

2384 is_seminar = colid in settings.MERSENNE_SEMINARS

2385 parser = etree.XMLParser(

2386 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

2387 )

2388 tree = etree.fromstring(body.encode("utf-8"), parser=parser)

2389 xarticle = cedrics_parser.CedricsArticle(

2390 tree=tree,

2391 colid=colid,

2392 issue_id=self.container_pid,

2393 is_seminar=is_seminar,

2394 ignore_date_published=True,

2395 article_folder=self.article_folder_name,

2396 )

2397 if xarticle.doi is None:

2398 raise ValueError(xarticle.pid, "n'a pas de doi")

2399

2400 # Get the article position in its issue (seq) to preserve its order

2401 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid)

2402 i = 1

2403 for folder in article_folders:

2404 if folder == self.article_folder_name:

2405 xarticle.seq = i

2406 i += 1

2407

2408 existing_article = model_helpers.get_article(xarticle.pid)

2409 temp_folder = settings.MERSENNE_TMP_FOLDER

2410

2411 # 3. Backup/Suppression de l'article existant

2412 if existing_article:

2413 # On commence par faire un backup de l'existant en cas de bug.

2414 ptf_cmds.exportPtfCmd(

2415 {

2416 "pid": self.container_pid,

2417 "with_internal_data": True,

2418 "with_binary_files": False,

2419 "for_archive": False,

2420 "export_folder": os.path.join(temp_folder, "backup"),

2421 }

2422 ).do()

2423

2424 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json

2425 params = {

2426 "pid": existing_article.pid,

2427 "export_folder": temp_folder,

2428 "export_all": True,

2429 "with_binary_files": True,

2430 }

2431 ptf_cmds.exportExtraDataPtfCmd(params).do()

2432

2433 backup_obj_not_in_metadata(existing_article)

2434 backup_translation(existing_article)

2435

2436 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone

2437

2438 # 4. Ajout de l'article dans Django/SolR

2439 params = {

2440 "xarticle": xarticle,

2441 "issue": issue,

2442 "standalone": True,

2443 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly

2444 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file

2445 # temp folder used to backup/restore info during the import

2446 "from_folder": settings.CEDRAM_TEX_FOLDER,

2447 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER,

2448 "keep_translations": True,

2449 }

2450

2451 cmd = addArticleXmlCmd(params)

2452 cmd.set_collection(issue.my_collection)

2453 article = cmd.do()

2454

2455 # 5. Lecture du full text en HTML

2456 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml")

2457 if os.path.isfile(xml_file):

2458 with open(xml_file, encoding="utf-8") as f:

2459 body = f.read()

2460

2461 cmd = addBodyInHtmlXmlCmd(

2462 {

2463 "body": body,

2464 "from_folder": settings.CEDRAM_XML_FOLDER,

2465 # nécessaire pour la copie des binaires type image

2466 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem

2467 "remove_blank_text": False,

2468 }

2469 )

2470 cmd.set_article(article)

2471 cmd.do()

2472

2473 # 6. On ajoute l'ojs-id pour ptf-tools

2474 cmd = ptf_cmds.updateResourceIdPtfCmd(

2475 {"id_type": "ojs-id", "id_value": self.article_folder_name}

2476 )

2477 cmd.set_resource(article)

2478 cmd.do()

2479

2480 # 7. On restaure les données additionnelles (extid, deployed_date,...)

2481 if existing_article:

2482 ptf_cmds.importExtraDataPtfCmd(

2483 {"pid": existing_article.pid, "import_folder": temp_folder}

2484 ).do()

2485

2486 restore_obj_not_in_metadata(article)

2487 restore_translation(article)

2488

2489 return article

2490

2491

2492class transformBodyInHtmlXmlCmd(addXmlCmd):

2493 """

2494 transformBodyInHtmlXmlCmd: transform the JATS body in HTML

2495

2496 TODO: handle images,...

2497

2498 """

2499

2500 use_body = False

2501

2502 def internal_do(self):

2503 super().internal_do()

2504

2505 xsl_file = settings.PTF_HTML_XSL

2506 xslt_doc = etree.parse(xsl_file)

2507 t = etree.XSLT(xslt_doc)

2508

2509 html_tree = t(self.tree).getroot()

2510

2511 body = html_tree.find("body/article/main")

2512 text = xmldata_jats.innerxml(body).decode("utf-8")

2513

2514 return text

2515

2516

2517class addBodyInHtmlXmlCmd(addXmlCmd):

2518 """

2519 addBodyInHtmlXmlCmd: read the JATS body of an article

2520 and create the corresponding HTML

2521

2522 TODO: handle images,... manage warnings for unused tag ?

2523

2524 """

2525

2526 def __init__(self, params=None):

2527 self.article = None

2528 self.pid = None

2529

2530 super().__init__(params)

2531

2532 def set_article(self, article):

2533 self.article = article

2534

2535 def pre_do(self):

2536 super().pre_do()

2537

2538 if self.pid is None and self.article is None:

2539 raise ValueError("pid et article sont vides")

2540

2541 if self.article is None:

2542 self.article = model_helpers.get_article(self.pid)

2543

2544 if self.pid is None:

2545 self.pid = self.article.pid

2546

2547 def internal_do(self):

2548 super().internal_do()

2549

2550 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid)

2551 # faut il récupérer les warnings du parseHTML ?

2552 # self.warnings.extend(xarticle.warnings)

2553 self.article.relatedobject_set.filter(rel="html-image").delete()

2554 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject")

2555

2556 params = {

2557 "body_html": xarticle.body_html,

2558 "body_tex": xarticle.body_tex,

2559 "body_xml": xarticle.body_xml,

2560 "use_page_count": False,

2561 }

2562

2563 cmd = ptf_cmds.updateArticlePtfCmd(params)

2564 cmd.set_article(self.article)

2565 cmd.do()

2566

2567 # copy_binary_files will call resolver.copy_html_images

2568 # to copy the article images

2569 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here

2570

2571 resolver.copy_html_images(

2572 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER

2573 )

2574

2575

2576class updateCacheXmlCmd(baseCmd):

2577 """

2578 recreate the citation_html field of the bibitems

2579

2580 Params: colid: pid of the collection to process

2581 """

2582

2583 def __init__(self, params=None):

2584 self.colid = None

2585 self.start_id = None

2586

2587 super().__init__(params)

2588

2589 self.required_params.extend(["colid"])

2590

2591 def update_article(self, xarticle):

2592 article = model_helpers.get_article(xarticle.pid)

2593 if article is None:

2594 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist")

2595

2596 article.title_html = xarticle.title_html

2597 article.title_tex = xarticle.title_tex

2598 article.trans_title_html = xarticle.trans_title_html

2599 article.trans_title_tex = xarticle.trans_title_tex

2600 article.save()

2601

2602 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()):

2603 abstract.value_html = xabstract["value_html"]

2604 abstract.value_tex = xabstract["value_tex"]

2605 abstract.save()

2606

2607 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()):

2608 # kwd_group.value_html = xkwd_group['value_html']

2609 # kwd_group.value_tex = xkwd_group['value_tex']

2610 # kwd_group.save()

2611

2612 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()):

2613 bib.citation_html = xbib.citation_html

2614 bib.citation_tex = xbib.citation_tex

2615 bib.article_title_tex = xbib.article_title_tex

2616 bib.chapter_title_tex = xbib.chapter_title_tex

2617 bib.source_tex = xbib.source_tex

2618 bib.volume = xbib.volume

2619 bib.save()

2620

2621 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY:

2622 params = {

2623 "body_html": xarticle.body_html,

2624 "body_tex": xarticle.body_tex,

2625 "body_xml": xarticle.body_xml,

2626 "use_page_count": False,

2627 }

2628

2629 cmd = ptf_cmds.updateArticlePtfCmd(params)

2630 cmd.set_article(article)

2631 cmd.do()

2632

2633 def internal_do(self):

2634 super().internal_do()

2635

2636 collection = model_helpers.get_collection(self.colid)

2637 if collection is None:

2638 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist")

2639

2640 qs = collection.content.all().order_by("pid")

2641 start = self.start_id is None

2642 for container in qs:

2643 if not start and container.pid == self.start_id:

2644 start = True

2645

2646 if start:

2647 print(container.pid)

2648 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY

2649 xml_body = ptf_cmds.exportPtfCmd(

2650 {"pid": container.pid, "with_body": with_body}

2651 ).do()

2652

2653 parser = etree.XMLParser(

2654 huge_tree=True,

2655 recover=True,

2656 remove_blank_text=False,

2657 remove_comments=True,

2658 resolve_entities=True,

2659 )

2660 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser)

2661 xissue = jats_parser.JatsIssue(tree=tree)

2662

2663 for xarticle in xissue:

2664 self.update_article(xarticle)

Coverage for src/ptf/cmds/xml_cmds.py: 51%

1259 statements