Coverage for src/ptf/cmds/xml_cmds.py: 51%
1259 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import copy
2import datetime
3import os.path
4import subprocess
5import sys
6import traceback
8from lxml import ElementInclude
9from lxml import etree
11from django.conf import settings
12from django.db import transaction
13from django.db.models import Prefetch
14from django.utils import timezone
16from ptf import exceptions
17from ptf import model_data
18from ptf import model_data_comparator
19from ptf import model_data_converter
20from ptf import model_helpers
21from ptf import tex
22from ptf import utils
23from ptf.cmds import ptf_cmds
24from ptf.cmds import solr_cmds
25from ptf.cmds.base_cmds import baseCmd
26from ptf.cmds.xml import xml_utils
27from ptf.cmds.xml.cedrics import cedrics_parser
29# KEEP THIS UNUSED IMPORT THEY ARE USED
30from ptf.cmds.xml.jats import jats_parser
31from ptf.cmds.xml.jats import xmldata as xmldata_jats
32from ptf.cmds.xml.xml_utils import normalize
33from ptf.display import resolver
35# from ptf.models import Resource
36from ptf.models import Article
37from ptf.models import Collection
38from ptf.models import Container
39from ptf.models import Person
40from ptf.models import PtfSite
41from ptf.models import backup_obj_not_in_metadata
42from ptf.models import backup_translation
43from ptf.models import restore_obj_not_in_metadata
44from ptf.models import restore_translation
47def find_file(name):
48 paths = settings.MANAGER_XSLT_DIRS
49 for path in paths:
50 for root, _, files in os.walk(path):
51 if name in files:
52 return os.path.join(root, name)
53 return None
56def get_transform(name):
57 file_path = find_file(f"{name}.xsl")
58 xslt_doc = etree.parse(file_path)
59 return etree.XSLT(xslt_doc)
62class addXmlCmd(baseCmd):
63 """
64 addXmlCmd: base class for commands that take an XML as input
65 The XML is passed with the body param
67 from_folder / to_folder: location of binary files to copy
69 Example with a file:
70 f = open('journal.xml')
71 body = f.read()
72 f.close()
73 cmd = add...XmlCmd( { "body":body } )
75 Exception raised:
76 - ValueError if the init params are empty
77 """
79 use_body = True
80 body: str | None = None
81 tree = None
82 solr_commit_at_the_end = True
83 xml_filename_in_log = None
84 remove_blank_text = False
85 xml_file_folder = None
87 def __init__(self, params=None):
88 super().__init__(params)
90 if self.use_body:
91 self.required_params.extend(["body"])
93 def get_logname(self):
94 filename = ""
96 if hasattr(settings, "LOG_DIR"): 96 ↛ 106line 96 didn't jump to line 106 because the condition on line 96 was always true
97 i = 0
98 today = datetime.date.today()
99 basename = str(today) + "-" + self.__class__.__name__ + "-"
100 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")
102 while os.path.isfile(filename):
103 i += 1
104 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")
106 return filename
108 def pre_do(self):
109 super().pre_do()
111 if self.use_body:
112 # The Cedrics -> JATS XSLT transform manually adds space=preserve around
113 # the nodes with mixed-content, but leaves the text unchanged.
114 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True
115 # Or the spaces will be removed whereas the JATS XML will keep them.
116 # We still need the remove_blank_text=True for JATS XML for all the other nodes
117 parser = etree.XMLParser(
118 huge_tree=True,
119 recover=True,
120 remove_blank_text=self.remove_blank_text,
121 remove_comments=True,
122 resolve_entities=True,
123 )
124 # if isinstance(self.body, str):
125 # self.body = self.body
126 if self.xml_file_folder is not None: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 if self.xml_file_folder[-1] != "/":
128 self.xml_file_folder += "/"
129 # For ElementInclude to find the href
130 self.body = self.body.replace(
131 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""
132 ).replace("xlink:href", "href")
133 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
135 if self.xml_file_folder is not None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 ElementInclude.include(tree, base_url=self.xml_file_folder)
137 # t = get_transform('strip-namespace')
138 # self.tree = t(tree).getroot()
139 self.tree = tree
141 if self.tree is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 raise ValueError("tree est vide")
144 # Write the xml body on disk
145 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:
146 self.xml_filename_in_log = self.get_logname()
148 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_:
149 file_.write(self.body)
151 @transaction.atomic
152 def do(self, parent=None):
153 try:
154 obj = super().do(parent)
155 except Exception as e:
156 ptf_cmds.do_solr_rollback()
158 # Empty sub_cmds to ignore undo
159 self.cmds = []
161 # Write the xml body on disk
162 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:
163 with open(
164 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8"
165 ) as file_:
166 file_.write("----------------------\n")
168 if self.xml_filename_in_log is None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 self.xml_filename_in_log = self.get_logname()
171 file_.write(self.xml_filename_in_log + " : FAILED\n")
172 exc_type, exc_value, exc_traceback = sys.exc_info()
173 lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
174 for line in lines:
175 file_.write(line + "\n")
176 file_.write("----------------------\n")
178 raise e
180 if self.solr_commit_at_the_end:
181 ptf_cmds.do_solr_commit()
183 return obj
185 def post_undo(self):
186 super().post_undo()
188 Person.objects.clean()
190 def post_do(self, resource=None):
191 super().post_do(resource)
193 Person.objects.clean()
195 if hasattr(settings, "LOG_DIR") and resource and self.use_body:
196 today = datetime.date.today()
197 basename = str(today) + "-" + self.__class__.__name__
199 pids = ""
200 first = True
201 if isinstance(resource, list):
202 for resource_item in resource:
203 if first: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true
204 first = False
205 else:
206 pids += ", "
208 pids += resource_item.pid
209 else:
210 pids = resource.pid
212 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_:
213 file_.write(basename + " : " + pids + "\n")
215 if hasattr(resource, "my_collection") and resource.my_collection:
216 folder = os.path.join(
217 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid
218 )
219 filename = os.path.join(folder, resource.pid + ".xml")
220 resolver.create_folder(folder)
221 with open(filename, "w", encoding="utf-8") as file_:
222 file_.write(self.body)
224 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed)
225 # if 'test' in sys.argv:
226 # if len(self.warnings) > 0:
227 # print(self.warnings)
228 # raise UserWarning("All tags are not parsed", self.warnings)
230 def undo(self):
231 super().undo()
233 if self.solr_commit_at_the_end:
234 ptf_cmds.do_solr_commit()
236 def add_objects_with_location(self, xobjs, resource, cmd_type):
237 seq = 1
239 for xobj in xobjs:
240 base = None
242 if xobj["base"]:
243 base_name = xobj["base"]
244 base = model_helpers.get_xmlbase(base_name)
245 if base is None:
246 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False})
247 base = cmd.do(self)
249 rel = xobj["rel"]
250 location = xobj["location"]
252 params = {
253 "rel": rel,
254 "mimetype": xobj.get("mimetype", ""),
255 "location": location,
256 "seq": seq,
257 "solr_commit": False,
258 "from_folder": self.from_folder,
259 "to_folder": self.to_folder,
260 }
262 # Ignore XML file
263 if params["mimetype"] != "application/xml": 263 ↛ 239line 263 didn't jump to line 239 because the condition on line 263 was always true
264 if "metadata" in xobj:
265 params["metadata"] = xobj["metadata"]
267 if "text" in xobj:
268 params["text"] = xobj["text"]
270 # TODO: cmd factory ?
271 cmd = None
272 if cmd_type == "ExtLink":
273 cmd = ptf_cmds.addExtLinkPtfCmd(params)
274 elif cmd_type == "RelatedObject":
275 cmd = ptf_cmds.addRelatedObjectPtfCmd(params)
276 elif cmd_type == "SupplementaryMaterial": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true
277 params["caption"] = xobj.get("caption", "")
278 params["supplementary_material"] = True
279 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params)
280 elif cmd_type == "DataStream": 280 ↛ 286line 280 didn't jump to line 286 because the condition on line 280 was always true
281 cmd = ptf_cmds.addDataStreamPtfCmd(params)
283 # Always try to add an ExtLink or a RelatedObject
284 # May raise ResourceExists if the ExtLink/RelatedObject is added twice
286 if cmd is not None: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true
287 cmd.set_base(base)
288 cmd.set_resource(resource)
290 cmd.do(self)
292 seq += 1
294 # def add_metadata_parts(self, xobj, resource):
295 # for (seq, name, data) in xobj.metadataparts:
296 # params = {"name": name,
297 # "data": data,
298 # "seq": seq,
299 # "solr_commit": False}
300 #
301 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params)
302 # cmd.set_resource(resource)
303 # cmd.do(self)
305 @staticmethod
306 def remove_publisher(publisher):
307 cmd = ptf_cmds.addPublisherPtfCmd()
308 cmd.set_object_to_be_deleted(publisher)
309 cmd.undo()
311 # Update the published years of a collection (journal/acta/book-series...)
312 @staticmethod
313 def update_collection_years(pid, container, save=True):
314 collection = Collection.objects.get(pid=pid)
315 if container.year:
316 year = container.year
317 fyear, lyear = model_helpers.get_first_last_years(year)
318 fyear = int(fyear)
319 lyear = int(lyear)
321 if fyear < collection.fyear or not collection.fyear:
322 collection.fyear = fyear
324 if lyear > collection.lyear or not collection.lyear:
325 collection.lyear = lyear
327 if save:
328 collection.save()
331class addCollectionsXmlCmd(addXmlCmd):
332 """
333 addCollectionsXmlCmd: adds/remove a collection
335 TODO: merge Collection and Journal ?
337 Exception raised:
338 - exceptions.ResourceExists during do
339 if the Collection already exists
340 if the collection defines the same extlink/relatedobject multiple times
341 - exceptions.ResourceDoesNotExist
342 during undo if the Collection does not exist
343 during do of the provider does not exist
344 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
345 - RuntimeError during undo if resources are still published
346 """
348 provider = None
349 xml_format = None
351 def set_provider(self, provider):
352 self.provider = provider
354 def add_collection(self, xcol, update=False):
355 if not xcol: 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true
356 return None
358 if xcol.provider: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true
359 provider = model_helpers.get_provider_by_name(xcol.provider)
360 else:
361 provider = self.provider
363 col_id = xcol.pid
364 collection = model_helpers.get_collection(col_id)
366 existing = False
368 if collection is not None:
369 existing = True
370 if not update: 370 ↛ 374line 370 didn't jump to line 374
371 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists")
373 # Create a collection
374 params = {
375 "xobj": xcol,
376 "from_folder": self.from_folder,
377 "to_folder": self.to_folder,
378 "solr_commit": False,
379 }
381 cls = ptf_cmds.addCollectionPtfCmd
382 if update and existing: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true
383 cls = ptf_cmds.updateCollectionPtfCmd
385 cmd = cls(params)
386 cmd.set_provider(provider)
387 collection = cmd.do(self)
389 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink")
391 # if publisher:
392 # model_helpers.publish_resource(publisher, journal)
394 return collection
396 def internal_do(self):
397 super().internal_do()
399 collections = []
401 if self.tree.tag == "journal-meta": 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true
402 raise ValueError(
403 "Creation of a journal on the fly from an article is not yet supported"
404 )
405 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI)
406 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....)
407 # # to be compatible with jats_parser.py
408 # # TODO 2 : Prevent the creation of the collection on the fly ?
409 # # Shouldn't the collection be monitored/controlled ?
410 # xmldata = globals()[self.xml_format]
411 # xcol = xmldata.Journal(self.tree)
412 # collection = self.add_collection(xcol, update=True)
413 # collections.append(collection)
414 else:
415 for node in self.tree:
416 xcol = None
417 if node.tag == "collection-meta": 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true
418 raise ValueError("Collection can only be created from <publication-meta>")
419 # xcol = jats_parser.BitsCollection(tree=node)
420 elif node.tag == "journal-meta": 420 ↛ 421line 420 didn't jump to line 421 because the condition on line 420 was never true
421 raise ValueError(
422 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>"
423 )
424 # xcol = jats_parser.JatsJournal(tree=node)
425 elif node.tag == "publication-meta": 425 ↛ 428line 425 didn't jump to line 428 because the condition on line 425 was always true
426 xcol = jats_parser.MathdocPublication(tree=node)
428 collection = self.add_collection(xcol)
429 collections.append(collection)
431 return collections
434class addIssueXmlCmd(addXmlCmd):
435 """
436 addIssueXmlCmd: adds/remove an issue
438 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy
440 extra_folder: folder where extra data (extid false_positive...) are stored in a json
441 It is used
442 - when you call addIssueXmlCmd directly to import from an archive,
443 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import
445 Exception raised:
446 - exceptions.ResourceExists during do if the issue already exists
447 - exceptions.ResourceDoesNotExist
448 during undo if the Issue does not exist
449 during do if the serial/provider does not exist
450 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
451 - RuntimeError during undo if resources are still published
452 """
454 assign_doi = False
455 full_text_folder = ""
456 extra_folder = None
457 prod_deployed_date_iso_8601_date_str = None
458 xissue = None
459 count = 0
460 no_bib = False # Ignore the references during the import (used in Geodesic)
461 embargo = False # Import only the open articles (used in Geodesic)
463 def create_child_collection(self, xjournal, journal):
464 issn = xjournal.issn if xjournal.issn else xjournal.e_issn
466 new_xjournal = copy.deepcopy(xjournal)
467 new_xjournal.wall = 0
468 new_xjournal.pid = f"{xjournal.pid}-{issn}"
469 new_xjournal.coltype = journal.coltype
471 params = {"xobj": new_xjournal}
472 provider = model_helpers.get_provider_by_name("mathdoc")
474 cmd = ptf_cmds.addCollectionPtfCmd(params)
475 cmd.set_parent(journal)
476 cmd.set_provider(provider)
478 collection = cmd.do()
479 # collection.parent = journal
480 # journal = collection
481 return collection
483 def get_historic_collection(self, xjournal, journal):
484 use_meta_collections = (
485 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False
486 )
488 if not use_meta_collections:
489 return journal
491 # meta-collections are used : journal may be the top collection or one of its children
493 value = id_type = None
495 # Take care of special case of STNB :
496 # For that, we ignore the issn of STNB 2nd series
497 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 497 ↛ 498line 497 didn't jump to line 498 because the condition on line 497 was never true
498 xjournal.issn = None
499 xjournal.e_issn = None
500 xjournal.ids = []
501 else:
502 if xjournal.issn: 502 ↛ 505line 502 didn't jump to line 505 because the condition on line 502 was always true
503 value = xjournal.issn
504 id_type = "issn"
505 elif xjournal.e_issn:
506 value = xjournal.e_issn
507 id_type = "e-issn"
509 if value: 509 ↛ 519line 509 didn't jump to line 519 because the condition on line 509 was always true
510 # collection has at least one issn
511 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type)
512 if qs.exists(): 512 ↛ 513line 512 didn't jump to line 513 because the condition on line 512 was never true
513 journal = qs.first()
514 else:
515 # xjournal does not exist yet.
516 journal = self.create_child_collection(xjournal, journal)
517 else:
518 # collection has no issn
519 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"]
520 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter(
521 pid__in=possible_pids
522 )
523 if qs.exists():
524 journal = qs.first()
525 else:
526 journal = self.create_child_collection(xjournal, journal)
528 return journal
530 def internal_do(self):
531 super().internal_do()
533 #######################################################################
534 # get xissue
536 if self.xissue:
537 xissue = self.xissue
538 else:
539 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib)
540 self.warnings.extend(xissue.warnings)
542 #######################################################################
543 # Check if there is an existing issue / journal
545 issue_id = xissue.pid
546 issue = model_helpers.get_container(issue_id)
548 if issue is not None:
549 raise exceptions.ResourceExists(f"Issue {issue_id} already exists")
551 xjournal = xissue.journal
552 journal_id = xjournal.pid
553 journal = model_helpers.get_collection(journal_id)
555 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal
556 # as there is a <journal-meta> with an id ?
557 # The ptf_resource table (Resource objects) are created with only 1 id.
558 # When you add a journal, the journal id is the one of its
559 # <custom-meta-group><custom-meta> provider.
560 # If you want to find the journal of an issue based on the <journal-meta> information, you might
561 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select
562 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure
563 # we use the correct provider. A simple select in the ptf_resource table is then needed.
564 if journal is None: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true
565 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist")
567 # Journal is the top collection (ex: AFST)
568 # We want to get (or create) the journal that corresponds to the issue
569 journal = self.get_historic_collection(xjournal, journal)
571 if self.embargo and journal.wall > 0: 571 ↛ 574line 571 didn't jump to line 574 because the condition on line 571 was never true
572 # Geodesic is for open access articles.
573 # We do not want to import the issues under embargo
574 if resolver.embargo(journal.wall, xissue.year):
575 print(f"Embargo, ignore {xissue.pid}")
576 return None
578 #######################################################################
579 # Get provider/publisher
581 provider_name = xissue.provider if xissue.provider else "mathdoc"
582 provider = model_helpers.get_provider_by_name(provider_name)
584 #######################################################################
585 # Add the issue
587 params = {
588 "xobj": xissue,
589 "pid": xissue.pid,
590 "from_folder": self.from_folder,
591 "to_folder": self.to_folder,
592 "solr_commit": False,
593 }
595 cmd = ptf_cmds.addContainerPtfCmd(params)
596 cmd.add_collection(journal)
597 cmd.set_provider(provider)
598 issue = cmd.do(self)
600 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink")
601 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject")
602 self.add_objects_with_location(xissue.streams, issue, "DataStream")
604 #######################################################################
605 # Add the issue's articles
607 # JatsIssue is an iterator (has the __iter__ function)
608 # you simply iterate the xissue to get its articles
609 if xissue.ctype == "issue": 609 ↛ 626line 609 didn't jump to line 626 because the condition on line 609 was always true
610 for seq, xarticle in enumerate(xissue, start=1):
611 params = {
612 "xarticle": xarticle,
613 "journal": journal,
614 "issue": issue,
615 "seq": seq,
616 "provider": provider,
617 "assign_doi": self.assign_doi,
618 "full_text_folder": self.full_text_folder,
619 "use_body": False,
620 "from_folder": self.from_folder,
621 "to_folder": self.to_folder,
622 "solr_commit_at_the_end": False,
623 }
624 cmd = addArticleXmlCmd(params)
625 cmd.do(self)
626 elif xissue.ctype == "issue_special":
627 site = PtfSite.objects.get(id=settings.SITE_ID)
628 issue.deploy(site)
629 for seq, xresource in enumerate(xissue.articles, start=1):
630 # en fait on peut appeler directement la ptfCMD
631 # et on peut supprimer la xml cmd
633 params = {
634 "use_body": False,
635 "xcontainer": issue,
636 "seq": seq,
637 # on veut juste passer le champ resource_doi dans ma fonction
638 "xresource": xresource,
639 "resource_doi": xresource.doi,
640 }
641 cmd = addResourceInSpecialIssueXmlCmd(params)
642 cmd.do(self)
644 # Update the top journal first year and last year
645 self.update_collection_years(journal_id, issue)
647 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi)
648 # Update issue before returning the object.
649 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db()
650 issue.my_collection.refresh_from_db()
652 # Used in post_do
653 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str
655 return issue
657 def post_do(self, resource=None):
658 super().post_do(resource)
660 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une.
661 if resource.last_modified is None: 661 ↛ 662line 661 didn't jump to line 662 because the condition on line 661 was never true
662 resource.last_modified = timezone.now()
663 resource.save()
665 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date,
666 # On la propage aux Articles/Issue.
667 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date
668 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools":
669 prod_deployed_date = model_helpers.parse_date_str(
670 self._prod_deployed_date_iso_8601_date_str
671 )
672 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid)
673 if journal_site: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true
674 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date)
676 if self.extra_folder:
677 ptf_cmds.importExtraDataPtfCmd(
678 {"pid": resource.pid, "import_folder": self.extra_folder}
679 ).do()
682class addResourceInSpecialIssueXmlCmd(addXmlCmd):
683 """
684 addResourceXmlCmd: adds/remove resource from special issue
685 """
687 xcontainer = None
688 resource_doi = ""
689 xresource = None
690 seq = 0
691 citation = ""
692 provider = None
694 def __init__(self, params=None):
695 super().__init__(params)
696 self.required_params.extend(["xcontainer"])
698 def internal_do(self):
699 super().internal_do()
700 # for later, check the type of the resource first
701 resource_in_special_issue = model_helpers.get_resource_in_special_issue_by_doi(
702 self.resource_doi
703 )
704 resource_doi = self.resource_doi
706 # if self.xcontainer:
707 container = model_helpers.get_container(self.xcontainer.pid)
709 seq = self.seq
710 # needs_to_restore_resource = False
712 if resource_in_special_issue is not None:
713 # temporary
714 raise ValueError(
715 "First step of developpement require to manually delete all resources in special issue"
716 )
717 # self.provider = self.xresource.provider
718 # 2 is the id of ptf_tools. If we are not in ptf tools we are dealing with jats article which has no citation
719 if settings.SITE_ID == 2:
720 citation = self.xresource["citation"]
721 else:
722 citation = ""
723 params = {
724 # "xobj": self.xresource,
725 "obj_doi": resource_doi,
726 "container": container,
727 "seq": seq,
728 "citation": citation,
729 # "provider": self.provider,
730 }
732 cmd = ptf_cmds.addResourceInSpecialIssuePtfCmd(params)
733 resource_in_special_issue = cmd.do(self)
735 return resource_in_special_issue
738class addArticleXmlCmd(addXmlCmd):
739 """
740 addArticleXmlCmd: adds/remove an issue
742 Exception raised:
743 - exceptions.ResourceExists during do if the article already exists
744 - exceptions.ResourceDoesNotExist
745 during undo if the Article does not exist
746 during do if the serial/issue/provider does not exist
747 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
748 """
750 xarticle = None
751 journal = None
752 issue = None
753 provider = None
754 provider_col = None
755 assign_doi = False
756 full_text_folder = ""
757 xml_format = "xmldata_jats"
758 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset
759 restricted_mode = False
760 # standalone is used to import isolated article, without issues
761 standalone = False
762 seq = (
763 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ)
764 )
765 keep_translations = False
767 def set_collection(self, collection):
768 self.journal = collection
769 self.provider = collection.provider
771 def set_xml_format(self, xml_format):
772 self.xml_format = xml_format
774 def set_provider(self, provider):
775 self.provider = provider
777 def set_provider_col(self, provider_col):
778 self.provider_col = provider_col
780 def set_article_single_mode(self):
781 self.xarticle = jats_parser.JatsArticle(tree=self.tree)
782 self.warnings.extend(self.xarticle.warnings)
784 # TODO: MaxiDML: allow the creation of an issue on the fly
785 # if not self.provider:
786 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider)
787 #
788 # xmldata_jats.set_pid_type(self.provider.pid_type)
789 #
790 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8")
791 # cmd = addCollectionsXmlCmd({'body': bdy,
792 # 'xml_format': self.xml_format,
793 # 'coltype': "journal"})
794 # cmd.set_provider(self.provider_col if self.provider_col else self.provider)
795 # self.journal = cmd.do()[0]
796 #
797 # self.issue = model_helpers.get_container(self.xarticle.issue_id)
798 # if self.issue is None:
799 # # need to create the issue
800 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str,
801 # '%Y-%m-%d')
802 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year)
803 # self.issue = model_helpers.get_container(pid)
804 # if self.issue is None:
805 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid,
806 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime(
807 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume,
808 # # if copy binary, need from_folder / to_folder
809 # }
810 #
811 # cmd = ptf_cmds.addContainerPtfCmd(params)
812 # cmd.add_collection(self.journal)
813 # cmd.set_provider(self.provider)
814 # self.issue = cmd.do()
816 def get_oai_identifier(self):
817 return self.xarticle.oai_identifier
819 def update_xobj_with_body(self):
820 # Import CEDRICS, le plein texte provient d'un fichier séparé
821 if self.full_text_folder and not self.xarticle.body: 821 ↛ 822line 821 didn't jump to line 822 because the condition on line 821 was never true
822 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER:
823 text = ""
824 locs = [
825 stream["location"]
826 for stream in self.xarticle.streams
827 if stream["mimetype"] == "application/pdf"
828 ]
829 if locs:
830 full_pdf_location = os.path.join(self.full_text_folder, locs[0])
831 text = utils.pdf_to_text(full_pdf_location)
832 self.xarticle.body = text
833 else:
834 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml"
836 with open(full_text_file, mode="rb") as file_:
837 body = file_.read()
839 parser = etree.XMLParser(huge_tree=True, recover=True)
840 tree = etree.fromstring(body, parser=parser)
841 node = tree.find("body")
842 self.xarticle.body = xml_utils.get_text_from_node(node)
843 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)
844 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true
845 full_text_file = os.path.join(
846 "/numdam_dev/acquisition/donnees_traitees",
847 self.journal.pid,
848 self.issue.pid,
849 self.xarticle.pid,
850 self.xarticle.pid + ".xml",
851 )
852 if os.path.isfile(full_text_file):
853 with open(full_text_file, mode="rb") as file_:
854 body = file_.read()
856 parser = etree.XMLParser(huge_tree=True, recover=True)
857 tree = etree.fromstring(body, parser=parser)
858 node = tree.find("body")
859 self.xarticle.body = xml_utils.get_text_from_node(node)
860 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)
862 def internal_do(self):
863 super().internal_do()
865 if self.xarticle is None and self.journal is not None: 865 ↛ 867line 865 didn't jump to line 867 because the condition on line 865 was never true
866 # self.restricted_mode = True
867 self.set_article_single_mode()
868 self.update = True
869 else:
870 self.update = False
872 if self.xarticle.pid is None:
873 self.xarticle.pid = (
874 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
875 )
877 for xtranslated_article in self.xarticle.translations: 877 ↛ 878line 877 didn't jump to line 878 because the loop on line 877 never started
878 for xtream in xtranslated_article.streams:
879 if xtream["mimetype"] == "text/html":
880 if self.from_folder is None:
881 raise ValueError(
882 "The article has its full text in a separate HTML file. You need to set from_folder"
883 )
885 location = os.path.join(self.from_folder, xtream["location"])
886 body_html = resolver.get_body(location)
887 body = xml_utils.get_text_from_xml_with_mathml(body_html)
888 xtranslated_article.body_html = body_html
889 xtranslated_article.body = body
891 for stream in self.xarticle.streams:
892 if stream["mimetype"] == "text/html":
893 location = os.path.join(self.from_folder, stream["location"])
894 body_html = resolver.get_body(location)
895 body = xml_utils.get_text_from_xml_with_mathml(body_html)
896 self.xarticle.body_html = body_html
897 self.xarticle.body = body
899 if self.xarticle.doi:
900 article = model_helpers.get_article_by_doi(self.xarticle.doi)
901 else:
902 article = model_helpers.get_article(self.xarticle.pid)
903 needs_to_restore_article = False
905 if article is not None: 905 ↛ 906line 905 didn't jump to line 906 because the condition on line 905 was never true
906 if self.update or self.standalone:
907 if self.standalone:
908 self.provider = article.provider
910 needs_to_restore_article = True
911 backup_obj_not_in_metadata(article)
913 if self.keep_translations:
914 backup_translation(article)
916 cmd = ptf_cmds.addArticlePtfCmd(
917 {
918 "pid": article.pid,
919 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr
920 }
921 )
922 cmd.set_object_to_be_deleted(article)
923 cmd.undo()
924 else:
925 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists")
927 # Override seq
928 if self.standalone and article is not None: 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true
929 self.xarticle.seq = article.seq
930 elif (
931 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0
932 ) or (hasattr(self, "pii") and self.seq != 0):
933 self.xarticle.seq = self.seq
935 # Get the article's text (body) for SolR if it is empty from the PDF
936 self.update_xobj_with_body()
938 params = {
939 "xobj": self.xarticle,
940 "pid": self.xarticle.pid,
941 "from_folder": self.from_folder,
942 "to_folder": self.to_folder,
943 "assign_doi": self.assign_doi and not self.xarticle.doi,
944 "solr_commit": False,
945 }
947 cmd = ptf_cmds.addArticlePtfCmd(params)
948 if self.issue or not self.standalone: 948 ↛ 950line 948 didn't jump to line 950 because the condition on line 948 was always true
949 cmd.set_container(self.issue)
950 cmd.add_collection(self.journal)
951 article = cmd.do(self)
953 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink")
954 self.add_objects_with_location(self.xarticle.streams, article, "DataStream")
955 if not self.restricted_mode: 955 ↛ 960line 955 didn't jump to line 960 because the condition on line 955 was always true
956 self.add_objects_with_location(
957 self.xarticle.supplementary_materials, article, "SupplementaryMaterial"
958 )
960 if (
961 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY
962 ) or settings.SITE_NAME == "ptf_tools":
963 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject")
965 for xtrans_article, trans_article in zip( 965 ↛ 968line 965 didn't jump to line 968 because the loop on line 965 never started
966 self.xarticle.translations, cmd.cmd.translated_articles
967 ):
968 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream")
970 if needs_to_restore_article: 970 ↛ 971line 970 didn't jump to line 971 because the condition on line 970 was never true
971 restore_obj_not_in_metadata(article)
973 if self.keep_translations:
974 restore_translation(article)
976 return article
979class addTranslatedArticleXmlCmd(addXmlCmd):
980 """
981 addTranslatedArticleXmlCmd: adds/remove translations.
982 The original article is not changed
983 The current translations are first removed
984 """
986 lang = ""
987 html_file_name = ""
988 pdf_file_name = ""
989 date_published_str = ""
991 def internal_do(self):
992 super().internal_do()
994 xarticle = jats_parser.JatsArticle(tree=self.tree)
995 article = model_helpers.get_article(xarticle.pid)
997 if article is None:
998 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist")
1000 # Merge existing article with new translation
1001 data_article = model_data_converter.db_to_article_data(article)
1002 new_translations = [
1003 translation
1004 for translation in data_article.translations
1005 if translation.lang != self.lang
1006 ]
1008 for xtrans_article in xarticle.translations:
1009 if xtrans_article.lang == self.lang:
1010 # Upload/views has copied the HTML file on disk
1011 # Add a DataStream.
1012 # TODO: check if the datastream is not already present
1013 if self.html_file_name:
1014 data = model_data.create_datastream()
1015 data["rel"] = "full-text"
1016 data["mimetype"] = "text/html"
1017 data["location"] = self.html_file_name
1018 xtrans_article.streams.append(data)
1020 if self.pdf_file_name:
1021 # Create a pdf file
1022 # pdf-translate needs the article/sub-article XML
1023 # Simply add a datastream for now
1024 # The new Article created in Django will be complete
1025 # But generate the PDF file at the end
1026 data = model_data.create_datastream()
1027 data["rel"] = "full-text"
1028 data["mimetype"] = "application/pdf"
1029 data["location"] = self.pdf_file_name
1030 xtrans_article.streams.append(data)
1032 if self.date_published_str:
1033 xtrans_article.date_published_iso_8601_date_str = self.date_published_str
1035 new_translations.append(xtrans_article)
1037 data_article.translations = new_translations
1039 cmd = addArticleXmlCmd(
1040 {
1041 "xarticle": data_article,
1042 "use_body": False,
1043 "issue": article.my_container,
1044 "standalone": True,
1045 "from_folder": self.from_folder,
1046 }
1047 )
1048 cmd.set_collection(article.get_collection())
1049 article = cmd.do()
1051 # pdf-translate needs the article/sub-article XML
1052 xml = ptf_cmds.exportPtfCmd(
1053 {
1054 "pid": article.pid,
1055 "with_body": False,
1056 "with_djvu": False,
1057 "article_standalone": True,
1058 "collection_pid": settings.COLLECTION_PID,
1059 }
1060 ).do()
1062 tex.create_translated_pdf(
1063 article,
1064 xml,
1065 self.lang,
1066 os.path.join(self.from_folder, self.pdf_file_name),
1067 os.path.join(self.from_folder, self.html_file_name),
1068 # If the date_published is specified, we assume that the PDF already exists
1069 skip_compilation=self.date_published_str != "",
1070 )
1072 return article
1075class addPCJArticleXmlCmd(addXmlCmd):
1076 """
1077 addPCJArticleXmlCmd:
1078 """
1080 html_file_name = ""
1082 def internal_do(self):
1083 super().internal_do()
1085 xarticle = jats_parser.JatsArticle(tree=self.tree)
1087 if self.html_file_name: 1087 ↛ 1094line 1087 didn't jump to line 1094 because the condition on line 1087 was always true
1088 data = model_data.create_datastream()
1089 data["rel"] = "full-text"
1090 data["mimetype"] = "text/html"
1091 data["location"] = self.html_file_name
1092 xarticle.streams.append(data)
1094 cmd = addArticleXmlCmd(
1095 {
1096 "xarticle": xarticle,
1097 "use_body": False,
1098 "issue": self.issue,
1099 "standalone": True,
1100 "from_folder": self.from_folder,
1101 }
1102 )
1103 cmd.set_collection(self.collection)
1104 article = cmd.do()
1106 return article
1109class addBookXmlCmd(addXmlCmd):
1110 """
1111 addBookXmlCmd: adds/remove a book
1113 Exception raised:
1114 - exceptions.ResourceExists during do if the book already exists
1115 - exceptions.ResourceDoesNotExist
1116 during undo if the Book does not exist
1117 during do if the serial/provider does not exist
1118 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1119 - RuntimeError during undo if resources are still published
1120 """
1122 provider = None
1123 import_oai_mode = False
1124 journal = None
1125 xml_format = "xmldata_jats"
1126 xbook = None
1127 _collection = None
1128 no_bib = False # Ignore the references during the import (used in Geodesic)
1130 def set_provider(self, provider):
1131 self.provider = provider
1133 def add_parts(self, xparts, pseq):
1134 if xparts:
1135 seq = 1
1136 for xpart in xparts:
1137 self.add_part(xpart, seq, pseq)
1138 seq += 1
1140 def add_part(self, xpart, seq, pseq):
1141 if xpart is None: 1141 ↛ 1142line 1141 didn't jump to line 1142 because the condition on line 1141 was never true
1142 return
1144 # An Article is used to store a book part in the database
1145 article = model_helpers.get_article(xpart.pid)
1147 if article is not None: 1147 ↛ 1148line 1147 didn't jump to line 1148 because the condition on line 1147 was never true
1148 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists")
1150 params = {
1151 "xobj": xpart,
1152 "pid": xpart.pid,
1153 "seq": seq,
1154 "pseq": pseq,
1155 # "deployed": deployed,
1156 "from_folder": self.from_folder,
1157 "to_folder": self.to_folder,
1158 "solr_commit": False,
1159 }
1161 cmd = ptf_cmds.addBookPartPtfCmd(params)
1162 cmd.set_container(self.book)
1163 cmd.add_collection(self._collection)
1164 article = cmd.do(self)
1166 self.add_objects_with_location(xpart.ext_links, article, "ExtLink")
1167 self.add_objects_with_location(xpart.streams, article, "DataStream")
1169 self.add_parts(xpart.parts, seq)
1171 def set_import_oai_mode(self):
1172 self.import_oai_mode = True
1174 def internal_do(self):
1175 super().internal_do()
1177 #######################################################################
1178 # Get xbook
1180 if self.import_oai_mode: 1180 ↛ 1181line 1180 didn't jump to line 1181 because the condition on line 1180 was never true
1181 xmldata = globals()[self.xml_format]
1182 xbook = xmldata.Book(self.tree)
1183 self.journal = model_helpers.get_collection("GDML_Books")
1185 else:
1186 if self.xbook:
1187 xbook = self.xbook
1188 else:
1189 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)
1190 self.warnings.extend(xbook.warnings)
1192 #######################################################################
1193 # Get existing book if any
1195 if not self.provider: 1195 ↛ 1199line 1195 didn't jump to line 1199 because the condition on line 1195 was always true
1196 provider = model_helpers.get_provider_by_name(xbook.provider)
1197 self.provider = provider
1199 book_id = xbook.pid
1200 book = model_helpers.get_container(book_id)
1202 #######################################################################
1203 # Delete any existing book
1205 if book is not None:
1206 if self.import_oai_mode: 1206 ↛ 1207line 1206 didn't jump to line 1207 because the condition on line 1206 was never true
1207 publisher = book.my_publisher
1209 # Note: the existing collection is not removed even if it no longer has a resource
1210 # TODO: urls/commands to add/update/delete a collection
1212 # Removes the book
1213 cmd = ptf_cmds.addContainerPtfCmd()
1214 cmd.set_object_to_be_deleted(book)
1215 cmd.undo()
1217 if publisher and publisher.publishes.count() == 0:
1218 self.remove_publisher(publisher)
1219 else:
1220 raise exceptions.ResourceExists("Book %s already exists" % book_id)
1222 #######################################################################
1223 # Add new book
1225 if xbook.incollection: 1225 ↛ 1230line 1225 didn't jump to line 1230 because the condition on line 1225 was always true
1226 colid = xbook.incollection[0].pid
1227 self._collection = model_helpers.get_collection(colid)
1228 if self._collection is None:
1229 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist")
1230 elif self.import_oai_mode:
1231 self._collection = self.journal
1233 params = {
1234 "xobj": xbook,
1235 "pid": xbook.pid,
1236 "from_folder": self.from_folder,
1237 "to_folder": self.to_folder,
1238 "solr_commit": False,
1239 }
1241 cmd = ptf_cmds.addContainerPtfCmd(params)
1242 cmd.add_collection(self._collection)
1243 cmd.set_provider(provider)
1245 book = cmd.do(self)
1246 self.book = book
1248 self.add_objects_with_location(xbook.ext_links, book, "ExtLink")
1249 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject")
1250 self.add_objects_with_location(xbook.streams, book, "DataStream")
1252 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ?
1254 #######################################################################
1255 # Add Book parts
1257 # JatsIssue is an iterator (has the __iter__ function)
1258 # TODO make JatsBook an iterator as well ?
1259 self.add_parts(xbook.parts, 0)
1261 # Update the collection first year and last year
1262 for incol in xbook.incollection:
1263 self.update_collection_years(incol.pid, book)
1265 return book
1268######################################################################################
1269######################################################################################
1270#
1271# Update Commands
1272#
1273######################################################################################
1274######################################################################################
1277class updateCollectionsXmlCmd(addXmlCmd):
1278 """
1279 updateSerialsXmlCmd: updates one or more journals
1281 Exception raised:
1282 - exceptions.ResourceDoesNotExist during do if the Collection does not exist
1283 - RuntimeError if undo is called
1284 """
1286 def update_collection(self, xcol, do_update=True):
1287 if not xcol: 1287 ↛ 1288line 1287 didn't jump to line 1288 because the condition on line 1287 was never true
1288 return None
1290 provider = model_helpers.get_provider_by_name(xcol.provider)
1292 col_id = xcol.pid
1293 col = model_helpers.get_collection(col_id)
1295 if col is None:
1296 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid)
1298 if do_update:
1299 params = {
1300 "xobj": xcol,
1301 "solr_commit": False,
1302 "from_folder": self.from_folder,
1303 "to_folder": self.to_folder,
1304 }
1306 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do
1307 # and the new ones are added in the post_do (addResourceDatabaseCmd)
1309 cmd = ptf_cmds.updateCollectionPtfCmd(params)
1310 cmd.set_provider(provider)
1311 # cmd.set_publisher(publisher)
1312 col = cmd.do()
1314 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do
1315 self.add_objects_with_location(xcol.ext_links, col, "ExtLink")
1316 resolver.copy_binary_files(col, self.from_folder, self.to_folder)
1318 # if publisher:
1319 # model_helpers.publish_resource(publisher, col)
1321 return col
1323 def internal_do(self):
1324 super().internal_do()
1326 collections = []
1328 # First, check that all journals exist
1329 for node in self.tree:
1330 xcol = None
1331 if node.tag == "collection-meta": 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true
1332 xcol = jats_parser.BitsCollection(tree=node)
1333 elif node.tag == "journal-meta": 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 xcol = jats_parser.JatsJournal(tree=node)
1335 elif node.tag == "publication-meta": 1335 ↛ 1337line 1335 didn't jump to line 1337 because the condition on line 1335 was always true
1336 xcol = jats_parser.MathdocPublication(tree=node)
1337 self.update_collection(xcol, False)
1339 for node in self.tree:
1340 xcol = None
1341 if node.tag == "collection-meta": 1341 ↛ 1342line 1341 didn't jump to line 1342 because the condition on line 1341 was never true
1342 xcol = jats_parser.BitsCollection(tree=node)
1343 elif node.tag == "journal-meta": 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true
1344 xcol = jats_parser.JatsJournal(tree=node)
1345 elif node.tag == "publication-meta": 1345 ↛ 1347line 1345 didn't jump to line 1347 because the condition on line 1345 was always true
1346 xcol = jats_parser.MathdocPublication(tree=node)
1347 self.warnings.extend(xcol.warnings)
1348 xcol = self.update_collection(xcol)
1349 collections.append(xcol)
1351 return collections
1353 def internal_undo(self):
1354 raise RuntimeError("update commands do not support the undo")
1357#####################################################################
1358#
1359# replaceIssueXmlCmd: updates an issue
1360#
1361# Exception raised:
1362# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist
1363# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1364# - RuntimeError if undo is called
1365#
1366######################################################################
1367class replaceIssueXmlCmd(addXmlCmd):
1368 def internal_do(self):
1369 super().internal_do()
1371 xissue = jats_parser.JatsIssue(tree=self.tree)
1372 self.warnings.extend(xissue.warnings)
1374 xjournal = xissue.journal
1375 journal_id = xjournal.pid
1376 journal = model_helpers.get_collection(journal_id)
1378 if journal is None: 1378 ↛ 1379line 1378 didn't jump to line 1379 because the condition on line 1378 was never true
1379 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)
1381 issue_id = xissue.pid
1382 issue = model_helpers.get_container(issue_id)
1384 if issue is None: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true
1385 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id)
1387 publisher = issue.my_publisher
1389 cmd = ptf_cmds.addContainerPtfCmd()
1390 cmd.set_object_to_be_deleted(issue)
1391 cmd.undo()
1393 if publisher.publishes.count() == 0:
1394 self.remove_publisher(publisher)
1396 # update the journal first and last year
1397 for the_issue in journal.content.all():
1398 self.update_collection_years(journal_id, the_issue, False)
1400 journal.save()
1402 cmd = addIssueXmlCmd(
1403 {
1404 "xissue": xissue,
1405 "use_body": False,
1406 "solr_commit": False,
1407 "extra_folder": self.from_folder,
1408 "to_folder": self.to_folder,
1409 }
1410 )
1411 issue = cmd.do()
1413 return issue
1415 # node_tag = self.tree.tag
1416 # for child in self.tree:
1417 # node_tag = child.tag
1419 def internal_undo(self):
1420 raise RuntimeError("update commands do not support the undo")
1423class updateBookXmlCmd(addXmlCmd):
1424 """
1425 updateBookXmlCmd: updates a book
1427 Exception raised:
1428 - exceptions.ResourceDoesNotExist during do if the Book does not exist
1429 - RuntimeError if undo is called
1430 """
1432 no_bib = False # Ignore the references during the import (used in Geodesic)
1434 def internal_do(self):
1435 super().internal_do()
1437 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)
1438 self.warnings.extend(xbook.warnings)
1440 book_id = xbook.pid
1441 book = model_helpers.get_container(book_id)
1443 if book is None: 1443 ↛ 1444line 1443 didn't jump to line 1444 because the condition on line 1443 was never true
1444 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid)
1446 # unpublish and delete the existing publisher if necessary
1447 # self.update_publisher(xbook, book)
1449 # Note: the existing collection is not removed even if it no longer has a resource
1450 # TODO: urls/commands to add/update/delete a collection
1452 # Removes the book
1453 cmd = ptf_cmds.addContainerPtfCmd()
1454 cmd.set_object_to_be_deleted(book)
1455 cmd.undo()
1457 cmd = addBookXmlCmd(
1458 {
1459 "xbook": xbook,
1460 "use_body": False,
1461 "solr_commit": False,
1462 "from_folder": self.from_folder,
1463 "no_bib": self.no_bib,
1464 "to_folder": self.to_folder,
1465 }
1466 )
1467 book = cmd.do()
1469 return book
1471 def internal_undo(self):
1472 raise RuntimeError("update commands do not support the undo")
1475class addOrUpdateContainerXmlCmd(addXmlCmd):
1476 """
1477 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book
1479 just detect Container type (do not check params etc.)
1480 """
1482 keep_metadata = False
1483 keep_translations = False
1484 backup_folder = None
1485 full_text_folder = ""
1486 fake = False # Parse the XML but do not import
1487 no_bib = False # Ignore the references during the import (used in Geodesic)
1488 embargo = False # Import only the open articles (used in Geodesic)
1490 def check_params(self):
1491 super().check_params()
1493 def internal_do(self):
1494 super().internal_do()
1496 tag = normalize(self.tree.tag)
1498 if tag == "journal-issue":
1499 cmd = addOrUpdateIssueXmlCmd(
1500 {
1501 "body": self.body,
1502 "keep_metadata": self.keep_metadata,
1503 "keep_translations": self.keep_translations,
1504 "backup_folder": self.backup_folder,
1505 "to_folder": self.to_folder,
1506 "from_folder": self.from_folder,
1507 "xml_file_folder": self.xml_file_folder,
1508 "fake": self.fake,
1509 "no_bib": self.no_bib,
1510 "embargo": self.embargo,
1511 }
1512 )
1513 obj = cmd.do()
1514 self.warnings.extend(cmd.warnings)
1515 return obj
1516 elif tag == "book":
1517 cmd = addOrUpdateBookXmlCmd(
1518 {
1519 "body": self.body,
1520 "from_folder": self.from_folder,
1521 "to_folder": self.to_folder,
1522 "no_bib": self.no_bib,
1523 "embargo": self.embargo,
1524 }
1525 )
1526 obj = cmd.do()
1527 self.warnings.extend(cmd.warnings)
1528 return obj
1529 else:
1530 raise RuntimeError("addOrupdateContainer command can't detect container type")
1532 def internal_undo(self):
1533 raise RuntimeError("update commands do not support the undo")
1536class addOrUpdateIssueXmlCmd(addXmlCmd):
1537 """
1538 addOrUpdateIssueXmlCmd: adds or updates an issue
1540 Adds an issue if it is not in the system or updates the issue if it is already there.
1541 By default, no DOI is assigned for the articles. Set assign_doi to True.
1543 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy
1544 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json
1546 keep_metadata:
1547 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder
1548 Default: False
1549 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called
1550 We always want to preserve GraphicalAbstracts (they are not in the issue XML)
1552 keep_translations:
1553 True if you want back up/restore translations.
1554 Default: False
1555 Note: When you post an article to a journal (test) website, the translation is declared in the XML
1556 But if you import a Cedrics article in Trammel, the XML does not list translations
1558 Exception raised:
1559 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist
1560 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1561 - RuntimeError if undo is called
1562 """
1564 keep_metadata = False
1565 keep_translations = False
1566 backup_folder = None
1567 assign_doi = False
1568 full_text_folder = ""
1570 xissue = None
1571 fake = False # Parse the XML but do not import
1572 no_bib = False # Ignore the references during the import (used in Geodesic)
1573 embargo = False # Import only the open articles (used in Geodesic)
1575 def check_params(self):
1576 super().check_params()
1578 if self.keep_metadata and self.assign_doi: 1578 ↛ 1579line 1578 didn't jump to line 1579 because the condition on line 1578 was never true
1579 raise ValueError("keep_metadata and assign_doi cannot both be true.")
1581 if self.keep_metadata and self.backup_folder is None: 1581 ↛ 1582line 1581 didn't jump to line 1582 because the condition on line 1581 was never true
1582 raise ValueError("backup_folder needs to be set when keep_metadata is true.")
1584 def internal_do(self):
1585 super().internal_do()
1587 if not self.xissue: 1587 ↛ 1605line 1587 didn't jump to line 1605 because the condition on line 1587 was always true
1588 self.xissue = xissue = jats_parser.JatsIssue(
1589 tree=self.tree,
1590 from_folder=self.from_folder,
1591 no_bib=self.no_bib,
1592 )
1593 if len(xissue.warnings) > 0 and self.xml_file_folder: 1593 ↛ 1594line 1593 didn't jump to line 1594 because the condition on line 1593 was never true
1594 warnings = []
1595 warning_keys = []
1596 for warning in xissue.warnings:
1597 for key, value in warning.items():
1598 if value not in warning_keys:
1599 warning_keys.append(value)
1600 warnings.append({key: value})
1601 for warning in warnings:
1602 print(warning)
1603 self.warnings.extend(xissue.warnings)
1604 else:
1605 xissue = self.xissue
1607 if self.fake: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true
1608 return
1610 xjournal = xissue.journal
1611 journal_id = xjournal.pid
1612 journal = model_helpers.get_collection(journal_id)
1614 if journal is None: 1614 ↛ 1615line 1614 didn't jump to line 1615 because the condition on line 1614 was never true
1615 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)
1617 existing_issue = model_helpers.get_container(xissue.pid)
1619 if existing_issue:
1620 if self.embargo and existing_issue.embargo(): 1620 ↛ 1623line 1620 didn't jump to line 1623 because the condition on line 1620 was never true
1621 # Geodesic is for open access articles.
1622 # We do not want to import the issues under embargo
1623 print(f"Embargo, ignore {xissue.pid}")
1624 return None
1626 if self.keep_metadata:
1627 # On commence par faire un backup de l'existant en cas de bug.
1628 ptf_cmds.exportPtfCmd(
1629 {
1630 "pid": existing_issue.pid,
1631 "with_internal_data": True,
1632 "with_binary_files": False,
1633 "for_archive": False,
1634 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"),
1635 }
1636 ).do()
1638 # On sauvegarde les données additionnelles (extid, deployed_date,...)
1639 # dans un json qui sera ré-importé avec l'import du nouvel issue
1640 params = {
1641 "pid": existing_issue.pid,
1642 "export_folder": self.backup_folder,
1643 "export_all": True,
1644 "with_binary_files": True,
1645 }
1646 ptf_cmds.exportExtraDataPtfCmd(params).do()
1648 for article in existing_issue.article_set.all():
1649 backup_obj_not_in_metadata(article)
1650 if self.keep_translations: 1650 ↛ 1651line 1650 didn't jump to line 1651 because the condition on line 1650 was never true
1651 backup_translation(article)
1652 # changer nom de variable resource
1653 for resource_in_special_issue in existing_issue.resources_in_special_issue.all(): 1653 ↛ 1656line 1653 didn't jump to line 1656 because the loop on line 1653 never started
1654 # External article can be part of special issue and backup can bug if so
1656 if resource_in_special_issue.resource:
1657 backup_obj_not_in_metadata(resource_in_special_issue.resource)
1659 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants
1661 cmd = ptf_cmds.addContainerPtfCmd()
1662 cmd.set_object_to_be_deleted(existing_issue)
1663 cmd.undo()
1665 # update the journal first and last year
1666 for the_issue in journal.content.all():
1667 self.update_collection_years(journal_id, the_issue, False)
1669 journal.save()
1670 else:
1671 issue_to_appear = model_helpers.get_issue_to_appear(journal_id)
1673 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés
1674 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex)
1675 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None.
1676 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2.
1677 # L'import va échouer car on ne peut avoir 2 fois le même article.
1678 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3.
1679 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import
1680 # du nouveau volume sinon il va y avoir des conflits.
1682 if issue_to_appear and xissue.pid != issue_to_appear.pid:
1683 # On sauvegarde les données additionnelles (extid, deployed_date,...)
1684 # dans un json qui sera ré-importé avec l'import du nouvel issue
1685 # ainsi que image associée via ptf-tools
1686 if self.keep_metadata: 1686 ↛ 1696line 1686 didn't jump to line 1696 because the condition on line 1686 was always true
1687 params = {
1688 "pid": issue_to_appear.pid,
1689 "force_pid": xissue.pid,
1690 "export_folder": self.backup_folder,
1691 "export_all": True,
1692 "with_binary_files": True,
1693 }
1694 ptf_cmds.exportExtraDataPtfCmd(params).do()
1696 for xarticle in xissue.articles:
1697 if isinstance(xarticle, dict): 1697 ↛ 1698line 1697 didn't jump to line 1698 because the condition on line 1697 was never true
1698 xdoi = xarticle["doi"]
1699 else:
1700 xdoi = getattr(xarticle, "doi")
1701 article = issue_to_appear.article_set.filter(doi=xdoi).first()
1702 if article: 1702 ↛ 1696line 1702 didn't jump to line 1696 because the condition on line 1702 was always true
1703 backup_obj_not_in_metadata(article)
1704 if self.keep_translations: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true
1705 backup_translation(article)
1707 params = {"to_folder": self.to_folder} # pour suppression des binaires
1708 cmd = ptf_cmds.addArticlePtfCmd(params)
1709 cmd.set_object_to_be_deleted(article)
1710 cmd.undo()
1712 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd
1713 cmd = addIssueXmlCmd(
1714 {
1715 "xissue": xissue,
1716 "use_body": False,
1717 # "body": self.body,
1718 "assign_doi": self.assign_doi,
1719 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file
1720 "extra_folder": self.backup_folder,
1721 "from_folder": self.from_folder,
1722 "to_folder": self.to_folder,
1723 "no_bib": self.no_bib,
1724 "embargo": self.embargo,
1725 "solr_commit": False,
1726 }
1727 )
1728 new_issue = cmd.do()
1730 if new_issue: 1730 ↛ 1749line 1730 didn't jump to line 1749 because the condition on line 1730 was always true
1731 new_articles = new_issue.article_set.all()
1733 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés
1734 for article in new_articles:
1735 if self.assign_doi and article.doi is None: 1735 ↛ 1736line 1735 didn't jump to line 1736 because the condition on line 1735 was never true
1736 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid)
1738 # TODO garbage collector on articles no longer in the issue
1739 restore_obj_not_in_metadata(article)
1740 if self.keep_translations: 1740 ↛ 1741line 1740 didn't jump to line 1741 because the condition on line 1740 was never true
1741 restore_translation(article)
1742 if new_issue.ctype == "issue_special": 1742 ↛ 1743line 1742 didn't jump to line 1743 because the condition on line 1742 was never true
1743 resources_in_special_issue = new_issue.resources_in_special_issue.all()
1744 for resource_in_special_issue in resources_in_special_issue:
1745 # External article can be part of special issue and restore can bug if so
1746 if resource_in_special_issue.resource:
1747 restore_obj_not_in_metadata(resource_in_special_issue.resource)
1749 return new_issue
1751 def internal_undo(self):
1752 raise RuntimeError("update commands do not support the undo")
1755class addOrUpdateBookXmlCmd(addXmlCmd):
1756 xbook = None
1757 no_bib = False # Ignore the references during the import (used in Geodesic)
1759 def internal_do(self):
1760 super().internal_do()
1762 if not self.xbook: 1762 ↛ 1766line 1762 didn't jump to line 1766 because the condition on line 1762 was always true
1763 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib)
1764 self.warnings.extend(xbook.warnings)
1765 else:
1766 xbook = self.xbook
1768 book_id = xbook.pid
1769 book = model_helpers.get_container(book_id)
1771 if book: 1771 ↛ 1772line 1771 didn't jump to line 1772 because the condition on line 1771 was never true
1772 cmd = ptf_cmds.addContainerPtfCmd()
1773 cmd.set_object_to_be_deleted(book)
1774 cmd.undo()
1776 collection = book.get_collection()
1778 # update the collection first and last year
1779 for container in collection.content.all():
1780 self.update_collection_years(collection.pid, container, False)
1782 collection.save()
1784 cmd = addBookXmlCmd(
1785 {
1786 "xbook": xbook,
1787 "use_body": False,
1788 # "body": self.body,
1789 "from_folder": self.from_folder,
1790 "to_folder": self.to_folder,
1791 "no_bib": self.no_bib,
1792 "solr_commit": False,
1793 }
1794 )
1795 book = cmd.do()
1796 return book
1799class updateBibitemCitationXmlCmd(baseCmd):
1800 """ """
1802 def __init__(self, params=None):
1803 self.bibitem = None
1805 super().__init__(params)
1807 self.required_params.extend(["bibitem"])
1809 def set_bibitem(self, bibitem):
1810 self.bibitem = bibitem
1812 def internal_do(self):
1813 super().internal_do()
1815 new_ids = {}
1816 for bibitemid in self.bibitem.bibitemid_set.all():
1817 new_ids[bibitemid.id_type] = {
1818 "id_type": bibitemid.id_type,
1819 "id_value": bibitemid.id_value,
1820 "checked": bibitemid.checked,
1821 "false_positive": bibitemid.false_positive,
1822 }
1824 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids)
1825 self.warnings.extend(xbibitem.warnings)
1827 self.bibitem.citation_xml = xbibitem.citation_xml
1828 self.bibitem.citation_html = xbibitem.citation_html
1829 self.bibitem.citation_tex = xbibitem.citation_tex
1830 self.bibitem.save()
1832 def internal_undo(self):
1833 raise RuntimeError("update commands do not support the undo")
1836######################################################################################
1837######################################################################################
1838#
1839# Import Commands
1840#
1841######################################################################################
1842######################################################################################
1845class collectEntireCollectionXmlCmd(baseCmd):
1846 """
1847 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder
1849 results:
1850 """
1852 def __init__(self, params=None):
1853 self.pid = None
1854 self.folder = None
1856 super().__init__(params)
1858 self.required_params.extend(["pid", "folder"])
1860 def internal_do(self):
1861 super().internal_do()
1862 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)]
1863 return pids
1866class importEntireCollectionXmlCmd(baseCmd):
1867 """
1868 Import all the XML of a collection (collection.xml, issues.xml) of a given folder
1870 results:
1871 """
1873 def __init__(self, params=None):
1874 self.pid = None
1875 self.from_folder = None
1876 self.to_folder = None
1877 self.backup_folder = None
1878 self.keep_metadata = False
1879 self.keep_translations = False
1881 self.with_cedrics = True
1882 self.from_cedrics = False # The entire collection is in Cedrics format
1883 self.date_for_pii = False # Fetch publication_date for Elsevier articles
1884 self.first_issue = ""
1885 self.fake = False # Parse the XML but do not import
1887 self.no_bib = False # Ignore the references during the import (used in Geodesic)
1888 self.embargo = False # Import only the open articles (used in Geodesic)
1890 self.caller = None
1891 self.callback = None
1892 self.job = None
1894 super().__init__(params)
1896 self.required_params.extend(["pid", "from_folder"])
1898 def internal_do(self):
1899 super().internal_do()
1901 pid = self.pid
1902 resource = model_helpers.get_resource(pid)
1903 if not resource and not self.fake:
1904 body = resolver.get_archive_body(self.from_folder, pid, None)
1905 journals = addCollectionsXmlCmd(
1906 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder}
1907 ).do()
1908 if not journals:
1909 raise ValueError(self.from_folder + " does not contain a collection")
1910 resource = journals[0]
1912 obj = resource.cast()
1914 if obj.classname != "Collection":
1915 raise ValueError(pid + " does not contain a collection")
1917 if self.with_cedrics:
1918 # with_cedrics means that you want to import everything from scratch
1919 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID)
1920 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"})
1921 cmd.do()
1923 i = 0
1924 for pid, file_ in resolver.iterate_collection_folder(
1925 self.from_folder, self.pid, self.first_issue
1926 ):
1927 if self.callback is None:
1928 print(pid)
1930 if self.from_cedrics:
1931 cmd = importCedricsIssueDirectlyXmlCmd(
1932 {
1933 "colid": self.pid,
1934 "input_file": file_,
1935 "remove_email": False,
1936 "remove_date_prod": True,
1937 "copy_files": True,
1938 "force_dois": False,
1939 }
1940 )
1941 else:
1942 body = resolver.get_body(file_)
1943 xml_file_folder = os.path.dirname(file_)
1944 cmd = addOrUpdateContainerXmlCmd(
1945 {
1946 "body": body,
1947 "from_folder": self.from_folder,
1948 "to_folder": self.to_folder,
1949 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file
1950 "xml_file_folder": xml_file_folder, # when article.XML are in separate files
1951 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML
1952 "keep_translations": self.keep_translations, # Backup/Restore existing translations
1953 "no_bib": self.no_bib,
1954 "embargo": self.embargo,
1955 # Needed in Trammel
1956 "fake": self.fake,
1957 }
1958 )
1959 cmd.do()
1961 i += 1
1962 if self.callback:
1963 self.callback(self.job, i)
1965 if self.with_cedrics:
1966 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata")
1968 xml_files = [
1969 os.path.join(src_folder, f)
1970 for f in os.listdir(src_folder)
1971 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml")
1972 ]
1973 for xml_file in xml_files:
1974 if self.callback is None:
1975 print(xml_file)
1977 cmd = importCedricsIssueXmlCmd(
1978 {
1979 "colid": self.pid,
1980 "input_file": xml_file,
1981 "from_folder": self.from_folder,
1982 "to_folder": self.to_folder,
1983 }
1984 )
1985 cmd.do()
1988class importCedricsIssueXmlCmd(baseCmd):
1989 def __init__(self, params=None):
1990 self.colid = None
1991 self.input_file = None
1992 self.remove_email = True
1993 self.remove_date_prod = True
1994 self.diff_only = False
1995 self.body = None
1996 self.xissue = None
1997 self.copy_files = True
1999 super().__init__(params)
2001 self.required_params.extend(["colid"])
2003 def import_full_text(self, issue):
2004 """
2005 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL)
2006 Read the XML file and convert the body in HTML
2007 """
2008 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid)
2009 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid)
2011 if len(tex_folders) > 0:
2012 i = 0
2013 for article in issue.article_set.all():
2014 article_folder = tex_folders[i]
2015 xml_file = os.path.join(
2016 tex_src_folder, article_folder, "FullText", article_folder + ".xml"
2017 )
2019 cmd = ptf_cmds.updateResourceIdPtfCmd(
2020 {"id_type": "ojs-id", "id_value": article_folder}
2021 )
2022 cmd.set_resource(article)
2023 cmd.do()
2025 if os.path.isfile(xml_file):
2026 with open(xml_file, encoding="utf-8") as f:
2027 body = f.read()
2029 cmd = addBodyInHtmlXmlCmd(
2030 {
2031 "body": body,
2032 "from_folder": settings.CEDRAM_XML_FOLDER,
2033 # nécessaire pour la copie des binaires type image
2034 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem
2035 }
2036 )
2037 cmd.set_article(article)
2038 cmd.do()
2040 i += 1
2042 def import_in_db(self):
2043 """
2044 Import Cedrics issue from /cedram_dev/exploitation/cedram
2045 This worflow is no longer used.
2046 """
2048 # Cedrics: the full text for SolR is in a separate file
2049 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/"
2051 params = {
2052 "assign_doi": False,
2053 "full_text_folder": full_text_folder,
2054 "keep_metadata": True,
2055 "keep_translations": True,
2056 "use_body": False,
2057 "xissue": self.xissue,
2058 "backup_folder": settings.MERSENNE_TMP_FOLDER,
2059 "from_folder": settings.CEDRAM_XML_FOLDER,
2060 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,
2061 }
2063 # params['body'] = self.body
2065 cmd = addOrUpdateIssueXmlCmd(params)
2066 issue = cmd.do()
2067 self.warnings.extend(cmd.get_warnings())
2069 # resolver.copy_binary_files(
2070 # issue,
2071 # settings.CEDRAM_XML_FOLDER,
2072 # settings.MERSENNE_TEST_DATA_FOLDER)
2074 self.import_full_text(issue)
2076 return issue
2078 def compare_issue(self):
2079 xissue = self.xissue
2080 issues_diff = {}
2081 result = True
2083 time1 = timezone.now()
2085 new_dois = [article.doi for article in xissue.articles]
2087 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related(
2088 "abstract_set",
2089 "kwd_set",
2090 "subj_set",
2091 "datastream_set",
2092 "relatedobject_set",
2093 "resourcecount_set",
2094 "contributions",
2095 "contributions__contribaddress_set",
2096 "bibitem_set__bibitemid_set",
2097 "bibitem_set__contributions",
2098 "bibitem_set__contributions__contribaddress_set",
2099 )
2101 issue = None
2102 try:
2103 issue = (
2104 Container.objects.select_related("my_collection", "my_publisher")
2105 .prefetch_related(
2106 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi")
2107 )
2108 .get(sites__id=settings.SITE_ID, pid=xissue.pid)
2109 )
2110 except Container.DoesNotExist:
2111 pass
2113 if issue:
2114 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi)
2116 time2 = timezone.now()
2117 delta = time2 - time1
2119 delta.seconds + delta.microseconds / 1e6
2120 print(delta)
2122 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...)
2123 model_data_comparator.prepare_issue_for_comparison(xissue)
2125 issue_comparator = model_data_comparator.IssueDataComparator()
2127 result = issue_comparator.compare(data_issue, xissue, issues_diff)
2129 return (result, issues_diff, xissue)
2131 def delete_previous_file(self, output_folder):
2132 basename = os.path.basename(self.input_file)
2134 output_file = os.path.join(output_folder, self.colid, basename)
2135 if os.path.isfile(output_file):
2136 os.remove(output_file)
2138 os.makedirs(output_folder, exist_ok=True)
2139 os.makedirs(os.path.dirname(output_file), exist_ok=True)
2141 return output_file
2143 def import_cedrics_issue(self):
2144 """
2145 Import Cedrics issue from /cedram_dev/exploitation/cedram
2146 This worflow is no longer used.
2147 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM
2148 (see importCedricsIssueDirectlyXmlCmd below)
2149 """
2151 output_folder = settings.MERSENNE_TMP_FOLDER
2152 ptf_xsl_folder = settings.PTF_XSL_FOLDER
2153 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE)
2155 # 1. Delete the previous file
2156 output_file = self.delete_previous_file(output_folder)
2158 # 2. Transform the cedrics XML into JATS
2159 cmd_folder = os.path.join(ptf_xsl_folder, "cedram")
2161 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format(
2162 cmd_folder,
2163 os.path.join(settings.VIRTUALENV_DIR, "bin/python"),
2164 "-s" if self.colid in settings.MERSENNE_SEMINARS else "",
2165 self.input_file,
2166 output_folder,
2167 log_file + "1",
2168 # option -e for cedram2ptf.py for not removing email
2169 "-e" if not self.remove_email else "",
2170 "-t" if self.remove_date_prod else "",
2171 log_file,
2172 )
2174 log_file2 = log_file + "2"
2175 with open(log_file2, "w", encoding="ascii") as file_:
2176 file_.write(cmd_str + "\n")
2178 sys.path.append(ptf_xsl_folder + "/lib")
2180 try:
2181 result = subprocess.check_output(cmd_str, shell=True)
2182 except Exception as e:
2183 with open(log_file) as logfile_:
2184 logfile_body = logfile_.read()
2185 message = str(e) + "\n" + logfile_body + "\n"
2186 file_.write(message)
2187 file_.close()
2188 raise RuntimeError(message)
2190 file_.write(str(result) + "\n")
2192 # Check if the output_file has been created
2193 if not os.path.isfile(output_file):
2194 raise RuntimeError("The file was not converted in JATS")
2196 with open(output_file, encoding="utf-8") as f:
2197 self.body = f.read()
2199 parser = etree.XMLParser(
2200 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True
2201 )
2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
2203 self.xissue = jats_parser.JatsIssue(tree=tree)
2204 self.warnings.extend(self.xissue.warnings)
2206 def internal_do(self):
2207 super().internal_do()
2209 if not self.xissue:
2210 self.import_cedrics_issue()
2212 result = None
2214 if self.diff_only:
2215 result = self.compare_issue()
2216 else:
2217 result = self.import_in_db()
2219 return result
2222# import from /cedram_dev/production_tex/CEDRAM
2223class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd):
2224 def __init__(self, params=None):
2225 self.is_seminar = False
2226 self.article_folders = None
2227 self.force_dois = True
2228 super().__init__(params)
2230 def read_file(self, filename, skip_lines=2):
2231 i = 0
2232 lines = []
2233 try:
2234 with open(filename, encoding="utf-8") as fr:
2235 for line in fr:
2236 if i > skip_lines:
2237 lines.append(line)
2238 i += 1
2239 except UnicodeDecodeError:
2240 i = 0
2241 lines = []
2242 with open(filename, encoding="iso-8859-1") as fr:
2243 for line in fr:
2244 if i > skip_lines:
2245 lines.append(line)
2246 i += 1
2248 return lines
2250 def import_cedrics_issue(self):
2251 """
2252 Parse the Cedrics XML directly, without Cedrics -> JATS transformation
2253 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created
2254 Workflow
2255 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM
2256 2. Cat the article XML files into one issue.XML
2257 3. Read the Cedrics issue.XML
2259 :return:
2260 """
2262 output_folder = settings.MERSENNE_TMP_FOLDER
2263 output_file = self.delete_previous_file(output_folder)
2265 basename = os.path.basename(self.input_file)
2266 if "-cdrxml" in basename:
2267 pid = basename.split("-cdrxml.")[0]
2268 else:
2269 pid = basename.split(".xml")[0]
2271 # 1. Get the list of articles
2272 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid)
2273 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid)
2275 # 2. Create the issue XML file
2276 with open(output_file, "w", encoding="utf-8") as fw:
2277 # 2.a. Start the issue.xml based on @pid-cdrxml.xml
2278 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n')
2279 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n')
2280 fw.write("<cedram>\n")
2282 lines = self.read_file(self.input_file)
2283 for line in lines:
2284 fw.write(line)
2286 # 2.b. Cat the article XML files
2287 for basename in self.article_folders:
2288 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml")
2290 lines = self.read_file(src_file)
2291 for line in lines:
2292 fw.write(line)
2294 fw.write("</cedram>\n")
2296 # 3. Read the Cedrics issue.XML
2297 with open(output_file, encoding="utf-8") as f:
2298 self.body = f.read()
2300 parser = etree.XMLParser(
2301 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
2302 )
2303 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
2304 self.xissue = cedrics_parser.CedricsIssue(
2305 tree=tree,
2306 is_seminar=self.is_seminar,
2307 ignore_date_published=self.remove_date_prod,
2308 article_folders=self.article_folders,
2309 dois=self.dois,
2310 )
2311 if self.force_dois:
2312 for xarticle in self.xissue.articles:
2313 if xarticle.doi is None:
2314 raise ValueError(xarticle.pid, "n'a pas de doi")
2316 self.warnings.extend(self.xissue.warnings)
2318 def import_in_db(self):
2319 params = {
2320 "assign_doi": False,
2321 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file
2322 "keep_metadata": True,
2323 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them.
2324 "use_body": False,
2325 "xissue": self.xissue,
2326 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import
2327 "from_folder": settings.CEDRAM_TEX_FOLDER,
2328 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,
2329 }
2331 cmd = addOrUpdateIssueXmlCmd(params)
2332 issue = cmd.do()
2333 self.warnings.extend(cmd.get_warnings())
2335 self.import_full_text(issue)
2337 return issue
2340class addCedricsIssueXmlCmd(addXmlCmd):
2341 assign_doi = False
2342 full_text_folder = ""
2343 import_folder = None
2344 prod_deployed_date_iso_8601_date_str = None
2345 xissue = None
2346 remove_blank_text = False
2347 is_seminar = False
2349 def internal_do(self):
2350 super().internal_do()
2352 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar)
2354 return self.xissue
2357class addorUpdateCedricsArticleXmlCmd(baseCmd):
2358 def __init__(self, params=None):
2359 self.container_pid = None
2360 self.article_folder_name = None
2362 super().__init__(params)
2364 self.required_params.extend(["container_pid", "article_folder_name"])
2366 def internal_do(self):
2367 super().internal_do()
2369 issue = model_helpers.get_container(self.container_pid)
2370 if not issue:
2371 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist")
2373 colid = issue.my_collection.pid
2374 article_folder = os.path.join(
2375 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name
2376 )
2378 # 1. Read the Cedrics article.XML
2379 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml")
2380 with open(input_file, encoding="utf-8") as f:
2381 body = f.read()
2383 # 2. Parse the file and create an xarticle
2384 is_seminar = colid in settings.MERSENNE_SEMINARS
2385 parser = etree.XMLParser(
2386 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
2387 )
2388 tree = etree.fromstring(body.encode("utf-8"), parser=parser)
2389 xarticle = cedrics_parser.CedricsArticle(
2390 tree=tree,
2391 colid=colid,
2392 issue_id=self.container_pid,
2393 is_seminar=is_seminar,
2394 ignore_date_published=True,
2395 article_folder=self.article_folder_name,
2396 )
2397 if xarticle.doi is None:
2398 raise ValueError(xarticle.pid, "n'a pas de doi")
2400 # Get the article position in its issue (seq) to preserve its order
2401 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid)
2402 i = 1
2403 for folder in article_folders:
2404 if folder == self.article_folder_name:
2405 xarticle.seq = i
2406 i += 1
2408 existing_article = model_helpers.get_article(xarticle.pid)
2409 temp_folder = settings.MERSENNE_TMP_FOLDER
2411 # 3. Backup/Suppression de l'article existant
2412 if existing_article:
2413 # On commence par faire un backup de l'existant en cas de bug.
2414 ptf_cmds.exportPtfCmd(
2415 {
2416 "pid": self.container_pid,
2417 "with_internal_data": True,
2418 "with_binary_files": False,
2419 "for_archive": False,
2420 "export_folder": os.path.join(temp_folder, "backup"),
2421 }
2422 ).do()
2424 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json
2425 params = {
2426 "pid": existing_article.pid,
2427 "export_folder": temp_folder,
2428 "export_all": True,
2429 "with_binary_files": True,
2430 }
2431 ptf_cmds.exportExtraDataPtfCmd(params).do()
2433 backup_obj_not_in_metadata(existing_article)
2434 backup_translation(existing_article)
2436 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone
2438 # 4. Ajout de l'article dans Django/SolR
2439 params = {
2440 "xarticle": xarticle,
2441 "issue": issue,
2442 "standalone": True,
2443 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly
2444 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file
2445 # temp folder used to backup/restore info during the import
2446 "from_folder": settings.CEDRAM_TEX_FOLDER,
2447 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER,
2448 "keep_translations": True,
2449 }
2451 cmd = addArticleXmlCmd(params)
2452 cmd.set_collection(issue.my_collection)
2453 article = cmd.do()
2455 # 5. Lecture du full text en HTML
2456 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml")
2457 if os.path.isfile(xml_file):
2458 with open(xml_file, encoding="utf-8") as f:
2459 body = f.read()
2461 cmd = addBodyInHtmlXmlCmd(
2462 {
2463 "body": body,
2464 "from_folder": settings.CEDRAM_XML_FOLDER,
2465 # nécessaire pour la copie des binaires type image
2466 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem
2467 "remove_blank_text": False,
2468 }
2469 )
2470 cmd.set_article(article)
2471 cmd.do()
2473 # 6. On ajoute l'ojs-id pour ptf-tools
2474 cmd = ptf_cmds.updateResourceIdPtfCmd(
2475 {"id_type": "ojs-id", "id_value": self.article_folder_name}
2476 )
2477 cmd.set_resource(article)
2478 cmd.do()
2480 # 7. On restaure les données additionnelles (extid, deployed_date,...)
2481 if existing_article:
2482 ptf_cmds.importExtraDataPtfCmd(
2483 {"pid": existing_article.pid, "import_folder": temp_folder}
2484 ).do()
2486 restore_obj_not_in_metadata(article)
2487 restore_translation(article)
2489 return article
2492class transformBodyInHtmlXmlCmd(addXmlCmd):
2493 """
2494 transformBodyInHtmlXmlCmd: transform the JATS body in HTML
2496 TODO: handle images,...
2498 """
2500 use_body = False
2502 def internal_do(self):
2503 super().internal_do()
2505 xsl_file = settings.PTF_HTML_XSL
2506 xslt_doc = etree.parse(xsl_file)
2507 t = etree.XSLT(xslt_doc)
2509 html_tree = t(self.tree).getroot()
2511 body = html_tree.find("body/article/main")
2512 text = xmldata_jats.innerxml(body).decode("utf-8")
2514 return text
2517class addBodyInHtmlXmlCmd(addXmlCmd):
2518 """
2519 addBodyInHtmlXmlCmd: read the JATS body of an article
2520 and create the corresponding HTML
2522 TODO: handle images,... manage warnings for unused tag ?
2524 """
2526 def __init__(self, params=None):
2527 self.article = None
2528 self.pid = None
2530 super().__init__(params)
2532 def set_article(self, article):
2533 self.article = article
2535 def pre_do(self):
2536 super().pre_do()
2538 if self.pid is None and self.article is None:
2539 raise ValueError("pid et article sont vides")
2541 if self.article is None:
2542 self.article = model_helpers.get_article(self.pid)
2544 if self.pid is None:
2545 self.pid = self.article.pid
2547 def internal_do(self):
2548 super().internal_do()
2550 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid)
2551 # faut il récupérer les warnings du parseHTML ?
2552 # self.warnings.extend(xarticle.warnings)
2553 self.article.relatedobject_set.filter(rel="html-image").delete()
2554 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject")
2556 params = {
2557 "body_html": xarticle.body_html,
2558 "body_tex": xarticle.body_tex,
2559 "body_xml": xarticle.body_xml,
2560 "use_page_count": False,
2561 }
2563 cmd = ptf_cmds.updateArticlePtfCmd(params)
2564 cmd.set_article(self.article)
2565 cmd.do()
2567 # copy_binary_files will call resolver.copy_html_images
2568 # to copy the article images
2569 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here
2571 resolver.copy_html_images(
2572 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER
2573 )
2576class updateCacheXmlCmd(baseCmd):
2577 """
2578 recreate the citation_html field of the bibitems
2580 Params: colid: pid of the collection to process
2581 """
2583 def __init__(self, params=None):
2584 self.colid = None
2585 self.start_id = None
2587 super().__init__(params)
2589 self.required_params.extend(["colid"])
2591 def update_article(self, xarticle):
2592 article = model_helpers.get_article(xarticle.pid)
2593 if article is None:
2594 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist")
2596 article.title_html = xarticle.title_html
2597 article.title_tex = xarticle.title_tex
2598 article.trans_title_html = xarticle.trans_title_html
2599 article.trans_title_tex = xarticle.trans_title_tex
2600 article.save()
2602 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()):
2603 abstract.value_html = xabstract["value_html"]
2604 abstract.value_tex = xabstract["value_tex"]
2605 abstract.save()
2607 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()):
2608 # kwd_group.value_html = xkwd_group['value_html']
2609 # kwd_group.value_tex = xkwd_group['value_tex']
2610 # kwd_group.save()
2612 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()):
2613 bib.citation_html = xbib.citation_html
2614 bib.citation_tex = xbib.citation_tex
2615 bib.article_title_tex = xbib.article_title_tex
2616 bib.chapter_title_tex = xbib.chapter_title_tex
2617 bib.source_tex = xbib.source_tex
2618 bib.volume = xbib.volume
2619 bib.save()
2621 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY:
2622 params = {
2623 "body_html": xarticle.body_html,
2624 "body_tex": xarticle.body_tex,
2625 "body_xml": xarticle.body_xml,
2626 "use_page_count": False,
2627 }
2629 cmd = ptf_cmds.updateArticlePtfCmd(params)
2630 cmd.set_article(article)
2631 cmd.do()
2633 def internal_do(self):
2634 super().internal_do()
2636 collection = model_helpers.get_collection(self.colid)
2637 if collection is None:
2638 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist")
2640 qs = collection.content.all().order_by("pid")
2641 start = self.start_id is None
2642 for container in qs:
2643 if not start and container.pid == self.start_id:
2644 start = True
2646 if start:
2647 print(container.pid)
2648 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY
2649 xml_body = ptf_cmds.exportPtfCmd(
2650 {"pid": container.pid, "with_body": with_body}
2651 ).do()
2653 parser = etree.XMLParser(
2654 huge_tree=True,
2655 recover=True,
2656 remove_blank_text=False,
2657 remove_comments=True,
2658 resolve_entities=True,
2659 )
2660 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser)
2661 xissue = jats_parser.JatsIssue(tree=tree)
2663 for xarticle in xissue:
2664 self.update_article(xarticle)