Coverage for src/ptf/cmds/xml/cedrics/cedrics_parser.py: 10%
1098 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1##################################################################################################
2#
3# README
4#
5# cedrics_parser.py is the equivalent of jats_parser for Cedrics XML
6#
7# Bugs fixed:
8# - <xref> with url in "dx.doi.org" were filtered in ptf-xsl
9# - Non structured references (bibitemdata in Cedrics) got ext_links only if the <xref> node
10# is one step below the <bibitemdata> node
11# - comments that started with ' ' were ignored (AIF_2008__58_2_689_0 [9])
12#
13##################################################################################################
15import html
16import re
17from operator import attrgetter
19from django.conf import settings
20from django.utils import timezone
22from ptf.cmds.xml.citation_html import get_citation_html
23from ptf.cmds.xml.xml_base import RefBase
24from ptf.cmds.xml.xml_base import XmlParserBase
25from ptf.cmds.xml.xml_utils import clean_doi
26from ptf.cmds.xml.xml_utils import escape
27from ptf.cmds.xml.xml_utils import fix_mfenced_in_mathml
28from ptf.cmds.xml.xml_utils import get_contrib_xml
29from ptf.cmds.xml.xml_utils import get_normalized_attrib
30from ptf.cmds.xml.xml_utils import get_text_from_node
31from ptf.cmds.xml.xml_utils import get_xml_from_node
32from ptf.cmds.xml.xml_utils import helper_update_name_params
33from ptf.cmds.xml.xml_utils import int_to_Roman
34from ptf.cmds.xml.xml_utils import make_links_clickable
35from ptf.cmds.xml.xml_utils import normalize
36from ptf.cmds.xml.xml_utils import normalize_space
37from ptf.cmds.xml.xml_utils import replace_html_entities
38from ptf.cmds.xml.xml_utils import split_kwds
39from ptf.model_data import ArticleData
40from ptf.model_data import Foo
41from ptf.model_data import IssueData
42from ptf.model_data import JournalData
43from ptf.model_data import PublisherData
44from ptf.model_data import create_contributor
47def helper_add_link_from_node(node):
48 text = node.text or ""
49 tag = normalize(node.tag)
50 fct_name = "get_data_from_" + tag.replace("-", "_")
51 data = globals()[fct_name](node)
52 if not data["rel"]:
53 href = data["location"]
54 if "www.numdam.org" not in href:
55 text = make_links_clickable(href, data["metadata"])
56 else:
57 text = ""
58 return text
61def get_data_from_custom_meta(node):
62 name = ""
63 value = ""
65 for child in node:
66 tag = normalize(child.tag)
68 if tag == "meta-name":
69 name = child.text
70 elif tag == "meta-value":
71 value = child.text
73 return name, value
76def get_data_from_date(node):
77 date_str = ""
78 if "iso-8601-date" in node.attrib:
79 date_str = node.attrib["iso-8601-date"]
80 else:
81 year = month = day = ""
82 for child in node:
83 tag = normalize(child.tag)
85 if tag == "year":
86 year = child.text
87 elif tag == "month":
88 month = child.text
89 elif tag == "day":
90 day = child.text
91 date_str = year
92 if date_str and month:
93 date_str += "-" + month
94 if date_str and day:
95 date_str += "-" + day
97 return date_str
100def get_data_from_ext_link(node):
101 link_type = node.get("ext-link-type") or ""
102 href = get_normalized_attrib(node, "href") or ""
103 base = get_normalized_attrib(node, "base") or ""
105 data = {
106 "rel": link_type,
107 "mimetype": "",
108 "location": href,
109 "base": base,
110 "metadata": node.text or "",
111 }
113 return data
116def get_data_from_history(node):
117 history_dates = []
118 # TODO: transform history_dates in a hash where date-type is the key
119 # => Change database_cmds
120 for child in node:
121 if "date-type" in child.attrib:
122 date_type = child.attrib["date-type"]
123 date_str = get_data_from_date(child)
124 history_dates.append({"type": date_type, "date": date_str})
125 return history_dates
128def get_data_from_uri(node):
129 href = text = ""
130 href = get_normalized_attrib(node, "href") or ""
131 text = node.text or ""
133 data = {"rel": "", "mimetype": "", "location": href, "base": "", "metadata": text}
135 return data
138class CedricsBase(XmlParserBase):
139 def __init__(self, *args, **kwargs):
140 super().__init__()
141 self.warnings = []
143 def parse_tree(self, tree):
144 pass
146 def set_titles(self):
147 pass
149 def post_parse_tree(self):
150 self.set_titles()
152 def filter_text(self, text):
153 text = text.replace("<allowbreak/>", "")
154 return text
156 def get_location_from_xref(self, node, **kwargs):
157 location = get_normalized_attrib(node, "url") or ""
159 if location == "":
160 text = get_text_from_node(node)
161 location = self.filter_text(text)
163 return location
165 def get_data_from_xref(self, node, **kwargs):
166 href = text = ""
168 href = get_normalized_attrib(node, "url") or ""
170 # TODO: BUG in JATS. JEP_2017__4__435_0 [9]
171 # The comment has an ext-link with a display embedded in <monospace>
172 # jats_parser produces 2 <a> (1 for the <ext-link>, 1 for the text inside the <monospace>
173 # The code below should be removed
174 is_comment = "is_comment" in kwargs and kwargs["is_comment"]
175 if is_comment and node.text is None:
176 kwargs["add_HTML_link"] = True
178 html_text, _, xml_text = self.parse_node_inner(node, None, **kwargs)
180 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False
182 if href == "":
183 text = get_text_from_node(node)
184 text = self.filter_text(text)
185 href = text
187 bibitemdata_display = html_text
188 if is_bibitemdata and node.text is None:
189 html_text = ""
191 data = {
192 "rel": "",
193 "mimetype": "",
194 "location": href,
195 "base": "",
196 "metadata": html_text,
197 "xml_text": xml_text,
198 }
200 if is_bibitemdata:
201 data["bibitemdata_display"] = bibitemdata_display
203 return data
205 def get_numeric_value(self, node):
206 systnum = node.get("systnum") or ""
208 value = node.text
209 if systnum.lower() == "romain":
210 value = int_to_Roman(int(value))
212 return value
214 def parse_node_inner(self, node, tex_node, **kwargs):
215 """
216 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML
217 :param node:
218 :param kwargs:
219 :return:
220 """
222 kwargs["is_top"] = False
223 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
225 if node.text:
226 text = node.text
228 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"):
229 text = text[1:]
231 inner_jats_xml_text += escape(text)
232 inner_html_text += text
233 inner_tex_text += text
235 for i in range(len(node)):
236 child = node[i]
237 text_child = tex_node[i] if (tex_node is not None and len(tex_node) > i) else None
239 (
240 child_html_text,
241 child_tex_text,
242 child_jats_xml_text,
243 ) = self.parse_node_with_mixed_content(child, text_child, **kwargs)
244 inner_html_text += child_html_text
245 inner_tex_text += child_tex_text
246 inner_jats_xml_text += child_jats_xml_text
248 if "add_HTML_link" in kwargs and kwargs["add_HTML_link"]:
249 match = re.match(r"[\n ]+", inner_html_text)
250 if not match:
251 inner_html_text = make_links_clickable(inner_html_text, inner_html_text)
253 return inner_html_text, inner_tex_text, inner_jats_xml_text
255 def parse_node_with_b(self, node, tex_node, **kwargs):
256 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
257 node, tex_node, **kwargs
258 )
260 html_text = "<strong>" + inner_html_text + "</strong>"
261 tex_text = "<strong>" + inner_tex_text + "</strong>"
262 if len(inner_jats_xml_text) > 0:
263 xml_text = "<bold>" + inner_jats_xml_text + "</bold>"
264 else:
265 xml_text = "<bold/>"
267 return html_text, tex_text, xml_text
269 def parse_node_with_cit(self, node, tex_node, **kwargs):
270 html_text = tex_text = get_text_from_node(node)
271 xml_text = escape(html_text)
273 return html_text, tex_text, xml_text
275 def parse_node_with_hi(self, node, tex_node, **kwargs):
276 rend = node.get("rend")
278 if rend == "it":
279 return self.parse_node_with_i(node, tex_node, **kwargs)
280 elif rend == "bold":
281 return self.parse_node_with_b(node, tex_node, **kwargs)
282 else:
283 fct_name = "parse_node_with_" + rend.replace("-", "_")
284 ftor = getattr(self, fct_name, None)
285 if callable(ftor):
286 return ftor(node, tex_node, **kwargs)
288 return self.parse_node_inner(node, tex_node, **kwargs)
290 def parse_node_with_i(self, node, tex_node, **kwargs):
291 # TODO: BUG in JATS: unlike <monospace>, no HTLM links are added in italics
292 kwargs["add_HTML_link"] = False
294 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
295 node, tex_node, **kwargs
296 )
298 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False
299 is_citation = kwargs["is_citation"] if "is_citation" in kwargs else False
300 is_comment = kwargs["is_comment"] if "is_comment" in kwargs else False
302 tex_text = f"<i>{inner_tex_text}</i>"
304 if inner_html_text == "" or (is_citation and not is_bibitemdata and not is_comment):
305 html_text = inner_html_text
306 else:
307 html_text = '<span class="italique">' + inner_html_text + "</span>"
309 if len(inner_jats_xml_text) > 0:
310 xml_text = "<italic>" + inner_jats_xml_text + "</italic>"
311 else:
312 xml_text = "<italic/>"
314 return html_text, tex_text, xml_text
316 def parse_node_with_label(self, node, tex_node, **kwargs):
317 html_text = tex_text = xml_text = ""
319 self.list_item_label = get_text_from_node(node)
321 return html_text, tex_text, xml_text
323 def parse_node_with_large(self, node, tex_node, **kwargs):
324 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
325 node, tex_node, **kwargs
326 )
328 xml_text = "<large>" + inner_jats_xml_text + "</large>"
330 return inner_html_text, inner_tex_text, xml_text
332 def parse_node_with_list(self, node, tex_node, **kwargs):
333 self.list_item_label = None
335 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
336 node, tex_node, **kwargs
337 )
339 list_type = node.get("type")
341 if list_type is None:
342 xml_text = "<list>"
343 else:
344 xml_text = '<list list-type="' + list_type + '">'
345 xml_text += inner_jats_xml_text
346 xml_text += "</list>"
348 if list_type is None or list_type == "bullet" or list_type == "simple":
349 prefix = "<ul>"
350 suffix = "</ul>"
351 else:
352 suffix = "</ol>"
354 if list_type == "order" or list_type == "number":
355 prefix = '<ol type="1">'
356 elif list_type == "alpha-lower":
357 prefix = '<ol type="a">'
358 elif list_type == "alpha-upper":
359 prefix = '<ol type="A">'
360 elif list_type == "roman-lower":
361 prefix = '<ol type="i">'
362 elif list_type == "roman-upper":
363 prefix = '<ol type="I">'
364 else:
365 prefix = '<ul class="no-bullet" style="list-style-type:none;">'
366 suffix = "</ul>"
368 html_text = prefix + inner_html_text + suffix
369 tex_text = prefix + inner_tex_text + suffix
371 return html_text, tex_text, xml_text
373 def parse_node_with_item(self, node, tex_node, **kwargs):
374 """
375 <list-item><label>LABEL</label><p>TEXT</p> becomes in HTML
376 <li>LABEL TEXT</li>
377 (same with <title>)
379 :param node:
380 :return:
381 """
383 label = self.list_item_label or ""
384 if label == "":
385 label = node.get("label") or ""
387 self.list_item_label = None
389 kwargs["no_p"] = True
390 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
391 node, tex_node, **kwargs
392 )
394 xml_text = "<list-item>"
395 if label:
396 xml_text += "<label>" + label + "</label>"
397 xml_text += inner_jats_xml_text
398 xml_text += "</list-item>"
400 text = "<li>"
401 if label:
402 text += label + " "
404 html_text = text + inner_html_text + "</li>"
405 tex_text = text + inner_tex_text + "</li>"
407 return html_text, tex_text, xml_text
409 def parse_node_with_formula(self, node, tex_node, **kwargs):
410 # '\n' are added in this function, because the Cedrics -> XML transformation
411 # does not add xml:space="preserve" with formulas (?)
412 # An abstract with <p> and <formula> will have mix of "preserve".
414 html_text = tex_text = jats_xml_text = ""
415 type_ = node.attrib["type"] or "inline"
416 tex_type = tex_node.attrib["textype"] if tex_node is not None else "inline"
418 math_node = node[0]
419 math_node_text = get_xml_from_node(math_node)
420 math_node_text = replace_html_entities(math_node_text)
421 # The Cedrics Mathml transform rounds up the width value
422 math_node_text = math_node_text.replace(".em", "em")
423 math_node_text = math_node_text.replace(".pt", "pt")
425 tex_prefix = tex_suffix = "$"
426 if type_ != "inline":
427 tex_prefix = "\n\\["
428 tex_suffix = "\\]\n"
429 if tex_node is not None and tex_type not in ("inline", "display"):
430 tex_prefix = "\n\\begin{" + tex_type + "}\n"
431 tex_suffix = "\n\\end{" + tex_type + "}\n"
433 math_node_text = fix_mfenced_in_mathml(math_node_text)
435 if not kwargs["is_citation"]:
436 math_node_text = math_node_text.replace(
437 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""
438 )
439 math_node_text = math_node_text.replace('mode="display"', 'display="block"')
441 if tex_node is None:
442 # TODO: BUG in JATS. No need for a '$$' in the title if there is no tex formula
443 # The '$$' at the end of the next line is to be compatible with jats_parser
445 if type_ == "inline":
446 tex_node_text = "$$"
447 else:
448 tex_node_text = ""
449 else:
450 tex_node_text = tex_prefix + tex_node.text + tex_suffix
452 if type_ == "inline":
453 jats_xml_text = "<inline-formula>"
454 else:
455 jats_xml_text = '<disp-formula xml:space="preserve">\n'
457 jats_xml_text += "<alternatives>" + math_node_text
458 jats_tex_text = escape(tex_node_text)
460 if type_ != "inline":
461 jats_xml_text += "\n"
463 jats_xml_text += "<tex-math>" + jats_tex_text + "</tex-math>"
465 if type_ != "inline":
466 jats_xml_text += "\n"
468 jats_xml_text += "</alternatives>"
470 if type_ == "inline":
471 jats_xml_text += "</inline-formula>"
472 else:
473 jats_xml_text += "\n</disp-formula>"
474 node.tail = ""
476 if "bug_cedrics" in kwargs and kwargs["bug_cedrics"]:
477 # TODO: Bug in Cedrics. AIF_2012__62_6_2053_0 [16]
478 # If there is no texmath, a <tex-math>$$</tex-math> is added and
479 # get_text_from_node appends the 2.
480 tex_text = get_text_from_node(node)
481 if tex_node is None:
482 tex_text += "$$"
483 else:
484 tex_text = tex_node_text
486 data_tex = tex_node_text if type_ == "inline" else tex_node_text.replace("\n", "")
487 html_text = f'<span class="mathjax-formula" data-tex="{data_tex}">{math_node_text}</span>'
489 if type_ != "inline":
490 prefix = '<table class="formula"><tr><td class="formula-inner">'
491 suffix = '</td><td class="formula-label"></td></tr></table>'
493 html_text = prefix + html_text + suffix
495 # tex_text = escape(tex_text)
497 return html_text, tex_text, jats_xml_text
499 def parse_node_with_mixed_content(self, node, tex_node, **kwargs):
500 """
501 Parse and return the text of an XML node which mixes text and XML sub-nodes.
502 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>
503 Some inner nodes are removed, others are kept or replaced.
505 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings.
506 Parse the 2 nodes at the same time.
508 The JATS xml string is constructed at the same time because it is used during a PTF export
510 :param node: XML Node (with MathML), XML Node (with TexMath)
511 :param kwargs: params of the function
512 :return: HTML text, TeX test, XML text
513 """
515 html_text = tex_text = jats_xml_text = ""
517 if node is None:
518 return html_text, tex_text, jats_xml_text
520 name_ = type(node).__name__
521 # Found 1 exception with <title>Дополнение к работе (AIF_2013__63_4)
522 # The XML parser creates a different node with no tag for " "
523 if name_ != "_Element":
524 html_text = tex_text = jats_xml_text = html.unescape(node.text)
525 if node.tail and not kwargs["is_top"]:
526 html_text += node.tail
527 tex_text += node.tail
528 jats_xml_text += escape(node.tail)
529 return html_text, tex_text, jats_xml_text
531 # The tail is the text following the end of the node
532 # Ex: <node>text1<a>text_a</a>a_tail</node>
533 # The HTML text has to include the tail
534 # only if html_from_mixed_content was called recursively
535 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
537 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>
538 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2
540 # Text in <comment> is parsed to add HTML link.
541 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False
543 # base_url to image links
544 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""
546 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False
547 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False
549 # TODO remove once jats_parser has been validated agains xmldata
550 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False
551 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False
552 kwargs["temp_mixed_citation"] = (
553 kwargs["temp_mixed_citation"] if "temp_mixed_citation" in kwargs else False
554 )
556 tag = normalize(node.tag)
558 # pub-id/object-id are ignored by default are they are treated separately
559 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):
560 print(tag, "in", jats_xml_text)
561 return html_text, tex_text, jats_xml_text
563 if tag in ("bibitemdata", "toc"):
564 kwargs["is_citation"] = True
565 kwargs["temp_mixed_citation"] = True
566 elif tag == "comment":
567 kwargs["is_comment"] = True
569 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
571 # I. Add the node's text.
572 # Some tag have a corresponding html_from_@tag function to generate the HTML text.
574 # Check if the html_from_@tag exists
575 tag_mapped = {
576 "statement": "sec",
577 "disp-formula": "inline-formula",
578 "chapter-title": "article-title",
579 "bold": "strong",
580 "table": "table-generic",
581 "th": "table-generic",
582 "tr": "table-generic",
583 "td": "table-generic",
584 "thead": "table-generic",
585 "tbody": "table-generic",
586 "colgroup": "table-generic",
587 "col": "table-generic",
588 "em": "i",
589 }
591 fct_name = tag_mapped[tag] if tag in tag_mapped else tag
592 fct_name = "parse_node_with_" + fct_name.replace("-", "_")
593 ftor = getattr(self, fct_name, None)
594 if callable(ftor):
595 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, tex_node, **kwargs)
596 # Code if fc_name is a module fonction, not a class function:
597 # if fct_name in globals():
598 # Call the html_from_@tag function
599 # inner_text = globals()[fct_name](node, **kwargs)
600 else:
601 # II.1. Add the node text (before the children text)
603 # TODO Add HTML links to the text with URLs
604 # if tag in ("ext-link", "uri"):
605 # if kwargs['include_ext_link']:
606 # inner_text += helper_add_link_from_node(node)
607 # elif kwargs['add_HTML_link'] and node.text:
608 # match = re.match(r'[\n ]+', node.text)
609 # if not match:
610 # comment = make_links_clickable(node.text, node.text)
611 # inner_text += comment
612 # elif node.text:
613 # inner_text += node.text
615 # II.2. children
616 # child_text = html_from_mixed_content(child, params)
618 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
619 node, tex_node, **kwargs
620 )
622 html_text += inner_html_text
623 tex_text += inner_tex_text
624 jats_xml_text += inner_jats_xml_text
626 # III. Add the node's tail for children
627 if node.tail and not kwargs["is_top"] and tag not in ("p", "list", "item", "label"):
628 html_text += node.tail
629 tex_text += node.tail
630 jats_xml_text += escape(node.tail)
632 return html_text, tex_text, jats_xml_text
634 def parse_node_with_p(self, node, tex_node, **kwargs):
635 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
636 node, tex_node, **kwargs
637 )
639 if "no_p" in kwargs and kwargs["no_p"]:
640 # <p> inside <item> are removed in HTML to avoid a carriage return
641 html_text = inner_html_text
642 else:
643 node_type = node.get("specific-use")
644 if node_type:
645 html_text = '<p class="' + node_type + '">' + inner_html_text + "</p>"
646 else:
647 html_text = "<p>" + inner_html_text + "</p>"
649 # TODO: BUG in JATS (no <p> in the tex version)
650 tex_text = inner_tex_text
652 if len(inner_jats_xml_text) > 0:
653 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>"
654 else:
655 xml_text = '<p xml:space="preserve"/>'
657 return html_text, tex_text, xml_text
659 def parse_node_with_ref(self, node, tex_node, **kwargs):
660 label = node.text
662 html_text = ""
663 tex_text = ""
664 xml_text = '<xref ref-type="bibr">' + escape(label) + "</xref>"
666 return html_text, tex_text, xml_text
668 def parse_node_with_sansserif(self, node, tex_node, **kwargs):
669 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
670 node, tex_node, **kwargs
671 )
673 xml_text = "<sans-serif>" + inner_jats_xml_text + "</sans-serif>"
675 return inner_html_text, inner_tex_text, xml_text
677 def parse_node_with_sc(self, node, tex_node, **kwargs):
678 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
679 node, tex_node, **kwargs
680 )
682 html_text = '<span class="smallcaps">' + inner_html_text + "</span>"
683 tex_text = '<span class="smallcaps">' + inner_tex_text + "</span>"
685 if len(inner_jats_xml_text) > 0:
686 xml_text = "<sc>" + inner_jats_xml_text + "</sc>"
687 else:
688 xml_text = "<sc/>"
690 return html_text, tex_text, xml_text
692 def parse_node_with_slanted(self, node, tex_node, **kwargs):
693 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
694 node, tex_node, **kwargs
695 )
697 xml_text = "<slanted>" + inner_jats_xml_text + "</slanted>"
699 return inner_html_text, inner_tex_text, xml_text
701 def parse_node_with_small(self, node, tex_node, **kwargs):
702 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
703 node, tex_node, **kwargs
704 )
706 xml_text = "<small>" + inner_jats_xml_text + "</small>"
708 return inner_html_text, inner_tex_text, xml_text
710 def parse_node_with_sub(self, node, tex_node, **kwargs):
711 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
712 node, tex_node, **kwargs
713 )
715 html_text = "<sub>" + inner_html_text + "</sub>"
716 tex_text = "<sub>" + inner_tex_text + "</sub>"
717 xml_text = "<sub>" + inner_jats_xml_text + "</sub>"
719 return html_text, tex_text, xml_text
721 def parse_node_with_sup(self, node, tex_node, **kwargs):
722 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
723 node, tex_node, **kwargs
724 )
726 html_text = "<sup>" + inner_html_text + "</sup>"
727 tex_text = "<sup>" + inner_tex_text + "</sup>"
728 xml_text = "<sup>" + inner_jats_xml_text + "</sup>"
730 return html_text, tex_text, xml_text
732 def parse_node_with_texmath(self, node, tex_node, **kwargs):
733 html_text = tex_text = xml_text = ""
735 tex_text = "$" + get_text_from_node(node) + "$"
737 return html_text, tex_text, xml_text
739 def parse_node_with_tt(self, node, tex_node, **kwargs):
740 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
741 node, tex_node, **kwargs
742 )
744 if len(inner_jats_xml_text) > 0:
745 xml_text = "<monospace>" + inner_jats_xml_text + "</monospace>"
746 else:
747 xml_text = "<monospace/>"
749 return inner_html_text, inner_tex_text, xml_text
751 def parse_node_with_underline(self, node, tex_node, **kwargs):
752 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
753 node, tex_node, **kwargs
754 )
756 xml_text = "<underline>" + inner_jats_xml_text + "</underline>"
758 return inner_html_text, inner_tex_text, xml_text
760 def parse_node_with_xref(self, node, tex_node, **kwargs):
761 """
762 Parse an xref.
763 Extract extids (doi, mr-item-id,...) and ext_links
765 :param node:
766 :param tex_node:
767 :param kwargs:
768 :return: html_text, tex_text, xml_text
769 """
771 location = self.get_location_from_xref(node)
773 kwargs["add_HTML_link"] = False
774 html_text, tex_text, xml_text = self.parse_node_inner(node, None, **kwargs)
775 metadata = html_text
776 html_text = make_links_clickable(location, html_text)
777 tex_text = make_links_clickable(location, tex_text)
779 is_comment = "is_comment" in kwargs and kwargs["is_comment"]
781 # No ext-links is added while parsing titles or abstracts
782 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else True
784 xref_data = {
785 "rel": "",
786 "mimetype": "",
787 "location": location,
788 "base": "",
789 "metadata": metadata,
790 }
792 extid_value = (None, None)
794 if add_ext_link and not is_comment:
795 extid_value = self.add_extids_from_node_with_link(xref_data)
797 # <ext-link> in a bibitemdata, in a comment, or if the xref is not converted into an extid
798 # if is_bibitemdata or is_comment or extid_value[0] is None:
799 xml_text = (
800 '<ext-link xlink:href="' + html.escape(location) + '">' + xml_text + "</ext-link>"
801 )
803 if (
804 add_ext_link
805 and not is_comment
806 and extid_value[0] is None
807 and xref_data not in self.ext_links
808 ):
809 self.ext_links.append(xref_data)
811 return html_text, tex_text, xml_text
813 def parse_article_subject(self, node):
814 lang = get_normalized_attrib(node, "lang") or self.lang
816 subjects = [text.lstrip() for text in node.text.split(",")]
818 for subject in subjects:
819 self.subjs.append({"type": "subject", "lang": lang, "value": subject})
821 def parse_article_subjects(self, node):
822 for child in node:
823 tag = normalize(child.tag)
825 if tag == "article-subject":
826 self.parse_article_subject(child)
828 def parse_article_type(self, node):
829 lang = get_normalized_attrib(node, "lang") or self.lang
831 subjects = [node.text]
833 for subject in subjects:
834 self.subjs.append({"type": "type", "lang": lang, "value": subject})
836 def parse_article_types(self, node):
837 # 2023/12/05 <articletype> has been added to store the type
838 if self.has_articletype:
839 return
841 for child in node:
842 tag = normalize(child.tag)
844 if tag == "article-type":
845 self.parse_article_type(child)
847 def parse_articletype(self, node):
848 self.atype = node.text
849 self.has_articletype = True
851 def parse_auteur(self, node, is_ref=False):
852 self.parse_common_contrib(node, "author", is_ref)
854 def _get_abstract_data(self, node, abstract_type: str = None):
855 tex_node = node.getnext()
856 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(
857 node, tex_node, add_ext_link=False
858 )
860 lang = get_normalized_attrib(node, "lang") or ""
861 if abstract_type is None:
862 if lang == self.lang:
863 value_xml = f"<abstract>{value_xml}</abstract>"
864 elif self.lang == "und":
865 value_xml = f'<abstract xml:lang="{lang}">{value_xml}</abstract>'
866 else:
867 value_xml = f'<trans-abstract xml:lang="{lang}">{value_xml}</trans-abstract>'
868 else:
869 value_xml = f'<abstract xml:lang="{lang}" abstract-type="{abstract_type}">{value_xml}</abstract>'
871 abstract_data = {
872 "tag": abstract_type if abstract_type is not None else "",
873 "lang": lang,
874 "value_xml": value_xml,
875 "value_html": value_html,
876 "value_tex": value_tex,
877 }
878 return abstract_data
880 def parse_avertissement(self, node):
881 self.abstracts.append(self._get_abstract_data(node, "avertissement"))
883 def parse_note(self, node):
884 self.abstracts.append(self._get_abstract_data(node, "note"))
886 def parse_biblio(self, node):
887 biblio_type = node.get("type") or ""
888 for child in node:
889 tag = normalize(child.tag)
891 if tag == "bib_entry":
892 type_ = child.get("type") or biblio_type
893 is_mixed_citation = type_ == "flat"
895 ref = CedricsRef(tree=child, lang="und", is_mixed_citation=is_mixed_citation)
896 self.bibitems.append(ref)
897 # TODO: Remove bibitem. This is used for solrCmds.
898 # solrCmds should use bibitems instead.
899 self.bibitem.append(ref.citation_html)
901 self.sort_bibitems()
903 def parse_common_contrib(self, node, role, is_ref=False):
904 contributor = create_contributor()
906 if role and role[-1] == "s":
907 role = role[0:-1]
908 contributor["role"] = role
910 equal_contrib_ = node.get("equal-contrib") or "no"
911 contributor["equal_contrib"] = equal_contrib_ == "yes"
913 corresp = node.get("author-role") or ""
914 if corresp == "corresponding":
915 contributor["corresponding"] = True
917 is_etal = False
918 has_children = False
919 middlename = ""
921 for child in node:
922 has_children = True
923 tag = normalize(child.tag)
925 if tag == "nomcomplet":
926 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>
927 if not is_ref:
928 contributor["string_name"] = child.text
929 deceased_ = child.get("deceased") or "no"
930 contributor["deceased_before_publication"] = deceased_ == "yes"
931 elif tag == "prenom":
932 contributor["first_name"] = child.text or ""
933 if middlename != "":
934 contributor["first_name"] += " " + middlename
935 middlename = ""
936 elif tag in ("middlename", "particule"):
937 contributor["first_name"] += " " + child.text
938 middlename = child.text
939 elif tag == "initiale":
940 pass
941 # if len(contributor['first_name']) > 0:
942 # contributor['first_initials'] = child.text or ''
943 elif tag == "junior":
944 contributor["suffix"] = child.text
945 elif tag == "nom":
946 contributor["last_name"] = child.text or ""
947 elif tag == "adresse":
948 text = get_text_from_node(child)
949 text = normalize_space(text).replace("\n", " ")
950 if len(text) > 0:
951 contributor["addresses"].append(text)
952 elif tag == "author-orcid":
953 contributor["orcid"] = child.text
954 elif tag == "mel":
955 email = None
956 for greatchild in child:
957 tag = normalize(greatchild.tag)
958 if tag == "xref":
959 email = greatchild.get("url")
960 if email is None:
961 email = child.text
962 if email is not None:
963 if len(contributor["email"]) > 0:
964 contributor["email"] += "{{{"
965 contributor["email"] += email
966 elif tag == "etal":
967 is_etal = True
969 if has_children:
970 use_initials = is_ref and getattr(settings, "REF_JEP_STYLE", False)
971 helper_update_name_params(contributor, use_initials)
973 contributor["contrib_xml"] = (
974 "<etal/>" if is_etal else get_contrib_xml(contributor, is_ref=is_ref)
975 )
976 elif node.text is not None:
977 contributor["string_name"] = node.text
978 contributor["contrib_xml"] = (
979 '<string-name xml:space="preserve">' + escape(node.text) + "</string-name>"
980 )
982 contributor["addresses"].sort()
984 # email is ignored by jats_parser
985 contributor["email"] = ""
987 self.contributors.append(contributor)
989 def parse_financement(self, node):
990 abbrev = award_id = None
992 for child in node:
993 tag = normalize(child.tag)
995 if tag == "bourse":
996 award_id = child.text
997 elif tag == "financeur":
998 abbrev = get_text_from_node(child)
1000 if abbrev is not None and award_id is not None:
1001 self.awards.append({"abbrev": abbrev, "award_id": award_id})
1003 def parse_financements(self, node):
1004 for child in node:
1005 tag = normalize(child.tag)
1007 if tag == "financement":
1008 self.parse_financement(child)
1010 def parse_langue(self, node):
1011 self.lang = node.text
1013 def parse_motcle(self, node):
1014 lang = get_normalized_attrib(node, "lang") or self.lang
1015 tex_node = node.getnext()
1017 kwds = []
1018 for child in tex_node:
1019 tag = normalize(child.tag)
1021 if tag == "mot":
1022 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(
1023 child, None
1024 )
1025 # text = normalize_space(get_text_from_node(child))
1026 kwds.append(value_tex)
1028 if len(kwds) == 0:
1029 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(
1030 node, tex_node
1031 )
1032 kwds = split_kwds(value_tex)
1034 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds])
1036 def parse_msc(self, node):
1037 lang = get_normalized_attrib(node, "lang") or self.lang
1038 kwds = node.text.split(",")
1039 kwds = [kwd.strip() for kwd in kwds if len(kwd) > 0]
1041 self.kwds.extend([{"type": "msc", "lang": lang, "value": kwd} for kwd in kwds])
1043 def parse_resp(self, node):
1044 role = node.get("role") or "editeur"
1045 if role == "editeur":
1046 role = "editor"
1047 elif role == "organisateur":
1048 role = "organizer"
1050 self.parse_common_contrib(node, role)
1052 def parse_resume(self, node):
1053 lang = get_normalized_attrib(node, "lang") or self.lang
1054 """
1055 tag = "abstract"
1056 tex_node = node.getnext()
1058 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(
1059 node, tex_node, add_ext_link=False
1060 )
1062 if lang == self.lang:
1063 value_xml = "<abstract"
1064 elif self.lang == "und":
1065 value_xml = '<abstract xml:lang="' + lang + '"'
1066 else:
1067 value_xml = '<trans-abstract xml:lang="' + lang + '"'
1069 if len(value_xml_inner) == 0:
1070 value_xml += "/>"
1071 else:
1072 value_xml += ">" + value_xml_inner
1074 if lang == self.lang or self.lang == "und":
1075 value_xml += "</abstract>"
1076 else:
1077 value_xml += "</trans-abstract>"
1079 abstract_data = {
1080 "tag": tag,
1081 "lang": lang,
1082 "value_xml": value_xml,
1083 "value_html": value_html,
1084 "value_tex": value_tex,
1085 }
1086 """
1087 if lang == self.lang:
1088 # JATS puts the trans_abstract after the abstract
1089 self.abstracts.insert(0, self._get_abstract_data(node, None))
1090 else:
1091 self.abstracts.append(self._get_abstract_data(node))
1093 def parse_supplement(self, node):
1094 location = None
1095 caption = ""
1097 for child in node:
1098 tag = normalize(child.tag)
1100 if tag == "xref":
1101 location = self.get_location_from_xref(child)
1102 elif tag == "caption":
1103 caption = escape(node.text)
1105 if location:
1106 pos = location.find("/attach/")
1107 if pos > -1:
1108 if hasattr(self, "colid") and hasattr(self, "issue_id"):
1109 text = location
1110 location = self.colid + "/" + self.issue_id + "/"
1112 if hasattr(self, "article_folder") and self.article_folder is not None:
1113 location += self.article_folder + "/Attach/" + text[pos + 8 :]
1114 else:
1115 location += self.pid + text[pos:]
1117 relation = node.attrib.get("content-type")
1118 assert relation in ["supplementary-material", "review"], (
1119 f"Dans la balise supplement de {self.pid}, "
1120 f'content-type être "supplementary-material" ou "review" '
1121 f'au lieu de "{relation}"'
1122 )
1124 material = {
1125 "rel": node.attrib.get("content-type"),
1126 "mimetype": node.attrib.get("mimetype"),
1127 "location": location,
1128 "base": "",
1129 "metadata": "",
1130 "caption": caption,
1131 }
1132 self.supplementary_materials.append(material)
1134 def parse_supplements(self, node):
1135 for child in node:
1136 tag = normalize(child.tag)
1138 if tag == "supplement":
1139 self.parse_supplement(child)
1141 # TODO: It is a node with mix content
1142 # Transform the function in parse_node_with_motcle to handle formulas
1143 def parse_texmotcle(self, node):
1144 lang = get_normalized_attrib(node, "lang") or self.lang
1145 tex_node = node.getnext()
1147 kwds = []
1148 for child in tex_node:
1149 tag = normalize(child.tag)
1151 if tag == "mot":
1152 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(child)
1153 kwds.append(value_tex)
1155 if len(kwds) == 0:
1156 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(node)
1157 kwds = split_kwds(value_tex)
1159 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds])
1161 def parse_titre(self, node):
1162 lang = get_normalized_attrib(node, "lang") or "und"
1163 tex_node = node.getnext()
1165 # node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
1166 # tex_node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
1168 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(node, tex_node)
1170 if len(title_xml) > 0:
1171 self.titres.append(
1172 {
1173 "lang": lang,
1174 "title_html": title_html,
1175 "title_tex": title_tex,
1176 "title_xml": title_xml,
1177 }
1178 )
1180 def sort_bibitems(self):
1181 if len(self.bibitems):
1182 label = self.bibitems[0].label.strip("[]") # Sometimes, labels are surrounded by []
1183 if len(label):
1184 # First, we split each label into label_prefix and label_suffix
1185 for bib in self.bibitems:
1186 bib.split_label()
1188 if label.isdigit():
1190 def sort_bibitem(bibitem):
1191 return int(bibitem.label_prefix)
1193 self.bibitems = sorted(self.bibitems, key=sort_bibitem)
1194 else:
1195 self.bibitems = sorted(
1196 self.bibitems, key=attrgetter("label_prefix", "year", "label_suffix")
1197 )
1200class CedricsPublisher(PublisherData):
1201 def __init__(self, *args, **kwargs):
1202 super().__init__(*args, **kwargs)
1203 self.parse_tree(kwargs["tree"])
1205 def parse_tree(self, tree):
1206 self.name = tree.text
1209class CedricsJournal(JournalData, CedricsBase):
1210 def __init__(self, *args, **kwargs):
1211 super().__init__(*args, **kwargs)
1212 self.parse_tree(kwargs["tree"])
1214 def parse_tree(self, tree):
1215 super().parse_tree(tree)
1217 for node in tree:
1218 tag = normalize(node.tag)
1220 if tag == "acrocedram":
1221 self.pid = node.text
1222 elif tag == "jtitre":
1223 self.title_html = self.title_tex = node.text
1224 self.title_xml = "<journal-title-group><journal-title>" + escape(node.text)
1225 elif tag == "jtitrecourt":
1226 self.title_xml += (
1227 '</journal-title><abbrev-journal-title abbrev-type="short-title">'
1228 + escape(node.text)
1229 )
1230 self.title_xml += "</abbrev-journal-title></journal-title-group>"
1231 elif tag == "jediteur":
1232 self.publisher = CedricsPublisher(tree=node)
1233 elif tag == "issn":
1234 self.issn = node.text
1235 elif tag == "E-issn":
1236 self.e_issn = node.text
1239class CedricsIssue(IssueData, CedricsBase):
1240 def __init__(self, *args, **kwargs):
1241 super().__init__(*args, **kwargs)
1243 # Jats has a title/trans_title
1244 # Cedrics has multiples <titre xml:lang>
1245 # Use self.titres to store the titles temporary.
1246 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree
1247 self.titres = []
1249 self.ignore_date_published = (
1250 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False
1251 )
1252 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False
1253 self.colid = None
1254 self.provider = "mathdoc"
1255 self.article_folders = kwargs["article_folders"] if "article_folders" in kwargs else []
1256 self.dois = kwargs["dois"] if "dois" in kwargs else []
1258 self.parse_tree(kwargs["tree"])
1259 self.post_parse_tree()
1261 def parse_tree(self, tree):
1262 super().parse_tree(tree)
1264 seq = 1
1266 for node in tree:
1267 tag = normalize(node.tag)
1269 if tag == "notice":
1270 self.parse_notice(node)
1271 elif tag == "article":
1272 article_folder = (
1273 self.article_folders[seq - 1] if len(self.article_folders) > 0 else ""
1274 )
1275 doi = self.dois[seq - 1] if len(self.dois) > 0 else ""
1276 article = CedricsArticle(
1277 tree=node,
1278 colid=self.colid,
1279 issue_id=self.pid,
1280 doi=doi,
1281 ignore_date_published=self.ignore_date_published,
1282 is_seminar=self.is_seminar,
1283 article_folder=article_folder,
1284 )
1285 article.seq = str(seq)
1286 seq += 1
1287 self.articles.append(article)
1289 def parse_gestion(self, node):
1290 for child in node:
1291 tag = normalize(child.tag)
1293 if tag == "efirst":
1294 self.with_online_first = child.text == "yes"
1296 def parse_notice(self, node):
1297 for child in node:
1298 tag = normalize(child.tag)
1300 if tag == "idvol":
1301 self.pid = child.text
1302 elif tag == "tome":
1303 self.volume = child.text
1304 elif tag == "fascicule":
1305 self.number = child.text
1306 elif tag == "serie":
1307 self.vseries = child.text
1308 elif tag == "annee":
1309 self.year = child.text
1310 else:
1311 fct_name = "parse_" + tag.replace("-", "_")
1312 ftor = getattr(self, fct_name, None)
1313 if callable(ftor):
1314 ftor(child)
1316 if self.last_modified_iso_8601_date_str is None:
1317 self.last_modified_iso_8601_date_str = timezone.now().isoformat()
1319 def parse_revue(self, node):
1320 self.journal = CedricsJournal(tree=node)
1321 self.colid = self.journal.pid
1322 self.publisher = self.journal.publisher
1324 def set_titles(self):
1325 # TODO: BUG in JATS: title_html is the one of the last title (bug if title in multiple langs)
1326 for titre in self.titres:
1327 if titre["lang"] == self.lang or self.lang == "und":
1328 self.title_html = titre["title_html"]
1329 self.title_tex = titre["title_tex"]
1330 else:
1331 self.trans_lang = titre["lang"]
1332 self.trans_title_html = titre["title_html"]
1333 self.trans_title_tex = titre["title_tex"]
1335 if self.title_html:
1336 self.title_xml = "<issue-title-group>"
1338 for titre in self.titres:
1339 if titre["lang"] == self.lang or self.lang == "und":
1340 self.title_xml += (
1341 '<issue-title xml:space="preserve" xml:lang="'
1342 + titre["lang"]
1343 + '">'
1344 + titre["title_xml"]
1345 + "</issue-title>"
1346 )
1348 for titre in self.titres:
1349 if titre["lang"] != self.lang and self.lang != "und":
1350 self.title_xml += '<trans-title-group xml:lang="' + titre["lang"] + '">'
1351 self.title_xml += (
1352 '<trans-title xml:space="preserve">'
1353 + titre["title_xml"]
1354 + "</trans-title>"
1355 )
1356 self.title_xml += "</trans-title-group>"
1358 self.title_xml += "</issue-title-group>"
1361class CedricsArticle(ArticleData, CedricsBase):
1362 def __init__(self, *args, **kwargs):
1363 super().__init__(*args, **kwargs)
1365 self.ignore_date_published = (
1366 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False
1367 )
1368 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False
1369 self.article_folder = kwargs["article_folder"] if "article_folder" in kwargs else None
1371 # Jats has a title/trans_title
1372 # Cedrics has multiples <titre xml:lang>
1373 # Use self.titres to store the titles temporary.
1374 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree
1375 self.titres = []
1377 self.pid = kwargs["pid"] if "pid" in kwargs else None
1378 self.colid = kwargs["colid"]
1379 self.issue_id = kwargs["issue_id"]
1380 self.atype = "normal"
1382 if "doi" in kwargs and kwargs["doi"] is not None:
1383 self.doi = clean_doi(kwargs["doi"])
1384 self.ids.append(("doi", self.doi))
1386 self.publishTeX = False
1387 self.tex_filename = None
1388 self.has_articletype = (
1389 False # 2023/12/05 <articletype> has been added. Ignore <article-types>
1390 )
1392 self.parse_tree(kwargs["tree"])
1393 self.post_parse_tree()
1395 def parse_tree(self, tree):
1396 super().parse_tree(tree)
1398 for node in tree:
1399 tag = normalize(node.tag)
1401 if tag == "idart":
1402 self.pid = node.text
1403 elif tag == "doi":
1404 self.doi = clean_doi(node.text)
1405 # TODO: Remove as ResourceId do not seem useful (needs to upate templates)
1406 value = ("doi", self.doi)
1407 if value not in self.ids:
1408 self.ids.append(value)
1409 elif tag == "pagedeb":
1410 self.fpage = self.get_numeric_value(node)
1411 elif tag == "pagefin":
1412 self.lpage = self.get_numeric_value(node)
1413 elif tag == "ordreart":
1414 # Set article_number or talk_number
1415 # Side effect in Cedrics: set page-count (handled at the end of this function)
1416 if self.is_seminar:
1417 self.talk_number = node.text
1418 else:
1419 self.article_number = node.text
1420 elif tag == "msn-id":
1421 self.extids.append(("mr-item-id", node.text))
1422 elif tag == "zbl-id":
1423 self.extids.append(("zbl-item-id", node.text))
1425 # elif tag == 'pub-date':
1426 # date_type = child.get('date-type') or 'pub'
1427 # if date_type == 'pub':
1428 # self.date_published_iso_8601_date_str = get_data_from_date(child)
1429 # else:
1430 # date_str = get_data_from_date(child)
1431 # self.history_dates.append({'type': 'online', 'date': date_str})
1432 # elif tag == "history":
1433 # self.history_dates += get_data_from_history(child)
1434 # for date in self.history_dates:
1435 # if date['type'] == 'prod-deployed-date':
1436 # self.prod_deployed_date_iso_8601_date_str = date['date']
1438 else:
1439 fct_name = "parse_" + tag.replace("-", "_")
1440 print("function " + fct_name)
1441 ftor = getattr(self, fct_name, None)
1442 if callable(ftor):
1443 ftor(node)
1445 def parse_gestion(self, node):
1446 for child in node:
1447 tag = normalize(child.tag)
1449 if tag == "date_online" and not self.ignore_date_published:
1450 self.history_dates.append({"type": "online", "date": child.text})
1451 elif tag == "date_acceptation":
1452 self.history_dates.append({"type": "accepted", "date": child.text})
1453 elif tag == "date_reception":
1454 self.history_dates.append({"type": "received", "date": child.text})
1455 elif tag == "date_revision":
1456 self.history_dates.append({"type": "revised", "date": child.text})
1457 elif tag == "publishTeX":
1458 self.publishTeX = child.text == "yes"
1460 def parse_production(self, node):
1461 for child in node:
1462 tag = normalize(child.tag)
1464 if tag == "date_prod_PDF" and not self.ignore_date_published:
1465 self.date_published_iso_8601_date_str = child.text
1466 elif tag == "fichier_tex":
1467 self.tex_filename = child.text
1469 def parse_relations(self, node):
1470 rel_type = get_normalized_attrib(node, "type") or ""
1471 id_value = node.text
1473 relations = {
1474 "corrige": "corrects",
1475 "estcorrige": "corrected-by",
1476 "complete": "complements",
1477 "estcomplete": "complemented-by",
1478 "suitede": "follows",
1479 "estsuivide": "followed-by",
1480 "pagesprec": "prev-pages",
1481 "pagessuiv": "next-pages",
1482 "solutionde": "resolves",
1483 "apoursolution": "resolved-by",
1484 "commente": "comments",
1485 "estcommente": "commented-by",
1486 "remplace": "replaces",
1487 "estremplace": "replaced-by",
1488 }
1490 if rel_type in relations:
1491 obj = Foo()
1492 obj.rel_type = relations[rel_type]
1493 obj.id_value = id_value
1495 self.relations.append(obj)
1497 def post_parse_tree(self):
1498 # Some values in Cedrics XMLs are not embedded in groups (ex: authors)
1499 # We need to wait at the end of the parsing to finish the job
1501 super().post_parse_tree()
1503 if len(self.talk_number) > 0 or len(self.article_number) > 0:
1504 try:
1505 fpage_int = int(self.fpage)
1506 lpage_int = int(self.lpage)
1507 count_value = lpage_int - fpage_int + 1
1508 self.counts.append(("page-count", str(count_value)))
1509 except ValueError:
1510 pass
1512 # The (data)streams of the article's PDF and TeX are added automatically
1513 if hasattr(self, "colid") and hasattr(self, "issue_id"):
1514 location = self.colid + "/" + self.issue_id + "/"
1515 if self.article_folder:
1516 location += self.article_folder + "/" + self.article_folder + ".pdf"
1517 else:
1518 location += self.pid + "/" + self.pid + ".pdf"
1520 data = {
1521 "rel": "full-text",
1522 "mimetype": "application/pdf",
1523 "location": location,
1524 "base": "",
1525 "text": "Full (PDF)",
1526 }
1527 self.streams.append(data)
1529 if self.publishTeX and self.tex_filename:
1530 location = self.colid + "/" + self.issue_id + "/"
1531 if self.article_folder:
1532 location += self.article_folder + "/" + self.tex_filename + ".tex"
1533 else:
1534 location += self.pid + "/src/tex/" + self.tex_filename + ".tex"
1536 data = {
1537 "rel": "full-text",
1538 "mimetype": "application/x-tex",
1539 "location": location,
1540 "base": "",
1541 "text": "TeX source",
1542 }
1543 self.streams.append(data)
1545 def set_titles(self):
1546 for titre in self.titres:
1547 if titre["lang"] == self.lang or self.lang == "und":
1548 self.title_html = titre["title_html"]
1549 self.title_tex = titre["title_tex"]
1550 if len(titre["title_xml"]) > 0:
1551 self.title_xml = (
1552 '<article-title xml:space="preserve">'
1553 + titre["title_xml"]
1554 + "</article-title>"
1555 )
1556 else:
1557 self.trans_title_html = titre["title_html"]
1558 self.trans_title_tex = titre["title_tex"]
1559 if len(titre["title_xml"]):
1560 self.trans_title_xml = '<trans-title-group xml:lang="' + titre["lang"] + '">'
1561 self.trans_title_xml += '<trans-title xml:space="preserve">'
1562 self.trans_title_xml += (
1563 titre["title_xml"] + "</trans-title></trans-title-group>"
1564 )
1565 self.trans_lang = titre["lang"]
1567 if len(self.title_xml) > 0:
1568 self.title_xml = (
1569 "<title-group>" + self.title_xml + self.trans_title_xml + "</title-group>"
1570 )
1573class CedricsRef(RefBase, CedricsBase):
1574 def __init__(self, *args, **kwargs):
1575 super().__init__(*args, **kwargs)
1577 self.citation_xml = self.citation_html = self.citation_tex = ""
1578 self.REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)
1580 self.is_mixed_citation = (
1581 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
1582 )
1583 self.eprint_id = None
1584 self.archive_name = None
1585 self.has_doi = False
1587 self.editeur_citation_xml = (
1588 "" # bediteur is not in the correct order. Store the xml temporarily
1589 )
1591 self.parse_tree(kwargs["tree"])
1593 def parse_address(self, node):
1594 self.publisher_loc = normalize_space(get_text_from_node(node))
1595 self.citation_xml += "<publisher-loc>" + escape(self.publisher_loc) + "</publisher-loc>"
1597 def parse_archive_name(self, node):
1598 # TODO 1 JEP ref has a formula in its archive-name (for biorxiv)
1599 # It should be modified to use common names "biorxiv"
1601 self.archive_name = node.text.lower()
1603 def parse_article_id(self, node):
1604 eid = node.text
1605 self.extids.append(("eid", eid))
1607 self.citation_xml += '<pub-id pub-id-type="eid">' + escape(eid) + "</pub-id>"
1609 def parse_bauteur(self, node):
1610 self.parse_auteur(node, is_ref=True)
1612 last_contribution = self.contributors[-1]
1613 self.citation_xml += last_contribution["contrib_xml"]
1615 def parse_bediteur(self, node):
1616 self.parse_common_contrib(node, "editor", is_ref=True)
1618 last_contribution = self.contributors[-1]
1619 self.editeur_citation_xml += last_contribution["contrib_xml"]
1621 def parse_bibitemdata(self, node):
1622 tex_node = node.getnext()
1624 # TODO: Bug in Cedrics. if bibitemdata has no text between the nodes,
1625 # the XML is pretty printed. But since space="preserve" is added on the fly on mixed-citation
1626 # The \n and spaces should be preserved.
1627 # This bug is ignored (JTNB_2014__26_3_757_0 [1])
1629 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(
1630 node, tex_node, is_bibitemdata=True
1631 )
1632 self.citation_html += value_html
1633 self.citation_tex += value_tex
1634 self.citation_xml += (
1635 '<mixed-citation xml:space="preserve">' + value_xml + "</mixed-citation>"
1636 )
1638 def parse_booktitle(self, node):
1639 tex_node = node.getnext()
1640 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(
1641 node, tex_node, is_citation=True
1642 )
1644 self.source_tex = title_tex
1645 if title_xml != "":
1646 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"
1648 def parse_burl(self, node):
1649 for child in node:
1650 tag = normalize(child.tag)
1652 if tag == "xref":
1653 html_text, tex_text, xml_text = self.parse_node_with_xref(
1654 child, None, keep_link=True, is_citation=True
1655 )
1657 self.citation_xml += xml_text
1659 def parse_chapter(self, node):
1660 # TODO: Bug in Cedrics <chapter> for types other than inbook
1661 # becomes a text outside tags (AIF_2017__67_1_237_0 [16], CML_2013__5_1)
1662 # The info is not present in the PDF. It should not be in the Cedrics XML
1663 if self.type != "inbook":
1664 raise ValueError("<chapter> can be used only for an inbook")
1666 tex_node = node.getnext()
1667 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(
1668 node, tex_node, is_citation=True
1669 )
1671 self.citation_xml += (
1672 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>"
1673 )
1674 self.chapter_title_tex = title_tex
1676 def parse_doi(self, node):
1677 if node.text is None:
1678 raise ValueError("a doi can not be empty")
1680 if "http" in node.text:
1681 raise ValueError(node.text, "should not have http in it")
1683 doi_value = clean_doi(node.text)
1684 if self.doi is not None and self.doi != doi_value:
1685 raise ValueError(
1686 "Multiple dois for the same ref "
1687 + self.label
1688 + ": "
1689 + self.doi
1690 + " and "
1691 + doi_value
1692 )
1694 if self.doi is None:
1695 self.doi = doi_value
1696 self.extids.append(("doi", self.doi))
1698 self.has_doi = True
1700 # TODO: bug in Cedrics if the doi has a in it
1701 # the doi and the burl might not match and the dx.doi.org is no longer filtered
1702 # (bug²)
1703 # A doi should not have a space in it. raise an exception
1704 other_doi = self.doi.strip().replace(chr(160), "")
1705 if other_doi != self.doi:
1706 raise ValueError(self.doi, "has a space in it")
1708 if self.doi.lower().startswith("doi:"):
1709 raise ValueError('Remove "DOI:" in ' + self.doi)
1711 self.citation_xml += '<pub-id pub-id-type="doi">' + escape(node.text) + "</pub-id>"
1713 def parse_edition(self, node):
1714 # TODO: BUG in JATS (The edition is ignored in the HTML version)
1715 self.parse_node_common(node, "edition", "edition")
1717 def parse_editor(self, node):
1718 # TODO: Bug in Cedrics <editeur> becomes a <string-name> and we lose the info author vs editor
1719 self.parse_auteur(node, is_ref=True)
1721 last_contribution = self.contributors[-1]
1722 self.citation_xml += last_contribution["contrib_xml"]
1724 def parse_eprint_id(self, node):
1725 # Cannot add an ext_ids yet. Need to see if there's a archive-name
1726 self.eprint_id = escape(node.text)
1728 def parse_institution(self, node):
1729 self.parse_node_common(node, "institution", "institution")
1731 def parse_journal(self, node):
1732 tex_node = node.getnext()
1733 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(
1734 node, tex_node, is_citation=True
1735 )
1737 self.source_tex = title_html
1738 if len(title_xml) > 0:
1739 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"
1741 def parse_mixed_citation(self, node):
1742 for child in node:
1743 tag = normalize(child.tag)
1745 if tag == "reference":
1746 self.parse_reference(child)
1747 if len(self.label) > 0:
1748 self.citation_html = self.citation_tex = self.label + " "
1749 elif tag == "bibitemdata":
1750 self.parse_bibitemdata(child)
1752 def parse_month(self, node):
1753 # TODO: Bug in Cedrics. month is ignored in the PDF ? JEP_2019__6__737_0 [Hoe63]
1754 self.parse_node_common(node, "month", "month")
1756 def parse_msn_id(self, node):
1757 self.extids.append(("mr-item-id", node.text))
1758 self.citation_xml += (
1759 '<ext-link ext-link-type="mr-item-id">' + escape(node.text) + "</ext-link>"
1760 )
1762 def parse_node_common(self, node, variable_name, jats_tag, **kwargs):
1763 text = get_text_from_node(node)
1764 if "keep_space" not in kwargs:
1765 text = normalize_space(text)
1766 setattr(self, variable_name, text)
1768 self.citation_xml += "<" + jats_tag
1769 if "jats_params" in kwargs and len(kwargs["jats_params"]) > 0:
1770 self.citation_xml += " " + kwargs["jats_params"]
1772 self.citation_xml += ">" + escape(text) + "</" + jats_tag + ">"
1774 def parse_note(self, node):
1775 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(
1776 node, None, is_citation=True, is_comment=True, add_HTML_link=True, temp_math=True
1777 )
1779 self.comment = value_html
1781 if len(value_html) > 0:
1782 self.citation_xml += '<comment xml:space="preserve">' + value_xml + "</comment>"
1784 def parse_number(self, node):
1785 self.parse_node_common(node, "issue", "issue", keep_space=True)
1787 def parse_pagedeb(self, node):
1788 self.parse_node_common(node, "fpage", "fpage", keep_space=True)
1790 def parse_pagefin(self, node):
1791 self.parse_node_common(node, "lpage", "lpage", keep_space=True)
1793 def parse_pages(self, node):
1794 if len(self.fpage) == 0 and len(self.lpage) == 0:
1795 tag = "size" if (self.type == "book" or "thesis" in self.type) else "fpage"
1796 params = 'units="pages"' if tag == "size" else ""
1797 self.parse_node_common(node, tag, tag, jats_params=params)
1799 def parse_page_total_number(self, node):
1800 self.parse_node_common(node, "size", "size", jats_params='units="pages"')
1802 def parse_publisher(self, node):
1803 self.publisher_name = normalize_space(get_text_from_node(node))
1804 self.citation_xml += "<publisher-name>" + escape(self.publisher_name) + "</publisher-name>"
1806 def parse_reference(self, node):
1807 cedrics_label = get_text_from_node(node)
1809 if cedrics_label and cedrics_label[0] != "[":
1810 self.label = "[" + cedrics_label + "]"
1811 else:
1812 self.label = cedrics_label
1814 if self.label:
1815 if self.is_mixed_citation:
1816 self.citation_xml += "<label>" + escape(self.label) + "</label>"
1817 else:
1818 self.citation_xml += "<label>" + escape(cedrics_label) + "</label>"
1820 def parse_series(self, node):
1821 self.parse_node_common(node, "series", "series")
1823 def parse_structured_citation(self, node):
1824 wrapper_tag_added = False
1825 eprint_done = False
1827 for child in node:
1828 tag = normalize(child.tag)
1830 # The <label> is outside the <element-citation> in JATS
1831 if tag != "reference" and not wrapper_tag_added:
1832 self.citation_xml += '<element-citation publication-type="' + self.type + '">'
1833 wrapper_tag_added = True
1835 if self.eprint_id is not None and tag not in ("archive-prefix", "archive-name"):
1836 self.post_parse_eprint()
1837 eprint_done = True
1839 # TODO: brevue bcoll bconference bseries btome... (util/bibitem.xsl)
1841 if tag in ["howpublished"]:
1842 self.parse_title(child)
1843 elif tag in ("institution", "organization", "school"):
1844 self.parse_institution(child)
1845 elif tag not in ("TeXtitle", "TeXbooktitle", "archive-prefix"):
1846 fct_name = "parse_" + tag.replace("-", "_")
1847 ftor = getattr(self, fct_name, None)
1848 if callable(ftor):
1849 ftor(child)
1851 if self.eprint_id is not None and not eprint_done:
1852 self.post_parse_eprint()
1854 # ptf-xsl mets les <bediteur> à la fin en JATS
1855 if len(self.editeur_citation_xml) > 0:
1856 self.citation_xml += '<person-group person-group-type="editor">'
1857 self.citation_xml += self.editeur_citation_xml
1858 self.citation_xml += "</person-group>"
1860 self.citation_xml += "</element-citation>"
1862 text = get_citation_html(self)
1863 self.citation_html = self.citation_tex = text
1865 def parse_title(self, node):
1866 tex_node = node.getnext()
1868 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(
1869 node, tex_node, is_citation=True, add_ext_link=True
1870 )
1872 if self.type == "incollection":
1873 self.chapter_title_tex = title_html
1874 self.citation_xml += (
1875 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>"
1876 )
1877 elif self.type in [
1878 "book",
1879 "inbook",
1880 "unpublished",
1881 "phdthesis",
1882 "masterthesis",
1883 "mastersthesis",
1884 "manual",
1885 "techreport",
1886 "coursenotes",
1887 "proceedings",
1888 ] or node.tag in ["booktitle", "howpublished"]:
1889 self.source_tex = title_html
1890 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"
1891 else:
1892 self.article_title_tex = title_html
1893 self.citation_xml += (
1894 '<article-title xml:space="preserve">' + title_xml + "</article-title>"
1895 )
1897 def parse_tree(self, tree):
1898 super().parse_tree(tree)
1900 self.user_id = get_normalized_attrib(tree, "user-id") or ""
1901 self.type = get_normalized_attrib(tree, "doctype") or "misc"
1902 if self.type == "none":
1903 self.type = "misc"
1905 if self.is_mixed_citation:
1906 self.parse_mixed_citation(tree)
1907 else:
1908 self.parse_structured_citation(tree)
1910 def parse_type(self, node):
1911 tex_node = node.getnext()
1912 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(
1913 node, tex_node, bug_cedrics=True
1914 )
1916 self.annotation = value_tex
1918 if len(value_xml) > 0:
1919 self.citation_xml += (
1920 '<annotation><p xml:space="preserve">' + value_xml + "</p></annotation>"
1921 )
1923 def parse_url_last_visited(self, node):
1924 self.citation_xml += '<date-in-citation content-type="access-date" iso-8601-date="'
1925 self.citation_xml += node.text
1926 self.citation_xml += '">' + node.text
1927 self.citation_xml += "</date-in-citation>"
1929 def parse_volume(self, node):
1930 text = normalize_space(get_text_from_node(node))
1932 if text is not None and len(text) > 0:
1933 self.volume = text
1934 self.citation_xml += "<volume>" + escape(self.volume) + "</volume>"
1936 def parse_year(self, node):
1937 self.parse_node_common(node, "year", "year")
1939 def parse_zbl_id(self, node):
1940 self.extids.append(("zbl-item-id", node.text))
1941 self.citation_xml += (
1942 '<ext-link ext-link-type="zbl-item-id">' + escape(node.text) + "</ext-link>"
1943 )
1945 def post_parse_eprint(self):
1946 if self.eprint_id is not None:
1947 if self.archive_name is None:
1948 # Assumption made by the XSLT transform
1949 self.archive_name = "arxiv"
1951 if self.archive_name in ["arxiv", "tel", "hal", "theses.fr"]:
1952 # The Cedrics archive-prefix is ignored (the URL could change overtime)
1953 self.extids.append((self.archive_name, self.eprint_id))
1955 self.citation_xml += (
1956 '<pub-id pub-id-type="'
1957 + self.archive_name
1958 + '">'
1959 + self.eprint_id
1960 + "</pub-id>"
1961 )
1963 def split_label(self):
1964 """
1965 Used when sorting non-digit bibitems
1966 """
1967 label = self.label.lower()
1968 # CRAS <reference> do not allow a simple sort (?!?)
1969 # labels with "XXX et al." need to be put after "XXX"
1970 label = label.replace(" et al.", "ZZZ").replace(" et al.", "ZZZ")
1971 if len(label) > 1:
1972 label = label[1:-1]
1974 if label.isdigit():
1975 self.label_prefix = label
1976 else:
1977 try:
1978 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)
1979 except ValueError:
1980 # Special case where label is similar as "Sma" instead of "Sma15"
1981 self.label_prefix, self.label_suffix = [label, ""]