Coverage for src/ptf/cmds/xml/jats/jats_parser.py: 67%
2067 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1##################################################################################################
2#
3# README
4#
5# jats_parser.py is a replacement of xmldata.py
6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom.
7# Each node is read only once.
8#
9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds.
10# The xml tree is parsed in the class constructor (__init__)
11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables.
12# Some parse_<tag> functions are called directly.
13# Ex: if tag == "article-meta":
14# self.parse_article_meta(child)
15# Other parse_<tag> functions are called "automatically"
16# fct_name = 'parse_' + tag.replace('-', '_')
17# ftor = getattr(self, fct_name, None)
18# if callable(ftor):
19# ftor(child)
20#
21# JatsBase and JatsArticleBase are base classes.
22# They provide common instance variables and their corresponding parse_<tag> functions
23#
24# html_from_<tag> are used to generate the HTML text of a node with mixed content:
25# a node that mixes text, children and tail
26# These functions can also extract data and set instance variables (ex: self.figures)
27#
28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects
29#
30# At the end of this file, there are some functions that are/were called by ptf-tools.
31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser
32#
33# TODO: the import OAI or the import of a collection could simply call the first function
34# (def parser(tree))
35#
36##################################################################################################
38import copy
39import inspect
40import os
41import re
43from lxml import etree
44from pylatexenc.latexencode import unicode_to_latex
46from django.conf import settings
47from django.urls import reverse
48from django.utils import timezone
50from matching import scrapping
51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title
52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors
53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title
54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source
55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume
56from ptf.cmds.xml.citation_html import get_citation_html
57from ptf.cmds.xml.jats.builder.issue import get_single_title_xml
58from ptf.cmds.xml.jats.builder.issue import get_title_xml
59from ptf.cmds.xml.xml_base import RefBase
60from ptf.cmds.xml.xml_base import XmlParserBase
61from ptf.cmds.xml.xml_utils import escape
62from ptf.cmds.xml.xml_utils import get_contrib_xml
63from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions
64from ptf.cmds.xml.xml_utils import get_normalized_attrib
65from ptf.cmds.xml.xml_utils import get_text_from_node
66from ptf.cmds.xml.xml_utils import get_xml_from_node
67from ptf.cmds.xml.xml_utils import helper_update_name_params
68from ptf.cmds.xml.xml_utils import make_links_clickable
69from ptf.cmds.xml.xml_utils import normalize
70from ptf.cmds.xml.xml_utils import normalize_space
71from ptf.cmds.xml.xml_utils import split_kwds
72from ptf.display import resolver
73from ptf.model_data import ArticleData
74from ptf.model_data import BookData
75from ptf.model_data import BookPartData
76from ptf.model_data import CollectionData
77from ptf.model_data import ExtLinkDict
78from ptf.model_data import Foo
79from ptf.model_data import IssueData
80from ptf.model_data import JournalData
81from ptf.model_data import MathdocPublicationData
82from ptf.model_data import PublisherData
83from ptf.model_data import RefData
84from ptf.model_data import create_contributor
85from ptf.model_data import create_extlink
88class JatsBase(XmlParserBase):
89 def __init__(self, *args, **kwargs):
90 super().__init__()
91 self.warnings = []
92 self.fns = []
93 self.tree = None
94 # Used to convert an XML value for CKEditor (ie abstract)
95 self.add_span_around_tex_formula = False
96 # Used to create a Tex file from an XML value (ie abstract)
97 self.for_tex_file = False
99 def parse_tree(self, tree):
100 self.tree = tree
101 self.lang = get_normalized_attrib(tree, "lang") or "und"
103 def post_parse_tree(self):
104 if self.no_bib: 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was never true
105 # For Geodesic
106 ext_link = create_extlink()
107 ext_link["rel"] = "source"
108 ext_link["location"] = "http://www.numdam.org/item/" + self.pid
109 ext_link[
110 "metadata"
111 ] = "NUMDAM" # Used as the source id to find the source in the GDML Views
112 self.ext_links.append(ext_link)
114 def parse_node_with_article_title(self, node, **kwargs):
115 tex, html = self.parse_inner_node(node, **kwargs)
117 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
118 if is_mixed_citation:
119 html = add_span_class_to_html_from_article_title(html, **kwargs)
121 return tex, html
123 def parse_node_with_break(self, node, **kwargs):
124 tex = "\\newline\n" if self.for_tex_file else " "
125 html = "<br/>"
127 return tex, html
129 def parse_node_with_chem_struct_wrap(self, node, **kwargs):
130 table_id = label = None
131 inner_text = ""
133 if "id" in node.attrib:
134 table_id = node.attrib["id"]
136 for child in node:
137 tag = normalize(child.tag)
138 if tag == "label":
139 _, label = self.parse_node_with_mixed_content(child, **kwargs)
140 else:
141 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)
142 inner_text += child_text
144 text = "<table "
145 if table_id:
146 text += f'id="{table_id}" '
147 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>'
149 text += '<td class="formula-label">'
150 if label:
151 text += label
152 text += "</td></tr>"
153 text += "</table>"
155 return text, text
157 def parse_node_with_disp_quote(self, node, **kwargs):
158 tex, html = self.parse_inner_node(node, **kwargs)
160 html = f'<div class="disp-quote">{html}</div>'
161 tex = f'<div class="disp-quote">{tex}</div>'
163 return tex, html
165 def parse_node_with_boxed_text(self, node, **kwargs):
166 box_id = node.attrib["id"] if "id" in node.attrib else None
168 _, node_html = self.parse_inner_node(node, **kwargs)
170 if box_id:
171 html = f'<div id="{box_id}" class="boxed-text">'
172 else:
173 html = '<div class="boxed-text">'
175 html = f"{html}{node_html}</div>"
177 return "", html
179 def parse_node_with_fig(self, node, **kwargs):
180 """
181 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig>
182 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure>
184 :param node: XML node of a fig
185 :return: the HTML text + the dict representing the image (mimetype, location,...)
186 """
187 html = ""
189 fig_id = label_html = title_html = caption_html = None
190 img_html = ""
192 if "id" in node.attrib:
193 fig_id = node.attrib["id"]
195 for child in node:
196 tag = normalize(child.tag)
197 if tag == "label":
198 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)
199 elif tag == "caption":
200 for caption_child in child:
201 tag = normalize(caption_child.tag)
202 if tag == "title":
203 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs)
204 elif tag == "p": 204 ↛ 218line 204 didn't jump to line 218 because the condition on line 204 was always true
205 _, caption_p_html = self.parse_node_with_mixed_content(
206 caption_child, **kwargs
207 )
208 if caption_html:
209 caption_html = caption_html.replace(
210 "<p>", '<p class="fig-first-caption">', 1
211 )
212 caption_html += caption_p_html.replace(
213 "<p>", '<p class="fig-small-caption">', 1
214 )
215 else:
216 caption_html = caption_p_html
217 else:
218 self.warnings.append(
219 {
220 self.pid: self.__class__.__name__
221 + "."
222 + inspect.currentframe().f_code.co_name
223 + " "
224 + tag
225 }
226 )
228 elif tag == "graphic":
229 _, graphic_html = self.parse_node_with_graphic(child, **kwargs)
230 img_html += graphic_html
231 elif tag == "attrib":
232 _, html = self.parse_node_with_mixed_content(child, **kwargs)
233 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'
234 elif tag == "permissions": 234 ↛ 240line 234 didn't jump to line 240 because the condition on line 234 was always true
235 for gchild in child:
236 if gchild.tag == "copyright-statement": 236 ↛ 235line 236 didn't jump to line 235 because the condition on line 236 was always true
237 _, html = self.parse_node_with_mixed_content(gchild, **kwargs)
238 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'
239 else:
240 self.warnings.append(
241 {
242 self.pid: self.__class__.__name__
243 + "."
244 + inspect.currentframe().f_code.co_name
245 + " "
246 + tag
247 }
248 )
250 if fig_id:
251 html = '<figure id="' + fig_id + '">'
252 else:
253 html = "<figure>"
255 if len(img_html) > 0: 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true
256 html += img_html
258 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 258 ↛ 272line 258 didn't jump to line 272 because the condition on line 258 was always true
259 html += "<figcaption>"
261 if label_html: 261 ↛ 263line 261 didn't jump to line 263 because the condition on line 261 was always true
262 html += label_html
263 if label_html and title_html:
264 html += " : "
265 if title_html:
266 html += title_html
267 if caption_html: 267 ↛ 270line 267 didn't jump to line 270 because the condition on line 267 was always true
268 html += caption_html
270 html += "</figcaption>"
272 html += "</figure>"
274 if ( 274 ↛ 280line 274 didn't jump to line 280
275 "append_floats" in kwargs
276 and kwargs["append_floats"]
277 and hasattr(self, "floats")
278 and fig_id is not None
279 ):
280 self.floats[fig_id] = html
282 return "", html
284 def parse_node_with_fn(self, node, **kwargs):
285 """
286 Ex: <fn><label>LABEL</label><p>TEXT</p></fn>
288 :param node: XML node of a fn
289 :return: ''. the text is stripped from the HTML. but a list of fn is built
290 """
291 html = fn_html = ""
293 label_html = fn_id = None
295 if "id" in node.attrib: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 fn_id = node.attrib["id"]
298 for child in node:
299 tag = normalize(child.tag)
300 if tag == "label":
301 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)
302 elif tag == "p": 302 ↛ 306line 302 didn't jump to line 306
303 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs)
304 fn_html = fn_html.replace("<p>", "").replace("</p>", "")
305 else:
306 warning = (
307 self.__class__.__name__
308 + "."
309 + inspect.currentframe().f_code.co_name
310 + " "
311 + tag
312 )
313 self.warnings.append({self.pid: warning})
315 if fn_id: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true
316 html = '<p id="' + fn_id + '">'
317 else:
318 html = "<p>"
320 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true
321 html += f"<sup>{label_html}</sup> "
323 html += fn_html + "</p>"
325 if not kwargs["keep_fn"] and html not in self.fns: 325 ↛ 326line 325 didn't jump to line 326 because the condition on line 325 was never true
326 self.fns.append(html)
328 html = html if kwargs["keep_fn"] else ""
329 return "", html
331 def parse_node_with_graphic(self, node, **kwargs):
332 """
333 The href value of graphics used in our XML can have the following values
334 - relative path to the issue XML folder (Elsevier JATS)
335 - full path starting with "file:/" (Elsevier JATS created in early 2022)
336 - simple file name (with no relative path) in the RVT FullText XML
338 After the import, we want
339 - the files located in the src/tex/figures article folder
340 - the url pointing to the image, built thanks to kwargs['base_url']
342 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/
343 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute)
344 """
345 href = ""
347 for attrib in node.attrib:
348 name = normalize(attrib)
349 if name == "href":
350 href = node.attrib[attrib]
352 if href: 352 ↛ 398line 352 didn't jump to line 398 because the condition on line 352 was always true
353 basename = os.path.basename(href)
354 ext = basename.split(".")[-1]
355 if ext == "png": 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true
356 mimetype = "image/png"
357 else:
358 mimetype = "image/jpeg"
360 img_url = "src/tex/figures/" + basename
362 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 362 ↛ 365line 362 didn't jump to line 365 because the condition on line 362 was always true
363 img_url = img_url[0 : -len(ext)] + "jpg"
365 data_location = href if "file:/" in href else img_url
366 if ( 366 ↛ 372line 366 didn't jump to line 372
367 hasattr(self, "pii")
368 and hasattr(self, "issue")
369 and "file:/" not in href
370 and self.from_folder
371 ):
372 base_dir = self.issue.journal.pid
373 if os.path.dirname(href) != base_dir:
374 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href)
375 data_location = "file:" + href
377 data = {
378 "rel": "html-image",
379 "mimetype": mimetype,
380 "location": data_location,
381 "base": None,
382 "metadata": node.text if node.text is not None else "",
383 }
385 if ext == "png": 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true
386 img_url = os.path.join(kwargs["base_url"], "png", img_url)
387 else:
388 img_url = os.path.join(kwargs["base_url"], "jpg", img_url)
389 img_text = '<a href="' + img_url + '" data-lightbox="image-'
390 img_text += str(len(self.figures)) + '" title="">'
391 img_text += '<img src="' + img_url + '" class="article-body-img" />'
392 img_text += "</a>"
394 if data not in self.figures: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true
395 self.figures.append(data)
396 self.related_objects.append(data)
398 return "", img_text
400 def parse_node_with_inline_formula(self, node, **kwargs):
401 # MathJAX is doing a good job with formulae and is now the standard
402 # MathML could be ignored in HTML (the original XML value is preserved with value_xml)
403 # We could simply return the tex-math text
404 # But there are multiple errors in the TeX of the Mersenne articles.
405 # We first need to fix those mistakes before switching to TeX
407 tex_math = ""
408 math_text = ""
409 formula_id = label = None
411 if "id" in node.attrib: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true
412 formula_id = node.attrib["id"]
414 for child in node:
415 tag = normalize(child.tag)
416 if tag == "alternatives": 416 ↛ 435line 416 didn't jump to line 435 because the condition on line 416 was always true
417 for alternative in child:
418 tag = normalize(alternative.tag)
419 if tag == "tex-math":
420 tex_math = alternative.text or ""
421 elif tag == "math":
422 # remove_namespace(child)
423 # Elsevier sometimes provide the formula a an alternative image. Remove it.
424 alternative.attrib.pop("altimg", None)
426 math_text = get_xml_from_node(alternative).replace("mml:", "")
427 math_text = math_text.replace(
428 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""
429 )
430 math_text = math_text.replace(
431 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', ""
432 )
433 if node.tag == "disp-formula":
434 math_text = math_text.replace("<math", '<math display="block"')
435 elif tag == "label":
436 label = child.text or ""
437 else:
438 self.warnings.append(
439 {
440 self.pid: self.__class__.__name__
441 + "."
442 + inspect.currentframe().f_code.co_name
443 + " "
444 + tag
445 }
446 )
448 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""):
449 stack = inspect.stack()
450 stack_str = " ".join(
451 [
452 frameinfo[3]
453 for frameinfo in stack[1:]
454 if frameinfo[3].find("parse_") == 0
455 and frameinfo[3].find("parse_node") == -1
456 and frameinfo[3].find("parse_inner") == -1
457 and frameinfo[3].find("parse_tree") == -1
458 and frameinfo[3].find("parse_article_meta") == -1
459 ]
460 )
461 print(f"{self.pid} no math formula for {stack_str}")
462 # raise ValueError("No formula alternative")
464 if node.tag != "disp-formula":
465 if tex_math != "" and tex_math[0] != "$": 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true
466 tex_math = "$" + tex_math
467 if tex_math != "" and tex_math[-1] != "$": 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true
468 tex_math = tex_math + "$"
470 tex = tex_math
472 html = ""
473 if label or node.tag == "disp-formula":
474 html += '<table class="formula"><tr><td class="formula-inner">'
476 html += '<span class="mathjax-formula" '
477 if formula_id: 477 ↛ 478line 477 didn't jump to line 478 because the condition on line 477 was never true
478 html += 'id="' + formula_id + '" '
479 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math
480 if math_text:
481 html += f'data-tex="{alt_text}">{math_text}</span>'
482 else:
483 html += f'data-tex="{alt_text}">{tex_math}</span>'
485 if label or node.tag == "disp-formula":
486 html += '</td><td class="formula-label">'
487 if label: 487 ↛ 488line 487 didn't jump to line 488 because the condition on line 487 was never true
488 html += label
489 html += "</td></tr>"
490 html += "</table>"
492 if self.add_span_around_tex_formula: 492 ↛ 493line 492 didn't jump to line 493 because the condition on line 492 was never true
493 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>'
495 return tex, html
497 def parse_node_with_institution_id(self, node, **kwargs):
498 return "", ""
500 def parse_node_with_italic(self, node, **kwargs):
501 tex, html = self.parse_inner_node(node, **kwargs)
503 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False
504 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False
505 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False
506 #
507 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment):
508 # text = inner_text
509 # else:
510 # text = '<span class="italique">' + inner_text + '</span>'
512 html = f'<span class="italique">{html}</span>'
514 if self.for_tex_file: 514 ↛ 515line 514 didn't jump to line 515 because the condition on line 514 was never true
515 tex = "{\\it " + tex + "}"
516 else:
517 tex = f"<i>{tex}</i>"
519 return tex, html
521 def parse_node_with_list(self, node, **kwargs):
522 tex, html = self.parse_inner_node(node, **kwargs)
524 start = None
525 continued_from = node.get("continued-from")
526 if continued_from is not None: 526 ↛ 527line 526 didn't jump to line 527 because the condition on line 526 was never true
527 start = self.get_list_start_value(node) + 1
529 list_type = node.get("list-type")
530 if list_type == "bullet" or list_type == "simple":
531 if self.for_tex_file: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true
532 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n"
533 else:
534 tex = f"<ul>{tex}</ul>"
536 html = f"<ul>{html}</ul>"
537 else:
538 if self.for_tex_file: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n"
540 else:
541 if list_type == "order" or list_type == "number":
542 if start is not None: 542 ↛ 543line 542 didn't jump to line 543 because the condition on line 542 was never true
543 html = f'<ol type="1" start="{str(start)}">{html}</ol>'
544 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>'
545 else:
546 html = f'<ol type="1">{html}</ol>'
547 tex = f'<ol type="1">{tex}</ol>'
548 elif list_type == "alpha-lower":
549 html = f'<ol type="a">{html}</ol>'
550 tex = f'<ol type="a">{tex}</ol>'
551 elif list_type == "alpha-upper":
552 html = f'<ol type="A">{html}</ol>'
553 tex = f'<ol type="A">{tex}</ol>'
554 elif list_type == "roman-lower":
555 html = f'<ol type="i">{html}</ol>'
556 tex = f'<ol type="i">{tex}</ol>'
557 elif list_type == "roman-upper": 557 ↛ 558line 557 didn't jump to line 558 because the condition on line 557 was never true
558 html = f'<ol type="I">{html}</ol>'
559 tex = f'<ol type="I">{tex}</ol>'
560 else:
561 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>'
562 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>'
564 return tex, html
566 def parse_node_with_list_item(self, node, **kwargs):
567 """
568 <list-item><label>LABEL</label><p>TEXT</p> becomes
569 <li>LABEL TEXT</li>
570 (same with <title>)
572 :param node:
573 :return:
574 """
576 title_tex = (
577 title_html
578 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = ""
580 for child in node:
581 tag = normalize(child.tag)
582 if tag == "label":
583 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs)
584 elif tag == "title": 584 ↛ 585line 584 didn't jump to line 585 because the condition on line 584 was never true
585 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs)
586 elif tag == "p":
587 if p_html == "" and content_html == "": 587 ↛ 590line 587 didn't jump to line 590 because the condition on line 587 was always true
588 p_tex, p_html = self.parse_inner_node(child, **kwargs)
589 else:
590 content_tex, content_html = self.parse_inner_node(child, **kwargs)
591 content_html = f"<p>{content_html}</p>"
592 elif tag == "list": 592 ↛ 596line 592 didn't jump to line 596 because the condition on line 592 was always true
593 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs)
594 # TODO if tag == "def-list":
595 else:
596 self.warnings.append(
597 {
598 self.pid: self.__class__.__name__
599 + "."
600 + inspect.currentframe().f_code.co_name
601 + " "
602 + tag
603 }
604 )
606 inner_tex = ""
607 if label_tex:
608 inner_tex += label_tex + " "
609 if title_tex: 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true
610 inner_tex += title_tex + " "
611 inner_tex += p_tex + content_tex
613 if self.for_tex_file: 613 ↛ 614line 613 didn't jump to line 614 because the condition on line 613 was never true
614 tex = "\\item " + inner_tex + "\n"
615 else:
616 tex = f"<li>{inner_tex}</li>"
618 html = "<li>"
619 if label_html:
620 html += label_html + " "
621 if title_html: 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true
622 html += title_html + " "
623 html += p_html + content_html + "</li>"
625 return tex, html
627 def parse_node_with_name_content(self, node, **kwargs):
628 tex, html = self.parse_inner_node(node, **kwargs)
629 return tex, html
631 def parse_node_with_p(self, node, **kwargs):
632 tex, html = self.parse_inner_node(node, **kwargs)
634 if not self.for_tex_file: 634 ↛ 637line 634 didn't jump to line 637 because the condition on line 634 was always true
635 tex = f"<p>{tex}</p>"
637 node_type = node.get("specific-use")
638 if node_type:
639 html = f'<p class="{node_type}">{html}</p>'
640 else:
641 html = f"<p>{html}</p>"
643 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true
644 while len(self.floats_to_insert) > 0:
645 float_id = self.floats_to_insert.pop(0)
646 if float_id in self.floats:
647 html += self.floats[float_id]
648 self.floats.pop(float_id)
650 return tex, html
652 def parse_node_with_h1(self, node, **kwargs):
653 tex, html = self.parse_inner_node(node, **kwargs)
655 if not self.for_tex_file:
656 tex = f"<h1>{tex}</h1>"
658 node_type = node.get("specific-use")
659 if node_type:
660 html = f'<h1 class="{node_type}">{html}</h1>'
661 else:
662 html = f"<h1>{html}</h1>"
664 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"):
665 while len(self.floats_to_insert) > 0:
666 float_id = self.floats_to_insert.pop(0)
667 if float_id in self.floats:
668 html += self.floats[float_id]
669 self.floats.pop(float_id)
671 return tex, html
673 def parse_node_with_sc(self, node, **kwargs):
674 tex, html = self.parse_inner_node(node, **kwargs)
675 html = f'<span class="smallcaps">{html}</span>'
677 return tex, html
679 def parse_node_with_sec(self, node, **kwargs):
680 """
681 <sec><title>TITLE</title><p>TEXT</p> becomes
682 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children)
684 :param node:
685 :param kwargs:
686 :return:
687 """
689 label_tex = label_html = title_tex = title_html = None
690 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2
692 inner_tex = inner_html = ""
693 kwargs["sec_level"] += 1
695 for child in node:
696 tag = normalize(child.tag)
697 if tag == "label":
698 label_tex, label_html = self.parse_node_with_mixed_content(child)
699 elif tag == "title":
700 title_tex, title_html = self.parse_node_with_mixed_content(child)
701 else:
702 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)
703 inner_tex += child_tex
704 inner_html += child_html
706 tex = ""
707 html = "<section>"
709 if label_html or title_html: 709 ↛ 722line 709 didn't jump to line 722 because the condition on line 709 was always true
710 html += f"<h{str(sec_level)}>"
711 if label_html: 711 ↛ 714line 711 didn't jump to line 714 because the condition on line 711 was always true
712 tex += label_tex
713 html += label_html
714 if label_html and title_html: 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was always true
715 tex += " "
716 html += " "
717 if title_html: 717 ↛ 720line 717 didn't jump to line 720 because the condition on line 717 was always true
718 tex += title_tex
719 html += title_html
720 html += f"</h{str(sec_level)}>"
722 tex += inner_tex
723 html += inner_html + "</section>"
725 return tex, html
727 def parse_node_with_string_name(self, node, **kwargs):
728 tex, html = self.parse_inner_node(node, **kwargs)
730 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
731 if is_mixed_citation: 731 ↛ 734line 731 didn't jump to line 734 because the condition on line 731 was always true
732 html = add_span_class_to_html_from_authors(html.title(), **kwargs)
734 return tex, html
736 def parse_node_with_strong(self, node, **kwargs):
737 tex, html = self.parse_inner_node(node, **kwargs)
739 if self.for_tex_file: 739 ↛ 740line 739 didn't jump to line 740 because the condition on line 739 was never true
740 tex = "{\\bf " + tex + "}"
741 else:
742 tex = f"<strong>{tex}</strong>"
743 html = f"<strong>{html}</strong>"
745 return tex, html
747 def parse_node_with_styled_content(self, node, **kwargs):
748 tex, html = self.parse_inner_node(node, **kwargs)
750 if "style" in node.attrib: 750 ↛ 755line 750 didn't jump to line 755 because the condition on line 750 was always true
751 style = node.attrib["style"]
752 if style != "": 752 ↛ 755line 752 didn't jump to line 755 because the condition on line 752 was always true
753 html = f'<span style="{style}">{html}</span>'
755 return tex, html
757 def parse_node_with_sub(self, node, **kwargs):
758 tex, html = self.parse_inner_node(node, **kwargs)
760 if self.for_tex_file: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true
761 tex = "\\textsubscript{" + tex + "}"
762 else:
763 tex = f"<sub>{tex}</sub>"
764 html = f"<sub>{html}</sub>"
766 return tex, html
768 def parse_node_with_sup(self, node, **kwargs):
769 tex, html = self.parse_inner_node(node, **kwargs)
771 if self.for_tex_file: 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true
772 tex = "\\textsuperscript{" + tex + "}"
773 else:
774 tex = f"<sup>{tex}</sup>"
775 html = f"<sup>{html}</sup>"
777 return tex, html
779 def parse_node_with_table_generic(self, node, **kwargs):
780 tex, html = self.parse_inner_node(node, **kwargs)
782 tag = normalize(node.tag)
783 if tag == "row": 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true
784 tag = "tr"
785 elif tag == "entry": 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true
786 tag = "td"
787 open_tag = "<" + tag
789 if tag == "table":
790 class_table = "table"
792 cols = node.xpath("colgroup/col")
793 i = 1
794 for col in cols:
795 if "width" in col.attrib:
796 class_table += f" nowrap-col-{i}"
797 i += 1
799 open_tag += f' class="{class_table}"'
800 if "rowspan" in node.attrib:
801 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"'
802 if "colspan" in node.attrib:
803 open_tag += ' colspan="' + node.attrib["colspan"] + '"'
804 if "align" in node.attrib:
805 open_tag += ' align="' + node.attrib["align"] + '"'
806 if "valign" in node.attrib:
807 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"'
808 if "style" in node.attrib:
809 open_tag += ' style="' + node.attrib["style"] + '"'
810 open_tag += ">"
812 html = f"{open_tag}{html}</{tag}>"
814 return "", html
816 def parse_node_with_table_wrap(self, node, **kwargs):
817 """
818 Create a <div class="table-wrap"> around the table
819 :param node:
820 :return:
821 """
823 table_id = label = caption = None
824 inner_text = ""
826 if "id" in node.attrib: 826 ↛ 829line 826 didn't jump to line 829 because the condition on line 826 was always true
827 table_id = node.attrib["id"]
829 for child in node:
830 tag = normalize(child.tag)
831 if tag == "label":
832 _, label = self.parse_node_with_mixed_content(child, **kwargs)
833 elif tag == "caption":
834 _, caption = self.parse_node_with_mixed_content(child, **kwargs)
835 else:
836 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)
837 inner_text += child_text
839 if table_id: 839 ↛ 842line 839 didn't jump to line 842 because the condition on line 839 was always true
840 text = '<div class="table-wrap table-responsive" id="' + table_id + '">'
841 else:
842 text = '<div class="table-wrap table-responsive">'
844 if label or caption: 844 ↛ 847line 844 didn't jump to line 847 because the condition on line 844 was always true
845 text += '<div class="table-wrap-header">'
847 if label: 847 ↛ 850line 847 didn't jump to line 850 because the condition on line 847 was always true
848 text += "<strong>" + label + "</strong>"
850 if caption: 850 ↛ 856line 850 didn't jump to line 856 because the condition on line 850 was always true
851 if label: 851 ↛ 853line 851 didn't jump to line 853 because the condition on line 851 was always true
852 text += " "
853 if caption: 853 ↛ 856line 853 didn't jump to line 856 because the condition on line 853 was always true
854 text += caption
856 if label or caption: 856 ↛ 859line 856 didn't jump to line 859 because the condition on line 856 was always true
857 text += "</div>"
859 text += inner_text
860 text += "</div>"
862 if ( 862 ↛ 868line 862 didn't jump to line 868
863 "append_floats" in kwargs
864 and kwargs["append_floats"]
865 and hasattr(self, "floats")
866 and table_id is not None
867 ):
868 self.floats[table_id] = text
870 return "", text
872 def parse_node_with_table_wrap_foot(self, node, **kwargs):
873 """
874 Create a <div class="table-wrap-foot"> at bottom of the table
875 Keep the footnotes inside this div
876 :param node:
877 :return:
878 """
880 text = '<div class="table-wrap-foot">'
881 kwargs["keep_fn"] = True
883 for child in node:
884 tag = normalize(child.tag)
885 if tag == "fn-group": 885 ↛ 883line 885 didn't jump to line 883 because the condition on line 885 was always true
886 _, html = self.parse_node_with_mixed_content(child, **kwargs)
887 text += html
889 text += "</div>"
891 return "", text
893 def parse_node_with_toc(self, node, **kwargs):
894 tex, html = self.parse_inner_node(node, **kwargs)
896 html = f"<table>{html}</table>"
898 # text = '<ul class="no-bullet book-toc">'
899 # text += inner_text + '</ul>'
901 return "", html
903 def parse_node_with_toc_entry(self, node, **kwargs):
904 html = label = title = child_text = page = anchor = ""
905 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"]
906 toc_class = "inside-toc" if inside_toc_entry else ""
907 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul>
908 # html = '<tr class="inside-toc">'
909 # #html = '<ul class="no-bullet book-toc">'
911 for child in node:
912 tag = normalize(child.tag)
913 if tag == "title":
914 _, title = self.parse_node_with_mixed_content(child, **kwargs)
915 elif tag == "label":
916 _, label = self.parse_node_with_mixed_content(child, **kwargs)
917 elif tag == "nav-pointer":
918 _, page = self.parse_node_with_mixed_content(child, **kwargs)
919 elif tag == "nav-pointer-group": 919 ↛ 920line 919 didn't jump to line 920 because the condition on line 919 was never true
920 for grandchild in child:
921 if (
922 grandchild.tag == "nav-pointer"
923 and "specific-use" in grandchild.attrib
924 and grandchild.attrib["specific-use"] == "pagenum"
925 ):
926 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs)
927 if (
928 grandchild.tag == "nav-pointer"
929 and "specific-use" in grandchild.attrib
930 and grandchild.attrib["specific-use"] == "pageindex"
931 ):
932 anchor = int(grandchild.text) + 1
933 elif tag == "toc-entry": 933 ↛ 911line 933 didn't jump to line 911 because the condition on line 933 was always true
934 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True)
935 child_text += text
937 toc_text = f"{label} {title}"
938 page_text = f"p. {page}"
940 if anchor: 940 ↛ 941line 940 didn't jump to line 941 because the condition on line 940 was never true
941 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"})
942 href += f"#page={anchor}"
943 toc_text = f'<a href="{href}">{toc_text}</a>'
944 page_text = f'<a href="{href}">{page_text}</a>'
946 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>'
947 if len(child_text) > 0:
948 html += child_text
949 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>'
951 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']:
952 # html += '</tr>'
953 # #html += '</ul>'
955 return "", html
957 def parse_node_with_underline(self, node, **kwargs):
958 tex, html = self.parse_inner_node(node, **kwargs)
959 tex = f"<u>{tex}</u>"
960 html = f"<u>{html}</u>"
962 return tex, html
964 def parse_node_with_volume(self, node, **kwargs):
965 tex, html = self.parse_inner_node(node, **kwargs)
967 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
968 if is_mixed_citation: 968 ↛ 971line 968 didn't jump to line 971 because the condition on line 968 was always true
969 html = add_span_class_to_html_from_volume(html, **kwargs)
971 return tex, html
973 def parse_node_with_xref(self, node, **kwargs):
974 tex = html = ""
976 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 976 ↛ 977line 976 didn't jump to line 977 because the condition on line 976 was never true
977 return tex, html
979 xref_id = node.get("rid")
980 if xref_id: 980 ↛ 994line 980 didn't jump to line 994 because the condition on line 980 was always true
981 rids = xref_id.split()
983 tex, html = self.parse_inner_node(node, **kwargs)
984 rid0 = rids[0]
985 if rid0.find("bib") == 0: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true
986 rid0 = "r" + rid0[3:]
987 html = f'<a href="#{rid0}">{html}</a>'
989 for rid in rids:
990 ref_type = node.get("ref-type") or None
991 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true
992 self.floats_to_insert.append(rid)
994 return tex, html
996 def parse_inner_node(self, node, **kwargs):
997 """
998 Used by html_from_mixed_content for nodes that have a different tag in HTML
999 :param node:
1000 :param kwargs:
1001 :return:
1002 """
1003 tex = html = ""
1004 kwargs["is_top"] = False
1005 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False
1007 if node.text:
1008 node_text = node.text
1009 if self.for_tex_file: 1009 ↛ 1010line 1009 didn't jump to line 1010 because the condition on line 1009 was never true
1010 node_text = unicode_to_latex(node_text)
1011 tex = node_text
1012 html = escape(node.text)
1014 for child in node:
1015 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)
1016 tex += child_tex
1017 html += child_html
1019 return tex, html
1021 def parse_node_with_mixed_content(self, node, **kwargs):
1022 """
1023 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes.
1024 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>
1025 Some inner nodes are removed, others are kept or replaced by their HTML equivalent.
1026 html_from_mixed_content is called recursively to get the HTML text of the children.
1028 :param node: XML Node
1029 :param kwargs: params of the function
1030 :return: HTML text
1031 """
1033 if node is None: 1033 ↛ 1034line 1033 didn't jump to line 1034 because the condition on line 1033 was never true
1034 return "", ""
1036 # The tail is the text following the end of the node
1037 # Ex: <node>text1<a>text_a</a>a_tail</node>
1038 # The HTML text has to include the tail
1039 # only if html_from_mixed_content was called recursively
1040 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
1042 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>
1043 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2
1045 # Text in <comment> is parsed to add HTML link.
1046 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False
1048 # base_url to image links
1049 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""
1051 # footnotes are removed from the fulltext (and put at the end) except for those in a table
1052 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False
1054 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False
1055 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False
1056 # mixed-citation ignores ext-link
1057 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False
1059 # TODO remove once jats_parser has been validated agains xmldata
1060 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False
1061 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False
1062 kwargs["is_mixed_citation"] = (
1063 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
1064 )
1065 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False
1067 tag = normalize(node.tag)
1069 # pub-id/object-id are ignored by default are they are treated separately
1070 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):
1071 return "", ""
1073 if tag in ("mixed-citation", "toc"):
1074 kwargs["is_citation"] = True
1075 elif tag == "comment":
1076 kwargs["is_comment"] = True
1078 tex = html = inner_tex = inner_html = ""
1080 # I. Add the node's text.
1081 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text.
1083 # Check if the parse_node_with_@tag exists
1084 tag_mapped = {
1085 "statement": "sec",
1086 "disp-formula": "inline-formula",
1087 "chapter-title": "article-title",
1088 "bold": "strong",
1089 "table": "table-generic",
1090 "th": "table-generic",
1091 "tr": "table-generic",
1092 "td": "table-generic",
1093 "thead": "table-generic",
1094 "tbody": "table-generic",
1095 "colgroup": "table-generic",
1096 "col": "table-generic",
1097 "tgroup": "table-generic",
1098 "entry": "table-generic",
1099 "row": "table-generic",
1100 }
1102 fct_name = tag_mapped[tag] if tag in tag_mapped else tag
1103 fct_name = "parse_node_with_" + fct_name.replace("-", "_")
1104 ftor = getattr(self, fct_name, None)
1105 if callable(ftor):
1106 inner_tex, inner_html = ftor(node, **kwargs)
1107 elif tag in ("ext-link", "uri"):
1108 # Add HTML links
1109 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs)
1110 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>,
1111 # and not caught by parse_citation_node
1112 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]:
1113 is_extid_value = self.parse_ext_link(node, **kwargs)
1114 if is_extid_value and kwargs["is_mixed_citation"]:
1115 # an extid has been found in a mixed_citation, no need to add the text of the id here
1116 inner_tex = inner_html = ""
1117 elif tag == "supplementary-material": 1117 ↛ 1118line 1117 didn't jump to line 1118 because the condition on line 1117 was never true
1118 self.parse_supplementary_material(node, **kwargs)
1119 else:
1120 # II.1. Add the node text (before the children text)
1121 if node.text is not None:
1122 node_text = node.text
1123 if self.for_tex_file: 1123 ↛ 1124line 1123 didn't jump to line 1124 because the condition on line 1123 was never true
1124 node_text = unicode_to_latex(node_text)
1125 inner_tex += node_text
1126 inner_html += escape(node.text)
1128 # II.2. children
1129 # child_text = html_from_mixed_content(child, params)
1131 child_kwargs = kwargs.copy()
1132 child_kwargs["is_top"] = False
1134 for child in node:
1135 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs)
1137 # Case where an ext-link has been removed in a mixed-citation
1138 # We may have "title. , (year)"
1139 # Remove the comma that is now useless
1140 if ( 1140 ↛ 1146line 1140 didn't jump to line 1146
1141 kwargs["is_mixed_citation"]
1142 and child_html
1143 and child_html[0] in [",", "."]
1144 and inner_html[-2:] == ". "
1145 ):
1146 inner_html = inner_html[0:-1]
1147 child_html = child_html[1:]
1148 inner_tex = inner_tex[0:-1]
1149 child_tex = child_tex[1:]
1151 inner_tex += child_tex
1152 inner_html += child_html
1154 # II.3. wrap the children text with html links
1155 if kwargs["add_HTML_link"] and node.text:
1156 match = re.match(r"[\n ]+", node.text)
1157 if not match:
1158 inner_html = make_links_clickable(node.text, inner_html)
1160 tex += inner_tex
1161 html += inner_html
1163 # III. Add the node's tail for children
1164 if node.tail and not kwargs["is_top"]:
1165 node_tail = node.tail
1166 if self.for_tex_file: 1166 ↛ 1167line 1166 didn't jump to line 1167 because the condition on line 1166 was never true
1167 node_tail = unicode_to_latex(node_tail)
1168 tex += node_tail
1169 html += escape(node.tail)
1171 return tex, html
1173 def parse_abstract(self, node, **kwargs):
1174 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"
1175 tag = get_normalized_attrib(node, "abstract-type") or "abstract"
1176 if tag == "author": 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true
1177 tag = "abstract"
1178 lang = get_normalized_attrib(node, "lang") or self.lang
1179 value_tex, value_html = self.parse_node_with_mixed_content(node)
1180 value_xml = get_xml_from_node(node)
1181 self.abstracts.append(
1182 {
1183 "tag": tag,
1184 "lang": lang,
1185 "value_xml": value_xml,
1186 "value_html": value_html,
1187 "value_tex": value_tex,
1188 }
1189 )
1191 def parse_aff_alternatives(self, node, **kwargs):
1192 xref_id = get_normalized_attrib(node, "id") or ""
1193 address = ""
1194 aff_to_all = True
1196 for child in node:
1197 tag = normalize(child.tag)
1199 if tag == "aff": 1199 ↛ 1210line 1199 didn't jump to line 1210 because the condition on line 1199 was always true
1200 # Skip the formatted aff and use only the complete address text
1201 # TODO support <aff> properly
1202 for aff in child:
1203 if aff.tag == "label" and address == "": 1203 ↛ 1204line 1203 didn't jump to line 1204 because the condition on line 1203 was never true
1204 label = get_text_from_node(aff)
1205 address = get_text_from_node(child)[len(label) :]
1206 aff_to_all = False
1207 if address == "" and child.text:
1208 address = child.text
1209 else:
1210 self.warnings.append(
1211 {
1212 self.pid: self.__class__.__name__
1213 + "."
1214 + inspect.currentframe().f_code.co_name
1215 + " "
1216 + tag
1217 }
1218 )
1220 if address != "": 1220 ↛ exitline 1220 didn't return from function 'parse_aff_alternatives' because the condition on line 1220 was always true
1221 for contrib in self.contributors:
1222 if address not in contrib["addresses"] and ( 1222 ↛ 1221line 1222 didn't jump to line 1221 because the condition on line 1222 was always true
1223 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all
1224 ):
1225 contrib["addresses"].append(address)
1226 contrib["contrib_xml"] = get_contrib_xml(contrib)
1228 def parse_award_group(self, node, **kwargs):
1229 abbrev = award_id = None
1231 for child in node:
1232 tag = normalize(child.tag)
1234 if tag == "award-id":
1235 award_id = child.text
1236 elif tag == "funding-source":
1237 abbrev = get_text_from_node(child)
1238 else:
1239 self.warnings.append(
1240 {
1241 self.pid: self.__class__.__name__
1242 + "."
1243 + inspect.currentframe().f_code.co_name
1244 + " "
1245 + tag
1246 }
1247 )
1249 if abbrev is not None and award_id is not None:
1250 self.awards.append({"abbrev": abbrev, "award_id": award_id})
1252 def parse_contrib_group(self, node, **kwargs):
1253 role = node.get("content-type") or ""
1254 if role and role[-1] == "s": 1254 ↛ 1257line 1254 didn't jump to line 1257 because the condition on line 1254 was always true
1255 role = role[0:-1]
1257 for child in node:
1258 tag = normalize(child.tag)
1260 if tag == "contrib": 1260 ↛ 1265line 1260 didn't jump to line 1265 because the condition on line 1260 was always true
1261 contrib = self.get_data_from_contrib(child)
1262 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role
1263 contrib["contrib_xml"] = get_xml_from_node(child)
1264 self.contributors.append(contrib)
1265 elif tag == "aff-alternatives":
1266 self.parse_aff_alternatives(child)
1267 elif tag == "fn":
1268 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)
1269 xml = get_xml_from_node(child)
1270 self.footnotes_xml += xml
1271 self.footnotes_html += html
1272 else:
1273 self.warnings.append(
1274 {
1275 self.pid: self.__class__.__name__
1276 + "."
1277 + inspect.currentframe().f_code.co_name
1278 + " "
1279 + tag
1280 }
1281 )
1283 def parse_counts(self, node, **kwargs):
1284 for child in node:
1285 count_value = child.get("count")
1286 if count_value is None:
1287 count_value = child.text
1289 if count_value is not None: 1289 ↛ 1284line 1289 didn't jump to line 1284 because the condition on line 1289 was always true
1290 tag = normalize(child.tag)
1291 if tag == "book-page-count":
1292 tag = "page-count"
1294 self.counts.append((tag, count_value))
1296 def parse_ext_link(self, node, **kwargs):
1297 datas = self.get_data_from_ext_link(node)
1298 extid_value = self.add_extids_from_node_with_link(datas)
1300 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False
1301 if (
1302 add_ext_link
1303 and extid_value[0] is None
1304 and datas not in self.ext_links
1305 and datas["rel"] != "cover"
1306 ):
1307 self.ext_links.append(datas)
1309 return extid_value[0] is not None
1311 def parse_front_matter(self, node, **kwargs):
1312 self.frontmatter_xml = get_xml_from_node(node)
1313 self.frontmatter_foreword_html = ""
1315 for child in node:
1316 tag = normalize(child.tag)
1318 if tag == "foreword": 1318 ↛ 1319line 1318 didn't jump to line 1319 because the condition on line 1318 was never true
1319 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child)
1320 elif tag == "toc": 1320 ↛ 1315line 1320 didn't jump to line 1315 because the condition on line 1320 was always true
1321 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child)
1323 def parse_id(self, node, **kwargs):
1324 node_id = node.text
1325 if "pub-id-type" in node.attrib:
1326 node_type = node.attrib["pub-id-type"]
1327 elif "book-id-type" in node.attrib:
1328 node_type = node.attrib["book-id-type"]
1329 elif "book-part-id-type" in node.attrib: 1329 ↛ 1332line 1329 didn't jump to line 1332 because the condition on line 1329 was always true
1330 node_type = node.attrib["book-part-id-type"]
1331 else:
1332 node_type = ""
1334 if node_type == "pii": 1334 ↛ 1336line 1334 didn't jump to line 1336 because the condition on line 1334 was never true
1335 # Elsevier ids get a special treatment: web scrapping to find the date_published
1336 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR":
1337 self.pii = node_id
1338 elif node_type in ("numdam-id", "mathdoc-id"):
1339 self.pid = node_id
1340 elif node_type == "ark": 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true
1341 self.extids.append((node_type, node_id))
1342 elif node_type in ("doi", "eid"):
1343 self.ids.append((node_type, node_id))
1344 if node_type == "doi": 1344 ↛ exitline 1344 didn't return from function 'parse_id' because the condition on line 1344 was always true
1345 self.doi = node_id
1347 def parse_kwd_group(self, node, **kwargs):
1348 kwds = []
1349 value_html = value_tex = ""
1350 for child in node:
1351 tag = normalize(child.tag)
1353 if tag == "kwd":
1354 kwds.append(child.text)
1355 elif tag == "unstructured-kwd-group": 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true
1356 # value_xml = get_xml_from_node(child)
1357 value_tex, value_html = self.parse_node_with_mixed_content(child)
1358 kwds = split_kwds(value_tex)
1359 else:
1360 self.warnings.append(
1361 {
1362 self.pid: self.__class__.__name__
1363 + "."
1364 + inspect.currentframe().f_code.co_name
1365 + " "
1366 + tag
1367 }
1368 )
1370 content_type = node.get("content-node_type") or ""
1371 if content_type == "": 1371 ↛ 1373line 1371 didn't jump to line 1373 because the condition on line 1371 was always true
1372 content_type = node.get("kwd-group-type") or ""
1373 lang = get_normalized_attrib(node, "lang") or self.lang
1375 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds])
1377 def parse_ref_list(self, node, **kwargs):
1378 for child in node:
1379 tag = normalize(child.tag)
1381 if tag == "ref": 1381 ↛ 1386line 1381 didn't jump to line 1386 because the condition on line 1381 was always true
1382 ref = JatsRef(tree=child, lang=self.lang)
1383 self.warnings.extend(ref.warnings)
1384 self.bibitems.append(ref)
1385 self.bibitem.append(ref.citation_html)
1386 elif tag == "p":
1387 # Elsevier can store supplementary-material inside ref-list / p
1388 self.parse_node_with_mixed_content(child)
1389 else:
1390 self.warnings.append(
1391 {
1392 self.pid: self.__class__.__name__
1393 + "."
1394 + inspect.currentframe().f_code.co_name
1395 + " "
1396 + tag
1397 }
1398 )
1400 def parse_related_article(self, node, **kwargs):
1401 rel_type = get_normalized_attrib(node, "related-article-type") or ""
1402 id_value = node.text
1404 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1404 ↛ 1407line 1404 didn't jump to line 1407 because the condition on line 1404 was never true
1405 # a pii is used instead of a DOI
1406 # Call Elsevier to get the doi
1407 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)
1408 id_value = doi
1410 obj = Foo()
1411 obj.rel_type = rel_type
1412 obj.id_value = id_value
1414 self.relations.append(obj)
1416 def parse_related_object(self, node, **kwargs):
1417 node_type = node.get("content-type") or ""
1418 rel = node.get("link-type") or ""
1419 href = get_normalized_attrib(node, "href") or ""
1420 base = get_normalized_attrib(node, "base") or ""
1421 text = get_xml_from_node(node)
1423 data = {
1424 "rel": rel,
1425 "mimetype": node_type,
1426 "location": href,
1427 "base": base,
1428 "metadata": text,
1429 }
1431 document_id_type = node.get("document-id-type") or ""
1432 if document_id_type: 1432 ↛ 1433line 1432 didn't jump to line 1433 because the condition on line 1432 was never true
1433 id_value = node.get("document-id") or ""
1434 if id_value != "NONE":
1435 if id_value and id_value.find("10.") == -1:
1436 # a pii is used instead of a DOI
1437 # Call Elsevier to get the doi
1438 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)
1439 id_value = doi
1441 obj = Foo()
1442 obj.rel_type = "refers to"
1443 obj.id_value = id_value
1445 self.relations.append(obj)
1446 else:
1447 self.related_objects.append(data)
1449 def parse_sec(self, node, **kwargs):
1450 for child in node:
1451 tag = normalize(child.tag)
1453 if tag == "title":
1454 pass
1455 elif tag == "ref-list":
1456 self.parse_ref_list(child)
1457 else:
1458 self.warnings.append(
1459 {
1460 self.pid: self.__class__.__name__
1461 + "."
1462 + inspect.currentframe().f_code.co_name
1463 + " "
1464 + tag
1465 }
1466 )
1468 def parse_self_uri(self, node, **kwargs):
1469 node_type = node.get("content-type") or "text/html"
1470 href = get_normalized_attrib(node, "href") or ""
1471 base = get_normalized_attrib(node, "base") or ""
1473 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections:
1474 # The collection folder is missing: add it back
1475 if hasattr(self, "pii") and hasattr(self, "issue"): 1475 ↛ 1476line 1475 didn't jump to line 1476 because the condition on line 1475 was never true
1476 base_dir = self.issue.journal.pid
1477 if os.path.dirname(href) != base_dir:
1478 href = os.path.join(base_dir, self.issue.pid, href)
1480 if self.no_bib: 1480 ↛ 1481line 1480 didn't jump to line 1481 because the condition on line 1480 was never true
1481 href = "http://www.numdam.org/item/" + os.path.basename(href)
1483 data = {
1484 "rel": "full-text",
1485 "mimetype": node_type,
1486 "location": href,
1487 "base": base,
1488 "text": normalize_space(node.text) if node.text is not None else "",
1489 }
1491 # Ext-links, Related-objects used metadata instead of text. Strange difference ?
1492 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here.
1493 if node_type != "application/xml":
1494 self.streams.append(data)
1496 def parse_sub_article(self, node, **kwargs):
1497 # Used for translations
1498 trans_article = JatsArticle(tree=node)
1499 self.translations.append(trans_article)
1501 def parse_subj_group(self, node, **kwargs):
1502 lang = get_normalized_attrib(node, "lang") or self.lang
1503 type_ = node.get("subj-group-type") or ""
1505 for child in node:
1506 tag = normalize(child.tag)
1508 if tag == "subject": 1508 ↛ 1513line 1508 didn't jump to line 1513 because the condition on line 1508 was always true
1509 self.subjs.append(
1510 {"type": type_, "lang": lang, "value": get_text_from_node(child)}
1511 )
1512 else:
1513 self.warnings.append(
1514 {
1515 self.pid: self.__class__.__name__
1516 + "."
1517 + inspect.currentframe().f_code.co_name
1518 + " "
1519 + tag
1520 }
1521 )
1523 def parse_supplementary_material(self, node, **kwargs):
1524 caption = ""
1525 for child in node:
1526 if child.tag == "caption":
1527 _, caption = self.parse_node_with_mixed_content(child)
1529 location = get_normalized_attrib(node, "href") or None
1530 if location is None:
1531 location = get_normalized_attrib(node, "id") or ""
1533 mimetype = node.attrib.get("mimetype") or None
1534 if mimetype is None:
1535 mimetype = resolver.get_mimetype(location)
1537 material = {
1538 "rel": node.attrib.get("content-type") or "supplementary-material",
1539 "mimetype": mimetype,
1540 "location": location,
1541 "base": "",
1542 "metadata": "",
1543 "caption": caption if caption else "",
1544 }
1545 base_location = os.path.basename(location)
1546 found_list = [
1547 item
1548 for item in self.supplementary_materials
1549 if os.path.basename(item["location"]) == base_location
1550 ]
1551 if len(found_list) == 0:
1552 self.supplementary_materials.append(material)
1554 def parse_title(self, node, **kwargs):
1555 self.title_tex, self.title_html = self.parse_node_with_mixed_content(
1556 node, ignore_xref=True
1557 )
1558 # In xmldata.py, title_xml had the <title_group> tag:
1559 # self.title_xml can't be set in parse_title
1561 def parse_title_group(self, node, **kwargs):
1562 has_fn_group = False
1564 for child in node:
1565 tag = normalize(child.tag)
1567 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"):
1568 self.parse_title(child)
1569 elif tag == "subtitle": 1569 ↛ 1570line 1569 didn't jump to line 1570 because the condition on line 1569 was never true
1570 title_tex, title_html = self.parse_node_with_mixed_content(child)
1571 self.title_tex += " " + title_tex
1572 self.title_html += " " + title_html
1573 elif tag == "trans-title-group":
1574 self.parse_trans_title_group(child)
1575 elif tag == "abbrev-title":
1576 _, self.abbrev = self.parse_node_with_mixed_content(child)
1577 elif tag == "fn-group": 1577 ↛ 1578line 1577 didn't jump to line 1578 because the condition on line 1577 was never true
1578 has_fn_group = True
1579 for fn_node in child:
1580 if fn_node.tag == "fn":
1581 _, html = self.parse_node_with_fn(
1582 fn_node, keep_fn=True, keep_fn_label=False
1583 )
1584 xml = get_xml_from_node(fn_node)
1585 self.footnotes_xml += xml
1586 self.footnotes_html += html
1587 else:
1588 self.warnings.append(
1589 {
1590 self.pid: self.__class__.__name__
1591 + "."
1592 + inspect.currentframe().f_code.co_name
1593 + " "
1594 + tag
1595 }
1596 )
1598 if has_fn_group: 1598 ↛ 1601line 1598 didn't jump to line 1601 because the condition on line 1598 was never true
1599 # fn-group is now a funding statement and will be exported separately in the XML:
1600 # => remove it from the title-group
1601 new_node = etree.Element("title-group")
1602 for child in node:
1603 tag = normalize(child.tag)
1604 if tag != "fn-group":
1605 new_node.append(copy.deepcopy(child))
1606 self.title_xml = get_xml_from_node(new_node)
1607 else:
1608 self.title_xml = get_xml_from_node(node)
1610 def parse_trans_abstract(self, node, **kwargs):
1611 tag = get_normalized_attrib(node, "abstract-type") or "abstract"
1612 if tag == "author": 1612 ↛ 1613line 1612 didn't jump to line 1613 because the condition on line 1612 was never true
1613 tag = "abstract"
1614 lang = get_normalized_attrib(node, "lang") or "und"
1615 value_tex, value_html = self.parse_node_with_mixed_content(node)
1616 value_xml = get_xml_from_node(node)
1617 self.abstracts.append(
1618 {
1619 "tag": tag,
1620 "lang": lang,
1621 "value_xml": value_xml,
1622 "value_html": value_html,
1623 "value_tex": value_tex,
1624 }
1625 )
1627 def parse_trans_title(self, node, **kwargs):
1628 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node)
1629 self.trans_title_xml = get_xml_from_node(node)
1631 def parse_trans_title_group(self, node, **kwargs):
1632 for child in node:
1633 tag = normalize(child.tag)
1635 if tag == "trans-title": 1635 ↛ 1638line 1635 didn't jump to line 1638 because the condition on line 1635 was always true
1636 self.parse_trans_title(child)
1637 else:
1638 self.warnings.append(
1639 {
1640 self.pid: self.__class__.__name__
1641 + "."
1642 + inspect.currentframe().f_code.co_name
1643 + " "
1644 + tag
1645 }
1646 )
1648 self.trans_lang = get_normalized_attrib(node, "lang") or "und"
1650 def get_data_from_contrib(self, node):
1651 """
1652 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives>
1653 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code
1654 :param node:
1655 :return:
1656 """
1658 params = create_contributor()
1660 for child in node:
1661 if child.tag == "name":
1662 self.update_data_from_name(child, params)
1663 elif child.tag == "string-name":
1664 self.update_data_from_name(child, params)
1665 if params["first_name"] == "" and params["last_name"] == "": 1665 ↛ 1660line 1665 didn't jump to line 1660 because the condition on line 1665 was always true
1666 params["string_name"] = child.text or ""
1667 elif child.tag == "name-alternatives":
1668 params["mid"] = self.get_data_from_name_alternatives(child)
1669 elif child.tag == "contrib-id":
1670 type_ = child.get("contrib-id-type") or ""
1671 if type_ == "orcid": 1671 ↛ 1673line 1671 didn't jump to line 1673 because the condition on line 1671 was always true
1672 params["orcid"] = child.text or ""
1673 if type_ == "idref": 1673 ↛ 1674line 1673 didn't jump to line 1674 because the condition on line 1673 was never true
1674 params["idref"] = child.text or ""
1675 elif child.tag == "address":
1676 addr = get_text_from_node(child)
1677 params["addresses"].append(addr)
1678 elif child.tag == "email":
1679 params["email"] = child.text or ""
1680 elif child.tag == "xref": 1680 ↛ 1692line 1680 didn't jump to line 1692 because the condition on line 1680 was always true
1681 # Elsevier uses xref/aff-alternatives to store affiliations
1682 type_ = child.get("ref-type") or ""
1683 if type_ == "aff": 1683 ↛ 1660line 1683 didn't jump to line 1660 because the condition on line 1683 was always true
1684 xref = child.get("rid") or ""
1685 if xref == "": 1685 ↛ 1686line 1685 didn't jump to line 1686 because the condition on line 1685 was never true
1686 xref = get_text_from_node(child)
1687 if xref != "": 1687 ↛ 1660line 1687 didn't jump to line 1660 because the condition on line 1687 was always true
1688 if "xrefs" not in params: 1688 ↛ 1691line 1688 didn't jump to line 1691 because the condition on line 1688 was always true
1689 params["xrefs"] = [xref]
1690 else:
1691 params["xrefs"].append(xref)
1692 elif child.tag == "collab":
1693 params["string_name"] = child.text or ""
1694 elif child.tag == "role":
1695 pass
1696 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente").
1697 # The node value can not be assigned to params['role'] as we want a controlled vocabulary
1698 # (author /editor / organizer...)
1699 # Ignore the value
1700 # params["role"] = child.text or ""
1701 else:
1702 self.warnings.append(
1703 {
1704 self.pid: self.__class__.__name__
1705 + "."
1706 + inspect.currentframe().f_code.co_name
1707 + " "
1708 + child.tag
1709 }
1710 )
1712 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ)
1713 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import
1714 # params['addresses'].sort()
1716 helper_update_name_params(params)
1718 corresp = node.get("corresp") or ""
1719 if corresp == "yes": 1719 ↛ 1720line 1719 didn't jump to line 1720 because the condition on line 1719 was never true
1720 params["corresponding"] = True
1722 deceased_ = node.get("deceased") or "no"
1723 params["deceased_before_publication"] = deceased_ == "yes"
1725 equal_contrib_ = node.get("equal-contrib") or "no"
1726 params["equal_contrib"] = equal_contrib_ == "yes"
1728 return params
1730 def get_data_from_custom_meta(self, node):
1731 name = ""
1732 value = ""
1734 for child in node:
1735 tag = normalize(child.tag)
1737 if tag == "meta-name":
1738 name = child.text
1739 elif tag == "meta-value": 1739 ↛ 1742line 1739 didn't jump to line 1742 because the condition on line 1739 was always true
1740 value = child.text
1741 else:
1742 self.warnings.append(
1743 {
1744 self.pid: self.__class__.__name__
1745 + "."
1746 + inspect.currentframe().f_code.co_name
1747 + " "
1748 + tag
1749 }
1750 )
1752 return name, value
1754 def get_data_from_date(self, node, ignore_month=False):
1755 date_str = ""
1756 if "iso-8601-date" in node.attrib:
1757 date_str = node.attrib["iso-8601-date"]
1758 else:
1759 year = month = day = ""
1760 for child in node:
1761 tag = normalize(child.tag)
1763 if tag == "year": 1763 ↛ 1765line 1763 didn't jump to line 1765 because the condition on line 1763 was always true
1764 year = child.text
1765 elif tag == "month" and not ignore_month:
1766 month = child.text
1767 elif tag == "day":
1768 day = child.text
1769 else:
1770 self.warnings.append(
1771 {
1772 self.pid: self.__class__.__name__
1773 + "."
1774 + inspect.currentframe().f_code.co_name
1775 + " "
1776 + tag
1777 }
1778 )
1780 date_str = year
1781 if date_str and month: 1781 ↛ 1782line 1781 didn't jump to line 1782 because the condition on line 1781 was never true
1782 date_str += "-" + month
1783 if date_str and day: 1783 ↛ 1784line 1783 didn't jump to line 1784 because the condition on line 1783 was never true
1784 date_str += "-" + day
1786 return date_str
1788 def get_data_from_ext_link(self, node, **kwargs):
1789 link_type = node.get("ext-link-type") or ""
1790 href = get_normalized_attrib(node, "href") or ""
1791 base = get_normalized_attrib(node, "base") or ""
1793 kwargs["add_HTML_link"] = False
1794 _, metadata = self.parse_inner_node(node, **kwargs)
1796 data = {
1797 "rel": link_type,
1798 "mimetype": "",
1799 "location": href,
1800 "base": base,
1801 "metadata": metadata,
1802 }
1804 return data
1806 def get_data_from_history(self, node):
1807 history_dates = []
1808 # TODO: transform history_dates in a hash where date-type is the key
1809 # => Change database_cmds
1810 for child in node:
1811 if "date-type" in child.attrib:
1812 date_type = child.attrib["date-type"]
1813 date_str = self.get_data_from_date(child)
1814 history_dates.append({"type": date_type, "date": date_str})
1815 else:
1816 self.warnings.append(
1817 {
1818 self.pid: self.__class__.__name__
1819 + "."
1820 + inspect.currentframe().f_code.co_name
1821 + " "
1822 + child.tag
1823 }
1824 )
1826 return history_dates
1828 def update_data_from_name(self, node, contributor):
1829 for child in node:
1830 if child.text is not None: 1830 ↛ 1829line 1830 didn't jump to line 1829 because the condition on line 1830 was always true
1831 if child.tag == "given-names":
1832 contributor["first_name"] = child.text
1833 elif child.tag == "surname":
1834 contributor["last_name"] = child.text
1835 elif child.tag == "prefix": 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true
1836 contributor["prefix"] = child.text
1837 elif child.tag == "suffix": 1837 ↛ 1840line 1837 didn't jump to line 1840 because the condition on line 1837 was always true
1838 contributor["suffix"] = child.text
1839 else:
1840 self.warnings.append(
1841 {
1842 self.pid: self.__class__.__name__
1843 + "."
1844 + inspect.currentframe().f_code.co_name
1845 + " "
1846 + child.tag
1847 }
1848 )
1850 def get_data_from_name_alternatives(self, node):
1851 mid = ""
1853 for child in node:
1854 if child.text is not None: 1854 ↛ 1853line 1854 didn't jump to line 1853 because the condition on line 1854 was always true
1855 if child.tag == "string-name": 1855 ↛ 1859line 1855 didn't jump to line 1859 because the condition on line 1855 was always true
1856 if child.get("specific-use") == "index": 1856 ↛ 1853line 1856 didn't jump to line 1853 because the condition on line 1856 was always true
1857 mid = child.text
1858 else:
1859 self.warnings.append(
1860 {
1861 self.pid: self.__class__.__name__
1862 + "."
1863 + inspect.currentframe().f_code.co_name
1864 + " "
1865 + child.tag
1866 }
1867 )
1869 return mid
1871 def get_data_from_uri(self, node, **kwargs):
1872 href = get_normalized_attrib(node, "href") or ""
1874 kwargs["add_HTML_link"] = False
1875 _, metadata = self.parse_inner_node(node, **kwargs)
1877 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata}
1879 return data
1881 def helper_add_link_from_node(self, node, **kwargs):
1882 text = node.text or ""
1883 tag = normalize(node.tag)
1884 fct_name = "get_data_from_" + tag.replace("-", "_")
1885 meth = getattr(self, fct_name)
1886 data = meth(node, **kwargs)
1887 if not data["rel"] or data["rel"] == "uri":
1888 href = data["location"]
1889 if self.for_tex_file: 1889 ↛ 1890line 1889 didn't jump to line 1890 because the condition on line 1889 was never true
1890 text = "\\href{" + href + "}{" + data["metadata"] + "}"
1891 else:
1892 text = make_links_clickable(href, data["metadata"])
1893 return text
1895 def get_list_start_value(self, list_node):
1896 continued_from = list_node.get("continued-from")
1897 if continued_from is None:
1898 start = 0
1899 else:
1900 from_node = self.tree.find(f'.//*[@id="{continued_from}"]')
1901 if from_node is not None:
1902 start = len(from_node) + self.get_list_start_value(from_node)
1904 return start
1907class MathdocPublication(MathdocPublicationData, JatsBase):
1908 def __init__(self, *args, **kwargs):
1909 super().__init__(*args, **kwargs)
1910 self.parse_tree(kwargs["tree"])
1912 def parse_tree(self, tree):
1913 super().parse_tree(tree)
1915 for node in tree:
1916 tag = normalize(node.tag)
1918 if tag in ("publication-id", "collection-id"):
1919 node_type = node.get("publication-id-type")
1920 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]:
1921 self.pid = node.text
1922 elif tag == "title-group":
1923 self.parse_title_group(node)
1924 elif tag == "issn":
1925 node_type = node.get("pub-type")
1926 if node_type == "ppub":
1927 self.issn = node.text
1928 self.ids.append(("issn", node.text))
1929 elif node_type == "epub": 1929 ↛ 1915line 1929 didn't jump to line 1915 because the condition on line 1929 was always true
1930 self.e_issn = node.text
1931 self.ids.append(("e-issn", node.text))
1932 elif tag == "ext-link":
1933 data = self.get_data_from_ext_link(node)
1934 self.ext_links.append(data)
1935 elif tag == "custom-meta-group":
1936 self.parse_custom_meta_group(node)
1937 elif tag == "description": 1937 ↛ 1938line 1937 didn't jump to line 1938 because the condition on line 1937 was never true
1938 self.parse_description(node)
1939 else:
1940 self.warnings.append(
1941 {
1942 self.pid: self.__class__.__name__
1943 + "."
1944 + inspect.currentframe().f_code.co_name
1945 + " "
1946 + tag
1947 }
1948 )
1950 def parse_custom_meta_group(self, node, **kwargs):
1951 for child in node:
1952 tag = normalize(child.tag)
1954 if tag == "custom-meta": 1954 ↛ 1964line 1954 didn't jump to line 1964 because the condition on line 1954 was always true
1955 name, value = self.get_data_from_custom_meta(child)
1957 if name == "serial-type":
1958 self.coltype = value
1959 elif name == "wall":
1960 self.wall = int(value)
1961 elif name == "provider": 1961 ↛ 1951line 1961 didn't jump to line 1951 because the condition on line 1961 was always true
1962 self.provider = value
1963 else:
1964 self.warnings.append(
1965 {
1966 self.pid: self.__class__.__name__
1967 + "."
1968 + inspect.currentframe().f_code.co_name
1969 + " "
1970 + tag
1971 }
1972 )
1974 def parse_description(self, node, **kwargs):
1975 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"
1976 tag = "description"
1977 lang = get_normalized_attrib(node, "lang") or self.lang
1978 value_xml = get_xml_from_node(node)
1979 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "")
1980 self.abstracts.append(
1981 {
1982 "tag": tag,
1983 "lang": lang,
1984 "value_xml": value_xml,
1985 "value_html": value_html,
1986 "value_tex": value_tex,
1987 }
1988 )
1991class JatsPublisher(PublisherData):
1992 def __init__(self, *args, **kwargs):
1993 super().__init__(*args, **kwargs)
1994 self.warnings = []
1995 self.parse_tree(kwargs["tree"])
1996 self.warnings = []
1998 def parse_tree(self, tree):
1999 for node in tree:
2000 tag = normalize(node.tag)
2002 if tag == "publisher-name": 2002 ↛ 2004line 2002 didn't jump to line 2004 because the condition on line 2002 was always true
2003 self.name = node.text
2004 elif tag == "publisher-loc":
2005 self.loc = node.text
2006 else:
2007 self.warnings.append(
2008 {
2009 self.pid: self.__class__.__name__
2010 + "."
2011 + inspect.currentframe().f_code.co_name
2012 + " "
2013 + tag
2014 }
2015 )
2018class JatsJournal(JournalData, JatsBase):
2019 def __init__(self, *args, **kwargs):
2020 super().__init__(*args, **kwargs)
2021 self.parse_tree(kwargs["tree"])
2023 def parse_tree(self, tree):
2024 super().parse_tree(tree)
2026 for node in tree:
2027 tag = normalize(node.tag)
2029 if tag == "journal-id":
2030 id_type = node.get("journal-id-type") or "numdam-id"
2031 if id_type == "numdam-id" or id_type == "mathdoc-id": 2031 ↛ 2026line 2031 didn't jump to line 2026 because the condition on line 2031 was always true
2032 self.pid = node.text
2033 elif tag == "journal-title-group":
2034 self.parse_title_group(node)
2035 elif tag == "publisher":
2036 self.publisher = JatsPublisher(tree=node)
2037 elif tag == "issn": 2037 ↛ 2046line 2037 didn't jump to line 2046 because the condition on line 2037 was always true
2038 node_type = node.get("pub-type") or "ppub"
2039 if node_type == "ppub":
2040 self.issn = node.text
2041 self.ids.append(("issn", node.text))
2042 elif node_type == "epub": 2042 ↛ 2026line 2042 didn't jump to line 2026 because the condition on line 2042 was always true
2043 self.e_issn = node.text
2044 self.ids.append(("e-issn", node.text))
2045 else:
2046 self.warnings.append(
2047 {
2048 self.pid: self.__class__.__name__
2049 + "."
2050 + inspect.currentframe().f_code.co_name
2051 + " "
2052 + tag
2053 }
2054 )
2057class JatsEdito(ArticleData, JatsBase):
2058 def __init__(self, *args, **kwargs): # , tree, pid=None):
2059 super().__init__(*args, **kwargs)
2060 self.pid = kwargs["pid"] if "pid" in kwargs else None
2061 self.issue = kwargs["issue"] if "issue" in kwargs else None
2063 self.add_span_around_tex_formula = (
2064 kwargs["add_span_around_tex_formula"]
2065 if "add_span_around_tex_formula" in kwargs
2066 else False
2067 )
2068 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False
2069 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None
2070 self.no_bib = kwargs.get("no_bib", False)
2072 self.parse_tree(kwargs["tree"])
2074 def parse_tree(self, tree):
2075 super().parse_tree(tree)
2076 for node in tree:
2077 text_html = ""
2079 tag = normalize(node.tag)
2080 if tag == "p":
2081 text_html = get_text_from_node(node)
2082 if text_html:
2083 self.body_html += "<p>" + text_html + "</p>"
2084 elif tag == "h1":
2085 text_html = get_text_from_node(node)
2086 if text_html:
2087 self.body_html += "<h1>" + text_html + "</h1>"
2089 return self.body_html
2092class JatsIssue(IssueData, JatsBase):
2093 def __init__(self, *args, **kwargs):
2094 super().__init__(*args, **kwargs)
2095 # from_folder is used to change the location of Elsevier graphics to a full path location
2096 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None
2097 self.no_bib = kwargs.get("no_bib", False)
2099 self.parse_tree(kwargs["tree"])
2101 def parse_tree(self, tree):
2102 super().parse_tree(tree)
2104 for node in tree:
2105 tag = normalize(node.tag)
2106 if tag == "journal-meta":
2107 self.journal = JatsJournal(tree=node)
2108 elif tag == "issue-meta":
2109 ctype = get_normalized_attrib(node, "issue_type")
2110 if ctype == "issue_special": 2110 ↛ 2111line 2110 didn't jump to line 2111 because the condition on line 2110 was never true
2111 self.ctype = "issue_special"
2112 self.parse_issue_meta(node)
2113 elif tag == "body": 2113 ↛ 2138line 2113 didn't jump to line 2138 because the condition on line 2113 was always true
2114 for child in node:
2115 tag = normalize(child.tag)
2117 if tag == "article": 2117 ↛ 2128line 2117 didn't jump to line 2128 because the condition on line 2117 was always true
2118 article = JatsArticle(
2119 tree=child,
2120 issue=self,
2121 from_folder=self.from_folder,
2122 no_bib=self.no_bib,
2123 )
2124 self.warnings.extend(article.warnings)
2125 self.articles.append(article)
2127 else:
2128 self.warnings.append(
2129 {
2130 self.pid: self.__class__.__name__
2131 + "."
2132 + inspect.currentframe().f_code.co_name
2133 + " "
2134 + tag
2135 }
2136 )
2137 else:
2138 self.warnings.append(
2139 {
2140 self.pid: self.__class__.__name__
2141 + "."
2142 + inspect.currentframe().f_code.co_name
2143 + " "
2144 + tag
2145 }
2146 )
2148 if self.journal is not None: 2148 ↛ 2152line 2148 didn't jump to line 2152 because the condition on line 2148 was always true
2149 self.publisher = self.journal.publisher
2151 # Issue editors may be replicated in all the articles, remove them
2152 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"]
2154 is_elsevier = False
2155 for xarticle in self.articles:
2156 if hasattr(xarticle, "pii"): 2156 ↛ 2157line 2156 didn't jump to line 2157 because the condition on line 2156 was never true
2157 is_elsevier = True
2159 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"]
2160 is_equal = len(editors) == len(issue_editors)
2161 i = 0
2162 while is_equal and i < len(editors): 2162 ↛ 2163line 2162 didn't jump to line 2163 because the condition on line 2162 was never true
2163 if (
2164 editors[i]["last_name"] != issue_editors[i]["last_name"]
2165 or editors[i]["first_name"] != issue_editors[i]["first_name"]
2166 ):
2167 is_equal = False
2168 i += 1
2169 if is_equal: 2169 ↛ 2155line 2169 didn't jump to line 2155 because the condition on line 2169 was always true
2170 xarticle.contributors = [
2171 contrib for contrib in xarticle.contributors if contrib["role"] != "editor"
2172 ]
2174 if is_elsevier: 2174 ↛ 2176line 2174 didn't jump to line 2176 because the condition on line 2174 was never true
2175 # Fix location of icons
2176 for link in self.ext_links:
2177 if link["rel"] in ["icon", "small_icon"]:
2178 base_dir = self.journal.pid
2179 location = link["location"]
2180 if os.path.dirname(location) != base_dir:
2181 location = os.path.join(base_dir, self.pid, location)
2182 if self.from_folder:
2183 location = os.path.join(self.from_folder, location)
2184 location = "file:" + location
2185 link["location"] = location
2187 # Fix article types and subjects
2188 for xarticle in self.articles:
2189 article_type = "research-article"
2190 old_type = ""
2191 new_subjs = []
2193 if xarticle.fpage != "":
2194 try:
2195 value = int(xarticle.fpage)
2196 except ValueError:
2197 # fpage is not a number: the article is an editorial
2198 article_type = "editorial"
2200 if article_type == "research-article":
2201 for subj in xarticle.subjs:
2202 if subj["type"] == "type":
2203 # Fix article types
2204 value = subj["value"].lower()
2205 old_type = value
2206 if value == "discussion":
2207 article_type = "letter"
2208 elif value == "editorial":
2209 if xarticle.title_tex.lower().find("foreword") == 0:
2210 article_type = "foreword"
2211 else:
2212 article_type = "editorial"
2213 elif value in ["mini review", "review article", "book review"]:
2214 article_type = "review"
2215 elif value == "research article":
2216 article_type = "research-article"
2217 elif value == "short communication":
2218 article_type = "foreword"
2219 elif value == "correspondence":
2220 article_type = "letter"
2221 elif value.find("conference") == 0:
2222 article_type = "congress"
2223 elif subj["type"] == "heading" and not xarticle.title_tex:
2224 # The title may be stored in the heading: fix it
2225 xarticle.title_tex = xarticle.title_html = subj["value"]
2226 xarticle.title_xml = get_title_xml(subj["value"])
2227 elif subj["type"] == "heading":
2228 value = subj["value"].lower().strip()
2229 issue_title = self.title_tex.lower()
2230 if issue_title.find("dossier: ") == 0:
2231 issue_title = issue_title[9:]
2232 self.title_tex = self.title_html = self.title_tex[9:]
2233 self.title_xml = (
2234 "<issue-title>"
2235 + get_single_title_xml(issue_title)
2236 + "</issue-title>"
2237 )
2239 # Some heading values are in fact article type
2240 if value.find("erratum") == 0:
2241 article_type = "erratum"
2242 elif value.find("corrigendum") == 0:
2243 article_type = "corrigendum"
2244 elif value.find("foreword") == 0:
2245 article_type = "foreword"
2246 elif value.find("nécrologie") == 0 or value.find("obituary") == 0:
2247 article_type = "history-of-sciences"
2248 elif (
2249 value.find("block calendar/éphéméride") == 0
2250 or value.find("chronique") == 0
2251 ):
2252 article_type = "history-of-sciences"
2253 elif value.find("histoire") == 0 or value.find("historic") == 0:
2254 article_type = "history-of-sciences"
2255 elif value.find("tribute/hommage") == 0:
2256 article_type = "history-of-sciences"
2257 elif value.find("note historique") == 0:
2258 article_type = "historical-commentary"
2259 elif (
2260 value.find("le point sur") == 0 or value.find("le point-sur") == 0
2261 ):
2262 article_type = "review"
2263 elif (
2264 value.find("review") == 0
2265 or value.find("revue") == 0
2266 or value.find("concise review") == 0
2267 ):
2268 article_type = "review"
2269 elif value.find("conférence") == 0:
2270 article_type = "congress"
2271 elif (
2272 value.find("communication") == 0 or value.find("preliminary") == 0
2273 ):
2274 article_type = "preliminary-communication"
2275 elif value.find("perspective") == 0 and old_type in [
2276 "correspondence",
2277 "short communication",
2278 ]:
2279 article_type = "opinion"
2280 elif value.find("debate") == 0:
2281 article_type = "opinion"
2282 elif (
2283 value.find("index") == 0
2284 or value.find("keyword") == 0
2285 or value.find("sommaire") == 0
2286 ):
2287 article_type = "editorial"
2288 elif (
2289 value.find("table auteurs") == 0
2290 or value.find("table sommaire") == 0
2291 ):
2292 article_type = "editorial"
2293 elif value.find("page présentation des index") == 0:
2294 article_type = "editorial"
2295 elif value.find("fac-similé") == 0:
2296 # Article de crbiol, Pubmed les met en "Classical Article"
2297 article_type = "historical-commentary"
2298 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie)
2299 new_subjs.append(subj)
2300 # Ignore the issue titles
2301 elif (
2302 not self.title_tex
2303 or value.find(self.title_tex.lower().strip()) != 0
2304 ):
2305 # Exclude headings that are redundant with article types
2306 exclude_list = [
2307 "editorial",
2308 "éditorial",
2309 "avant-propos",
2310 "book review",
2311 "comment",
2312 "concise review paper",
2313 "answer",
2314 "commentaire",
2315 "commentary",
2316 "reply",
2317 "foreword",
2318 "full paper",
2319 "mémoire",
2320 ]
2321 if len([x for x in exclude_list if value.find(x) == 0]) == 0:
2322 new_subjs.append(subj)
2323 else:
2324 new_subjs.append(subj)
2326 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage)
2327 xarticle.atype = article_type
2328 xarticle.subjs = new_subjs
2330 def parse_custom_meta_group(self, node, **kwargs):
2331 for child in node:
2332 tag = normalize(child.tag)
2334 if tag == "custom-meta": 2334 ↛ 2342line 2334 didn't jump to line 2342 because the condition on line 2334 was always true
2335 name, value = self.get_data_from_custom_meta(child)
2337 if name == "provider":
2338 self.provider = value
2339 elif name == "efirst": 2339 ↛ 2331line 2339 didn't jump to line 2331 because the condition on line 2339 was always true
2340 self.with_online_first = value == "yes"
2341 else:
2342 self.warnings.append(
2343 {
2344 self.pid: self.__class__.__name__
2345 + "."
2346 + inspect.currentframe().f_code.co_name
2347 + " "
2348 + tag
2349 }
2350 )
2352 def parse_issue_meta(self, node, **kwargs):
2353 for child in node:
2354 tag = normalize(child.tag)
2356 if tag == "issue-id":
2357 self.parse_id(child)
2358 elif tag == "volume-series":
2359 self.vseries = child.text
2360 elif tag == "volume":
2361 self.volume = child.text
2362 elif tag == "issue":
2363 self.number = child.text
2364 elif tag == "pub-date":
2365 self.year = self.get_data_from_date(child, ignore_month=True)
2366 elif tag == "history":
2367 history_dates = self.get_data_from_history(child)
2368 for date in history_dates:
2369 if date["type"] == "last-modified":
2370 self.last_modified_iso_8601_date_str = date["date"]
2371 elif date["type"] == "prod-deployed-date":
2372 self.prod_deployed_date_iso_8601_date_str = date["date"]
2373 elif tag == "issue-title":
2374 content_type = child.get("content-type") or ""
2375 if content_type != "subtitle" and content_type != "cover-date": 2375 ↛ 2353line 2375 didn't jump to line 2353 because the condition on line 2375 was always true
2376 # Elsevier stores contributors in subtitles. Ignore.
2377 lang = get_normalized_attrib(child, "lang") or "und"
2378 if not self.title_tex and ( 2378 ↛ 2386line 2378 didn't jump to line 2386 because the condition on line 2378 was always true
2379 self.lang == "und" or lang == "und" or lang == self.lang
2380 ):
2381 self.parse_title(child)
2382 # In xmldata, title_xml had the <title_group> tag:
2383 # self.title_xml can't be set in parse_title
2384 self.title_xml += get_xml_from_node(child)
2385 else:
2386 self.trans_lang = lang
2387 (
2388 self.trans_title_tex,
2389 self.trans_title_html,
2390 ) = self.parse_node_with_mixed_content(child)
2391 self.title_xml += get_xml_from_node(child)
2392 elif tag == "issue-title-group": 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true
2393 self.parse_title_group(child)
2394 else:
2395 fct_name = "parse_" + tag.replace("-", "_")
2396 ftor = getattr(self, fct_name, None)
2397 if callable(ftor): 2397 ↛ 2400line 2397 didn't jump to line 2400 because the condition on line 2397 was always true
2398 ftor(child, add_ext_link=True)
2399 else:
2400 self.warnings.append(
2401 {
2402 self.pid: self.__class__.__name__
2403 + "."
2404 + inspect.currentframe().f_code.co_name
2405 + " "
2406 + tag
2407 }
2408 )
2410 if self.last_modified_iso_8601_date_str is None:
2411 self.last_modified_iso_8601_date_str = timezone.now().isoformat()
2414class JatsArticleBase(JatsBase):
2415 def parse_custom_meta_group(self, node, **kwargs):
2416 for child in node:
2417 tag = normalize(child.tag)
2419 if tag == "custom-meta": 2419 ↛ 2438line 2419 didn't jump to line 2438 because the condition on line 2419 was always true
2420 name, value = self.get_data_from_custom_meta(child)
2422 if name == "article-number":
2423 self.article_number = value
2424 elif name == "talk-number":
2425 self.talk_number = value
2426 elif name == "presented": 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true
2427 presenter = create_contributor()
2428 presenter["role"] = "presenter"
2429 presenter["string_name"] = value.replace("Presented by ", "").replace(
2430 "Présenté par ", ""
2431 )
2432 presenter["contrib_xml"] = get_contrib_xml(presenter)
2433 self.contributors.append(presenter)
2434 elif name == "provider": 2434 ↛ 2416line 2434 didn't jump to line 2416 because the condition on line 2434 was always true
2435 self.provider = value
2437 else:
2438 self.warnings.append(
2439 {
2440 self.pid: self.__class__.__name__
2441 + "."
2442 + inspect.currentframe().f_code.co_name
2443 + " "
2444 + tag
2445 }
2446 )
2449class JatsArticle(ArticleData, JatsArticleBase):
2450 def __init__(self, *args, **kwargs): # , tree, pid=None):
2451 super().__init__(*args, **kwargs)
2452 self.pid = kwargs["pid"] if "pid" in kwargs else None
2453 self.issue = kwargs["issue"] if "issue" in kwargs else None
2455 self.add_span_around_tex_formula = (
2456 kwargs["add_span_around_tex_formula"]
2457 if "add_span_around_tex_formula" in kwargs
2458 else False
2459 )
2460 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False
2461 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None
2462 self.no_bib = kwargs.get("no_bib", False)
2464 self.parse_tree(kwargs["tree"])
2466 def parse_tree(self, tree):
2467 super().parse_tree(tree)
2469 self.atype = get_normalized_attrib(tree, "article-type") or ""
2471 # First loop to catch float-groups that are inserted inside the body
2472 for node in tree:
2473 tag = normalize(node.tag)
2475 if tag == "front":
2476 for child in node:
2477 tag = normalize(child.tag)
2479 if tag == "article-meta":
2480 self.parse_article_meta(child)
2481 else:
2482 self.warnings.append(
2483 {
2484 self.pid: self.__class__.__name__
2485 + "."
2486 + inspect.currentframe().f_code.co_name
2487 + " "
2488 + tag
2489 }
2490 )
2491 elif tag == "front-stub": 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true
2492 self.parse_article_meta(node)
2493 elif tag == "floats-group": 2493 ↛ 2494line 2493 didn't jump to line 2494 because the condition on line 2493 was never true
2494 self.parse_floats_group(node)
2496 for node in tree:
2497 tag = normalize(node.tag)
2498 if tag == "back":
2499 for child in node:
2500 tag = normalize(child.tag)
2502 if tag == "ref-list" and not self.no_bib:
2503 print("Parse bib")
2504 self.parse_ref_list(child)
2505 elif tag == "ack": 2505 ↛ 2506line 2505 didn't jump to line 2506 because the condition on line 2505 was never true
2506 self.parse_ack(child)
2507 elif tag == "sec": 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true
2508 self.parse_sec(child)
2509 elif tag == "app-group": 2509 ↛ 2510line 2509 didn't jump to line 2510 because the condition on line 2509 was never true
2510 self.parse_app_group(child)
2511 elif tag == "fn-group": 2511 ↛ 2512line 2511 didn't jump to line 2512 because the condition on line 2511 was never true
2512 self.parse_fn_group(child)
2513 else:
2514 self.warnings.append(
2515 {
2516 self.pid: self.__class__.__name__
2517 + "."
2518 + inspect.currentframe().f_code.co_name
2519 + " "
2520 + tag
2521 }
2522 )
2524 elif tag == "body":
2525 self.parse_body(node)
2526 elif tag == "sub-article": 2526 ↛ 2527line 2526 didn't jump to line 2527 because the condition on line 2526 was never true
2527 self.parse_sub_article(node)
2528 elif tag == "floats-group" or tag == "front": 2528 ↛ 2532line 2528 didn't jump to line 2532 because the condition on line 2528 was always true
2529 # Handled above
2530 pass
2531 else:
2532 self.warnings.append(
2533 {
2534 self.pid: self.__class__.__name__
2535 + "."
2536 + inspect.currentframe().f_code.co_name
2537 + " "
2538 + tag
2539 }
2540 )
2542 # Add the footnotes at the end
2543 if len(self.fns) > 0: 2543 ↛ 2544line 2543 didn't jump to line 2544 because the condition on line 2543 was never true
2544 fn_text = '<div class="footnotes">'
2545 for fn in self.fns:
2546 fn_text += fn
2547 fn_text += "</div>"
2549 self.body_html = fn_text if not self.body_html else self.body_html + fn_text
2551 if ( 2551 ↛ 2555line 2551 didn't jump to line 2555
2552 len(self.funding_statement_xml) > 0
2553 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1
2554 ):
2555 self.funding_statement_xml = (
2556 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>'
2557 )
2559 # Case for XML with <body>, then <back> and <floats_group>
2560 # The figures/tables of the floats_group are added inside the body_html
2561 # (close to their first <xref>)
2562 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function.
2563 # Instead, we append the floats_group_xml to the body_xml
2564 if hasattr(self, "floats_group_xml"): 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true
2565 self.body_xml += self.floats_group_xml
2567 # Special treatment for Elsevier articles: web scrapping to find the date_published
2568 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests
2569 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None:
2570 # article_data = scrapping.fetch_article(self.doi, self.pii)
2571 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str
2573 self.post_parse_tree()
2575 def update_body_content(self, node, **kwargs):
2576 if len(node) == 0:
2577 # Most journals do not display the Full text
2578 # the <body> is then used to store the text for the search engine and has no children
2579 # Let's not compute body_html in this case.
2580 # We want the same behavior for journals that display the Full text,
2581 # but with old articles without Full text.
2582 return
2584 # <front> has to be put before <body> so self.pid is defined here
2585 if hasattr(settings, "SITE_URL_PREFIX"): 2585 ↛ 2586line 2585 didn't jump to line 2586 because the condition on line 2585 was never true
2586 prefix = settings.SITE_URL_PREFIX
2587 base_article = settings.ARTICLE_BASE_URL
2588 base_url = "/" + prefix + base_article + self.pid
2589 else:
2590 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)
2591 kwargs["base_url"] = base_url
2593 append_to_body = True
2594 current_len = len(self.supplementary_materials)
2596 if "use_sec" in kwargs and kwargs["use_sec"]: 2596 ↛ 2598line 2596 didn't jump to line 2598 because the condition on line 2596 was never true
2597 # Hack for Elsevier: convert <ack> into <sec> of the <body>
2598 body_tex, body_html = self.parse_node_with_sec(node, **kwargs)
2599 else:
2600 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs)
2602 if len(self.supplementary_materials) != current_len: 2602 ↛ 2605line 2602 didn't jump to line 2605 because the condition on line 2602 was never true
2603 # Elsevier stores supplementary-material in app-group.
2604 # They are extracted, but ignored in the body_html if the appendix has only supplements
2605 append_to_body = False
2607 for child in node:
2608 if child.tag == "p":
2609 for gchild in child:
2610 if gchild.tag != "supplementary-material":
2611 append_to_body = True
2613 if append_to_body: 2613 ↛ exitline 2613 didn't return from function 'update_body_content' because the condition on line 2613 was always true
2614 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex
2615 self.body_html = body_html if not self.body_html else self.body_html + body_html
2617 body_xml = get_xml_from_node(node)
2618 if not self.body_xml: 2618 ↛ 2621line 2618 didn't jump to line 2621 because the condition on line 2618 was always true
2619 self.body_xml = body_xml
2620 else:
2621 if "use_sec" in kwargs and kwargs["use_sec"]:
2622 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>"
2623 else:
2624 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>"
2626 def parse_ack(self, node, **kwargs):
2627 content_type = node.get("content-type") or ""
2628 if content_type == "COI-statement":
2629 self.coi_statement = get_text_from_node(node)
2630 else:
2631 # Hack for Elsevier: convert <ack> into <sec> of the <body>
2632 self.update_body_content(node, use_sec=True)
2634 def parse_app(self, node, **kwargs):
2635 for child in node:
2636 tag = normalize(child.tag)
2638 if tag == "sec":
2639 # Elsevier can store all appendixes inside one <app> ?!?
2640 # One of them can store the supplements and has to be ignored in the body_html
2641 self.update_body_content(child)
2642 else:
2643 self.warnings.append(
2644 {
2645 self.pid: self.__class__.__name__
2646 + "."
2647 + inspect.currentframe().f_code.co_name
2648 + " "
2649 + tag
2650 }
2651 )
2653 def parse_app_group(self, node, **kwargs):
2654 for child in node:
2655 tag = normalize(child.tag)
2657 if tag == "app":
2658 self.parse_app(child)
2659 else:
2660 self.warnings.append(
2661 {
2662 self.pid: self.__class__.__name__
2663 + "."
2664 + inspect.currentframe().f_code.co_name
2665 + " "
2666 + tag
2667 }
2668 )
2670 def parse_article_categories(self, node, **kwargs):
2671 for child in node:
2672 tag = normalize(child.tag)
2674 if tag == "subj-group": 2674 ↛ 2677line 2674 didn't jump to line 2677 because the condition on line 2674 was always true
2675 self.parse_subj_group(child)
2676 else:
2677 self.warnings.append(
2678 {
2679 self.pid: self.__class__.__name__
2680 + "."
2681 + inspect.currentframe().f_code.co_name
2682 + " "
2683 + tag
2684 }
2685 )
2687 def parse_article_meta(self, node, **kwargs):
2688 for child in node:
2689 tag = normalize(child.tag)
2691 if tag == "article-id":
2692 self.parse_id(child)
2693 elif tag == "fpage":
2694 self.fpage = child.text
2695 self.page_type = child.get("content-type") or ""
2696 elif tag == "lpage":
2697 self.lpage = child.text or ""
2698 elif tag == "page-range":
2699 self.page_range = child.text
2700 elif tag in ("page-count", "size"): 2700 ↛ 2701line 2700 didn't jump to line 2701 because the condition on line 2700 was never true
2701 self.size = child.text
2702 elif tag == "elocation-id": 2702 ↛ 2703line 2702 didn't jump to line 2703 because the condition on line 2702 was never true
2703 self.elocation = child.text
2704 elif tag == "pub-date":
2705 date_type = child.get("date-type") or "pub"
2706 if date_type == "pub": 2706 ↛ 2709line 2706 didn't jump to line 2709 because the condition on line 2706 was always true
2707 self.date_published_iso_8601_date_str = self.get_data_from_date(child)
2708 else:
2709 date_str = self.get_data_from_date(child)
2710 self.history_dates.append({"type": "online", "date": date_str})
2711 elif tag == "history":
2712 self.history_dates += self.get_data_from_history(child)
2713 for date in self.history_dates:
2714 if date["type"] == "prod-deployed-date":
2715 self.prod_deployed_date_iso_8601_date_str = date["date"]
2716 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]:
2717 pass
2718 # TODO: store permissions in XML
2719 elif tag == "author-notes": 2719 ↛ 2721line 2719 didn't jump to line 2721 because the condition on line 2719 was never true
2720 # 2022/11/15 Mersenne meeting. ignore author-notes
2721 pass
2722 # self.parse_author_notes(child)
2723 else:
2724 fct_name = "parse_" + tag.replace("-", "_")
2725 ftor = getattr(self, fct_name, None)
2726 if callable(ftor): 2726 ↛ 2729line 2726 didn't jump to line 2729 because the condition on line 2726 was always true
2727 ftor(child, add_ext_link=True)
2728 else:
2729 self.warnings.append(
2730 {
2731 self.pid: self.__class__.__name__
2732 + "."
2733 + inspect.currentframe().f_code.co_name
2734 + " "
2735 + tag
2736 }
2737 )
2739 def parse_author_notes(self, node, **kwargs):
2740 for child in node:
2741 tag = normalize(child.tag)
2742 if tag == "fn":
2743 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)
2744 xml = get_xml_from_node(child)
2745 self.footnotes_xml += xml
2746 self.footnotes_html += html
2748 def parse_body(self, node, **kwargs):
2749 self.body = get_text_from_node(node)
2751 if hasattr(self, "floats"): 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true
2752 self.floats_to_insert = []
2754 self.update_body_content(node, **kwargs)
2756 if not self.body_xml:
2757 self.body_xml = get_xml_from_node(node)
2759 def parse_boxed_text(self, node, **kwargs):
2760 """
2761 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary.
2762 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML.
2763 """
2764 box_id = node.attrib["id"] if "id" in node.attrib else None
2766 _, html = self.parse_node_with_boxed_text(node, **kwargs)
2768 if box_id is not None:
2769 self.floats[box_id] = html
2771 def parse_floats_group(self, node, **kwargs):
2772 if hasattr(settings, "SITE_URL_PREFIX"):
2773 prefix = settings.SITE_URL_PREFIX
2774 base_article = settings.ARTICLE_BASE_URL
2775 base_url = "/" + prefix + base_article + self.pid
2776 else:
2777 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)
2779 self.floats = {}
2780 for child in node:
2781 tag = normalize(child.tag)
2783 if tag == "fig":
2784 self.parse_node_with_fig(child, append_floats=True, base_url=base_url)
2785 elif tag == "table-wrap":
2786 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url)
2787 elif tag == "boxed-text":
2788 self.parse_boxed_text(child, base_url=base_url)
2789 else:
2790 self.warnings.append(
2791 {
2792 self.pid: self.__class__.__name__
2793 + "."
2794 + inspect.currentframe().f_code.co_name
2795 + " "
2796 + tag
2797 }
2798 )
2800 self.floats_group_xml = get_xml_from_node(node)
2802 def parse_fn_group(self, node, **kwargs):
2803 for child in node:
2804 tag = normalize(child.tag)
2806 if tag == "fn":
2807 _, html = self.parse_node_with_fn(child, keep_fn=True)
2808 xml = get_xml_from_node(child)
2810 self.footnotes_html += html
2811 self.footnotes_xml += xml
2812 else:
2813 self.warnings.append(
2814 {
2815 self.pid: self.__class__.__name__
2816 + "."
2817 + inspect.currentframe().f_code.co_name
2818 + " "
2819 + tag
2820 }
2821 )
2823 def parse_funding_group(self, node, **kwargs):
2824 for child in node:
2825 tag = normalize(child.tag)
2827 if tag == "award-group":
2828 self.parse_award_group(child)
2829 elif tag == "funding-statement":
2830 for funding_node in child:
2831 if funding_node.tag == "name-content":
2832 for funding_child in funding_node:
2833 if funding_child.tag == "fn":
2834 _, html = self.parse_node_with_fn(funding_child, keep_fn=True)
2835 self.funding_statement_html += html
2836 self.funding_statement_xml = get_xml_from_node(funding_node)
2838 # TODO: handle funding-statement with simple texts
2839 else:
2840 self.warnings.append(
2841 {
2842 self.pid: self.__class__.__name__
2843 + "."
2844 + inspect.currentframe().f_code.co_name
2845 + " "
2846 + tag
2847 }
2848 )
2850 def parse_issue(self, node, **kwargs):
2851 # Elsevier stores bs in the seq attribute
2852 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0")
2855class JatsRef(RefBase, JatsBase):
2856 def __init__(self, *args, tree, lang="und", **kwargs):
2857 super().__init__(*args, lang=lang, **kwargs)
2858 self.parse_tree(tree)
2860 def parse_tree(self, tree):
2861 super().parse_tree(tree)
2863 self.user_id = get_normalized_attrib(tree, "id") or ""
2865 for node in tree:
2866 tag = normalize(node.tag)
2868 if tag == "label":
2869 self.label = node.text or ""
2871 if self.label: 2871 ↛ 2906line 2871 didn't jump to line 2906 because the condition on line 2871 was always true
2872 if self.label[0] != "[":
2873 self.label = "[" + self.label + "]"
2875 elif tag == "mixed-citation" or tag == "note":
2876 self.parse_citation_node(node)
2878 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content(
2879 node,
2880 is_citation=True,
2881 is_mixed_citation=True,
2882 add_ext_link=True,
2883 ref_type="misc",
2884 )
2886 if self.label:
2887 self.citation_html = self.label + " " + self.citation_html
2888 self.citation_tex = self.label + " " + self.citation_tex
2890 elif tag == "element-citation":
2891 self.parse_citation_node(node)
2893 self.citation_tex = self.citation_html = get_citation_html(self)
2894 else:
2895 self.warnings.append(
2896 {
2897 self.pid: self.__class__.__name__
2898 + "."
2899 + inspect.currentframe().f_code.co_name
2900 + " "
2901 + tag
2902 }
2903 )
2905 # With xmldata, citation_xml does not have '<ref>', but only the text of the children
2906 self.citation_xml += get_xml_from_node(node)
2908 def get_data_from_name_in_ref(self, node, role):
2909 params = create_contributor()
2910 params["role"] = role
2912 if node.tag == "name":
2913 self.update_data_from_name(node, params)
2914 elif node.tag == "string-name": 2914 ↛ 2918line 2914 didn't jump to line 2918 because the condition on line 2914 was always true
2915 self.update_data_from_name(node, params)
2916 if params["first_name"] == "" and params["last_name"] == "":
2917 params["string_name"] = node.text or ""
2918 elif node.tag == "name-alternatives":
2919 params["mid"] = self.get_data_from_name_alternatives(node)
2920 elif node.tag == "collab":
2921 params["string_name"] = node.text or ""
2923 use_initials = getattr(settings, "REF_JEP_STYLE", False)
2924 helper_update_name_params(params, use_initials)
2925 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node)
2927 return params
2929 def parse_node_with_chapter_title(self, node, **kwargs):
2930 tex, html = self.parse_inner_node(node, **kwargs)
2932 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
2933 if is_mixed_citation:
2934 html = add_span_class_to_html_from_chapter_title(html, **kwargs)
2936 return tex, html
2938 def parse_node_with_source(self, node, **kwargs):
2939 tex, html = self.parse_inner_node(node, **kwargs)
2941 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
2942 if is_mixed_citation:
2943 html = add_span_class_to_html_from_source(html, **kwargs)
2945 return tex, html
2947 def parse_citation_node(self, node, **kwargs):
2948 self.type = get_normalized_attrib(node, "publication-type") or "misc"
2950 # Elsevier can store data about a translation after comments (<source>...)
2951 # Append these tags in the comment
2952 has_comment = False
2954 for child in node:
2955 tag = normalize(child.tag)
2957 if tag in ("page-count", "size"): 2957 ↛ 2958line 2957 didn't jump to line 2958 because the condition on line 2957 was never true
2958 if not self.size:
2959 self.size = child.text
2960 elif tag == "comment":
2961 has_comment = True
2962 # comments may have ext-links or uri. HTML <a> links will be added
2963 _, comment = self.parse_node_with_mixed_content(
2964 child, is_citation=True, is_comment=True, add_HTML_link=True
2965 )
2966 if self.comment:
2967 self.comment += " "
2968 self.comment += comment
2969 elif tag == "source":
2970 # TODO: migration to store source_tex and source_html
2971 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True)
2973 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2973 ↛ 2975line 2973 didn't jump to line 2975 because the condition on line 2973 was never true
2974 # Multiple source for a book, store the extra source in series
2975 if self.series and has_comment:
2976 self.comment += " " + source_tex
2977 else:
2978 if self.series:
2979 self.series += ", "
2980 self.series += get_text_from_node(child)
2981 else:
2982 if self.source_tex and has_comment: 2982 ↛ 2983line 2982 didn't jump to line 2983 because the condition on line 2982 was never true
2983 self.comment += " " + source_tex
2984 else:
2985 self.source_tex = source_tex
2986 elif tag == "series":
2987 series = get_text_from_node(child)
2988 if self.series and has_comment: 2988 ↛ 2989line 2988 didn't jump to line 2989 because the condition on line 2988 was never true
2989 self.comment += ", " + series
2990 else:
2991 if self.series: 2991 ↛ 2992line 2991 didn't jump to line 2992 because the condition on line 2991 was never true
2992 self.series += ", "
2993 self.series += series
2994 elif tag == "annotation": 2994 ↛ 2995line 2994 didn't jump to line 2995 because the condition on line 2994 was never true
2995 if not self.annotation:
2996 self.annotation = get_text_from_node(child)
2997 elif tag == "article-title":
2998 # TODO: migration to store article_title_tex and article_title_html
2999 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)
3001 if self.type == "book": 3001 ↛ 3003line 3001 didn't jump to line 3003 because the condition on line 3001 was never true
3002 # Elsevier uses article-title for books !?!
3003 if len(self.source_tex) == 0:
3004 if has_comment:
3005 self.comment += " " + article_title_tex
3006 else:
3007 self.source_tex = article_title_tex
3008 else:
3009 if self.series and has_comment:
3010 self.comment += ", " + article_title_tex
3011 else:
3012 self.series += get_text_from_node(child)
3013 elif self.type == "inproceedings":
3014 if self.chapter_title_tex and has_comment: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true
3015 self.comment += " " + article_title_tex
3016 else:
3017 self.chapter_title_tex = article_title_tex
3018 else:
3019 if self.article_title_tex and has_comment: 3019 ↛ 3020line 3019 didn't jump to line 3020 because the condition on line 3019 was never true
3020 self.comment += " " + article_title_tex
3021 else:
3022 self.article_title_tex = article_title_tex
3023 elif tag == "chapter-title":
3024 # TODO: migration to store chapter_title_tex and chapter_title_html
3025 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)
3026 if self.chapter_title_tex and has_comment: 3026 ↛ 3027line 3026 didn't jump to line 3027 because the condition on line 3026 was never true
3027 self.comment += " " + chapter_title_tex
3028 else:
3029 self.chapter_title_tex = chapter_title_tex
3030 elif tag == "conf-name":
3031 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True)
3032 if self.source_tex and has_comment: 3032 ↛ 3033line 3032 didn't jump to line 3033 because the condition on line 3032 was never true
3033 self.comment += ", " + conf_tex
3034 else:
3035 self.source_tex = conf_tex
3036 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"):
3037 params = self.get_data_from_name_in_ref(child, "author")
3038 self.contributors.append(params)
3039 elif tag == "person-group":
3040 self.parse_person_group(child)
3041 elif tag == "ext-link":
3042 self.parse_ext_link(child, add_ext_link=True)
3043 elif tag == "pub-id":
3044 self.parse_pub_id(child)
3045 elif tag == "date": 3045 ↛ 3046line 3045 didn't jump to line 3046 because the condition on line 3045 was never true
3046 self.year = get_text_from_node(child)
3047 elif tag == "date-in-citation": 3047 ↛ 3048line 3047 didn't jump to line 3048 because the condition on line 3047 was never true
3048 date_ = child.get("iso-8601-date") or ""
3049 if date_:
3050 if self.comment:
3051 self.comment += ", "
3052 self.comment += "Accessed " + date_
3053 elif tag == "isbn": 3053 ↛ 3054line 3053 didn't jump to line 3054 because the condition on line 3053 was never true
3054 if self.annotation:
3055 self.annotation += ", "
3056 self.annotation += "ISBN: " + child.text
3057 elif tag == "issn": 3057 ↛ 3058line 3057 didn't jump to line 3058 because the condition on line 3057 was never true
3058 if self.annotation:
3059 self.annotation += ", "
3060 self.annotation += "ISSN: " + child.text
3061 elif child.text is not None:
3062 variable_name = tag.replace("-", "_")
3063 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true
3064 if tag == "fpage":
3065 self.comment += ", pp. "
3066 elif tag == "lpage":
3067 self.comment += "-"
3068 else:
3069 self.comment += ", "
3070 self.comment += child.text
3071 elif not hasattr(self, variable_name) or not getattr(self, variable_name):
3072 setattr(self, variable_name, child.text)
3074 def parse_person_group(self, node, **kwargs):
3075 role = node.get("person-group-type") or ""
3076 if role and role[-1] == "s": 3076 ↛ 3077line 3076 didn't jump to line 3077 because the condition on line 3076 was never true
3077 role = role[:-1]
3079 for child in node:
3080 tag = normalize(child.tag)
3082 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3082 ↛ 3086line 3082 didn't jump to line 3086 because the condition on line 3082 was always true
3083 contrib = self.get_data_from_name_in_ref(child, role)
3084 self.contributors.append(contrib)
3085 else:
3086 self.warnings.append(
3087 {
3088 self.pid: self.__class__.__name__
3089 + "."
3090 + inspect.currentframe().f_code.co_name
3091 + " "
3092 + tag
3093 }
3094 )
3096 def parse_pub_id(self, node, **kwargs):
3097 node_type = node.get("pub-id-type") or ""
3099 data: ExtLinkDict = {
3100 "rel": node_type,
3101 "mimetype": "",
3102 "location": "",
3103 "base": "",
3104 "metadata": node.text,
3105 }
3107 self.add_extids_from_node_with_link(data)
3109 def split_label(self):
3110 """
3111 Used when sorting non-digit bibitems
3112 """
3113 label = self.label.lower()
3114 if len(label) > 1:
3115 label = label[1:-1]
3117 try:
3118 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)
3119 except ValueError:
3120 # Special case where label is similar as "Sma" instead of "Sma15"
3121 self.label_prefix, self.label_suffix = [label, ""]
3124class BitsCollection(CollectionData, JatsBase):
3125 def __init__(self, *args, **kwargs):
3126 super().__init__(*args, **kwargs)
3127 self.parse_tree(kwargs["tree"])
3129 def parse_tree(self, tree):
3130 super().parse_tree(tree)
3132 if tree is not None: 3132 ↛ 3175line 3132 didn't jump to line 3175 because the condition on line 3132 was always true
3133 tag = normalize(tree.tag)
3134 collection_meta_node = None
3135 if tag == "collection-meta":
3136 self.parse_collection_meta(tree)
3137 collection_meta_node = tree
3138 elif tag == "in-collection": 3138 ↛ 3162line 3138 didn't jump to line 3162 because the condition on line 3138 was always true
3139 for node in tree:
3140 tag = normalize(node.tag)
3142 if tag == "collection-meta":
3143 self.parse_collection_meta(node)
3144 collection_meta_node = node
3145 elif tag == "volume":
3146 self.parse_volume(node)
3147 elif tag == "volume-series": 3147 ↛ 3149line 3147 didn't jump to line 3149 because the condition on line 3147 was always true
3148 self.parse_volume_series(node)
3149 elif tag == "volume-title":
3150 self.parse_volume_title(node)
3151 else:
3152 self.warnings.append(
3153 {
3154 self.pid: self.__class__.__name__
3155 + "."
3156 + inspect.currentframe().f_code.co_name
3157 + " "
3158 + tag
3159 }
3160 )
3162 if collection_meta_node is not None: 3162 ↛ 3165line 3162 didn't jump to line 3165 because the condition on line 3162 was always true
3163 self.set_seq(collection_meta_node)
3164 else:
3165 self.warnings.append(
3166 {
3167 self.pid: self.__class__.__name__
3168 + "."
3169 + inspect.currentframe().f_code.co_name
3170 + " "
3171 + tag
3172 }
3173 )
3175 self.collection = Foo()
3176 self.collection.pid = self.pid
3178 def parse_collection_meta(self, node, **kwargs):
3179 self.coltype = node.get("collection-type")
3181 for child in node:
3182 tag = normalize(child.tag)
3184 if tag == "collection-id":
3185 self.pid = child.text
3186 elif tag == "title-group":
3187 self.parse_title_group(child)
3188 elif tag == "issn":
3189 node_type = child.get("pub-type")
3190 if node_type == "ppub": 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true
3191 self.issn = child.text
3192 self.ids.append(("issn", child.text))
3193 elif node_type == "epub": 3193 ↛ 3194line 3193 didn't jump to line 3194 because the condition on line 3193 was never true
3194 self.e_issn = child.text
3195 self.ids.append(("e-issn", child.text))
3196 elif tag == "ext-link": 3196 ↛ 3197line 3196 didn't jump to line 3197 because the condition on line 3196 was never true
3197 data = self.get_data_from_ext_link(child)
3198 self.ext_links.append(data)
3199 elif tag == "volume-in-collection":
3200 self.parse_volume_in_collection(child)
3201 else:
3202 self.warnings.append(
3203 {
3204 self.pid: self.__class__.__name__
3205 + "."
3206 + inspect.currentframe().f_code.co_name
3207 + " "
3208 + tag
3209 }
3210 )
3212 def parse_volume(self, node, **kwargs):
3213 self.volume = node.text
3215 def parse_volume_in_collection(self, node, **kwargs):
3216 for child in node:
3217 tag = normalize(child.tag)
3219 if tag == "volume-number":
3220 self.parse_volume(child)
3221 elif tag == "volume-series":
3222 self.parse_volume_series(child)
3223 elif tag == "volume-title": 3223 ↛ 3226line 3223 didn't jump to line 3226 because the condition on line 3223 was always true
3224 self.parse_volume_title(child)
3225 else:
3226 self.warnings.append(
3227 {
3228 self.pid: self.__class__.__name__
3229 + "."
3230 + inspect.currentframe().f_code.co_name
3231 + " "
3232 + tag
3233 }
3234 )
3236 def parse_volume_series(self, node, **kwargs):
3237 self.vseries = node.text
3239 def parse_volume_title(self, node, **kwargs):
3240 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node)
3241 self.title_xml = get_xml_from_node(node)
3243 def set_seq(self, node):
3244 try:
3245 # First, use the seq attribute, if any
3246 self.seq = int(node.get("seq") or "")
3247 except ValueError:
3248 # Second, use self.volume (which can be like "158-159")
3249 if not self.volume: 3249 ↛ 3250line 3249 didn't jump to line 3250 because the condition on line 3249 was never true
3250 self.seq = 0
3251 else:
3252 text = self.volume.split("-")[0]
3253 try:
3254 self.seq = int(text)
3255 except ValueError:
3256 self.seq = 0
3258 # Third, use self.vseries as an offset
3259 try:
3260 # pas plus de 10000 ouvrages dans une série (gasp)
3261 self.seq = int(self.vseries) * 10000 + self.seq
3262 except ValueError:
3263 pass
3266class BitsBook(BookData, JatsBase):
3267 def __init__(self, *args, **kwargs):
3268 super().__init__(*args, **kwargs)
3269 self.no_bib = kwargs.get("no_bib", False)
3271 self.parse_tree(kwargs["tree"])
3273 def parse_tree(self, tree):
3274 super().parse_tree(tree)
3276 book_type = get_normalized_attrib(tree, "book-type") or "Book"
3277 self.ctype = "book-" + book_type
3279 for node in tree:
3280 if type(tree) == type(node): 3280 ↛ 3279line 3280 didn't jump to line 3279 because the condition on line 3280 was always true
3281 tag = normalize(node.tag)
3283 if tag in ("collection-meta", "in-collection"):
3284 col = BitsCollection(tree=node)
3285 self.incollection.append(col)
3286 elif tag == "book-meta":
3287 self.parse_book_meta(node)
3288 elif tag == "book-body":
3289 self.parse_book_body(node)
3290 elif tag == "front-matter":
3291 self.parse_front_matter(node)
3292 elif tag == "book-back": 3292 ↛ 3308line 3292 didn't jump to line 3308 because the condition on line 3292 was always true
3293 for child in node:
3294 tag = normalize(child.tag)
3295 if tag == "ref-list":
3296 self.parse_ref_list(child)
3297 else:
3298 self.warnings.append(
3299 {
3300 self.pid: self.__class__.__name__
3301 + "."
3302 + inspect.currentframe().f_code.co_name
3303 + " "
3304 + tag
3305 }
3306 )
3307 else:
3308 self.warnings.append(
3309 {
3310 self.pid: self.__class__.__name__
3311 + "."
3312 + inspect.currentframe().f_code.co_name
3313 + " "
3314 + tag
3315 }
3316 )
3318 self.set_contribs()
3319 self.set_title()
3320 self.post_parse_tree()
3322 def parse_book_body(self, node, **kwargs):
3323 for child in node:
3324 if type(child) == type(node): 3324 ↛ 3323line 3324 didn't jump to line 3323 because the condition on line 3324 was always true
3325 tag = normalize(child.tag)
3327 if tag == "book-part": 3327 ↛ 3332line 3327 didn't jump to line 3332 because the condition on line 3327 was always true
3328 book_part = BitsBookPart(tree=child, no_bib=self.no_bib)
3329 self.warnings.extend(book_part.warnings)
3330 self.parts.append(book_part)
3331 else:
3332 self.warnings.append(
3333 {
3334 self.pid: self.__class__.__name__
3335 + "."
3336 + inspect.currentframe().f_code.co_name
3337 + " "
3338 + tag
3339 }
3340 )
3342 if not self.parts:
3343 self.body = get_text_from_node(node)
3345 def parse_book_meta(self, node, **kwargs):
3346 for child in node:
3347 tag = normalize(child.tag)
3349 if tag == "book-id":
3350 self.parse_id(child)
3351 elif tag == "pub-date":
3352 self.year = self.get_data_from_date(child)
3353 elif tag == "book-volume-number": 3353 ↛ 3354line 3353 didn't jump to line 3354 because the condition on line 3353 was never true
3354 self.volume = child.text
3355 self.volume_int = child.text
3356 elif tag == "pub-history":
3357 history_dates = self.get_data_from_history(child)
3358 for date in history_dates:
3359 if date["type"] == "last-modified":
3360 self.last_modified_iso_8601_date_str = date["date"]
3361 elif date["type"] == "prod-deployed-date": 3361 ↛ 3362line 3361 didn't jump to line 3362 because the condition on line 3361 was never true
3362 self.prod_deployed_date_iso_8601_date_str = date["date"]
3363 elif tag == "book-title-group":
3364 self.parse_title_group(child)
3365 elif tag == "publisher":
3366 self.publisher = JatsPublisher(tree=child)
3367 else:
3368 fct_name = "parse_" + tag.replace("-", "_")
3369 ftor = getattr(self, fct_name, None)
3370 if callable(ftor):
3371 ftor(child, add_ext_link=True)
3372 else:
3373 self.warnings.append(
3374 {
3375 self.pid: self.__class__.__name__
3376 + "."
3377 + inspect.currentframe().f_code.co_name
3378 + " "
3379 + tag
3380 }
3381 )
3383 if self.last_modified_iso_8601_date_str is None: 3383 ↛ 3384line 3383 didn't jump to line 3384 because the condition on line 3383 was never true
3384 self.last_modified_iso_8601_date_str = timezone.now().isoformat()
3386 def parse_custom_meta_group(self, node, **kwargs):
3387 for child in node:
3388 tag = normalize(child.tag)
3390 if tag == "custom-meta": 3390 ↛ 3387line 3390 didn't jump to line 3387 because the condition on line 3390 was always true
3391 name, value = self.get_data_from_custom_meta(child)
3393 if name == "provider": 3393 ↛ 3387line 3393 didn't jump to line 3387 because the condition on line 3393 was always true
3394 self.provider = value
3396 def set_contribs(self):
3397 """
3398 Update the contrib_groups if the XML does not declare any
3399 - with the authors of the first part
3400 - if the book is a monograph
3401 - if all parts are written by the same authors
3403 :return:
3404 """
3406 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"]
3407 if not authors:
3408 if self.ctype == "book-monograph" and self.parts:
3409 first_part = self.parts[0]
3410 self.contributors = first_part.contributors
3411 elif ( 3411 ↛ exitline 3411 didn't return from function 'set_contribs' because the condition on line 3411 was always true
3412 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes"
3413 ) and self.parts:
3414 # check if authors of the book-parts are identical
3415 equal = True
3416 book_part_contributors = self.parts[0].contributors
3417 i = 1
3418 while equal and i < len(self.parts):
3419 part = self.parts[i]
3420 if part.contributors != book_part_contributors: 3420 ↛ 3422line 3420 didn't jump to line 3422 because the condition on line 3420 was always true
3421 equal = False
3422 i += 1
3423 if equal: 3423 ↛ 3424line 3423 didn't jump to line 3424 because the condition on line 3423 was never true
3424 if self.ctype == "book-edited-book":
3425 self.ctype = "book-monograph"
3426 self.contributors = book_part_contributors
3427 else:
3428 contrib = create_contributor()
3429 contrib["string_name"] = "Collectif"
3430 contrib["role"] = "author"
3431 contrib["contrib_xml"] = get_contrib_xml(contrib)
3432 self.contributors.append(contrib)
3434 def set_title(self):
3435 if self.title_xml == "" and len(self.incollection) > 0:
3436 self.title_xml = self.incollection[0].title_xml
3437 self.title_html = self.incollection[0].title_html
3438 self.title_tex = self.incollection[0].title_tex
3441class BitsBookPart(BookPartData, JatsArticleBase):
3442 def __init__(self, *args, **kwargs):
3443 super().__init__(*args, **kwargs)
3444 self.no_bib = kwargs.get("no_bib", False)
3446 self.parse_tree(kwargs["tree"])
3448 def parse_tree(self, tree):
3449 super().parse_tree(tree)
3451 self.atype = get_normalized_attrib(tree, "book-part-type") or ""
3452 try:
3453 self.seq = int(get_normalized_attrib(tree, "seq") or "")
3454 except ValueError:
3455 pass
3457 for node in tree:
3458 tag = normalize(node.tag)
3460 if tag == "book-part-meta":
3461 self.parse_book_part_meta(node)
3462 elif tag == "body":
3463 self.parse_body(node)
3464 elif tag == "front-matter": 3464 ↛ 3465line 3464 didn't jump to line 3465 because the condition on line 3464 was never true
3465 self.parse_front_matter(node)
3466 elif tag == "back": 3466 ↛ 3483line 3466 didn't jump to line 3483 because the condition on line 3466 was always true
3467 for child in node:
3468 tag = normalize(child.tag)
3470 if tag == "ref-list": 3470 ↛ 3473line 3470 didn't jump to line 3473 because the condition on line 3470 was always true
3471 self.parse_ref_list(child)
3472 else:
3473 self.warnings.append(
3474 {
3475 self.pid: self.__class__.__name__
3476 + "."
3477 + inspect.currentframe().f_code.co_name
3478 + " "
3479 + tag
3480 }
3481 )
3482 else:
3483 self.warnings.append(
3484 {
3485 self.pid: self.__class__.__name__
3486 + "."
3487 + inspect.currentframe().f_code.co_name
3488 + " "
3489 + tag
3490 }
3491 )
3493 # Work around a numdam-plus bug where a book-part can have a trans-title without a title
3494 # TODO: Fix numdam-plus, the books impacted and remove the hack
3495 self.set_title()
3497 self.post_parse_tree()
3499 def parse_book_part_meta(self, node, **kwargs):
3500 for child in node:
3501 tag = normalize(child.tag)
3503 if tag == "book-part-id":
3504 self.parse_id(child)
3505 elif tag == "fpage":
3506 self.fpage = child.text
3507 self.page_type = get_normalized_attrib(child, "content-type") or ""
3508 elif tag == "lpage":
3509 self.lpage = child.text
3510 elif tag == "page-range": 3510 ↛ 3511line 3510 didn't jump to line 3511 because the condition on line 3510 was never true
3511 self.page_range = child.text
3512 else:
3513 fct_name = "parse_" + tag.replace("-", "_")
3514 ftor = getattr(self, fct_name, None)
3515 if callable(ftor): 3515 ↛ 3518line 3515 didn't jump to line 3518 because the condition on line 3515 was always true
3516 ftor(child)
3517 else:
3518 self.warnings.append(
3519 {
3520 self.pid: self.__class__.__name__
3521 + "."
3522 + inspect.currentframe().f_code.co_name
3523 + " "
3524 + tag
3525 }
3526 )
3528 def parse_body(self, node, **kwargs):
3529 for child in node:
3530 tag = normalize(child.tag)
3532 if tag == "book-part":
3533 book_part = BitsBookPart(tree=child, no_bib=self.no_bib)
3534 self.warnings.extend(book_part.warnings)
3535 self.parts.append(book_part)
3536 else:
3537 self.warnings.append(
3538 {
3539 self.pid: self.__class__.__name__
3540 + "."
3541 + inspect.currentframe().f_code.co_name
3542 + " "
3543 + tag
3544 }
3545 )
3547 self.body = get_text_from_node(node)
3549 def set_title(self):
3550 """
3551 Bug in some books: some chapters may have a trans-title, but no title !
3552 Hack and manually set the title*
3553 :return:
3554 """
3556 if self.trans_title_html and not self.title_html:
3557 self.title_html = self.trans_title_html
3558 self.title_tex = self.trans_title_tex
3561######################################################################################
3562#
3563# Functions used by ptf-tools
3564#
3565######################################################################################
3568def update_bibitem_xml(bibitem, new_ids):
3569 xml = "<ref>" + bibitem.citation_xml + "</ref>"
3570 the_parser = etree.XMLParser(
3571 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3572 )
3573 tree = etree.fromstring(xml, parser=the_parser)
3575 node = tree.find("element-citation")
3576 if node is None:
3577 node = tree.find("mixed-citation")
3578 if node is not None: 3578 ↛ 3619line 3578 didn't jump to line 3619 because the condition on line 3578 was always true
3579 children_to_remove = []
3580 for child in node:
3581 if child.tag == "ext-link":
3582 child_type = child.get("ext-link-type")
3583 if child_type and child_type in [
3584 "zbl-item-id",
3585 "mr-item-id",
3586 "doi",
3587 "numdam-id",
3588 "mathdoc-id",
3589 "eid",
3590 ]:
3591 children_to_remove.append(child)
3592 elif child.tag == "pub-id":
3593 child_type = child.get("pub-id-type")
3594 if child_type and child_type in [ 3594 ↛ 3580line 3594 didn't jump to line 3580 because the condition on line 3594 was always true
3595 "zbl-item-id",
3596 "mr-item-id",
3597 "doi",
3598 "numdam-id",
3599 "mathdoc-id",
3600 ]:
3601 children_to_remove.append(child)
3603 for child in children_to_remove:
3604 node.remove(child)
3606 for id_type, value_dict in new_ids.items():
3607 if value_dict["checked"] and not value_dict["false_positive"]:
3608 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:
3609 new_node = etree.Element("pub-id")
3610 new_node.set("pub-id-type", id_type)
3611 else:
3612 new_node = etree.Element("ext-link")
3613 new_node.set("ext-link-type", id_type)
3615 new_node.text = value_dict["id_value"]
3616 node.append(new_node)
3618 # TODO Modify the call to update_bibitem_xml and pass the parent's lang
3619 result = JatsRef(tree=tree, lang="und")
3620 return result
3623def check_bibitem_xml(bibitem: RefData):
3624 xml = "<ref>" + bibitem.citation_xml + "</ref>"
3625 the_parser = etree.XMLParser(
3626 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3627 )
3628 tree = etree.fromstring(xml, parser=the_parser)
3630 result = JatsRef(tree=tree, lang="und")
3631 return result
3634# Create XML strings based on internal data
3637def get_tex_from_xml(xml, tag, **kwargs):
3638 parser_ = etree.XMLParser(
3639 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3640 )
3641 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
3642 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '')
3643 text = xml
3645 if tag in ["abstract", "title"]:
3646 text = f"<article><front><article-meta>{text}</article-meta></front></article>"
3648 tree = etree.fromstring(text.encode("utf-8"), parser=parser_)
3649 xarticle = JatsArticle(tree=tree, **kwargs)
3651 result = ""
3652 if tag == "abstract":
3653 result = xarticle.abstracts[0]["value_tex"]
3654 elif tag == "title":
3655 result = xarticle.title_tex, xarticle.trans_title_tex
3657 return result