Coverage for src/ptf/cmds/xml/jats/xmldata.py: 15%
1850 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import os
2import re
3import unicodedata
5from lxml import etree
7from django.conf import settings
8from django.utils import timezone
9from django.utils.translation import gettext_lazy as _
11from ptf.cmds.xml.xml_utils import escape
12from ptf.cmds.xml.xml_utils import normalize
13from ptf.cmds.xml.xml_utils import remove_namespace
16def get_attribute_value(node, fullname, basename=None, name=None):
17 value = ""
18 try:
19 if basename == name:
20 value = node.attrib[fullname]
21 except KeyError:
22 pass
24 return value
27def get_lang_attrib(node):
28 lang = "und"
29 if node is not None:
30 for attrib in node.attrib:
31 name = normalize(attrib)
32 if name == "lang":
33 lang = node.attrib[attrib]
35 return lang
38def get_href_attrib(node):
39 href = None
40 if node is not None:
41 for attrib in node.attrib:
42 name = normalize(attrib)
43 if name == "href":
44 href = node.attrib[attrib]
46 return href
49def innerxml(node):
50 if node.text:
51 parts = [escape(node.text)] + [
52 etree.tostring(c, encoding="unicode") for c in node.getchildren()
53 ]
54 else:
55 parts = [etree.tostring(c, encoding="unicode") for c in node.getchildren()]
56 return "".join(parts).strip().encode("utf-8")
59def get_node_text(node):
60 text = ""
61 if node is not None:
62 text = etree.tostring(
63 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False
64 )
65 return text
68##########################################################################
69#
70# get_mixed_content: recreate the xml string from a node
71#
72# Used to export data (OAI)
73#
74##########################################################################
77def get_mixed_content(node):
78 text = ""
79 if node is not None:
80 text = etree.tostring(
81 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False
82 )
83 return text
86##########################################################################
87#
88# get_tex: get the tex version of a node with mixed-content
89#
90# Strip the mathml alternative of formula
91#
92# Used to prepare the HTML pages. A Django template can simply display title_tex
93#
94##########################################################################
95def get_tex(node, is_top=True, is_citation=False):
96 text = ""
97 is_citation_author = False
98 is_citation_title = False
100 if node is not None:
101 normalized_tag = normalize(node.tag)
103 if normalized_tag == "element-citation":
104 text += get_element_citation_str(node, is_top)
105 elif normalized_tag != "math":
106 if normalized_tag == "mixed-citation":
107 is_citation = True
108 elif is_citation and normalized_tag == "string-name":
109 is_citation_author = True
110 elif is_citation and (
111 normalized_tag == "article-title"
112 or normalized_tag == "chapter-title"
113 or normalized_tag == "italic"
114 ):
115 is_citation_title = True
117 if node.text:
118 text += node.text
120 for child in node:
121 text += get_tex(child, False)
123 if is_citation_title:
124 text = '<span class="citation-title">' + text + "</span>"
125 elif is_citation_author:
126 text = '<span class="citation-author">' + text.title() + "</span>"
128 if node.tail and not is_top:
129 text += node.tail
131 return text
134def make_links_clickable(href, string):
135 if re.match(r"http+", href):
136 return f'<a href="{href}" target="_blank">{string}</a>'
137 if href.startswith("/"):
138 return f'<a href="{href}">{string}</a>'
139 return string
142##########################################################################
143#
144# get_html_mixed_content_with_figures: get the mathml version of a node with mixed-content
145#
146# Strip the tex alternative of formula, add the tex version to the tooltip (HTML <title> tag)
147#
148# Used to prepare the HTML pages. A Django template can simply display the_html
149#
150# TODO: Use a dict to pass the params
151#
152##########################################################################
153def get_html_mixed_content_with_figures(
154 node,
155 is_top=True,
156 is_citation=False,
157 is_comment=False,
158 is_figure=False,
159 prefix="",
160 suffix="",
161 sec_level=2,
162 label_title="",
163 figures=None,
164 base_url="",
165):
166 text = ""
167 is_citation_author = False
168 is_citation_title = False
169 is_citation_volume = False
171 # specific case for element-citation as the order of the children
172 # might not be the order of display
173 if node is not None:
174 normalized_tag = normalize(node.tag)
176 if normalized_tag == "element-citation":
177 text = get_element_citation_str(node, is_top)
178 # pub-id are ignored by default are they are treated separately
179 # Inside citations or comments, ext-links are converted in html links
180 elif is_comment or (normalized_tag != "pub-id" and normalized_tag != "object-id"):
181 if normalized_tag == "mixed-citation" or normalized_tag == "toc":
182 is_citation = True
183 # elif normalized_tag == "toc":
184 # is_toc = True
185 elif normalized_tag == "comment":
186 is_comment = True
187 elif is_citation and normalized_tag == "string-name":
188 is_citation_author = True
189 elif is_citation and (
190 normalized_tag == "article-title"
191 or normalized_tag == "chapter-title"
192 or normalized_tag == "italic"
193 ):
194 is_citation_title = True
195 elif is_citation and normalized_tag == "volume":
196 is_citation_volume = True
198 text += prefix
200 if is_citation and normalized_tag == "ext-link":
201 type = node.get("ext-link-type")
202 if type is None:
203 href = get_href_attrib(node)
204 if not href:
205 href = node.text
206 if "www.numdam.org" not in href:
207 href = make_links_clickable(href, node.text)
208 text += href
209 elif is_citation and normalized_tag == "uri":
210 href = get_href_attrib(node)
211 if not href:
212 href = node.text
213 href = make_links_clickable(href, node.text)
214 text += href
215 # elif normalized_tag == "nav-pointer":
216 # rid = get_attribute_value(node,'rid')
217 # if rid is not '':
218 # href = '/item/%s' % rid
219 # #href = make_links_clickable(href, node.text) non car make links clickable cree un lien absolu avec target _blank
220 # link = '<a href="%s">%s</a>' % (href, node.text)
221 # else:
222 # link = node.text
223 # text += link
224 elif is_comment and node.text:
225 match = re.match(r"[\n ]+", node.text)
226 if not match:
227 comment = make_links_clickable(node.text, node.text)
228 text += comment
229 elif node.text:
230 text += node.text
232 label = ""
233 if (
234 normalized_tag == "sec"
235 or normalized_tag == "statement"
236 or normalized_tag == "fig"
237 or normalized_tag == "list-item"
238 or normalized_tag == "table-wrap"
239 ):
240 child = node.find("label")
241 if child is not None:
242 label += child.text
243 node.remove(child)
244 child = node.find("title")
245 if child is not None:
246 if label:
247 label += " "
248 label += child.text
249 node.remove(child)
251 if normalized_tag == "sec" or normalized_tag == "statement":
252 text = "<h" + str(sec_level) + ">" + label + "</h" + str(sec_level) + ">"
253 sec_level += 1
255 if normalized_tag == "table-wrap":
256 text = "<strong>" + label + "</strong>"
258 if normalized_tag == "fig":
259 is_figure = True
260 child = node.find("caption")
261 if child is not None:
262 child_text, figures = get_html_mixed_content_with_figures(
263 child,
264 False,
265 is_citation,
266 is_comment,
267 is_figure,
268 "",
269 "",
270 sec_level,
271 "",
272 figures,
273 base_url,
274 )
275 label += " : " + child_text
276 node.remove(child)
278 if normalized_tag == "list-item":
279 label_title = label
281 if normalized_tag == "p":
282 if label_title:
283 text = label_title + " " + text
284 label_title = ""
286 if normalized_tag == "inline-formula" or normalized_tag == "disp-formula":
287 for child in node:
288 if child.tag == "alternatives":
289 math_text = ""
290 tex_text = ""
292 for great_child in child:
293 normalized_tag = normalize(great_child.tag)
294 if normalized_tag == "math":
295 math_text = get_mixed_content(great_child)
296 else:
297 tex_text = get_tex(great_child)
299 text += '<span title="' + tex_text + '">' + math_text + "</span>"
301 else:
302 for child in node:
303 child_text, figures = get_html_mixed_content_with_figures(
304 child,
305 False,
306 is_citation,
307 is_comment,
308 is_figure,
309 "",
310 "",
311 sec_level,
312 label_title,
313 figures,
314 base_url,
315 )
316 text += child_text
318 if is_citation_title:
319 text = '<span class="citation-document-title">' + text + "</span>"
320 elif is_citation_author:
321 text = '<span class="citation-author">' + text.title() + "</span>"
322 elif is_citation_volume:
323 text = '<span class="citation-volume">' + text + "</span>"
324 elif normalized_tag == "list":
325 type = node.get("list-type")
326 if type is None or type == "bullet":
327 text = "<ul>" + text + "</ul>"
328 else:
329 if type == "order":
330 text = '<ol type="1">' + text + "</ol>"
331 elif type == "alpha-lower":
332 text = '<ol type="a">' + text + "</ol>"
333 elif type == "alpha-upper":
334 text = '<ol type="A">' + text + "</ol>"
335 elif type == "roman-lower":
336 text = '<ol type="i">' + text + "</ol>"
337 elif type == "roman-upper":
338 text = '<ol type="I">' + text + "</ol>"
339 else:
340 text = (
341 '<ul class="no-bullet" style="list-style-type:none;">' + text + "</ul>"
342 )
343 elif normalized_tag == "list-item":
344 text = "<li>" + text + "</li>"
345 elif normalized_tag == "strong" or normalized_tag == "bold":
346 text = "<strong>" + text + "</strong>"
347 elif normalized_tag == "italic":
348 text = '<span class="italique">' + text + "</span>"
349 elif normalized_tag == "p":
350 type = node.get("specific-use")
351 if type:
352 text = '<p class="' + type + '">' + text + "</p>"
353 else:
354 text = "<p>" + text + "</p>"
355 elif normalized_tag == "caption" and not is_figure:
356 text = '<div class="caption">' + text + "</div>"
357 elif normalized_tag == "sec" or normalized_tag == "statement":
358 text = "<section>" + text + "</section>"
359 elif normalized_tag == "fig":
360 id = node.get("id")
361 if id:
362 tag = '<figure id="' + id + '">'
363 else:
364 tag = "<figure>"
365 text = tag + text
366 if label:
367 text += "<figcaption>" + label + "</figcaption>"
368 text += "</figure>"
369 elif normalized_tag == "sub" or normalized_tag == "sup":
370 text = "<" + normalized_tag + ">" + text + "</" + normalized_tag + ">"
371 elif normalized_tag == "xref":
372 id = node.get("rid")
373 if id:
374 text = '<a href="#' + id + '">' + text + "</a>"
375 elif normalized_tag == "graphic" and is_figure:
376 href = ""
377 for attrib in node.attrib:
378 name = normalize(attrib)
379 href = node.attrib[attrib] if name == "href" else ""
381 if len(href) > 0:
382 basename = os.path.basename(href)
383 ext = basename.split(".")[-1]
384 if ext == "png":
385 mimetype = "image/png"
386 else:
387 mimetype = "image/jpeg"
389 location = "src/tex/figures/" + basename
390 v = {
391 "rel": "image",
392 "mimetype": mimetype,
393 "location": location,
394 "base": None,
395 "text": node.text if node.text is not None else "",
396 }
398 if ext == "png":
399 location = os.path.join(base_url, "png", location)
400 else:
401 location = os.path.join(base_url, "jpg", location)
402 text = '<img src="' + location + '" class="article-body-img" />'
404 figures.append(v)
405 elif (
406 normalized_tag == "table"
407 or normalized_tag == "th"
408 or normalized_tag == "thead"
409 or normalized_tag == "tr"
410 or normalized_tag == "td"
411 ):
412 tag = "<" + normalized_tag
413 if "rowspan" in node.attrib:
414 tag += ' rowspan="' + node.attrib["rowspan"] + '"'
415 text = tag + ">" + text + "</" + normalized_tag + ">"
416 elif normalized_tag == "table-wrap":
417 tag = '<div class="table-wrap"'
418 id = node.get("id")
419 if id:
420 tag += ' id="' + id + '"'
422 text = tag + ">" + text + "</div>"
424 if node.tail and not is_top:
425 # match = None
426 # if is_citation:
427 # match = re.match(r'[\n ]+', node.tail)
428 # if not match:
429 text += node.tail
431 text += suffix
433 return text, figures
436def get_html_mixed_content(
437 node,
438 is_top=True,
439 is_citation=False,
440 is_comment=False,
441 prefix="",
442 suffix="",
443 sec_level=2,
444 label="",
445):
446 text, _ = get_html_mixed_content_with_figures(
447 node, is_top, is_citation, is_comment, False, prefix, suffix, sec_level, label, None
448 )
449 return text
452##########################################################################
453#
454# get_element_citation_str: get the mixed content of an element-citation node
455#
456# An element-citation node is specific as the order of its children might not be
457# the correct order for display
458#
459# Used to prepare the HTML pages. A Django template can simply display title_html
460#
461##########################################################################
462def get_element_citation_str(node, is_top=False, is_html=True):
463 text = document_title = ""
464 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)
466 # xbibitem = BibItem(node.getparent())
467 # ids = xbibitem.extids
469 if node is not None:
470 type = node.get("publication-type")
472 name_str = get_author_str(node)
473 text += name_str
475 if is_html:
476 prefix = " "
477 suffix = ""
478 if REF_JEP_STYLE:
479 prefix = " - “"
480 suffix = "”"
481 document_title += get_html_mixed_content(
482 node.find("article-title"), True, True, False, prefix, suffix
483 )
485 if REF_JEP_STYLE and type == "incollection":
486 document_title += get_html_mixed_content(
487 node.find("chapter-title"), True, True, False, prefix, suffix
488 )
489 else:
490 document_title += get_html_mixed_content(
491 node.find("chapter-title"), True, True, False, " "
492 )
493 else:
494 document_title += " " + get_tex(node.find("article-title"))
495 document_title += " " + get_tex(node.find("chapter-title"))
497 text += document_title
499 prefix = ""
500 suffix = "</span>"
502 if document_title:
503 if REF_JEP_STYLE and type == "incollection":
504 prefix += ', in <span class="citation-publication-title">'
505 else:
506 prefix += ', <span class="citation-publication-title">'
507 else:
508 if name_str:
509 prefix = " "
510 if REF_JEP_STYLE:
511 prefix = " - "
512 if type in ["unpublished", "misc"]:
513 prefix += "“"
514 suffix += "”"
515 prefix += '<span class="citation-document-title">'
517 source = get_html_mixed_content(node.find("source"), True, True, False, prefix, suffix)
518 if REF_JEP_STYLE and type == "book":
519 source = f"<i>{source}</i>"
520 if type in ["book", "incollection"]:
521 editor = get_editor_str(node.find("person-group"))
522 source += editor
523 else:
524 editor = ""
525 text += source
527 if document_title:
528 if REF_JEP_STYLE and type == "incollection":
529 prefix = ", "
530 else:
531 prefix = " ("
532 suffix = ")"
533 else:
534 if REF_JEP_STYLE and type == "book":
535 prefix = ', <span class="citation-publication-title-book">'
536 else:
537 prefix = ', <span class="citation-publication-title">'
538 suffix = "</span>"
540 serie = get_html_mixed_content(node.find("series"), True, True, False, prefix, suffix)
541 text += serie
543 if REF_JEP_STYLE:
544 if type in ["incollection", "book"]:
545 prefix = ", vol. "
546 else:
547 prefix = " "
548 else:
549 if document_title:
550 prefix = " " if serie else ", "
551 else:
552 prefix = ", " if serie else " "
553 prefix += str(_("Tome")) + " "
555 text += get_html_mixed_content(node.find("volume"), True, True, False, prefix)
556 if type in ["incollection", "book"]:
557 text = text.replace("citation-volume", "citation-volume-incollection")
558 text += get_html_mixed_content(node.find("publisher-name"), True, True, False, ", ")
559 text += get_html_mixed_content(node.find("publisher-loc"), True, True, False, ", ")
560 text += get_html_mixed_content(node.find("institution"), True, True, False, ", ")
561 prefix = ", "
562 suffix = ""
563 elif type == "misc":
564 prefix = ", "
565 suffix = ""
566 else:
567 prefix = " ("
568 suffix = ")"
569 text += get_html_mixed_content(node.find("year"), True, True, False, prefix, suffix)
570 text += get_html_mixed_content(node.find("issue"), True, True, False, " no. ")
572 for child in node.findall("pub-id"):
573 if child.get("pub-id-type") == "eid":
574 text += ", " + child.text
576 for child in node.findall("ext-link"):
577 if child.get("ext-link-type") == "eid":
578 if REF_JEP_STYLE:
579 text += ", article ID " + child.text
580 else:
581 text += ", " + child.text
583 if not (REF_JEP_STYLE and type == "book"):
584 text += get_pages_str(node)
586 for child in node.findall("ext-link"):
587 type = child.get("ext-link-type")
588 if type is None:
589 href = get_href_attrib(child)
590 if not href:
591 href = child.text
592 # bibitem with ext-links pointing to numdam.org have a numdam-id
593 # ext-links to doi.org are transformed in an extid
594 # We can ignore both cases
595 if "www.numdam.org" not in href and "doi.org" not in href and not REF_JEP_STYLE:
596 href = make_links_clickable(href, child.text)
597 text += " " + href
599 if REF_JEP_STYLE:
600 text += get_html_mixed_content(node.find("comment"), True, True, True, ", ")
601 else:
602 text += get_html_mixed_content(node.find("comment"), True, True, True, " (", ")")
604 # if type is None or type == 'article':
605 # elif type == 'book' or type == 'proceedings':
606 # elif type == 'incollection':
607 # elif type == 'conference':
608 # elif type == 'unpublished':
609 # elif type == "booklet":
610 # elif type == 'inbook' or type == 'inproceedings':
611 # elif type == "misc":
612 # elif type == 'phdthesis' or type == 'masterthesis':
613 # elif type == 'techreport' or type == 'manual':
615 # Fallback in case the publication-type is unknown
616 # else:
617 # if node.text:
618 # text += node.text
619 #
620 # for child in node:
621 # text += get_html_mixed_content(child, False, True)
622 #
623 # if node.tail and not is_top:
624 # text += node.tail
626 return text
629def get_name_str(node):
630 text = ""
631 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)
633 if node is not None:
634 names = node.findall("name")
635 i = 1
636 for name_node in names:
637 first_name = last_name = prefix = suffix = string_name = ""
639 for child in name_node:
640 if child.tag == "given-names":
641 if REF_JEP_STYLE:
642 first_name += child.get("initials", "")
643 else:
644 if child.text is None:
645 child.text = ""
646 first_name += child.text
647 if child.tag == "surname":
648 last_name += child.text
649 if child.tag == "prefix":
650 prefix += child.text
651 if child.tag == "suffix":
652 suffix += child.text
654 if prefix:
655 string_name = prefix + " "
657 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):
658 if first_name:
659 string_name += first_name + " "
660 string_name += last_name
661 else:
662 string_name += last_name
664 if first_name:
665 string_name += ", " + first_name
667 if suffix:
668 string_name += " " + suffix
670 if text:
671 if i == len(names) and REF_JEP_STYLE:
672 text += " & "
673 elif REF_JEP_STYLE:
674 text += ", "
675 else:
676 text += "; "
678 text += string_name
679 i += 1
681 names = node.findall("string-name")
682 i = 1
683 for name_node in names:
684 string_name = get_tex(name_node)
686 if text:
687 if i == len(names) and REF_JEP_STYLE:
688 text += " & "
689 elif REF_JEP_STYLE:
690 text += ", "
691 else:
692 text += "; "
694 text += string_name
695 i += 1
696 return text
699def get_author_str(node):
700 authors = get_name_str(node)
701 return f'<span class="citation-author">{authors}</span>'
704def get_editor_str(node):
705 editors = get_name_str(node)
706 if not editors:
707 return ""
708 # Here, we replace '&' (used in JEP) by ';' and then split in order to
709 # find if there are multiple editors
710 suffix = "eds." if len(editors.replace("&", ";").split(";")) > 1 else "ed."
711 return f" ({editors}, {suffix})"
714def get_pages_str(node):
715 text = ""
716 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)
717 child = node.find("page-count")
718 if child is not None:
719 text += get_html_mixed_content(child, True, True, False, ", ", " pages")
721 if not text:
722 child = node.find("size")
723 if child is not None:
724 text += get_html_mixed_content(child, True, True, False, ", ", " pages")
726 if not text:
727 first_page_child = node.find("fpage")
728 if first_page_child is not None:
729 fpage_text = get_html_mixed_content(first_page_child, True, True, False)
730 lpage_text = ""
731 fpage_int = lpage_int = 0
732 try:
733 fpage_int = int(fpage_text)
734 except BaseException:
735 pass
737 last_page_child = node.find("lpage")
738 if last_page_child is not None:
739 lpage_text = get_html_mixed_content(last_page_child, True, True, False)
740 try:
741 lpage_int = int(lpage_text)
742 except BaseException:
743 pass
745 if lpage_int > 0 and lpage_int - fpage_int > 1 and not REF_JEP_STYLE:
746 text += ", pp. "
747 else:
748 text += ", p. "
749 text += fpage_text
750 if lpage_text:
751 text += "-" + lpage_text
753 if not text:
754 child = node.find("page-range")
755 if child is not None:
756 prefix = ", pp. "
757 suffix = ""
758 if REF_JEP_STYLE:
759 prefix = ", p. "
761 text += get_html_mixed_content(child, True, True, False, prefix, suffix)
763 return text
766##########################################################################
767#
768# Parse a name node ("name", "string-name", or "name-alternative) and find the fields related to a person name:
769# first_name <given-names>
770# last_name <surname>
771# prefix <prefix>
772# suffix <suffix>
773# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>"
774# reference_name <string_name specific-use="index"> or string_name
775# Used in Solr for facets (regroup multiple orthographies under the same person)
776#
777# Note: parse_name and get_name_str can not be merged...today
778# string-names in mixed-citation mix structured data (ex: "surname") and non structured content.
779# Ex: <surname>ROBERTSON</surname>, <given-names>D. H.</given-names></string-name>
780# Notice the ", " inside.
781# get_name_str is used for web pages and need to preserve everything (the ', " in particular)
782# parse_name is used to export bibtex: only structured data are preserved.
783# TODO: discuss this workflow. Why add or preserve the mix content of a string-name ?
784#
785# TODO: merge parse_name and parse_contrib
786# 1. <contrib> can have multiple entries (ex: <name> then <string-name specific-use="index") for 1 single person,
787# whereas <mixed-citation> or <element-citation> use 1 entry per person.
788# 2. string-name is a contrib is a simple text, string-name in mixed-citation is a tree
789#
790##########################################################################
793def get_name_params(first_name, last_name, prefix, suffix, string_name="", reference_name=""):
794 if string_name and not reference_name:
795 reference_name = string_name
797 if last_name and not string_name:
798 if prefix:
799 string_name = prefix + " "
801 string_name += last_name
803 if first_name:
804 string_name += ", " + first_name
806 if suffix:
807 string_name += " " + suffix
809 elif string_name and not last_name:
810 array = string_name.split(",")
811 if len(array) > 1:
812 last_name = array[0]
813 first_name = array[1]
815 if not reference_name and last_name:
816 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):
817 reference_name = ""
818 if first_name:
819 reference_name = first_name + " "
820 reference_name += last_name
821 else:
822 reference_name = last_name
823 if first_name:
824 reference_name += ", " + first_name
826 params = {
827 "first_name": first_name,
828 "last_name": last_name,
829 "prefix": prefix,
830 "suffix": suffix,
831 "string_name": string_name,
832 "reference_name": reference_name,
833 }
835 return params
838def parse_name(node):
839 first_name = last_name = prefix = suffix = string_name = reference_name = ""
841 if node is not None:
842 if node.tag == "name":
843 for child in node:
844 if child.tag == "given-names":
845 first_name += child.text
846 if child.tag == "surname":
847 last_name += child.text
848 if child.tag == "prefix":
849 prefix += child.text
850 if child.tag == "suffix":
851 suffix += child.text
852 if node.tag == "string-name":
853 for child in node:
854 if child.text:
855 if child.tag == "given-names":
856 first_name += child.text
857 if child.tag == "surname":
858 last_name += child.text
859 if child.tag == "prefix":
860 prefix += child.text
861 if child.tag == "suffix":
862 suffix += child.text
864 if not first_name and not last_name:
865 string_name = node.text
867 if node.tag == "name-alternatives":
868 for child in node:
869 if child.tag == "string-name":
870 if child.get("specific-use") == "index":
871 reference_name += child.text
873 params = get_name_params(first_name, last_name, prefix, suffix, string_name, reference_name)
875 return params
878##########################################################################
879#
880# Parse a Contrib node and find the fields related to a person name:
881# first_name <given-names>
882# last_name <surname>
883# prefix <prefix>
884# suffix <suffix>
885# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>"
886# reference_name <string_name specific-use="index"> or string_name
887# Used in Solr for facets (regroup multiple orthographies under the same person)
888#
889##########################################################################
892def parse_contrib(node):
893 first_name = last_name = prefix = suffix = string_name = reference_name = ""
895 if node is not None:
896 for child in node:
897 if child.tag == "name":
898 for great_child in child:
899 if great_child.text is not None:
900 if great_child.tag == "given-names":
901 first_name += great_child.text
902 if great_child.tag == "surname":
903 last_name += great_child.text
904 if great_child.tag == "prefix":
905 prefix += great_child.text
906 if great_child.tag == "suffix":
907 suffix += great_child.text
908 if child.tag == "string-name":
909 if child.text is not None:
910 string_name += child.text
911 if child.tag == "name-alternatives":
912 for great_child in child:
913 if great_child.text is not None:
914 if great_child.tag == "string-name":
915 if great_child.get("specific-use") == "index":
916 reference_name += great_child.text
918 if string_name and not reference_name:
919 reference_name = string_name
921 if last_name and not string_name:
922 if prefix:
923 string_name = prefix + " "
925 string_name += last_name
927 if first_name:
928 string_name += ", " + first_name
930 if suffix:
931 string_name += " " + suffix
933 elif string_name and not last_name:
934 array = string_name.split(",")
935 if len(array) > 1:
936 last_name = array[0]
937 first_name = array[1]
939 if not reference_name and last_name:
940 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):
941 reference_name = ""
942 if first_name:
943 reference_name = first_name + " "
944 reference_name += last_name
945 else:
946 reference_name = last_name
947 if first_name:
948 reference_name += ", " + first_name
950 params = {
951 "first_name": first_name,
952 "last_name": last_name,
953 "prefix": prefix,
954 "suffix": suffix,
955 "string_name": string_name,
956 "reference_name": reference_name,
957 }
959 return params
962def make_int(value):
963 v = value.split("-")[0]
964 try:
965 v = int(v)
966 except BaseException:
967 v = [x for x in v if x.isdigit()]
968 v = int(v)
969 else:
970 pass
971 return v
974def uni2ascii(s):
975 s = unicodedata.normalize("NFKD", str(s)).encode("ascii", "ignore")
976 return s
979sid_type = None
980pid_type = None
983def set_sid_type(id_type):
984 global sid_type
985 sid_type = id_type
988def set_pid_type(id_type):
989 global pid_type
990 pid_type = id_type
993class XmlData:
994 ids_xpath = None
995 id_type_attr = "pub-id-type"
997 extids_xpath = None
998 extid_type_attr = None
999 title_group_elt_path = None
1000 title_path = None
1001 trans_title_group_elt_path = None
1002 trans_title_path = None
1003 alternate_title_path = None
1004 alternate_title_group_elt_path = None
1005 meta_root_xpath = ""
1006 custom_meta_path = "custom-meta-group"
1007 counts_path = "counts"
1008 remove_links = False
1010 def __init__(self, tree):
1011 self.tree = tree
1012 if self.meta_root_xpath:
1013 self.meta_root = tree.find(self.meta_root_xpath)
1014 else:
1015 self.meta_root = None
1017 def __getattr__(self, name):
1018 mname = "get_" + name if "self" not in name else name
1019 getter = getattr(self, mname)
1020 obj = getter()
1021 setattr(self, name, obj)
1022 return obj
1024 def get_doi(self):
1025 return None
1027 def xpath(self, xpath):
1028 return self.tree.xpath(xpath)
1030 def xget_subtree(self, xpath):
1031 subtree = self.tree.xpath(xpath)
1032 if subtree:
1033 return subtree[0]
1034 return None
1036 def xget_subtrees(self, xpath):
1037 return self.tree.xpath(xpath)
1039 def get_subtree(self, path):
1040 return self.tree.find(path)
1042 def get_subtrees(self, path):
1043 return self.tree.findall(path)
1045 def get_node_text(self, path, return_none=""):
1046 node = self.tree.find(path)
1047 if node is None:
1048 return return_none
1049 if node.text is None:
1050 return return_none
1051 xml_text = etree.tostring(
1052 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False
1053 )
1054 return xml_text
1056 def get_nodes_text(self, path):
1057 return [x.text for x in self.tree.findall(path)]
1059 def get_ascii_text(self, path, return_none=""):
1060 return uni2ascii(self.get_node_text(path, return_none=return_none))
1062 def xget_node_text(self, xpath, return_none=None):
1063 try:
1064 return self.tree.xpath(xpath)[0].text
1065 except BaseException:
1066 return return_none
1068 def xget_ascii_text(self, xpath, return_none=""):
1069 return uni2ascii(self.xget_node_text(xpath, return_none=return_none))
1071 def tostring(self):
1072 self.prune()
1073 return etree.tostring(self.tree, encoding="utf-8", xml_declaration=False)
1075 __str__ = tostring
1077 def prune(self):
1078 pass
1080 def get_ids(self):
1081 if self.ids_xpath is not None:
1082 nodes = self.xget_subtrees(self.ids_xpath)
1083 return [(x.get(self.id_type_attr), x.text) for x in nodes if x.text is not None]
1084 return []
1086 def get_mathdoc_id(self):
1087 if self.mathdoc_id_xpath is not None:
1088 try:
1089 node = self.xget_subtrees(self.mathdoc_id_xpath)[0]
1090 except IndexError:
1091 return None
1092 else:
1093 return node.text
1094 return None
1096 def get_title_xml(self):
1097 title_xml = ""
1098 node = self.tree.find(self.title_group_elt_path)
1099 if node is None and self.alternate_title_group_elt_path:
1100 node = self.tree.find(self.alternate_title_group_elt_path)
1101 if node is not None:
1102 title_xml = get_mixed_content(node)
1103 return title_xml
1105 def inner_get_title_html(self, path, alternate_path=None):
1106 title_html = ""
1107 node = self.tree.find(path)
1108 if node is None and alternate_path:
1109 node = self.tree.find(alternate_path)
1110 if node is not None:
1111 title_html = get_html_mixed_content(node)
1112 return title_html
1114 def get_title_html(self):
1115 return self.inner_get_title_html(self.title_path, self.alternate_title_path)
1117 def get_trans_title_html(self):
1118 return self.inner_get_title_html(self.trans_title_path)
1120 def inner_get_title_tex(self, path, alternate_path=None):
1121 title_tex = ""
1122 node = self.tree.find(path)
1123 if node is None and alternate_path:
1124 node = self.tree.find(alternate_path)
1125 if node is not None:
1126 title_tex = get_tex(node)
1127 return title_tex
1129 def get_title_tex(self):
1130 return self.inner_get_title_tex(self.title_path, self.alternate_title_path)
1132 def get_trans_title_tex(self):
1133 return self.inner_get_title_tex(self.trans_title_path)
1135 def get_lang(self):
1136 tree = self.tree
1138 lang = get_lang_attrib(tree)
1140 if lang == "und":
1141 parent = tree.getparent()
1143 grand_parent = parent
1144 while grand_parent is not None:
1145 parent = grand_parent
1146 grand_parent = parent.getparent()
1148 lang = get_lang_attrib(parent)
1150 return lang
1152 def get_trans_lang(self):
1153 lang = "und"
1154 node = self.tree.find(self.trans_title_group_elt_path)
1155 lang = get_lang_attrib(node)
1157 return lang
1159 def get_extids(self):
1160 if self.extids_xpath is not None:
1161 nodes = self.xget_subtrees(self.extids_xpath)
1162 links = []
1163 for n in nodes:
1164 id_type = n.get(self.extid_type_attr)
1165 value = n.text.strip()
1166 if id_type is None and value.find("doi.org/") > 0:
1167 id_type = "doi"
1168 if id_type in (
1169 "mr-item-id",
1170 "zbl-item-id",
1171 "sps-id",
1172 "numdam-id",
1173 "mathdoc-id",
1174 "jfm-item-id",
1175 "eudml-item-id",
1176 "doi",
1177 "eid",
1178 ):
1179 if id_type == "numdam-id":
1180 id_type = "mathdoc-id"
1181 if id_type == "doi":
1182 if value.find("doi.org") > 0:
1183 value = value.replace("http://dx.doi.org/", "")
1184 value = value.replace("https://doi.org/", "")
1185 value = value.replace("doi:", "")
1186 links.append((id_type, value))
1187 if self.remove_links:
1188 n.getparent().remove(n)
1189 return links
1190 return []
1192 def get_xml(self, path, return_none=""):
1193 node = self.get_subtree(path)
1194 if node is not None:
1195 return etree.tostring(node, encoding="utf-8", xml_declaration=False)
1196 return return_none
1198 def get_inner_xml(self, path, return_none=""):
1199 node = self.get_subtree(path)
1200 if node is not None:
1201 return innerxml(node)
1202 return return_none
1204 def xget_xml(self, path):
1205 node = self.xget_subtree(path)
1206 if node is not None:
1207 return etree.tostring(node, encoding="utf-8", xml_declaration=False)
1208 return ""
1210 def get_catxml(self, path):
1211 nodes = self.get_subtrees(path)
1212 text = []
1213 for node in nodes:
1214 text.append(etree.tostring(node))
1215 return "".join(text)
1217 def get_streams(self):
1218 if self.meta_root is not None:
1219 self_uris = self.meta_root.findall("self-uri")
1220 else:
1221 self_uris = self.tree.findall("self-uri")
1222 vv = []
1223 for node in self_uris:
1224 href = base = type = ""
1225 for attrib in node.attrib:
1226 name = normalize(attrib)
1228 href = node.attrib[attrib] if name == "href" else href
1229 base = node.attrib[attrib] if name == "base" else base
1230 type = node.attrib[attrib] if name == "content-type" else type
1232 v = {
1233 "rel": "full-text",
1234 "mimetype": type or "text/html",
1235 "location": href,
1236 "base": base,
1237 "text": node.text if node.text else "Link",
1238 }
1240 vv.append(v)
1241 return vv
1243 def get_related_objects(self):
1244 related = []
1245 if self.meta_root is not None:
1246 nodes = self.meta_root.findall("related-object")
1247 else:
1248 nodes = self.tree.findall("related-object")
1249 for node in nodes:
1250 rel = href = base = type = ""
1251 for attrib in node.attrib:
1252 name = normalize(attrib)
1254 rel = node.attrib[attrib] if name == "link-type" else rel
1255 href = node.attrib[attrib] if name == "href" else href
1256 base = node.attrib[attrib] if name == "base" else base
1257 type = node.attrib[attrib] if name == "content-type" else type
1259 text = innerxml(node)
1260 v = {"rel": rel, "mimetype": type, "location": href, "base": base, "metadata": text}
1261 related.append(v)
1262 return related
1264 def get_supplementary_materials(self):
1265 materials = []
1266 if self.meta_root is not None:
1267 nodes = self.meta_root.findall("supplementary-material")
1268 else:
1269 nodes = self.tree.findall("supplementary-material")
1270 for node in nodes:
1271 try:
1272 location = node.attrib["href"]
1273 except KeyError:
1274 location = node.attrib["id"]
1275 material = {
1276 "rel": node.attrib.get("content-type"),
1277 "mimetype": node.attrib.get("mimetype"),
1278 "location": location,
1279 "base": "",
1280 "metadata": "",
1281 "caption": node.xpath("caption/text()")[0],
1282 }
1283 materials.append(material)
1284 return materials
1286 def get_metadataparts(self):
1287 return []
1289 def get_custom_meta(self):
1290 cm = {}
1291 if self.custom_meta_path:
1292 node = self.tree.find(self.custom_meta_path)
1293 if node is not None:
1294 for child in node:
1295 key = child[0].text
1296 value = child[1].text
1297 cm[key] = value
1298 return cm
1300 def get_wall(self):
1301 try:
1302 wall = self.custom_meta["wall"]
1303 except KeyError:
1304 return 0
1305 return int(wall)
1307 def get_pid(self):
1308 # try:
1309 # name = self.custom_meta['provider']
1310 # except KeyError:
1311 # return None
1312 # provider_id_type = name + '-id'
1313 for id_type, id_value in self.ids:
1314 if id_type == pid_type or (
1315 (id_type == "numdam-id" or id_type == "mathdoc-id")
1316 and (pid_type == "numdam-id" or pid_type == "mathdoc-id")
1317 ):
1318 return id_value
1320 def get_provider(self):
1321 return self.custom_meta.get("provider", None)
1323 def get_sid(self):
1324 for id_type, id_value in self.ids:
1325 if id_type == sid_type:
1326 return id_value
1327 return None
1329 def get_counts(self):
1330 counts = []
1331 if self.counts_path:
1332 node = self.tree.find(self.counts_path)
1333 if node is not None:
1334 page_count = node.find("page-count")
1335 if page_count is None:
1336 page_count = node.find("book-page-count")
1337 count = page_count.get("count")
1338 if not count:
1339 count = get_node_text(node)
1340 counts.append(("page-count", count))
1341 return counts
1343 def get_ext_links(self):
1344 referentials = [
1345 "jfm-item-id",
1346 "zbl-item-id",
1347 "mr-item-id",
1348 "nmid",
1349 "numdam-id",
1350 "mathdoc-id",
1351 "sps-id",
1352 "dmlid",
1353 "eudml-item-id",
1354 ]
1355 result = []
1356 if self.meta_root is not None:
1357 nodes = self.meta_root.findall("ext-link")
1358 else:
1359 nodes = self.tree.findall("ext-link")
1360 for node in nodes:
1361 rel = href = base = ""
1362 for attrib in node.attrib:
1363 name = normalize(attrib)
1365 rel = node.attrib[attrib] if name == "ext-link-type" else rel
1366 href = node.attrib[attrib] if name == "href" else href
1367 base = node.attrib[attrib] if name == "base" else base
1369 if rel in referentials:
1370 continue
1372 text = innerxml(node)
1373 v = {"rel": rel, "mimetype": "", "location": href, "base": base, "metadata": text}
1374 result.append(v)
1375 return result
1377 def get_last_modified_iso_8601_date_str(self):
1378 if self.last_modified_path:
1379 node = self.tree.find(self.last_modified_path)
1380 if node is not None:
1381 last_modified_iso_8601_date_str = node.attrib["iso-8601-date"]
1382 return last_modified_iso_8601_date_str
1383 # on traite le cas où le container arrive via ptf-tools et donc la date de
1384 # dernière modification est la date d'import
1385 return timezone.now().isoformat()
1387 def get_date_published_iso_8601_date_str(self):
1388 date_str = None
1389 if self.published_path:
1390 node = self.tree.find(self.published_path)
1391 if node is not None:
1392 if "iso-8601-date" in node.attrib:
1393 date_published_iso_8601_date_str = node.attrib["iso-8601-date"]
1394 date_str = date_published_iso_8601_date_str
1395 else:
1396 year = month = day = ""
1398 sub_node = node.find("year")
1399 if sub_node is not None:
1400 year = sub_node.text
1401 sub_node = node.find("month")
1402 if sub_node is not None:
1403 month = sub_node.text
1404 sub_node = node.find("day")
1405 if sub_node is not None:
1406 day = sub_node.text
1408 date_str = year
1409 if date_str and month:
1410 date_str += "-" + month
1411 if date_str and day:
1412 date_str += "-" + day
1414 return date_str
1416 def get_prod_deployed_date_iso_8601_date_str(self):
1417 if self.prod_deployed_date_path:
1418 node = self.tree.find(self.prod_deployed_date_path)
1419 if node is not None:
1420 prod_deployed_date_iso_8601_date_str = node.attrib["iso-8601-date"]
1421 return prod_deployed_date_iso_8601_date_str
1422 return None
1425class StreamGroup:
1426 def __init__(self, tree):
1427 self.use = tree.get("use").lower()
1428 streams = []
1429 for node in tree:
1430 link = node.find("link")
1431 rel = href = seq = type = ""
1432 for attrib in link.attrib:
1433 name = normalize(attrib)
1435 rel = link.attrib[attrib] if name == "rel" else rel
1436 href = link.attrib[attrib] if name == "href" else href
1437 seq = link.attrib[attrib] if name == "seq" else seq
1438 type = node.attrib[attrib] if name == "content-type" else type
1440 v = {
1441 "rel": rel,
1442 "mimetype": type,
1443 "location": href,
1444 "seq": seq,
1445 "text": link.text or "",
1446 }
1447 streams.append(v)
1448 self.streams = streams
1451###
1452#
1453class Work(XmlData):
1454 lang = "und"
1455 back_paths = ("back", "book-back")
1456 biblio_xpath = "ref-list"
1458 def inner_get_lang(self, node):
1459 the_lang = get_lang_attrib(node)
1460 if the_lang == "und":
1461 the_lang = self.lang
1463 return the_lang
1465 def inner_get_abstract(self, node, tag, attrs):
1466 if node is not None:
1467 the_lang = self.inner_get_lang(node)
1469 value_xml = get_mixed_content(node)
1470 value_html = get_html_mixed_content(node)
1471 value_tex = get_tex(node)
1473 attrs.append(
1474 {
1475 "tag": tag,
1476 "lang": the_lang,
1477 "value_xml": value_xml,
1478 "value_html": value_html,
1479 "value_tex": value_tex,
1480 }
1481 )
1483 def get_abstracts(self):
1484 attrs = []
1485 nodes = self.tree.findall(self.abstract_path)
1486 for node in nodes:
1487 tag = node.get("abstract-type") or "abstract"
1488 self.inner_get_abstract(node, tag, attrs)
1490 nodes = self.tree.findall(self.trans_abstract_path)
1491 for node in nodes:
1492 tag = node.get("abstract-type") or "abstract"
1493 tag = "trans-" + tag
1494 self.inner_get_abstract(node, tag, attrs)
1496 return attrs
1498 def get_contrib_groups(self):
1499 groups = []
1500 grps = self.tree.findall(self.contrib_path)
1501 for g in grps:
1502 contribs = g.findall("contrib")
1503 gc = []
1504 for contrib in contribs:
1505 params = parse_contrib(contrib)
1506 params["contrib_type"] = contrib.get("contrib-type") or ""
1507 params["deceased"] = contrib.get("deceased") or ""
1508 params["contrib_xml"] = get_mixed_content(contrib)
1509 if (
1510 params["first_name"]
1511 or params["last_name"]
1512 or params["string_name"]
1513 or params["reference_name"]
1514 ):
1515 gc.append(params)
1516 if gc:
1517 groups.append({"content_type": g.get("content-type") or "", "contribs": gc})
1518 return groups
1520 def get_kwd_groups(self):
1521 groups = []
1522 grps = self.tree.findall(self.kwd_path)
1523 for g in grps:
1524 ugrp = g.find("unstructured-kwd-group")
1525 the_lang = self.inner_get_lang(g)
1526 if ugrp is not None:
1527 value_xml = get_mixed_content(ugrp)
1528 value_tex = get_tex(ugrp)
1529 value_html = get_html_mixed_content(ugrp)
1530 groups.append(
1531 {
1532 "content_type": g.get("content-type") or "",
1533 "lang": the_lang,
1534 "value_xml": value_xml,
1535 "value_html": value_html,
1536 "value_tex": value_tex,
1537 "kwds": [],
1538 }
1539 )
1540 else:
1541 kwds = g.findall("kwd")
1542 values = [innerxml(x) for x in kwds]
1543 groups.append(
1544 {
1545 "content_type": g.get("kwd-group-type") or "",
1546 "lang": the_lang,
1547 "value": "",
1548 "kwds": values,
1549 }
1550 )
1551 return groups
1553 def get_subj_groups(self):
1554 groups = []
1555 grps = self.tree.findall(self.subj_path)
1556 for g in grps:
1557 the_lang = self.inner_get_lang(g)
1558 subjects = g.findall("subject")
1559 values = [innerxml(x) for x in subjects]
1560 groups.append(
1561 {
1562 "content_type": g.get("subj-group-type") or "",
1563 "lang": the_lang,
1564 "value": "",
1565 "subjects": values,
1566 }
1567 )
1568 return groups
1570 def get_awards(self):
1571 awards = []
1573 nodes = self.tree.findall(self.funding_path)
1574 for node in nodes:
1575 abbrev = award_id = None
1576 names = node.findall("funding-source/named-content")
1577 for name_node in names:
1578 tag = name_node.get("content-type") or ""
1579 if tag == "abbrevation":
1580 abbrev = innerxml(name_node)
1581 id_node = node.find("award-id")
1582 if id_node is not None:
1583 award_id = innerxml(id_node)
1585 if abbrev is not None and id is not None:
1586 awards.append({"abbrev": abbrev, "award_id": award_id})
1588 return awards
1590 # def get_title_group(self):
1591 # title_xml = ''
1592 # group = self.tree.find(self.title_group_elt_path)
1593 # if group is not None:
1594 # title_xml = get_mixed_content(group)
1595 # return title_xml
1596 # return innerxml(group)
1597 # return ""
1598 #
1599 # def get_title_text(self):
1600 # return self.get_node_text(self.title_path)
1602 # def get_abstract(self):
1603 # return self.get_xml(self.abstract_path)
1604 #
1605 # def get_abstract_text(self):
1606 # return self.get_node_text(self.abstract_path)
1607 #
1608 # def get_trans_abstracts(self):
1609 # return self.get_catxml(self.trans_abstract_path)
1611 def get_keywords(self):
1612 return self.get_catxml(self.kwd_path)
1614 def get_bibitems(self):
1615 for back_path in self.back_paths:
1616 back = self.tree.find(back_path)
1617 if back is not None:
1618 break
1619 if back is None:
1620 return []
1621 ref_list = back.find(self.biblio_xpath)
1622 if ref_list is None:
1623 return []
1624 items = []
1625 for ref in ref_list:
1626 if ref.tag == "ref":
1627 items.append(BibItem(ref))
1628 # try:
1629 # self.tree.getroot().remove(back)
1630 # except:
1631 # self.tree.remove(back)
1632 return items
1635class InCollection(XmlData):
1636 def __init__(self, tree):
1637 super().__init__(tree)
1638 self.volume, self.seq, self.vseries = get_volume_and_seq(tree)
1639 colmeta = tree.find("collection-meta")
1640 self.collection = Collection(colmeta)
1643class BitsCollection(XmlData):
1644 def __init__(self, tree):
1645 try:
1646 seq = int(tree.get("seq"))
1647 except BaseException:
1648 try:
1649 seq = int(tree.find("volume-in-collection/volume-number").text)
1650 except BaseException:
1651 seq = 0
1652 try:
1653 volume = tree.find("volume-in-collection/volume-number").text
1654 except BaseException:
1655 volume = ""
1656 try:
1657 series = tree.find("volume-in-collection/volume-series").text
1658 except BaseException:
1659 series = ""
1660 self.volume = volume
1661 self.seq = seq
1662 self.vseries = series
1663 self.collection = Collection(tree)
1666class Publisher(XmlData):
1667 mathdoc_id_xpath = 'publisher-id[@publisher-id-type="mathdoc-id"]'
1669 def get_name(self):
1670 return self.get_node_text("publisher-name")
1672 def get_loc(self):
1673 return self.get_node_text("publisher-loc")
1676class EventSeries(XmlData):
1677 def __init__(self, tree):
1678 super().__init__(tree)
1679 self.event_type = tree.get("event-type")
1681 def get_title(self):
1682 return self.get_node_text("event-name")
1684 def get_acro(self):
1685 return self.get_node_text("event-acronym")
1687 def get_short_title(self):
1688 return ""
1691class Event(XmlData):
1692 def __init__(self, tree):
1693 super().__init__(tree)
1694 self.event_type = tree.get("event-type")
1696 def get_title(self):
1697 return self.get_node_text("event-name")
1699 def get_acro(self):
1700 return self.get_node_text("event-acronym")
1702 def get_year(self):
1703 return self.get_node_text("event-date")
1705 def get_number(self):
1706 return self.get_node_text("event-num")
1708 def get_loc(self):
1709 return self.get_node_text("event-loc")
1712# <collection-meta> d'un <book>
1715class Collection(Work):
1716 lang = "und"
1717 title_group_elt_path = "title-group"
1718 title_path = "title-group/title"
1719 subtitle_path = "title-group/subtitle"
1720 abstract_path = "abstract"
1721 trans_abstract_path = "trans-abstract"
1722 kwd_path = "kwd-group"
1723 subj_path = "Not-supported"
1724 ids_xpath = "collection-id"
1725 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]'
1726 trans_title_group_elt_path = "title-group/trans-title-group"
1727 trans_title_path = "title-group/trans-title-group/trans-title"
1728 funding_path = "Not supported"
1730 contrib_path = "contrib-group"
1731 id_type_attr = "collection-id-type"
1733 def get_coltype(self):
1734 return self.tree.get("collection-type") or "collection"
1736 def get_publisher(self):
1737 node = self.tree.find("publisher")
1738 if node is not None:
1739 return Publisher(node)
1740 return None
1742 def get_title(self):
1743 return self.get_node_text("title-group/title")
1745 def get_abbrev(self):
1746 return self.get_node_text("title-group/abbrev-title")
1748 def get_ids(self):
1749 ids = XmlData.get_ids(self)
1750 issns = self.tree.findall("issn")
1751 for issn in issns:
1752 itp = issn.get("pub-type")
1753 if itp == "ppub":
1754 ids.append(("issn", issn.text))
1755 elif itp == "epub":
1756 ids.append(("e-issn", issn.text))
1757 else:
1758 pass
1759 return ids
1762# <journal-meta> d'un <journal-issue>
1765class Journal(Work):
1766 ids_xpath = "journal-id"
1767 id_type_attr = "journal-id-type"
1768 title_group_elt_path = "journal-title-group"
1769 title_path = "journal-title-group/journal-title"
1770 abbrev_title_path = "journal-title-group/abbrev-title"
1771 trans_title_group_elt_path = "journal-title-group/trans-title-group"
1772 trans_title_path = "journal-title-group/trans-title-group/trans-title"
1773 abstract_path = "abstract"
1774 trans_abstract_path = "trans-abstract"
1775 contrib_path = "contrib-group"
1776 kwd_path = "kwd-group"
1777 subj_path = "Not-supported"
1778 funding_path = "Not-supported"
1780 def get_ids(self):
1781 ids = XmlData.get_ids(self)
1782 issns = self.tree.findall("issn")
1783 for issn in issns:
1784 itp = issn.get("pub-type")
1785 if issn.text:
1786 if itp == "ppub":
1787 ids.append(("issn", issn.text))
1788 elif itp == "epub":
1789 ids.append(("e-issn", issn.text))
1790 else:
1791 pass
1792 return ids
1794 def get_publisher(self):
1795 node = self.tree.find("publisher")
1796 if node is not None:
1797 return Publisher(node)
1798 return None
1800 def get_title_group(self):
1801 node = self.tree.find(self.title_group_elt_path)
1802 if node is not None:
1803 return innerxml(node)
1804 return ""
1806 def get_title_xml(self):
1807 title_xml = ""
1808 node = self.tree.find(self.title_group_elt_path)
1809 if node is not None:
1810 title_xml = get_mixed_content(node)
1811 return title_xml
1813 def get_title_html(self):
1814 title_html = ""
1815 node = self.tree.find(self.title_path)
1816 if node is not None:
1817 title_html = get_html_mixed_content(node)
1818 return title_html
1820 def get_title_tex(self):
1821 title_tex = ""
1822 node = self.tree.find(self.title_path)
1823 if node is not None:
1824 title_tex = get_tex(node)
1825 return title_tex
1827 def get_abbrev(self):
1828 return self.get_node_text(self.abbrev_title_path)
1830 def get_coltype(self):
1831 return self.custom_meta.get("serial-type")
1834class Publication(Journal):
1835 ids_xpath = "publication-id"
1836 id_type_attr = "publication-id-type"
1837 title_group_elt_path = "title-group"
1838 title_path = "title-group/title"
1839 abbrev_title_path = "title-group/abbrev-title"
1840 trans_title_group_elt_path = "title-group/trans-title-group"
1841 trans_title_path = "title-group/trans-title-group/trans-title"
1844class Issue(Work):
1845 mathdoc_id_xpath = 'issue-meta/issue-id[@issue-id-type="mathdoc-id"]'
1846 ids_xpath = "issue-meta/issue-id"
1847 abstract_path = "issue-meta/abstract"
1848 trans_abstract_path = "issue-meta/trans-abstract"
1849 kwd_path = "issue-meta/kwd-group"
1850 subj_path = "Not-supported"
1851 contrib_path = "issue-meta/contrib-group"
1852 title_group_elt_path = "issue-meta/issue-title"
1853 title_path = "issue-meta/issue-title"
1854 # TODO support langs in issue-title
1855 subtitle_path = ""
1856 trans_title_path = ""
1857 trans_title_group_elt_path = ""
1858 counts_path = "issue-meta/counts"
1859 last_modified_path = 'issue-meta/history/date[@date-type="last-modified"]'
1860 published_path = 'issue-meta/pub-date[@date-type="pub"]'
1861 prod_deployed_date_path = 'issue-meta/history/date[@date-type="prod-deployed-date"]'
1862 funding_path = "Not-supported"
1864 lang = "und"
1865 meta_root_xpath = "issue-meta"
1866 custom_meta_path = "issue-meta/custom-meta-group"
1868 def get_journal(self):
1869 node = self.tree.find("journal-meta")
1870 return Journal(node)
1872 def get_ctype(self):
1873 return "issue"
1875 def get_vseries(self):
1876 return self.get_node_text("issue-meta/volume-series")
1878 def get_vseries_int(self):
1879 v = self.get_node_text("issue-meta/volume-series")
1880 if v:
1881 return make_int(v)
1882 return 0
1884 def get_volume(self):
1885 return self.get_node_text("issue-meta/volume")
1887 def get_volume_int(self):
1888 v = self.get_node_text("issue-meta/volume")
1889 if v:
1890 return make_int(v)
1891 return 0
1893 def get_number(self):
1894 return self.get_node_text("issue-meta/issue")
1896 def get_number_int(self):
1897 v = self.get_node_text("issue-meta/issue")
1898 if v:
1899 return make_int(v)
1900 return 0
1902 def get_year(self):
1903 return self.get_node_text("issue-meta/pub-date/year")
1905 def get_event(self):
1906 node = self.tree.find("event")
1907 if node is not None:
1908 return Event(node)
1909 return None
1911 def get_publisher(self):
1912 xpublisher = None
1913 xjournal = self.get_journal()
1914 if xjournal is not None:
1915 xpublisher = xjournal.publisher
1916 return xpublisher
1918 def __iter__(self):
1919 body = self.tree.find("body")
1920 for node in body:
1921 yield Article(node)
1924class BibItem(XmlData):
1925 extids_xpath = "*/ext-link"
1926 extid_type_attr = "ext-link-type"
1928 # remove_links = True
1929 def __init__(self, tree):
1930 super().__init__(tree)
1931 self.extids = self.get_extids()
1933 # Temporary code
1934 # Some xml only have a pub-id (doi) and do not have an ext-link with a ext-link-type=doi
1935 # We need to manually create the link
1937 has_doi = False
1938 for id_type, _id_value in self.extids:
1939 if id_type == "doi":
1940 has_doi = True
1942 nodes = self.tree.findall("*/pub-id")
1943 for node in nodes:
1944 id_type = node.get("pub-id-type")
1945 if id_type == "doi" and not has_doi:
1946 value = node.text
1947 value = value.replace("http://dx.doi.org/", "")
1948 value = value.replace("https://doi.org/", "")
1949 value = value.replace("doi:", "")
1950 self.extids.append(("doi", value))
1951 elif id_type in ["eid", "arxiv", "tel", "hal", "theses.fr"]:
1952 value = node.text
1953 self.extids.append((id_type, value))
1955 def get_ref(self):
1956 return self.tostring()
1958 def split_label(self):
1959 """
1960 Used when sorting non-digit bibitems
1961 """
1962 label = self.label.lower()
1964 try:
1965 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)
1966 except ValueError:
1967 # Special case where label is similar as "Sma" instead of "Sma15"
1968 self.label_prefix, self.label_suffix = [label, ""]
1970 def get_label(self):
1971 node = self.tree.find("label")
1972 if node is not None:
1973 return node.text
1974 return ""
1976 def get_user_id(self):
1977 return self.tree.get("id", "")
1979 def get_citation_xml(self):
1980 text = ""
1981 for name in ("mixed-citation", "element-citation"):
1982 if not text:
1983 node = self.tree.find(name)
1984 text = get_mixed_content(node)
1986 label = self.get_label()
1987 if label:
1988 text = "<label>" + label + "</label>" + text
1990 return text
1992 def get_citation_html(self):
1993 text = ""
1994 for name in ("mixed-citation", "element-citation"):
1995 if not text:
1996 node = self.tree.find(name)
1997 text = get_html_mixed_content(node)
1999 label = self.get_label()
2000 if label:
2001 if label[0] != "[":
2002 label = "[" + label + "]"
2003 text = label + " " + text
2005 return text
2007 def get_citation_tex(self):
2008 text = ""
2009 for name in ("mixed-citation", "element-citation"):
2010 if not text:
2011 node = self.tree.find(name)
2012 text = get_tex(node)
2014 label = self.get_label()
2015 if label:
2016 if label[0] != "[":
2017 label = "[" + label + "]"
2018 text = label + " " + text
2020 return text
2022 def get_citation_node(self):
2023 tree = self.tree.find("element-citation")
2024 if tree is None:
2025 tree = self.tree.find("mixed-citation")
2027 return tree
2029 def get_type(self):
2030 type = "misc"
2032 tree = self.get_citation_node()
2033 if tree is not None:
2034 type = tree.get("publication-type", "misc")
2036 return type
2038 def get_node_text(self, node_name, tex=False):
2039 text = ""
2040 tree = self.get_citation_node()
2041 if tree is not None:
2042 node = tree.find(node_name)
2043 if node is not None:
2044 if tex:
2045 text = get_tex(node)
2046 else:
2047 text = node.text
2048 return text
2050 def get_publisher_name(self):
2051 return self.get_node_text("publisher-name")
2053 def get_publisher_loc(self):
2054 return self.get_node_text("publisher-loc")
2056 def get_institution(self):
2057 return self.get_node_text("institution")
2059 def get_series(self):
2060 return self.get_node_text("series")
2062 def get_volume(self):
2063 return self.get_node_text("volume")
2065 def get_issue(self):
2066 return self.get_node_text("issue")
2068 def get_year(self):
2069 return self.get_node_text("year")
2071 # TODO: comments may have ext-link like arxiv. Add ExtId ?
2072 def get_comment(self):
2073 return self.get_node_text("comment", tex=True)
2075 def get_fpage(self):
2076 return self.get_node_text("fpage")
2078 def get_lpage(self):
2079 return self.get_node_text("lpage")
2081 def get_page_range(self):
2082 return self.get_node_text("page-range")
2084 def get_size(self):
2085 text = self.get_node_text("page-count")
2086 if not text:
2087 text = self.get_node_text("size")
2088 return text
2090 def get_source_tex(self):
2091 return self.get_node_text("source", tex=True)
2093 def get_article_title_tex(self):
2094 return self.get_node_text("article-title", tex=True)
2096 def get_chapter_title_tex(self):
2097 return self.get_node_text("chapter-title", tex=True)
2099 def get_contrib_groups(self):
2100 groups = []
2102 tree = self.get_citation_node()
2103 if tree is not None:
2104 gc = []
2106 for child in tree:
2107 if (
2108 child.tag == "name"
2109 or child.tag == "string-name"
2110 or child.tag == "name-alternatives"
2111 ):
2112 params = parse_name(child)
2113 params["contrib_type"] = ""
2114 params["contrib_xml"] = get_mixed_content(child)
2115 gc.append(params)
2117 groups.append({"content_type": "", "contribs": gc})
2118 return groups
2121class Relation(XmlData):
2122 def get_id_type(self):
2123 return self.tree.get("ext-link-type") or ""
2125 def get_rel_type(self):
2126 return self.tree.get("related-article-type") or ""
2128 def get_id_value(self):
2129 return self.tree.text or ""
2131 def get_right_pid(self):
2132 return self.id_value
2135class Article(Work):
2136 mathdoc_id_xpath = 'article-id[@pub-id-type="mathdoc-id"]'
2137 ids_xpath = "front/article-meta/article-id"
2138 article_xpath = "front/article-meta"
2140 extids_xpath = (
2141 'front/article-meta/ext-link[@ext-link-type="mr-item-id"]'
2142 '|front/article-meta/ext-link[@ext-link-type="zbl-item-id"]'
2143 '|front/article-meta/ext-link[@ext-link-type="sps-id"]'
2144 '|front/article-meta/ext-link[@ext-link-type="jfm-item-id"]'
2145 )
2146 extid_type_attr = "ext-link-type"
2147 title_group_elt_path = "front/article-meta/title-group"
2148 title_path = "front/article-meta/title-group/article-title"
2149 subtitle_path = "front/article-meta/title-group/subtitle"
2150 trans_title_group_elt_path = "front/article-meta/title-group/trans-title-group"
2151 trans_title_path = "front/article-meta/title-group/trans-title-group/trans-title"
2152 abstract_path = "front/article-meta/abstract"
2153 trans_abstract_path = "front/article-meta/trans-abstract"
2154 kwd_path = "front/article-meta/kwd-group"
2155 subj_path = "front/article-meta/article-categories/subj-group"
2156 contrib_path = "front/article-meta/contrib-group"
2157 meta_root_xpath = "front/article-meta"
2158 custom_meta_path = "front/article-meta/custom-meta-group"
2159 counts_path = "front/article-meta/counts"
2160 published_path = 'front/article-meta/pub-date[@date-type="pub"]'
2161 prod_deployed_date_path = 'front/article-meta/history/date[@date-type="prod-deployed-date"]'
2162 history_path = "front/article-meta/history/date"
2163 funding_path = "front/article-meta/funding-group/award-group"
2165 def __init__(self, tree):
2166 # Case when we import the JATS article from OAI.
2167 # The <article> tag is surrounded by a <header> tag. Remove this tag.
2168 if tree.tag != "article":
2169 remove_namespace(tree)
2170 tree = tree.xpath("metadata/article")[0]
2172 super().__init__(tree)
2173 self.article_meta = self.get_subtree(self.article_xpath)
2174 self.atype = tree.get("article-type") or ""
2175 self.numbering = ""
2176 self.lang = self.get_lang()
2178 def get_doi(self):
2179 try:
2180 text = self.tree.xpath('front/article-meta/article-id[@pub-id-type="doi"]')[0].text
2181 except BaseException:
2182 return None
2183 else:
2184 return text
2186 # When the JATS XML has only an <article>, we need to construct the Journal on the fly
2187 def get_journal(self):
2188 node = self.tree.xpath("front/journal-meta")[0]
2189 return Journal(node)
2191 def get_issue_id(self):
2192 try:
2193 return self.tree.xpath("front/article-meta/issue-id")[0].text
2194 except:
2195 return ""
2197 def get_volume(self):
2198 try:
2199 return self.tree.xpath("front/article-meta/volume")[0].text
2200 except:
2201 return ""
2203 def get_fpage(self):
2204 return self.get_node_text("front/article-meta/fpage")
2206 def get_lpage(self):
2207 return self.get_node_text("front/article-meta/lpage")
2209 def get_page_type(self):
2210 page_type = ""
2211 node = self.tree.find("front/article-meta/fpage")
2212 if node is not None:
2213 page_type = node.get("content-type")
2215 if page_type is None:
2216 page_type = ""
2218 return page_type
2220 # Olivier 2016-01-13 add page-range & elocation
2221 def get_page_range(self):
2222 return self.get_node_text("front/article-meta/page-range")
2224 def get_elocation(self):
2225 return self.get_node_text("front/article-meta/elocation-id")
2227 def get_body(self):
2228 node = self.tree.find("body")
2229 text = get_node_text(node)
2230 return text
2232 def body_jats_to_html(self, base_url):
2233 body_html = ""
2234 figures = []
2235 node = self.tree.find("body")
2236 if node is not None:
2237 body_html, figures = get_html_mixed_content_with_figures(
2238 node,
2239 is_top=True,
2240 is_citation=False,
2241 is_comment=False,
2242 is_figure=False,
2243 prefix="",
2244 suffix="",
2245 sec_level=2,
2246 label_title="",
2247 figures=figures,
2248 base_url=base_url,
2249 )
2250 return body_html, figures
2252 def get_body_tex(self):
2253 node = self.tree.find("body")
2254 # TODO: body_tex devrait être en fait le HTML va les fourmules TeX en texte
2255 value_tex = get_tex(node)
2256 return value_tex
2258 def get_body_xml(self):
2259 node = self.tree.find("body")
2260 value_xml = get_mixed_content(node)
2261 return value_xml
2263 def get_seq(self):
2264 issue = self.get_subtree("front/article-meta/issue")
2265 seq = 0
2266 if issue is not None:
2267 seq = issue.get("seq") or 0
2268 if not seq:
2269 fpage = self.get_subtree("front/article-meta/fpage")
2270 if fpage is not None:
2271 seq = fpage.get("seq") or 0
2272 try:
2273 seq = int(seq)
2274 except BaseException:
2275 seq = 0
2276 return seq
2278 def get_relations(self):
2279 relations = []
2280 nodes = self.tree.findall("front/article-meta/related-article")
2281 for n in nodes:
2282 rel = Relation(n)
2283 rel.left_pid = self.pid
2284 relations.append(rel)
2285 return relations
2287 def get_history_dates(self):
2288 dates = []
2289 nodes = self.tree.findall(self.history_path)
2290 for node in nodes:
2291 type = node.attrib["date-type"]
2292 date = node.attrib["iso-8601-date"]
2293 dates.append({"type": type, "date": date})
2295 return dates
2297 def get_article_number(self):
2298 return self.custom_meta.get("article-number", "")
2300 def get_talk_number(self):
2301 return self.custom_meta.get("talk-number", "")
2304class BookSeries(XmlData):
2305 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]'
2306 ids_xpath = "collection-id"
2307 extid_type_attr = "collection-id-type"
2308 title_group_elt_path = "title-group"
2309 title_path = "title-group/title"
2310 subtitle_path = "title-group/subtitle"
2311 lang = "und"
2313 def get_ids(self):
2314 ids = []
2315 issn = self.get_node_text("issn")
2316 if issn:
2317 ids.append(("issn", issn))
2318 nodes = self.tree.findall("collection-id")
2319 for n in nodes:
2320 id_type = n.get("collection-id-type")
2321 id_val = n.text
2322 ids.append((id_type, id_val))
2323 return ids
2325 def get_title(self):
2326 return self.get_node_text(self.title_path)
2328 def get_abbrev(self):
2329 return self.get_node_text("title-group/abbrev-title")
2331 def get_publisher(self):
2332 node = self.tree.find("publisher")
2333 if node is not None:
2334 return Publisher(node)
2335 return None
2337 def get_stype(self):
2338 return self.custom_meta.get("serial-type")
2341# Mixin
2342class HasParts:
2343 def get_parts(self):
2344 xparts = []
2345 for name in ("book-body", "body"):
2346 parts = self.xget_subtrees("%s/book-part" % name)
2347 if parts:
2348 break
2349 if parts:
2350 for tree in parts:
2351 part = self.__class__.get_book_part_class()(tree)
2352 xparts.append(part)
2353 for name in ("book-body", "body"):
2354 body = self.get_subtree(name)
2355 if body is not None:
2356 break
2357 if body is not None:
2358 try:
2359 self.tree.getroot().remove(body) # XSLT result tree
2360 except BaseException:
2361 self.tree.remove(body) # Element tree
2362 return xparts
2365class BookPart(Work, HasParts):
2366 id_type_attr = "book-part-id-type"
2367 part_xpath = "book-part-meta"
2368 ids_xpath = "book-part-meta/book-part-id"
2369 mathdoc_id_xpath = 'book-part-meta/book-part-id[@book-part-id-type="mathdoc-id"]'
2370 meta_xpath = "book-part-meta"
2371 extids_xpath = (
2372 'book-part-meta/ext-link[@ext-link-type="mr-item-id"]'
2373 '|book-part-meta/ext-link[@ext-link-type="zbl-item-id"]'
2374 '|book-part-meta/ext-link[@ext-link-type="jfm-item-id"]'
2375 )
2376 extid_type_attr = "ext-link-type"
2377 title_group_elt_path = "book-part-meta/title-group"
2378 title_path = "book-part-meta/title-group/title"
2379 subtitle_path = "book-part-meta/title-group/subtitle"
2380 trans_title_group_elt_path = "book-part-meta/title-group/trans-title-group"
2381 trans_title_path = "book-part-meta/title-group/trans-title-group/trans-title"
2382 abstract_path = "book-part-meta/abstract"
2383 trans_abstract_path = "book-part-meta/trans-abstract"
2384 kwd_path = "book-part-meta/kwd-group"
2385 subj_path = "front/book-part-meta/article-categories/subj-group"
2386 contrib_path = "book-part-meta/contrib-group"
2387 meta_root_xpath = "book-part-meta"
2388 custom_meta_path = "book-part-meta/custom-meta-group"
2389 funding_path = "book-part-meta/funding-group/award-group"
2391 def __init__(self, tree):
2392 super().__init__(tree)
2393 self.part_meta = self.get_subtree(self.part_xpath)
2394 indexed = tree.get("indexed", "true")
2395 self.indexed = True if indexed == "true" else False
2396 self.atype = tree.get("book-part-type") or ""
2397 self.numbering = tree.get("book-part-number") or ""
2398 self.parts = self.get_parts()
2399 self.lang = self.get_lang()
2401 def get_fpage(self):
2402 return self.get_node_text("book-part-meta/fpage")
2404 def get_lpage(self):
2405 return self.get_node_text("book-part-meta/lpage")
2407 def get_page_range(self):
2408 return ""
2410 def get_page_type(self):
2411 page_type = ""
2412 node = self.tree.find("book-part-meta/fpage")
2413 if node is not None:
2414 page_type = node.get("content-type")
2416 if page_type is None:
2417 page_type = ""
2419 return page_type
2421 def get_seq(self):
2422 v = self.fpage
2423 try:
2424 v = int(v)
2425 except BaseException:
2426 return 0
2427 return v
2429 def get_body(self):
2430 node = self.tree.find("body")
2431 if node is not None:
2432 return etree.tostring(node, encoding="utf-8", xml_declaration=False)
2433 return ""
2435 def get_relations(self):
2436 relations = []
2437 nodes = self.tree.findall("book-part-meta/related-article")
2438 for n in nodes:
2439 rel = Relation(n)
2440 rel.left_pid = self.pid
2441 relations.append(rel)
2442 return relations
2444 def get_article_number(self):
2445 return self.custom_meta.get("article-number", "")
2447 def get_talk_number(self):
2448 return self.custom_meta.get("talk-number", "")
2451def get_volume_and_seq(incol):
2452 v = incol.find("volume")
2453 try:
2454 seq = int(incol.get("seq"))
2455 except BaseException:
2456 if v is None:
2457 seq = 0
2458 else:
2459 vt = v.text.split("-")[0]
2460 vt = [x for x in vt if x.isdigit()]
2461 try:
2462 seq = int(vt)
2463 except BaseException:
2464 seq = 0
2465 try:
2466 volume = v.text
2467 except BaseException:
2468 volume = ""
2469 try:
2470 vseries = incol.find("volume-series").text
2471 except BaseException:
2472 vseries = ""
2473 if vseries:
2474 try:
2475 # pas plus de 10000 ouvrages dans une série (gasp)
2476 seq = int(vseries) * 10000 + seq
2477 except BaseException:
2478 pass
2479 return (volume, seq, vseries)
2482class Book(Work, HasParts):
2483 id_type_attr = "book-id-type"
2484 mathdoc_id_xpath = 'book-meta/book-id[@book-id-type="mathdoc-id"]'
2485 ids_xpath = "book-meta/book-id"
2486 book_xpath = "book-meta"
2487 extids_xpath = (
2488 'book-meta/ext-link[@ext-link-type="mr-item-id"]'
2489 '|book-meta/ext-link[@ext-link-type="zbl-item-id"]'
2490 '|book-meta/ext-link[@ext-link-type="jfm-item-id"]'
2491 )
2492 extid_type_attr = "ext-link-type"
2494 title_group_elt_path = "book-meta/book-title-group"
2495 title_path = "book-meta/book-title-group/book-title"
2496 alternate_title_group_elt_path = "collection-meta/volume-in-collection/volume-title"
2497 alternate_title_path = "collection-meta/volume-in-collection/volume-title"
2498 trans_title_group_elt_path = "book-meta/book-title-group/trans-title-group"
2499 trans_title_path = "book-meta/book-title-group/trans-title-group/trans-title"
2500 subtitle_path = "book-meta/book-title-group/subtitle"
2502 abstract_path = "book-meta/abstract"
2503 trans_abstract_path = "book-meta/trans-abstract"
2504 kwd_path = "book-meta/kwd-group"
2505 subj_path = "Not-supported"
2506 contrib_path = "book-meta/contrib-group"
2507 meta_root_xpath = "book-meta"
2508 custom_meta_path = "book-meta/custom-meta-group"
2509 counts_path = "book-meta/counts"
2510 last_modified_path = 'book-meta/pub-history/date[@date-type="last-modified"]'
2511 published_path = 'book-meta/pub-date[@date-type="pub"]'
2512 prod_deployed_date_path = 'book-meta/pub-history/date[@date-type="prod-deployed-date"]'
2513 year_path = "book-meta/pub-date/year"
2514 funding_path = "Not-supported"
2516 mbook_seq = 0
2517 mbook_volume = ""
2518 mbook_vseries = ""
2520 def __init__(self, tree):
2521 # Case when we import the book from OAI.
2522 # The <book> tag is surrounded by a <header> tag. Remove this tag.
2523 if tree.tag != "book":
2524 remove_namespace(tree)
2525 tree = tree.xpath("metadata/book")[0]
2526 if tree.getchildren()[0].tag == "front":
2527 tree = tree.xpath("front")[0]
2529 super().__init__(tree)
2530 self.book_meta = self.get_subtree(self.book_xpath)
2531 self.contrib_groups = []
2532 try:
2533 self.book_type = tree.get("book-type") or "Book"
2534 except BaseException:
2535 self.book_type = tree.getroot().get("book-type") or "Book"
2536 # if self.book_type == 'proceedings' or self.book_type == 'edited-book'
2537 # or self.book_type == 'monograph' :
2538 if self.book_type:
2539 self.parts = self.get_parts()
2541 # patch for book without contrib-group:
2542 # 1 : monograph with book_parts : contrib-group of book egal to the
2543 # contrib-group of the first book-part
2544 # OR 2 : edited-books with same author for all of its book_parts : book-type become 'monograph' and
2545 # contrib-group of book equal to the contrib-group of the first book-part
2546 # OR 3 : edited-books but not same author for all book-parts : contrib-group of
2547 # book become "Collectif"
2548 self.contrib_groups = self.get_contrib_groups()
2549 if not self.contrib_groups:
2550 if self.book_type == "monograph" and self.parts:
2551 first_part = self.parts[0]
2552 self.contrib_groups = first_part.get_contrib_groups()
2553 elif self.book_type == "edited-book" and self.parts:
2554 # check if authors of the book-parts are identical
2555 equal = True
2556 book_part_contrib_group = self.parts[0].get_contrib_groups()
2557 for xparts in self.parts:
2558 if xparts.get_contrib_groups() != book_part_contrib_group:
2559 equal = False
2560 break
2561 if equal:
2562 # FIXME : ? is it a check or an assignation ?
2563 self.book_type == "monograph"
2564 self.contrib_groups = book_part_contrib_group
2565 else:
2566 self.contrib_groups = [
2567 {
2568 "contribs": [
2569 {
2570 "first_name": "",
2571 "last_name": "Collectif",
2572 "suffix": "",
2573 "string_name": "Collectif",
2574 "reference_name": "Collectif",
2575 "contrib_xml": "<contrib><name><surname>Collectif</surname><given-names>"
2576 + "</given-names></name><name-alternatives>"
2577 + '<string-name specific-use="index">Collectif</string-name></name-alternatives></contrib>',
2578 "prefix": "",
2579 "contrib_type": "author",
2580 }
2581 ],
2582 "content_type": "authors",
2583 }
2584 ]
2586 self.body = ""
2587 # else: #or self.book_type == 'monograph': pour monograph pas de book-part, body contient le plein text
2588 # self.parts = []
2589 self.incollection = self.get_incollection()
2591 self.lang = self.get_lang()
2593 @staticmethod
2594 def get_book_part_class():
2595 return BookPart
2597 def get_doi(self):
2598 try:
2599 text = self.tree.xpath('book-meta/book-id[@book-id-type="doi"]')[0].text
2600 except BaseException:
2601 return None
2602 else:
2603 return text
2605 def get_ctype(self):
2606 return "book-%s" % self.book_type
2608 def get_contrib_groups(self):
2609 if self.contrib_groups:
2610 return self.contrib_groups
2611 return super().get_contrib_groups()
2613 def get_publisher(self):
2614 node = self.tree.find("book-meta/publisher")
2615 if node is not None:
2616 return Publisher(node)
2617 return None
2619 def get_year(self):
2620 return self.get_node_text(self.year_path)
2622 def get_title(self):
2623 text = self.get_node_text("book-meta/title-group/title")
2624 if not text:
2625 self.get_node_text("collection-meta/volume-in-collection/volume-title")
2626 return text
2628 def get_body(self):
2629 node = self.tree.find("book-body")
2630 if node is not None:
2631 return etree.tostring(node, encoding="utf-8", xml_declaration=False)
2632 return ""
2634 def get_incollection(self):
2635 nodes = self.tree.findall("in-collection")
2636 incols = []
2637 for node in nodes:
2638 incols.append(InCollection(node))
2639 if incols:
2640 return incols
2641 nodes = self.tree.findall("collection-meta")
2642 for node in nodes:
2643 incols.append(BitsCollection(node))
2644 return incols
2646 def get_event(self):
2647 node = self.tree.find("book-meta/event")
2648 if node is not None:
2649 return Event(node)
2650 return None
2652 def get_event_series(self):
2653 node = self.tree.find("book-meta/event-series")
2654 if node is not None:
2655 return EventSeries(node)
2656 return None
2658 def get_vseries(self):
2659 return self.get_node_text("book-meta/volume-series")
2661 def get_frontmatter(self):
2662 node = self.tree.find("front-matter")
2663 if node is not None:
2664 return innerxml(node)
2665 return ""
2667 def get_relations(self):
2668 relations = []
2669 nodes = self.tree.findall("book-meta/related-article")
2670 for n in nodes:
2671 rel = Relation(n)
2672 rel.left_pid = self.pid
2673 relations.append(rel)
2674 return relations
2677factories = {
2678 "collection": Collection,
2679 "publisher": Publisher,
2680 "journal": Journal,
2681 "issue": Issue,
2682 "article": Article,
2683 "book": Book,
2684}
2687def xobj_fromtree(classname, tree):
2688 factory = factories[classname]
2689 return factory(tree)
2692def xobj_fromstring(classname, metadata):
2693 tree = etree.fromstring(metadata)
2694 return xobj_fromtree(classname, tree)
2697def xobj_fromfile(classname, path):
2698 metadata = open(path, "rb").read()
2699 return xobj_fromstring(classname, metadata)
2702def update_bibitem_xml(bibitem, new_ids):
2703 xml = "<ref>" + bibitem.citation_xml + "</ref>"
2704 parser = etree.XMLParser(
2705 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True
2706 )
2707 tree = etree.fromstring(xml, parser=parser)
2709 node = tree.find("element-citation")
2710 if node is None:
2711 node = tree.find("mixed-citation")
2712 if node is not None:
2713 children_to_remove = []
2714 for child in node:
2715 if child.tag == "ext-link":
2716 type = child.get("ext-link-type")
2717 if type and type in new_ids:
2718 children_to_remove.append(child)
2719 elif child.tag == "pub-id":
2720 type = child.get("pub-id-type")
2721 if type and type in new_ids:
2722 children_to_remove.append(child)
2724 for child in children_to_remove:
2725 node.remove(child)
2727 for type, value_dict in new_ids.items():
2728 if value_dict["checked"] and not value_dict["false_positive"]:
2729 if type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:
2730 new_node = etree.Element("pub-id")
2731 new_node.set("pub-id-type", type)
2732 else:
2733 new_node = etree.Element("ext-link")
2734 new_node.set("ext-link-type", type)
2736 new_node.text = value_dict["id_value"]
2737 node.append(new_node)
2739 result = BibItem(tree)
2740 return result
2743#########################################################################################
2744#
2745# Create XML strings based on internal data
2746#
2747#########################################################################################
2750def get_contrib_xml(type, first_name, last_name, prefix, suffix, deceased):
2751 xml = "<contrib"
2752 if type:
2753 xml += ' contrib-type="' + type + '"'
2754 if deceased:
2755 xml += ' deceased="yes"'
2756 xml += "><name>"
2758 if prefix:
2759 xml += "<prefix>" + prefix + "</prefix>"
2760 if first_name:
2761 xml += "<given-names>" + first_name + "</given-names>"
2762 if last_name:
2763 xml += "<surname>" + last_name + "</surname>"
2764 if suffix:
2765 xml += "<suffix>" + suffix + "</suffix>"
2767 xml += "</name></contrib>"
2769 return xml
2772def get_title_xml(title):
2773 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink"><article-title xml:space="preserve">'
2774 xml += title
2775 xml += "</article-title></title-group>"
2777 return xml