Coverage for src/ptf/cmds/xml/xml_utils.py: 54%
372 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import html
2import os
4from lxml import etree
5from lxml import objectify
6from lxml.html import fromstring
8from ptf.model_data import ContributorDict
9from ptf.model_data import ExtLinkDict
12# Unicode to XML
13def escape(string: str):
14 return string.replace("&", "&").replace("<", "<").replace(">", ">")
17# Replace html entities like φ by their corresponding unicode characters
18# except for XML reserved characters (& < >)
19def replace_html_entities(text):
20 # the mathtml 2 entities are not always identical to the HTML entities
21 # See https://www.w3.org/TR/xml-entity-names/#changes20080721
22 # Manually map the differences
23 text = text.replace("ϵ", chr(949))
24 text = text.replace("‾", chr(175))
25 text = text.replace("_", " " + chr(818))
26 text = text.replace("é", chr(233))
27 text = text.replace("É", chr(201))
28 text = text.replace("ç", chr(231))
29 text = text.replace("Ç", chr(199))
31 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv;
32 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013)
33 # But are kept intact in old ones
34 # Need to map the differences
35 text = text.replace("ϕ", chr(966))
36 text = text.replace("φ", chr(981))
38 # text has html entities like φ that need to be replaced by the unicode character.
39 # But html.replace() will also replace < > &
40 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes
41 # A hack is used: we change the < call html.unescape then restore the <
42 text = text.replace("<", "&mylt;").replace(">", "&mygt;").replace("&", "&myamp;")
43 text = html.unescape(text)
44 text = text.replace("&mylt;", "<").replace("&mygt;", ">").replace("&myamp;", "&")
46 # Bug in html.unescape ? Why does this module replace a unicode by another ?
47 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002))
48 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315))
49 text = text.replace(chr(9183), chr(65080))
51 return text
54def normalize(name):
55 if name[0] == "{":
56 _, tag = name[1:].split("}")
57 return tag
58 return name
61def get_xml_file_count(folder):
62 count = 0
63 for root, dirs, _files in os.walk(folder):
64 for dir_ in dirs:
65 file_ = os.path.join(folder, dir_, dir_ + ".xml")
66 num_sep_this = root.count(os.path.sep)
67 if num_sep_this < 3:
68 if os.path.isfile(file_):
69 count += 1
70 return count
73def get_xml_from_text(tag, text):
74 node = etree.Element(tag)
75 node.text = text
76 result = etree.tostring(node, encoding="UTF-8").decode("utf-8")
78 return result
81def remove_namespace(tree):
82 for elem in tree.getiterator():
83 if not hasattr(elem.tag, "find"):
84 continue # (1)
85 i = elem.tag.find("}")
86 if i >= 0:
87 elem.tag = elem.tag[i + 1 :]
88 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True)
91def get_normalized_attrib(node, attrib_name):
92 attrib_value = None
93 if node is not None: 93 ↛ 99line 93 didn't jump to line 99 because the condition on line 93 was always true
94 for attrib in node.attrib:
95 name = normalize(attrib)
96 if name == attrib_name:
97 attrib_value = node.attrib[attrib]
99 return attrib_value
102def get_xml_from_node(node):
103 text = ""
104 if node is not None: 104 ↛ 108line 104 didn't jump to line 108 because the condition on line 104 was always true
105 text = etree.tostring(
106 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False
107 )
108 return text
111def get_xml_from_node2(node, with_tail=False):
112 tag = normalize(node.tag)
114 text = "<" + tag + ">"
115 if node.text:
116 text += node.text
118 for child in node:
119 text += get_xml_from_node2(child, True)
121 text += "</" + tag + ">"
123 if node.tail and with_tail:
124 text += node.tail
126 return text
129# tostring is a useless fonction for 'text': it simply removes the HTML entities !
130def get_old_text_from_node(node):
131 text = ""
132 if node is not None:
133 text = etree.tostring(
134 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False
135 )
136 return text
139def get_text_from_node(node, **kwargs):
140 text = ""
142 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
144 if node is not None: 144 ↛ 155line 144 didn't jump to line 155 because the condition on line 144 was always true
145 text += replace_html_entities(node.text) if node.text is not None else ""
147 kwargs["is_top"] = False
149 for child in node:
150 text += get_text_from_node(child, **kwargs)
152 if not is_top and node.tail is not None:
153 text += replace_html_entities(node.tail)
155 return text
158def fix_mfenced_in_mathml(text):
159 i = 0
160 keep_testing = True
161 while keep_testing:
162 i = text.find("<mfenced", i)
163 keep_testing = i > -1
164 if i > 0 and text[i - 1] != ">":
165 j = i - 1
166 while j > 0 and text[j] != ">":
167 j -= 1
168 mfenced = text[j + 1 : i].strip()
169 if 0 < len(mfenced) < 3:
170 if len(mfenced) == 1:
171 first = mfenced
172 second = ""
173 else:
174 first = mfenced[0]
175 second = mfenced[1]
177 left = text[: j + 1]
178 right = text[i:]
180 if second == "":
181 if mfenced in ("{", "("):
182 open_c = mfenced
183 close_c = ""
184 else:
185 close_c = mfenced
186 open_c = ""
187 else:
188 ri = right.find('open=""')
189 rj = right.find('close=""')
190 if ri < rj:
191 open_c = first
192 close_c = second
193 else:
194 open_c = second
195 close_c = first
196 right = right.replace('open=""', 'open="' + open_c + '"', 1)
197 right = right.replace('close=""', 'close="' + close_c + '"', 1)
198 text = left + right
199 i += 1
201 return text
203 # chars = ('∥', '|')
204 # for c in chars:
205 # if c + c in math_node_text:
206 # l = math_node_text.split(c + c)
207 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring
208 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring
209 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1)
210 # math_node_text = l[0] + end_
213def add_mml_ns(node):
214 if node is None:
215 return
217 tag = normalize(node.tag)
218 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag)
219 node.tag = tag
221 for child in node:
222 add_mml_ns(child)
225def get_text_from_original_title_with_mathml(xml, **kwargs):
226 # on ne garde que la lang principal
227 parser = etree.XMLParser(
228 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
229 )
230 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
231 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")
232 tree = etree.fromstring(text.encode("utf-8"), parser=parser)
234 get_trans_title = kwargs.get("get_trans_title", False)
236 for node in tree:
237 tag = normalize(node.tag)
238 if get_trans_title and tag == "trans-title-group":
239 for child in node:
240 tag = normalize(child.tag)
241 if tag == "trans-title":
242 return get_text_from_node_with_mathml(child, **kwargs)
243 elif not get_trans_title and tag in (
244 "title",
245 "journal-title",
246 "article-title",
247 "book-title",
248 ):
249 return get_text_from_node_with_mathml(node, **kwargs)
252def get_text_from_xml_with_mathml(xml, **kwargs):
253 parser = etree.XMLParser(
254 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
255 )
256 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
257 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")
259 tree = etree.fromstring(text.encode("utf-8"), parser=parser)
260 value = get_text_from_node_with_mathml(tree, **kwargs)
261 return value
264def get_text_from_node_with_mathml(node, **kwargs):
265 text = ""
267 if node is None: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 return text
270 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
271 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False
273 tag = normalize(node.tag)
275 if tag == "inline-formula" or tag == "disp-formula": 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true
276 remove_namespace(node)
278 for child in node:
279 tag = normalize(child.tag)
280 if tag == "alternatives":
281 for alternative in child:
282 tag = normalize(alternative.tag)
283 if tag == "math" and kwargs["with_mathml"]:
284 add_mml_ns(alternative)
285 text = get_xml_from_node(alternative)
286 elif tag == "tex-math" and not kwargs["with_mathml"]:
287 text = get_xml_from_node(alternative)
289 else:
290 if node.text: 290 ↛ 294line 290 didn't jump to line 294 because the condition on line 290 was always true
291 text += node.text
292 text = escape(text)
294 kwargs["is_top"] = False
296 for child in node:
297 child_text = get_text_from_node_with_mathml(child, **kwargs)
298 text += child_text
300 if node.tail and not kwargs["is_top"]:
301 text += node.tail
303 return text
306def make_links_clickable(href, string):
307 if not href:
308 href = string
310 if href == "": 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true
311 return string
313 if href[0] == "/" or href.startswith("http"):
314 if "<" in href: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was never true
315 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6])
316 href = href.split("<")[0]
318 i = string.find("<")
319 if i > 0:
320 string = string[i:]
322 if not string: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true
323 string = href
325 if href[0] == "/" or href.startswith("http"):
326 if href[0] == "/": 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true
327 return f'<a href="{href}">{string}</a>'
328 else:
329 return f'<a href="{href}" target="_blank">{string}</a>'
331 return string
334def get_contrib_xml(contrib: ContributorDict, is_ref=False):
335 xml = ""
336 if not is_ref:
337 xml = f'<contrib contrib-type="{contrib["role"]}"'
338 if "corresponding" in contrib and contrib["corresponding"]: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true
339 xml += ' corresp="yes"'
340 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true
341 xml += ' deceased="yes"'
342 if ( 342 ↛ 347line 342 didn't jump to line 347
343 "equal_contrib" in contrib
344 and contrib["equal_contrib"] != ""
345 and contrib["equal_contrib"]
346 ):
347 xml += ' equal-contrib="yes"'
348 xml += ">"
350 name = ""
352 if "prefix" in contrib and contrib["prefix"]: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true
353 name += f'<prefix>{escape(contrib["prefix"])}</prefix>'
354 if "last_name" in contrib and contrib["last_name"]:
355 name += f'<surname>{escape(contrib["last_name"])}</surname>'
356 if "first_name" in contrib and contrib["first_name"]:
357 name += f'<given-names>{escape(contrib["first_name"])}</given-names>'
358 if "suffix" in contrib and contrib["suffix"]: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 name += f'<suffix>{escape(contrib["suffix"])}</suffix>'
361 if name == "":
362 if contrib["string_name"]: 362 ↛ 366line 362 didn't jump to line 366 because the condition on line 362 was always true
363 xml += f"<string-name>{contrib['string_name']}</string-name>"
364 else:
365 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>
366 xml += "<name/>"
367 else:
368 xml += f"<name>{name}</name>"
370 if "addresses" in contrib: 370 ↛ 374line 370 didn't jump to line 374 because the condition on line 370 was always true
371 for address in contrib["addresses"]:
372 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>"
374 if "email" in contrib and contrib["email"]: 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true
375 emails = contrib["email"].split("{{{")
376 for email in emails:
377 xml += "<email>" + escape(email) + "</email>"
378 if "orcid" in contrib and contrib["orcid"]: 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true
379 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>"
381 if "idref" in contrib and contrib["idref"]: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>"
383 if not is_ref:
384 xml += "</contrib>"
386 return xml
389def helper_update_name_params(params, use_initials=False):
390 # Extract first/last name if they are empty
391 if params["string_name"] and not params["last_name"]:
392 array = params["string_name"].split(",")
393 if len(array) > 1:
394 params["last_name"] = array[0]
395 params["first_name"] = array[1]
397 if len(params["first_name"]) > 128: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 params["first_name"] = params["first_name"][0:128]
399 if len(params["last_name"]) > 128: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true
400 params["last_name"] = params["last_name"][0:128]
401 if len(params["string_name"]) > 256: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true
402 params["string_name"] = params["string_name"][0:256]
403 if len(params["mid"]) > 256: 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true
404 params["mid"] = params["mid"][0:256]
407def normalise_span(value):
408 # Supprime les spans en trop dans les textes
410 i = 0
411 while i != -1:
412 i = value.find("<span")
413 if i > -1: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true
414 j = value.find(">", i)
415 if j > -1:
416 value = value[0:i] + value[j + 1 :]
417 value = value.replace("</span>", "")
418 return value
421def remove_html(string):
422 if not string:
423 return ""
424 return "".join(fromstring(string).itertext())
427def normalize_space(value):
428 # Supprime les espaces en trop dans les textes
430 # Common answers on the web " ".join(s.split())
431 # If does not work if there's a nbsp;
432 # Python splits it, xslt ignores it
434 result = ""
435 init_trim = True
436 skips = (" ", "\t", "\n")
438 for c in value:
439 if c in skips:
440 if not init_trim:
441 result += c
442 init_trim = True
443 else:
444 result += c
445 init_trim = False
447 if len(result) > 1 and result[-1] in skips:
448 result = result[0:-1]
450 return result
453def clean_doi(value: str):
454 i = value.find("10.")
455 if i > 0:
456 value = value[i:]
457 value = normalize_space(value)
459 return value
462def int_to_Roman(num):
463 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]
464 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"]
465 roman_num = ""
466 i = 0
467 while num > 0:
468 for _ in range(num // val[i]):
469 roman_num += syb[i]
470 num -= val[i]
471 i += 1
472 return roman_num
475def roman_to_int(s):
476 """
477 :type s: str
478 :rtype: int
479 """
480 roman = {
481 "I": 1,
482 "V": 5,
483 "X": 10,
484 "L": 50,
485 "C": 100,
486 "D": 500,
487 "M": 1000,
488 "IV": 4,
489 "IX": 9,
490 "XL": 40,
491 "XC": 90,
492 "CD": 400,
493 "CM": 900,
494 }
495 i = 0
496 num = 0
497 s = s.upper()
498 while i < len(s):
499 if i + 1 < len(s) and s[i : i + 2] in roman:
500 num += roman[s[i : i + 2]]
501 i += 2
502 else:
503 num += roman[s[i]]
504 i += 1
505 return num
508def get_extid_value_from_link_data(link_data: ExtLinkDict):
509 """
510 Some links have an id to an external database (MR, ZBL, DOI, Numdam).
511 Extract the link_type and value
513 :param link_data: dict with link data (ref, mimetype, location...)
514 :return: (link_type, value)
515 """
517 # rdoi: recommendation doi, used by PCI
518 # preprint: id of the preprint, used by PCI
519 referentials = [
520 "jfm-item-id",
521 "zbl-item-id",
522 "mr-item-id",
523 "nmid",
524 "numdam-id",
525 "mathdoc-id",
526 "sps-id",
527 "dmlid",
528 "eudml-item-id",
529 "doi",
530 "eid",
531 "arxiv",
532 "tel",
533 "hal",
534 "theses.fr",
535 "rdoi",
536 "preprint",
537 "pmid",
538 "ark",
539 ]
541 # data['rel'] is the ext-link-type or the pub-id-type
542 link_type = link_data["rel"] or ""
544 # The value attribute is not required. Use the node's text when href is empty.
545 value = link_data["location"]
546 if value == "":
547 value = link_data["metadata"]
548 value = value.strip()
550 if link_type == "" and value.find("doi.org") > 0:
551 link_type = "doi"
552 elif link_type == "" and value.find("arxiv.org") > 0: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true
553 link_type = "arxiv"
554 elif link_type == "" and value.find("hal-") > 0: 554 ↛ 555line 554 didn't jump to line 555 because the condition on line 554 was never true
555 link_type = "hal"
557 extid_value = (None, None)
559 if link_type in referentials:
560 if link_type == "numdam-id":
561 link_type = "mathdoc-id"
563 if link_type == "doi":
564 value = clean_doi(value)
565 elif link_type == "arxiv": 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true
566 if link_data["metadata"] != "":
567 value = link_data["metadata"].replace("arXiv:", "")
568 else:
569 value = link_data["location"]
570 value = value.replace("http://arxiv.org/abs/", "").replace(
571 "https://arxiv.org/abs/", ""
572 )
573 else:
574 value = link_data["metadata"]
576 extid_value = (link_type, value)
578 return extid_value
581def handle_pages(page_range):
582 try:
583 fpage, lpage = (int(page) for page in page_range.split("-"))
584 except (AttributeError, ValueError):
585 # means : page_range = None
586 fpage, lpage = None, None
587 return fpage, lpage
590def split_kwds(text):
591 list_ = text.split("$")
593 if len(list_) % 2 == 0:
594 # Formulas are encapsulated inside $$
595 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords
596 return [text]
598 kwds = []
599 cur_kwd = ""
600 for i, item in enumerate(list_):
601 if i % 2 == 0:
602 items = item.replace(";", ",").split(",")
603 if len(items) > 1:
604 kwds.append(cur_kwd + items[0])
605 kwds.extend(items[1:-1])
606 cur_kwd = items[-1]
607 else:
608 cur_kwd += item
609 else:
610 cur_kwd += "$" + item + "$"
612 if cur_kwd:
613 kwds.append(cur_kwd)
615 kwds = [kwd.strip() for kwd in kwds]
616 return kwds
619def get_elsevier_image_extensions():
620 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]