Coverage for src/ptf/cmds/xml/xml_utils.py: 54%

372 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1import html 

2import os 

3 

4from lxml import etree 

5from lxml import objectify 

6from lxml.html import fromstring 

7 

8from ptf.model_data import ContributorDict 

9from ptf.model_data import ExtLinkDict 

10 

11 

12# Unicode to XML 

13def escape(string: str): 

14 return string.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") 

15 

16 

17# Replace html entities like &phi; by their corresponding unicode characters 

18# except for XML reserved characters (& < >) 

19def replace_html_entities(text): 

20 # the mathtml 2 entities are not always identical to the HTML entities 

21 # See https://www.w3.org/TR/xml-entity-names/#changes20080721 

22 # Manually map the differences 

23 text = text.replace("&varepsilon;", chr(949)) 

24 text = text.replace("&OverBar;", chr(175)) 

25 text = text.replace("&UnderBar;", " " + chr(818)) 

26 text = text.replace("&eacute;", chr(233)) 

27 text = text.replace("&Eacute;", chr(201)) 

28 text = text.replace("&ccedil;", chr(231)) 

29 text = text.replace("&Ccedil;", chr(199)) 

30 

31 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv; 

32 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013) 

33 # But are kept intact in old ones 

34 # Need to map the differences 

35 text = text.replace("&phiv;", chr(966)) 

36 text = text.replace("&phi;", chr(981)) 

37 

38 # text has html entities like &phi; that need to be replaced by the unicode character. 

39 # But html.replace() will also replace &lt; &gt; &amp; 

40 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes 

41 # A hack is used: we change the &lt; call html.unescape then restore the &lt; 

42 text = text.replace("&lt;", "&mylt;").replace("&gt;", "&mygt;").replace("&amp;", "&myamp;") 

43 text = html.unescape(text) 

44 text = text.replace("&mylt;", "&lt;").replace("&mygt;", "&gt;").replace("&myamp;", "&amp;") 

45 

46 # Bug in html.unescape ? Why does this module replace a unicode by another ? 

47 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002)) 

48 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315)) 

49 text = text.replace(chr(9183), chr(65080)) 

50 

51 return text 

52 

53 

54def normalize(name): 

55 if name[0] == "{": 

56 _, tag = name[1:].split("}") 

57 return tag 

58 return name 

59 

60 

61def get_xml_file_count(folder): 

62 count = 0 

63 for root, dirs, _files in os.walk(folder): 

64 for dir_ in dirs: 

65 file_ = os.path.join(folder, dir_, dir_ + ".xml") 

66 num_sep_this = root.count(os.path.sep) 

67 if num_sep_this < 3: 

68 if os.path.isfile(file_): 

69 count += 1 

70 return count 

71 

72 

73def get_xml_from_text(tag, text): 

74 node = etree.Element(tag) 

75 node.text = text 

76 result = etree.tostring(node, encoding="UTF-8").decode("utf-8") 

77 

78 return result 

79 

80 

81def remove_namespace(tree): 

82 for elem in tree.getiterator(): 

83 if not hasattr(elem.tag, "find"): 

84 continue # (1) 

85 i = elem.tag.find("}") 

86 if i >= 0: 

87 elem.tag = elem.tag[i + 1 :] 

88 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True) 

89 

90 

91def get_normalized_attrib(node, attrib_name): 

92 attrib_value = None 

93 if node is not None: 93 ↛ 99line 93 didn't jump to line 99 because the condition on line 93 was always true

94 for attrib in node.attrib: 

95 name = normalize(attrib) 

96 if name == attrib_name: 

97 attrib_value = node.attrib[attrib] 

98 

99 return attrib_value 

100 

101 

102def get_xml_from_node(node): 

103 text = "" 

104 if node is not None: 104 ↛ 108line 104 didn't jump to line 108 because the condition on line 104 was always true

105 text = etree.tostring( 

106 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False 

107 ) 

108 return text 

109 

110 

111def get_xml_from_node2(node, with_tail=False): 

112 tag = normalize(node.tag) 

113 

114 text = "<" + tag + ">" 

115 if node.text: 

116 text += node.text 

117 

118 for child in node: 

119 text += get_xml_from_node2(child, True) 

120 

121 text += "</" + tag + ">" 

122 

123 if node.tail and with_tail: 

124 text += node.tail 

125 

126 return text 

127 

128 

129# tostring is a useless fonction for 'text': it simply removes the HTML entities ! 

130def get_old_text_from_node(node): 

131 text = "" 

132 if node is not None: 

133 text = etree.tostring( 

134 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False 

135 ) 

136 return text 

137 

138 

139def get_text_from_node(node, **kwargs): 

140 text = "" 

141 

142 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

143 

144 if node is not None: 144 ↛ 155line 144 didn't jump to line 155 because the condition on line 144 was always true

145 text += replace_html_entities(node.text) if node.text is not None else "" 

146 

147 kwargs["is_top"] = False 

148 

149 for child in node: 

150 text += get_text_from_node(child, **kwargs) 

151 

152 if not is_top and node.tail is not None: 

153 text += replace_html_entities(node.tail) 

154 

155 return text 

156 

157 

158def fix_mfenced_in_mathml(text): 

159 i = 0 

160 keep_testing = True 

161 while keep_testing: 

162 i = text.find("<mfenced", i) 

163 keep_testing = i > -1 

164 if i > 0 and text[i - 1] != ">": 

165 j = i - 1 

166 while j > 0 and text[j] != ">": 

167 j -= 1 

168 mfenced = text[j + 1 : i].strip() 

169 if 0 < len(mfenced) < 3: 

170 if len(mfenced) == 1: 

171 first = mfenced 

172 second = "" 

173 else: 

174 first = mfenced[0] 

175 second = mfenced[1] 

176 

177 left = text[: j + 1] 

178 right = text[i:] 

179 

180 if second == "": 

181 if mfenced in ("{", "("): 

182 open_c = mfenced 

183 close_c = "" 

184 else: 

185 close_c = mfenced 

186 open_c = "" 

187 else: 

188 ri = right.find('open=""') 

189 rj = right.find('close=""') 

190 if ri < rj: 

191 open_c = first 

192 close_c = second 

193 else: 

194 open_c = second 

195 close_c = first 

196 right = right.replace('open=""', 'open="' + open_c + '"', 1) 

197 right = right.replace('close=""', 'close="' + close_c + '"', 1) 

198 text = left + right 

199 i += 1 

200 

201 return text 

202 

203 # chars = ('∥', '|') 

204 # for c in chars: 

205 # if c + c in math_node_text: 

206 # l = math_node_text.split(c + c) 

207 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring 

208 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring 

209 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1) 

210 # math_node_text = l[0] + end_ 

211 

212 

213def add_mml_ns(node): 

214 if node is None: 

215 return 

216 

217 tag = normalize(node.tag) 

218 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag) 

219 node.tag = tag 

220 

221 for child in node: 

222 add_mml_ns(child) 

223 

224 

225def get_text_from_original_title_with_mathml(xml, **kwargs): 

226 # on ne garde que la lang principal 

227 parser = etree.XMLParser( 

228 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

229 ) 

230 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

231 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

232 tree = etree.fromstring(text.encode("utf-8"), parser=parser) 

233 

234 get_trans_title = kwargs.get("get_trans_title", False) 

235 

236 for node in tree: 

237 tag = normalize(node.tag) 

238 if get_trans_title and tag == "trans-title-group": 

239 for child in node: 

240 tag = normalize(child.tag) 

241 if tag == "trans-title": 

242 return get_text_from_node_with_mathml(child, **kwargs) 

243 elif not get_trans_title and tag in ( 

244 "title", 

245 "journal-title", 

246 "article-title", 

247 "book-title", 

248 ): 

249 return get_text_from_node_with_mathml(node, **kwargs) 

250 

251 

252def get_text_from_xml_with_mathml(xml, **kwargs): 

253 parser = etree.XMLParser( 

254 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

255 ) 

256 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

257 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

258 

259 tree = etree.fromstring(text.encode("utf-8"), parser=parser) 

260 value = get_text_from_node_with_mathml(tree, **kwargs) 

261 return value 

262 

263 

264def get_text_from_node_with_mathml(node, **kwargs): 

265 text = "" 

266 

267 if node is None: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return text 

269 

270 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

271 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False 

272 

273 tag = normalize(node.tag) 

274 

275 if tag == "inline-formula" or tag == "disp-formula": 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 remove_namespace(node) 

277 

278 for child in node: 

279 tag = normalize(child.tag) 

280 if tag == "alternatives": 

281 for alternative in child: 

282 tag = normalize(alternative.tag) 

283 if tag == "math" and kwargs["with_mathml"]: 

284 add_mml_ns(alternative) 

285 text = get_xml_from_node(alternative) 

286 elif tag == "tex-math" and not kwargs["with_mathml"]: 

287 text = get_xml_from_node(alternative) 

288 

289 else: 

290 if node.text: 290 ↛ 294line 290 didn't jump to line 294 because the condition on line 290 was always true

291 text += node.text 

292 text = escape(text) 

293 

294 kwargs["is_top"] = False 

295 

296 for child in node: 

297 child_text = get_text_from_node_with_mathml(child, **kwargs) 

298 text += child_text 

299 

300 if node.tail and not kwargs["is_top"]: 

301 text += node.tail 

302 

303 return text 

304 

305 

306def make_links_clickable(href, string): 

307 if not href: 

308 href = string 

309 

310 if href == "": 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true

311 return string 

312 

313 if href[0] == "/" or href.startswith("http"): 

314 if "<" in href: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was never true

315 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6]) 

316 href = href.split("<")[0] 

317 

318 i = string.find("<") 

319 if i > 0: 

320 string = string[i:] 

321 

322 if not string: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 string = href 

324 

325 if href[0] == "/" or href.startswith("http"): 

326 if href[0] == "/": 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 return f'<a href="{href}">{string}</a>' 

328 else: 

329 return f'<a href="{href}" target="_blank">{string}</a>' 

330 

331 return string 

332 

333 

334def get_contrib_xml(contrib: ContributorDict, is_ref=False): 

335 xml = "" 

336 if not is_ref: 

337 xml = f'<contrib contrib-type="{contrib["role"]}"' 

338 if "corresponding" in contrib and contrib["corresponding"]: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true

339 xml += ' corresp="yes"' 

340 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 xml += ' deceased="yes"' 

342 if ( 342 ↛ 347line 342 didn't jump to line 347

343 "equal_contrib" in contrib 

344 and contrib["equal_contrib"] != "" 

345 and contrib["equal_contrib"] 

346 ): 

347 xml += ' equal-contrib="yes"' 

348 xml += ">" 

349 

350 name = "" 

351 

352 if "prefix" in contrib and contrib["prefix"]: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 name += f'<prefix>{escape(contrib["prefix"])}</prefix>' 

354 if "last_name" in contrib and contrib["last_name"]: 

355 name += f'<surname>{escape(contrib["last_name"])}</surname>' 

356 if "first_name" in contrib and contrib["first_name"]: 

357 name += f'<given-names>{escape(contrib["first_name"])}</given-names>' 

358 if "suffix" in contrib and contrib["suffix"]: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 name += f'<suffix>{escape(contrib["suffix"])}</suffix>' 

360 

361 if name == "": 

362 if contrib["string_name"]: 362 ↛ 366line 362 didn't jump to line 366 because the condition on line 362 was always true

363 xml += f"<string-name>{contrib['string_name']}</string-name>" 

364 else: 

365 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur> 

366 xml += "<name/>" 

367 else: 

368 xml += f"<name>{name}</name>" 

369 

370 if "addresses" in contrib: 370 ↛ 374line 370 didn't jump to line 374 because the condition on line 370 was always true

371 for address in contrib["addresses"]: 

372 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>" 

373 

374 if "email" in contrib and contrib["email"]: 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true

375 emails = contrib["email"].split("{{{") 

376 for email in emails: 

377 xml += "<email>" + escape(email) + "</email>" 

378 if "orcid" in contrib and contrib["orcid"]: 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true

379 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>" 

380 

381 if "idref" in contrib and contrib["idref"]: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>" 

383 if not is_ref: 

384 xml += "</contrib>" 

385 

386 return xml 

387 

388 

389def helper_update_name_params(params, use_initials=False): 

390 # Extract first/last name if they are empty 

391 if params["string_name"] and not params["last_name"]: 

392 array = params["string_name"].split(",") 

393 if len(array) > 1: 

394 params["last_name"] = array[0] 

395 params["first_name"] = array[1] 

396 

397 if len(params["first_name"]) > 128: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 params["first_name"] = params["first_name"][0:128] 

399 if len(params["last_name"]) > 128: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 params["last_name"] = params["last_name"][0:128] 

401 if len(params["string_name"]) > 256: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 params["string_name"] = params["string_name"][0:256] 

403 if len(params["mid"]) > 256: 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true

404 params["mid"] = params["mid"][0:256] 

405 

406 

407def normalise_span(value): 

408 # Supprime les spans en trop dans les textes 

409 

410 i = 0 

411 while i != -1: 

412 i = value.find("<span") 

413 if i > -1: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true

414 j = value.find(">", i) 

415 if j > -1: 

416 value = value[0:i] + value[j + 1 :] 

417 value = value.replace("</span>", "") 

418 return value 

419 

420 

421def remove_html(string): 

422 if not string: 

423 return "" 

424 return "".join(fromstring(string).itertext()) 

425 

426 

427def normalize_space(value): 

428 # Supprime les espaces en trop dans les textes 

429 

430 # Common answers on the web " ".join(s.split()) 

431 # If does not work if there's a nbsp; 

432 # Python splits it, xslt ignores it 

433 

434 result = "" 

435 init_trim = True 

436 skips = (" ", "\t", "\n") 

437 

438 for c in value: 

439 if c in skips: 

440 if not init_trim: 

441 result += c 

442 init_trim = True 

443 else: 

444 result += c 

445 init_trim = False 

446 

447 if len(result) > 1 and result[-1] in skips: 

448 result = result[0:-1] 

449 

450 return result 

451 

452 

453def clean_doi(value: str): 

454 i = value.find("10.") 

455 if i > 0: 

456 value = value[i:] 

457 value = normalize_space(value) 

458 

459 return value 

460 

461 

462def int_to_Roman(num): 

463 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1] 

464 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"] 

465 roman_num = "" 

466 i = 0 

467 while num > 0: 

468 for _ in range(num // val[i]): 

469 roman_num += syb[i] 

470 num -= val[i] 

471 i += 1 

472 return roman_num 

473 

474 

475def roman_to_int(s): 

476 """ 

477 :type s: str 

478 :rtype: int 

479 """ 

480 roman = { 

481 "I": 1, 

482 "V": 5, 

483 "X": 10, 

484 "L": 50, 

485 "C": 100, 

486 "D": 500, 

487 "M": 1000, 

488 "IV": 4, 

489 "IX": 9, 

490 "XL": 40, 

491 "XC": 90, 

492 "CD": 400, 

493 "CM": 900, 

494 } 

495 i = 0 

496 num = 0 

497 s = s.upper() 

498 while i < len(s): 

499 if i + 1 < len(s) and s[i : i + 2] in roman: 

500 num += roman[s[i : i + 2]] 

501 i += 2 

502 else: 

503 num += roman[s[i]] 

504 i += 1 

505 return num 

506 

507 

508def get_extid_value_from_link_data(link_data: ExtLinkDict): 

509 """ 

510 Some links have an id to an external database (MR, ZBL, DOI, Numdam). 

511 Extract the link_type and value 

512 

513 :param link_data: dict with link data (ref, mimetype, location...) 

514 :return: (link_type, value) 

515 """ 

516 

517 # rdoi: recommendation doi, used by PCI 

518 # preprint: id of the preprint, used by PCI 

519 referentials = [ 

520 "jfm-item-id", 

521 "zbl-item-id", 

522 "mr-item-id", 

523 "nmid", 

524 "numdam-id", 

525 "mathdoc-id", 

526 "sps-id", 

527 "dmlid", 

528 "eudml-item-id", 

529 "doi", 

530 "eid", 

531 "arxiv", 

532 "tel", 

533 "hal", 

534 "theses.fr", 

535 "rdoi", 

536 "preprint", 

537 "pmid", 

538 "ark", 

539 ] 

540 

541 # data['rel'] is the ext-link-type or the pub-id-type 

542 link_type = link_data["rel"] or "" 

543 

544 # The value attribute is not required. Use the node's text when href is empty. 

545 value = link_data["location"] 

546 if value == "": 

547 value = link_data["metadata"] 

548 value = value.strip() 

549 

550 if link_type == "" and value.find("doi.org") > 0: 

551 link_type = "doi" 

552 elif link_type == "" and value.find("arxiv.org") > 0: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true

553 link_type = "arxiv" 

554 elif link_type == "" and value.find("hal-") > 0: 554 ↛ 555line 554 didn't jump to line 555 because the condition on line 554 was never true

555 link_type = "hal" 

556 

557 extid_value = (None, None) 

558 

559 if link_type in referentials: 

560 if link_type == "numdam-id": 

561 link_type = "mathdoc-id" 

562 

563 if link_type == "doi": 

564 value = clean_doi(value) 

565 elif link_type == "arxiv": 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true

566 if link_data["metadata"] != "": 

567 value = link_data["metadata"].replace("arXiv:", "") 

568 else: 

569 value = link_data["location"] 

570 value = value.replace("http://arxiv.org/abs/", "").replace( 

571 "https://arxiv.org/abs/", "" 

572 ) 

573 else: 

574 value = link_data["metadata"] 

575 

576 extid_value = (link_type, value) 

577 

578 return extid_value 

579 

580 

581def handle_pages(page_range): 

582 try: 

583 fpage, lpage = (int(page) for page in page_range.split("-")) 

584 except (AttributeError, ValueError): 

585 # means : page_range = None 

586 fpage, lpage = None, None 

587 return fpage, lpage 

588 

589 

590def split_kwds(text): 

591 list_ = text.split("$") 

592 

593 if len(list_) % 2 == 0: 

594 # Formulas are encapsulated inside $$ 

595 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords 

596 return [text] 

597 

598 kwds = [] 

599 cur_kwd = "" 

600 for i, item in enumerate(list_): 

601 if i % 2 == 0: 

602 items = item.replace(";", ",").split(",") 

603 if len(items) > 1: 

604 kwds.append(cur_kwd + items[0]) 

605 kwds.extend(items[1:-1]) 

606 cur_kwd = items[-1] 

607 else: 

608 cur_kwd += item 

609 else: 

610 cur_kwd += "$" + item + "$" 

611 

612 if cur_kwd: 

613 kwds.append(cur_kwd) 

614 

615 kwds = [kwd.strip() for kwd in kwds] 

616 return kwds 

617 

618 

619def get_elsevier_image_extensions(): 

620 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]