Coverage for src/ptf/cmds/xml/xml

1import html

2import os

4from lxml import etree

5from lxml import objectify

6from lxml.html import fromstring

8from ptf.model_data import ContributorDict

9from ptf.model_data import ExtLinkDict

12# Unicode to XML

13def escape(string: str):

14 return string.replace("&", "&").replace("<", "<").replace(">", ">")

17# Replace html entities like φ by their corresponding unicode characters

18# except for XML reserved characters (& < >)

19def replace_html_entities(text):

20 # the mathtml 2 entities are not always identical to the HTML entities

21 # See https://www.w3.org/TR/xml-entity-names/#changes20080721

22 # Manually map the differences

23 text = text.replace("ϵ", chr(949))

24 text = text.replace("&OverBar;", chr(175))

25 text = text.replace("&UnderBar;", " " + chr(818))

26 text = text.replace("é", chr(233))

27 text = text.replace("É", chr(201))

28 text = text.replace("ç", chr(231))

29 text = text.replace("Ç", chr(199))

31 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv;

32 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013)

33 # But are kept intact in old ones

34 # Need to map the differences

35 text = text.replace("ϕ", chr(966))

36 text = text.replace("φ", chr(981))

38 # text has html entities like φ that need to be replaced by the unicode character.

39 # But html.replace() will also replace < > &

40 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes

41 # A hack is used: we change the < call html.unescape then restore the <

42 text = text.replace("<", "&mylt;").replace(">", "&mygt;").replace("&", "&myamp;")

43 text = html.unescape(text)

44 text = text.replace("&mylt;", "<").replace("&mygt;", ">").replace("&myamp;", "&")

46 # Bug in html.unescape ? Why does this module replace a unicode by another ?

47 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002))

48 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315))

49 text = text.replace(chr(9183), chr(65080))

51 return text

54def normalize(name):

55 if name[0] == "{":

56 _, tag = name[1:].split("}")

57 return tag

58 return name

61def get_xml_file_count(folder):

62 count = 0

63 for root, dirs, _files in os.walk(folder):

64 for dir_ in dirs:

65 file_ = os.path.join(folder, dir_, dir_ + ".xml")

66 num_sep_this = root.count(os.path.sep)

67 if num_sep_this < 3:

68 if os.path.isfile(file_):

69 count += 1

70 return count

73def get_xml_from_text(tag, text):

74 node = etree.Element(tag)

75 node.text = text

76 result = etree.tostring(node, encoding="UTF-8").decode("utf-8")

78 return result

81def remove_namespace(tree):

82 for elem in tree.getiterator():

83 if not hasattr(elem.tag, "find"):

84 continue # (1)

85 i = elem.tag.find("}")

86 if i >= 0:

87 elem.tag = elem.tag[i + 1 :]

88 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True)

91def get_normalized_attrib(node, attrib_name):

92 attrib_value = None

93 if node is not None: 93 ↛ 99line 93 didn't jump to line 99 because the condition on line 93 was always true

94 for attrib in node.attrib:

95 name = normalize(attrib)

96 if name == attrib_name:

97 attrib_value = node.attrib[attrib]

99 return attrib_value

100

101

102def get_xml_from_node(node):

103 text = ""

104 if node is not None: 104 ↛ 108line 104 didn't jump to line 108 because the condition on line 104 was always true

105 text = etree.tostring(

106 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False

107 )

108 return text

109

110

111def get_xml_from_node2(node, with_tail=False):

112 tag = normalize(node.tag)

113

114 text = "<" + tag + ">"

115 if node.text:

116 text += node.text

117

118 for child in node:

119 text += get_xml_from_node2(child, True)

120

121 text += "</" + tag + ">"

122

123 if node.tail and with_tail:

124 text += node.tail

125

126 return text

127

128

129# tostring is a useless fonction for 'text': it simply removes the HTML entities !

130def get_old_text_from_node(node):

131 text = ""

132 if node is not None:

133 text = etree.tostring(

134 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False

135 )

136 return text

137

138

139def get_text_from_node(node, **kwargs):

140 text = ""

141

142 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

143

144 if node is not None: 144 ↛ 155line 144 didn't jump to line 155 because the condition on line 144 was always true

145 text += replace_html_entities(node.text) if node.text is not None else ""

146

147 kwargs["is_top"] = False

148

149 for child in node:

150 text += get_text_from_node(child, **kwargs)

151

152 if not is_top and node.tail is not None:

153 text += replace_html_entities(node.tail)

154

155 return text

156

157

158def fix_mfenced_in_mathml(text):

159 i = 0

160 keep_testing = True

161 while keep_testing:

162 i = text.find("<mfenced", i)

163 keep_testing = i > -1

164 if i > 0 and text[i - 1] != ">":

165 j = i - 1

166 while j > 0 and text[j] != ">":

167 j -= 1

168 mfenced = text[j + 1 : i].strip()

169 if 0 < len(mfenced) < 3:

170 if len(mfenced) == 1:

171 first = mfenced

172 second = ""

173 else:

174 first = mfenced[0]

175 second = mfenced[1]

176

177 left = text[: j + 1]

178 right = text[i:]

179

180 if second == "":

181 if mfenced in ("{", "("):

182 open_c = mfenced

183 close_c = ""

184 else:

185 close_c = mfenced

186 open_c = ""

187 else:

188 ri = right.find('open=""')

189 rj = right.find('close=""')

190 if ri < rj:

191 open_c = first

192 close_c = second

193 else:

194 open_c = second

195 close_c = first

196 right = right.replace('open=""', 'open="' + open_c + '"', 1)

197 right = right.replace('close=""', 'close="' + close_c + '"', 1)

198 text = left + right

199 i += 1

200

201 return text

202

203 # chars = ('∥', '|')

204 # for c in chars:

205 # if c + c in math_node_text:

206 # l = math_node_text.split(c + c)

207 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring

208 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring

209 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1)

210 # math_node_text = l[0] + end_

211

212

213def add_mml_ns(node):

214 if node is None:

215 return

216

217 tag = normalize(node.tag)

218 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag)

219 node.tag = tag

220

221 for child in node:

222 add_mml_ns(child)

223

224

225def get_text_from_original_title_with_mathml(xml, **kwargs):

226 # on ne garde que la lang principal

227 parser = etree.XMLParser(

228 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

229 )

230 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

231 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")

232 tree = etree.fromstring(text.encode("utf-8"), parser=parser)

233

234 get_trans_title = kwargs.get("get_trans_title", False)

235

236 for node in tree:

237 tag = normalize(node.tag)

238 if get_trans_title and tag == "trans-title-group":

239 for child in node:

240 tag = normalize(child.tag)

241 if tag == "trans-title":

242 return get_text_from_node_with_mathml(child, **kwargs)

243 elif not get_trans_title and tag in (

244 "title",

245 "journal-title",

246 "article-title",

247 "book-title",

248 ):

249 return get_text_from_node_with_mathml(node, **kwargs)

250

251

252def get_text_from_xml_with_mathml(xml, **kwargs):

253 parser = etree.XMLParser(

254 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

255 )

256 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

257 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")

258

259 tree = etree.fromstring(text.encode("utf-8"), parser=parser)

260 value = get_text_from_node_with_mathml(tree, **kwargs)

261 return value

262

263

264def get_text_from_node_with_mathml(node, **kwargs):

265 text = ""

266

267 if node is None: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return text

269

270 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

271 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False

272

273 tag = normalize(node.tag)

274

275 if tag == "inline-formula" or tag == "disp-formula": 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 remove_namespace(node)

277

278 for child in node:

279 tag = normalize(child.tag)

280 if tag == "alternatives":

281 for alternative in child:

282 tag = normalize(alternative.tag)

283 if tag == "math" and kwargs["with_mathml"]:

284 add_mml_ns(alternative)

285 text = get_xml_from_node(alternative)

286 elif tag == "tex-math" and not kwargs["with_mathml"]:

287 text = get_xml_from_node(alternative)

288

289 else:

290 if node.text: 290 ↛ 294line 290 didn't jump to line 294 because the condition on line 290 was always true

291 text += node.text

292 text = escape(text)

293

294 kwargs["is_top"] = False

295

296 for child in node:

297 child_text = get_text_from_node_with_mathml(child, **kwargs)

298 text += child_text

299

300 if node.tail and not kwargs["is_top"]:

301 text += node.tail

302

303 return text

304

305

306def make_links_clickable(href, string):

307 if not href:

308 href = string

309

310 if href == "": 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true

311 return string

312

313 if href[0] == "/" or href.startswith("http"):

314 if "<" in href: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was never true

315 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6])

316 href = href.split("<")[0]

317

318 i = string.find("<")

319 if i > 0:

320 string = string[i:]

321

322 if not string: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 string = href

324

325 if href[0] == "/" or href.startswith("http"):

326 if href[0] == "/": 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 return f'<a href="{href}">{string}</a>'

328 else:

329 return f'<a href="{href}" target="_blank">{string}</a>'

330

331 return string

332

333

334def get_contrib_xml(contrib: ContributorDict, is_ref=False):

335 xml = ""

336 if not is_ref:

337 xml = f'<contrib contrib-type="{contrib["role"]}"'

338 if "corresponding" in contrib and contrib["corresponding"]: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true

339 xml += ' corresp="yes"'

340 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 xml += ' deceased="yes"'

342 if ( 342 ↛ 347line 342 didn't jump to line 347

343 "equal_contrib" in contrib

344 and contrib["equal_contrib"] != ""

345 and contrib["equal_contrib"]

346 ):

347 xml += ' equal-contrib="yes"'

348 xml += ">"

349

350 name = ""

351

352 if "prefix" in contrib and contrib["prefix"]: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 name += f'<prefix>{escape(contrib["prefix"])}</prefix>'

354 if "last_name" in contrib and contrib["last_name"]:

355 name += f'<surname>{escape(contrib["last_name"])}</surname>'

356 if "first_name" in contrib and contrib["first_name"]:

357 name += f'<given-names>{escape(contrib["first_name"])}</given-names>'

358 if "suffix" in contrib and contrib["suffix"]: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 name += f'<suffix>{escape(contrib["suffix"])}</suffix>'

360

361 if name == "":

362 if contrib["string_name"]: 362 ↛ 366line 362 didn't jump to line 366 because the condition on line 362 was always true

363 xml += f"<string-name>{contrib['string_name']}</string-name>"

364 else:

365 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>

366 xml += "<name/>"

367 else:

368 xml += f"<name>{name}</name>"

369

370 if "addresses" in contrib: 370 ↛ 374line 370 didn't jump to line 374 because the condition on line 370 was always true

371 for address in contrib["addresses"]:

372 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>"

373

374 if "email" in contrib and contrib["email"]: 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true

375 emails = contrib["email"].split("{{{")

376 for email in emails:

377 xml += "<email>" + escape(email) + "</email>"

378 if "orcid" in contrib and contrib["orcid"]: 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true

379 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>"

380

381 if "idref" in contrib and contrib["idref"]: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>"

383 if not is_ref:

384 xml += "</contrib>"

385

386 return xml

387

388

389def helper_update_name_params(params, use_initials=False):

390 # Extract first/last name if they are empty

391 if params["string_name"] and not params["last_name"]:

392 array = params["string_name"].split(",")

393 if len(array) > 1:

394 params["last_name"] = array[0]

395 params["first_name"] = array[1]

396

397 if len(params["first_name"]) > 128: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 params["first_name"] = params["first_name"][0:128]

399 if len(params["last_name"]) > 128: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 params["last_name"] = params["last_name"][0:128]

401 if len(params["string_name"]) > 256: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 params["string_name"] = params["string_name"][0:256]

403 if len(params["mid"]) > 256: 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true

404 params["mid"] = params["mid"][0:256]

405

406

407def normalise_span(value):

408 # Supprime les spans en trop dans les textes

409

410 i = 0

411 while i != -1:

412 i = value.find("<span")

413 if i > -1: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true

414 j = value.find(">", i)

415 if j > -1:

416 value = value[0:i] + value[j + 1 :]

417 value = value.replace("</span>", "")

418 return value

419

420

421def remove_html(string):

422 if not string:

423 return ""

424 return "".join(fromstring(string).itertext())

425

426

427def normalize_space(value):

428 # Supprime les espaces en trop dans les textes

429

430 # Common answers on the web " ".join(s.split())

431 # If does not work if there's a nbsp;

432 # Python splits it, xslt ignores it

433

434 result = ""

435 init_trim = True

436 skips = (" ", "\t", "\n")

437

438 for c in value:

439 if c in skips:

440 if not init_trim:

441 result += c

442 init_trim = True

443 else:

444 result += c

445 init_trim = False

446

447 if len(result) > 1 and result[-1] in skips:

448 result = result[0:-1]

449

450 return result

451

452

453def clean_doi(value: str):

454 i = value.find("10.")

455 if i > 0:

456 value = value[i:]

457 value = normalize_space(value)

458

459 return value

460

461

462def int_to_Roman(num):

463 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]

464 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"]

465 roman_num = ""

466 i = 0

467 while num > 0:

468 for _ in range(num // val[i]):

469 roman_num += syb[i]

470 num -= val[i]

471 i += 1

472 return roman_num

473

474

475def roman_to_int(s):

476 """

477 :type s: str

478 :rtype: int

479 """

480 roman = {

481 "I": 1,

482 "V": 5,

483 "X": 10,

484 "L": 50,

485 "C": 100,

486 "D": 500,

487 "M": 1000,

488 "IV": 4,

489 "IX": 9,

490 "XL": 40,

491 "XC": 90,

492 "CD": 400,

493 "CM": 900,

494 }

495 i = 0

496 num = 0

497 s = s.upper()

498 while i < len(s):

499 if i + 1 < len(s) and s[i : i + 2] in roman:

500 num += roman[s[i : i + 2]]

501 i += 2

502 else:

503 num += roman[s[i]]

504 i += 1

505 return num

506

507

508def get_extid_value_from_link_data(link_data: ExtLinkDict):

509 """

510 Some links have an id to an external database (MR, ZBL, DOI, Numdam).

511 Extract the link_type and value

512

513 :param link_data: dict with link data (ref, mimetype, location...)

514 :return: (link_type, value)

515 """

516

517 # rdoi: recommendation doi, used by PCI

518 # preprint: id of the preprint, used by PCI

519 referentials = [

520 "jfm-item-id",

521 "zbl-item-id",

522 "mr-item-id",

523 "nmid",

524 "numdam-id",

525 "mathdoc-id",

526 "sps-id",

527 "dmlid",

528 "eudml-item-id",

529 "doi",

530 "eid",

531 "arxiv",

532 "tel",

533 "hal",

534 "theses.fr",

535 "rdoi",

536 "preprint",

537 "pmid",

538 "ark",

539 ]

540

541 # data['rel'] is the ext-link-type or the pub-id-type

542 link_type = link_data["rel"] or ""

543

544 # The value attribute is not required. Use the node's text when href is empty.

545 value = link_data["location"]

546 if value == "":

547 value = link_data["metadata"]

548 value = value.strip()

549

550 if link_type == "" and value.find("doi.org") > 0:

551 link_type = "doi"

552 elif link_type == "" and value.find("arxiv.org") > 0: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true

553 link_type = "arxiv"

554 elif link_type == "" and value.find("hal-") > 0: 554 ↛ 555line 554 didn't jump to line 555 because the condition on line 554 was never true

555 link_type = "hal"

556

557 extid_value = (None, None)

558

559 if link_type in referentials:

560 if link_type == "numdam-id":

561 link_type = "mathdoc-id"

562

563 if link_type == "doi":

564 value = clean_doi(value)

565 elif link_type == "arxiv": 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true

566 if link_data["metadata"] != "":

567 value = link_data["metadata"].replace("arXiv:", "")

568 else:

569 value = link_data["location"]

570 value = value.replace("http://arxiv.org/abs/", "").replace(

571 "https://arxiv.org/abs/", ""

572 )

573 else:

574 value = link_data["metadata"]

575

576 extid_value = (link_type, value)

577

578 return extid_value

579

580

581def handle_pages(page_range):

582 try:

583 fpage, lpage = (int(page) for page in page_range.split("-"))

584 except (AttributeError, ValueError):

585 # means : page_range = None

586 fpage, lpage = None, None

587 return fpage, lpage

588

589

590def split_kwds(text):

591 list_ = text.split("$")

592

593 if len(list_) % 2 == 0:

594 # Formulas are encapsulated inside $$

595 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords

596 return [text]

597

598 kwds = []

599 cur_kwd = ""

600 for i, item in enumerate(list_):

601 if i % 2 == 0:

602 items = item.replace(";", ",").split(",")

603 if len(items) > 1:

604 kwds.append(cur_kwd + items[0])

605 kwds.extend(items[1:-1])

606 cur_kwd = items[-1]

607 else:

608 cur_kwd += item

609 else:

610 cur_kwd += "$" + item + "$"

611

612 if cur_kwd:

613 kwds.append(cur_kwd)

614

615 kwds = [kwd.strip() for kwd in kwds]

616 return kwds

617

618

619def get_elsevier_image_extensions():

620 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]

Coverage for src/ptf/cmds/xml/xml_utils.py: 54%

372 statements