Coverage for src/ptf/cmds/xml/cedrics/cedrics_parser.py: 10%

1098 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# cedrics_parser.py is the equivalent of jats_parser for Cedrics XML 

6# 

7# Bugs fixed: 

8# - <xref> with url in "dx.doi.org" were filtered in ptf-xsl 

9# - Non structured references (bibitemdata in Cedrics) got ext_links only if the <xref> node 

10# is one step below the <bibitemdata> node 

11# - comments that started with ' ' were ignored (AIF_2008__58_2_689_0 [9]) 

12# 

13################################################################################################## 

14 

15import html 

16import re 

17from operator import attrgetter 

18 

19from django.conf import settings 

20from django.utils import timezone 

21 

22from ptf.cmds.xml.citation_html import get_citation_html 

23from ptf.cmds.xml.xml_base import RefBase 

24from ptf.cmds.xml.xml_base import XmlParserBase 

25from ptf.cmds.xml.xml_utils import clean_doi 

26from ptf.cmds.xml.xml_utils import escape 

27from ptf.cmds.xml.xml_utils import fix_mfenced_in_mathml 

28from ptf.cmds.xml.xml_utils import get_contrib_xml 

29from ptf.cmds.xml.xml_utils import get_normalized_attrib 

30from ptf.cmds.xml.xml_utils import get_text_from_node 

31from ptf.cmds.xml.xml_utils import get_xml_from_node 

32from ptf.cmds.xml.xml_utils import helper_update_name_params 

33from ptf.cmds.xml.xml_utils import int_to_Roman 

34from ptf.cmds.xml.xml_utils import make_links_clickable 

35from ptf.cmds.xml.xml_utils import normalize 

36from ptf.cmds.xml.xml_utils import normalize_space 

37from ptf.cmds.xml.xml_utils import replace_html_entities 

38from ptf.cmds.xml.xml_utils import split_kwds 

39from ptf.model_data import ArticleData 

40from ptf.model_data import Foo 

41from ptf.model_data import IssueData 

42from ptf.model_data import JournalData 

43from ptf.model_data import PublisherData 

44from ptf.model_data import create_contributor 

45 

46 

47def helper_add_link_from_node(node): 

48 text = node.text or "" 

49 tag = normalize(node.tag) 

50 fct_name = "get_data_from_" + tag.replace("-", "_") 

51 data = globals()[fct_name](node) 

52 if not data["rel"]: 

53 href = data["location"] 

54 if "www.numdam.org" not in href: 

55 text = make_links_clickable(href, data["metadata"]) 

56 else: 

57 text = "" 

58 return text 

59 

60 

61def get_data_from_custom_meta(node): 

62 name = "" 

63 value = "" 

64 

65 for child in node: 

66 tag = normalize(child.tag) 

67 

68 if tag == "meta-name": 

69 name = child.text 

70 elif tag == "meta-value": 

71 value = child.text 

72 

73 return name, value 

74 

75 

76def get_data_from_date(node): 

77 date_str = "" 

78 if "iso-8601-date" in node.attrib: 

79 date_str = node.attrib["iso-8601-date"] 

80 else: 

81 year = month = day = "" 

82 for child in node: 

83 tag = normalize(child.tag) 

84 

85 if tag == "year": 

86 year = child.text 

87 elif tag == "month": 

88 month = child.text 

89 elif tag == "day": 

90 day = child.text 

91 date_str = year 

92 if date_str and month: 

93 date_str += "-" + month 

94 if date_str and day: 

95 date_str += "-" + day 

96 

97 return date_str 

98 

99 

100def get_data_from_ext_link(node): 

101 link_type = node.get("ext-link-type") or "" 

102 href = get_normalized_attrib(node, "href") or "" 

103 base = get_normalized_attrib(node, "base") or "" 

104 

105 data = { 

106 "rel": link_type, 

107 "mimetype": "", 

108 "location": href, 

109 "base": base, 

110 "metadata": node.text or "", 

111 } 

112 

113 return data 

114 

115 

116def get_data_from_history(node): 

117 history_dates = [] 

118 # TODO: transform history_dates in a hash where date-type is the key 

119 # => Change database_cmds 

120 for child in node: 

121 if "date-type" in child.attrib: 

122 date_type = child.attrib["date-type"] 

123 date_str = get_data_from_date(child) 

124 history_dates.append({"type": date_type, "date": date_str}) 

125 return history_dates 

126 

127 

128def get_data_from_uri(node): 

129 href = text = "" 

130 href = get_normalized_attrib(node, "href") or "" 

131 text = node.text or "" 

132 

133 data = {"rel": "", "mimetype": "", "location": href, "base": "", "metadata": text} 

134 

135 return data 

136 

137 

138class CedricsBase(XmlParserBase): 

139 def __init__(self, *args, **kwargs): 

140 super().__init__() 

141 self.warnings = [] 

142 

143 def parse_tree(self, tree): 

144 pass 

145 

146 def set_titles(self): 

147 pass 

148 

149 def post_parse_tree(self): 

150 self.set_titles() 

151 

152 def filter_text(self, text): 

153 text = text.replace("<allowbreak/>", "") 

154 return text 

155 

156 def get_location_from_xref(self, node, **kwargs): 

157 location = get_normalized_attrib(node, "url") or "" 

158 

159 if location == "": 

160 text = get_text_from_node(node) 

161 location = self.filter_text(text) 

162 

163 return location 

164 

165 def get_data_from_xref(self, node, **kwargs): 

166 href = text = "" 

167 

168 href = get_normalized_attrib(node, "url") or "" 

169 

170 # TODO: BUG in JATS. JEP_2017__4__435_0 [9] 

171 # The comment has an ext-link with a display embedded in <monospace> 

172 # jats_parser produces 2 <a> (1 for the <ext-link>, 1 for the text inside the <monospace> 

173 # The code below should be removed 

174 is_comment = "is_comment" in kwargs and kwargs["is_comment"] 

175 if is_comment and node.text is None: 

176 kwargs["add_HTML_link"] = True 

177 

178 html_text, _, xml_text = self.parse_node_inner(node, None, **kwargs) 

179 

180 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False 

181 

182 if href == "": 

183 text = get_text_from_node(node) 

184 text = self.filter_text(text) 

185 href = text 

186 

187 bibitemdata_display = html_text 

188 if is_bibitemdata and node.text is None: 

189 html_text = "" 

190 

191 data = { 

192 "rel": "", 

193 "mimetype": "", 

194 "location": href, 

195 "base": "", 

196 "metadata": html_text, 

197 "xml_text": xml_text, 

198 } 

199 

200 if is_bibitemdata: 

201 data["bibitemdata_display"] = bibitemdata_display 

202 

203 return data 

204 

205 def get_numeric_value(self, node): 

206 systnum = node.get("systnum") or "" 

207 

208 value = node.text 

209 if systnum.lower() == "romain": 

210 value = int_to_Roman(int(value)) 

211 

212 return value 

213 

214 def parse_node_inner(self, node, tex_node, **kwargs): 

215 """ 

216 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML 

217 :param node: 

218 :param kwargs: 

219 :return: 

220 """ 

221 

222 kwargs["is_top"] = False 

223 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

224 

225 if node.text: 

226 text = node.text 

227 

228 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 

229 text = text[1:] 

230 

231 inner_jats_xml_text += escape(text) 

232 inner_html_text += text 

233 inner_tex_text += text 

234 

235 for i in range(len(node)): 

236 child = node[i] 

237 text_child = tex_node[i] if (tex_node is not None and len(tex_node) > i) else None 

238 

239 ( 

240 child_html_text, 

241 child_tex_text, 

242 child_jats_xml_text, 

243 ) = self.parse_node_with_mixed_content(child, text_child, **kwargs) 

244 inner_html_text += child_html_text 

245 inner_tex_text += child_tex_text 

246 inner_jats_xml_text += child_jats_xml_text 

247 

248 if "add_HTML_link" in kwargs and kwargs["add_HTML_link"]: 

249 match = re.match(r"[\n ]+", inner_html_text) 

250 if not match: 

251 inner_html_text = make_links_clickable(inner_html_text, inner_html_text) 

252 

253 return inner_html_text, inner_tex_text, inner_jats_xml_text 

254 

255 def parse_node_with_b(self, node, tex_node, **kwargs): 

256 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

257 node, tex_node, **kwargs 

258 ) 

259 

260 html_text = "<strong>" + inner_html_text + "</strong>" 

261 tex_text = "<strong>" + inner_tex_text + "</strong>" 

262 if len(inner_jats_xml_text) > 0: 

263 xml_text = "<bold>" + inner_jats_xml_text + "</bold>" 

264 else: 

265 xml_text = "<bold/>" 

266 

267 return html_text, tex_text, xml_text 

268 

269 def parse_node_with_cit(self, node, tex_node, **kwargs): 

270 html_text = tex_text = get_text_from_node(node) 

271 xml_text = escape(html_text) 

272 

273 return html_text, tex_text, xml_text 

274 

275 def parse_node_with_hi(self, node, tex_node, **kwargs): 

276 rend = node.get("rend") 

277 

278 if rend == "it": 

279 return self.parse_node_with_i(node, tex_node, **kwargs) 

280 elif rend == "bold": 

281 return self.parse_node_with_b(node, tex_node, **kwargs) 

282 else: 

283 fct_name = "parse_node_with_" + rend.replace("-", "_") 

284 ftor = getattr(self, fct_name, None) 

285 if callable(ftor): 

286 return ftor(node, tex_node, **kwargs) 

287 

288 return self.parse_node_inner(node, tex_node, **kwargs) 

289 

290 def parse_node_with_i(self, node, tex_node, **kwargs): 

291 # TODO: BUG in JATS: unlike <monospace>, no HTLM links are added in italics 

292 kwargs["add_HTML_link"] = False 

293 

294 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

295 node, tex_node, **kwargs 

296 ) 

297 

298 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False 

299 is_citation = kwargs["is_citation"] if "is_citation" in kwargs else False 

300 is_comment = kwargs["is_comment"] if "is_comment" in kwargs else False 

301 

302 tex_text = f"<i>{inner_tex_text}</i>" 

303 

304 if inner_html_text == "" or (is_citation and not is_bibitemdata and not is_comment): 

305 html_text = inner_html_text 

306 else: 

307 html_text = '<span class="italique">' + inner_html_text + "</span>" 

308 

309 if len(inner_jats_xml_text) > 0: 

310 xml_text = "<italic>" + inner_jats_xml_text + "</italic>" 

311 else: 

312 xml_text = "<italic/>" 

313 

314 return html_text, tex_text, xml_text 

315 

316 def parse_node_with_label(self, node, tex_node, **kwargs): 

317 html_text = tex_text = xml_text = "" 

318 

319 self.list_item_label = get_text_from_node(node) 

320 

321 return html_text, tex_text, xml_text 

322 

323 def parse_node_with_large(self, node, tex_node, **kwargs): 

324 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

325 node, tex_node, **kwargs 

326 ) 

327 

328 xml_text = "<large>" + inner_jats_xml_text + "</large>" 

329 

330 return inner_html_text, inner_tex_text, xml_text 

331 

332 def parse_node_with_list(self, node, tex_node, **kwargs): 

333 self.list_item_label = None 

334 

335 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

336 node, tex_node, **kwargs 

337 ) 

338 

339 list_type = node.get("type") 

340 

341 if list_type is None: 

342 xml_text = "<list>" 

343 else: 

344 xml_text = '<list list-type="' + list_type + '">' 

345 xml_text += inner_jats_xml_text 

346 xml_text += "</list>" 

347 

348 if list_type is None or list_type == "bullet" or list_type == "simple": 

349 prefix = "<ul>" 

350 suffix = "</ul>" 

351 else: 

352 suffix = "</ol>" 

353 

354 if list_type == "order" or list_type == "number": 

355 prefix = '<ol type="1">' 

356 elif list_type == "alpha-lower": 

357 prefix = '<ol type="a">' 

358 elif list_type == "alpha-upper": 

359 prefix = '<ol type="A">' 

360 elif list_type == "roman-lower": 

361 prefix = '<ol type="i">' 

362 elif list_type == "roman-upper": 

363 prefix = '<ol type="I">' 

364 else: 

365 prefix = '<ul class="no-bullet" style="list-style-type:none;">' 

366 suffix = "</ul>" 

367 

368 html_text = prefix + inner_html_text + suffix 

369 tex_text = prefix + inner_tex_text + suffix 

370 

371 return html_text, tex_text, xml_text 

372 

373 def parse_node_with_item(self, node, tex_node, **kwargs): 

374 """ 

375 <list-item><label>LABEL</label><p>TEXT</p> becomes in HTML 

376 <li>LABEL TEXT</li> 

377 (same with <title>) 

378 

379 :param node: 

380 :return: 

381 """ 

382 

383 label = self.list_item_label or "" 

384 if label == "": 

385 label = node.get("label") or "" 

386 

387 self.list_item_label = None 

388 

389 kwargs["no_p"] = True 

390 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

391 node, tex_node, **kwargs 

392 ) 

393 

394 xml_text = "<list-item>" 

395 if label: 

396 xml_text += "<label>" + label + "</label>" 

397 xml_text += inner_jats_xml_text 

398 xml_text += "</list-item>" 

399 

400 text = "<li>" 

401 if label: 

402 text += label + " " 

403 

404 html_text = text + inner_html_text + "</li>" 

405 tex_text = text + inner_tex_text + "</li>" 

406 

407 return html_text, tex_text, xml_text 

408 

409 def parse_node_with_formula(self, node, tex_node, **kwargs): 

410 # '\n' are added in this function, because the Cedrics -> XML transformation 

411 # does not add xml:space="preserve" with formulas (?) 

412 # An abstract with <p> and <formula> will have mix of "preserve". 

413 

414 html_text = tex_text = jats_xml_text = "" 

415 type_ = node.attrib["type"] or "inline" 

416 tex_type = tex_node.attrib["textype"] if tex_node is not None else "inline" 

417 

418 math_node = node[0] 

419 math_node_text = get_xml_from_node(math_node) 

420 math_node_text = replace_html_entities(math_node_text) 

421 # The Cedrics Mathml transform rounds up the width value 

422 math_node_text = math_node_text.replace(".em", "em") 

423 math_node_text = math_node_text.replace(".pt", "pt") 

424 

425 tex_prefix = tex_suffix = "$" 

426 if type_ != "inline": 

427 tex_prefix = "\n\\[" 

428 tex_suffix = "\\]\n" 

429 if tex_node is not None and tex_type not in ("inline", "display"): 

430 tex_prefix = "\n\\begin{" + tex_type + "}\n" 

431 tex_suffix = "\n\\end{" + tex_type + "}\n" 

432 

433 math_node_text = fix_mfenced_in_mathml(math_node_text) 

434 

435 if not kwargs["is_citation"]: 

436 math_node_text = math_node_text.replace( 

437 ' xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

438 ) 

439 math_node_text = math_node_text.replace('mode="display"', 'display="block"') 

440 

441 if tex_node is None: 

442 # TODO: BUG in JATS. No need for a '$$' in the title if there is no tex formula 

443 # The '$$' at the end of the next line is to be compatible with jats_parser 

444 

445 if type_ == "inline": 

446 tex_node_text = "$$" 

447 else: 

448 tex_node_text = "" 

449 else: 

450 tex_node_text = tex_prefix + tex_node.text + tex_suffix 

451 

452 if type_ == "inline": 

453 jats_xml_text = "<inline-formula>" 

454 else: 

455 jats_xml_text = '<disp-formula xml:space="preserve">\n' 

456 

457 jats_xml_text += "<alternatives>" + math_node_text 

458 jats_tex_text = escape(tex_node_text) 

459 

460 if type_ != "inline": 

461 jats_xml_text += "\n" 

462 

463 jats_xml_text += "<tex-math>" + jats_tex_text + "</tex-math>" 

464 

465 if type_ != "inline": 

466 jats_xml_text += "\n" 

467 

468 jats_xml_text += "</alternatives>" 

469 

470 if type_ == "inline": 

471 jats_xml_text += "</inline-formula>" 

472 else: 

473 jats_xml_text += "\n</disp-formula>" 

474 node.tail = "" 

475 

476 if "bug_cedrics" in kwargs and kwargs["bug_cedrics"]: 

477 # TODO: Bug in Cedrics. AIF_2012__62_6_2053_0 [16] 

478 # If there is no texmath, a <tex-math>$$</tex-math> is added and 

479 # get_text_from_node appends the 2. 

480 tex_text = get_text_from_node(node) 

481 if tex_node is None: 

482 tex_text += "$$" 

483 else: 

484 tex_text = tex_node_text 

485 

486 data_tex = tex_node_text if type_ == "inline" else tex_node_text.replace("\n", "") 

487 html_text = f'<span class="mathjax-formula" data-tex="{data_tex}">{math_node_text}</span>' 

488 

489 if type_ != "inline": 

490 prefix = '<table class="formula"><tr><td class="formula-inner">' 

491 suffix = '</td><td class="formula-label"></td></tr></table>' 

492 

493 html_text = prefix + html_text + suffix 

494 

495 # tex_text = escape(tex_text) 

496 

497 return html_text, tex_text, jats_xml_text 

498 

499 def parse_node_with_mixed_content(self, node, tex_node, **kwargs): 

500 """ 

501 Parse and return the text of an XML node which mixes text and XML sub-nodes. 

502 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

503 Some inner nodes are removed, others are kept or replaced. 

504 

505 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings. 

506 Parse the 2 nodes at the same time. 

507 

508 The JATS xml string is constructed at the same time because it is used during a PTF export 

509 

510 :param node: XML Node (with MathML), XML Node (with TexMath) 

511 :param kwargs: params of the function 

512 :return: HTML text, TeX test, XML text 

513 """ 

514 

515 html_text = tex_text = jats_xml_text = "" 

516 

517 if node is None: 

518 return html_text, tex_text, jats_xml_text 

519 

520 name_ = type(node).__name__ 

521 # Found 1 exception with <title>Дополнение к&nbsp;работе (AIF_2013__63_4) 

522 # The XML parser creates a different node with no tag for "&nbsp;" 

523 if name_ != "_Element": 

524 html_text = tex_text = jats_xml_text = html.unescape(node.text) 

525 if node.tail and not kwargs["is_top"]: 

526 html_text += node.tail 

527 tex_text += node.tail 

528 jats_xml_text += escape(node.tail) 

529 return html_text, tex_text, jats_xml_text 

530 

531 # The tail is the text following the end of the node 

532 # Ex: <node>text1<a>text_a</a>a_tail</node> 

533 # The HTML text has to include the tail 

534 # only if html_from_mixed_content was called recursively 

535 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

536 

537 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec> 

538 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

539 

540 # Text in <comment> is parsed to add HTML link. 

541 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False 

542 

543 # base_url to image links 

544 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else "" 

545 

546 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False 

547 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False 

548 

549 # TODO remove once jats_parser has been validated agains xmldata 

550 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False 

551 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False 

552 kwargs["temp_mixed_citation"] = ( 

553 kwargs["temp_mixed_citation"] if "temp_mixed_citation" in kwargs else False 

554 ) 

555 

556 tag = normalize(node.tag) 

557 

558 # pub-id/object-id are ignored by default are they are treated separately 

559 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"): 

560 print(tag, "in", jats_xml_text) 

561 return html_text, tex_text, jats_xml_text 

562 

563 if tag in ("bibitemdata", "toc"): 

564 kwargs["is_citation"] = True 

565 kwargs["temp_mixed_citation"] = True 

566 elif tag == "comment": 

567 kwargs["is_comment"] = True 

568 

569 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

570 

571 # I. Add the node's text. 

572 # Some tag have a corresponding html_from_@tag function to generate the HTML text. 

573 

574 # Check if the html_from_@tag exists 

575 tag_mapped = { 

576 "statement": "sec", 

577 "disp-formula": "inline-formula", 

578 "chapter-title": "article-title", 

579 "bold": "strong", 

580 "table": "table-generic", 

581 "th": "table-generic", 

582 "tr": "table-generic", 

583 "td": "table-generic", 

584 "thead": "table-generic", 

585 "tbody": "table-generic", 

586 "colgroup": "table-generic", 

587 "col": "table-generic", 

588 "em": "i", 

589 } 

590 

591 fct_name = tag_mapped[tag] if tag in tag_mapped else tag 

592 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

593 ftor = getattr(self, fct_name, None) 

594 if callable(ftor): 

595 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, tex_node, **kwargs) 

596 # Code if fc_name is a module fonction, not a class function: 

597 # if fct_name in globals(): 

598 # Call the html_from_@tag function 

599 # inner_text = globals()[fct_name](node, **kwargs) 

600 else: 

601 # II.1. Add the node text (before the children text) 

602 

603 # TODO Add HTML links to the text with URLs 

604 # if tag in ("ext-link", "uri"): 

605 # if kwargs['include_ext_link']: 

606 # inner_text += helper_add_link_from_node(node) 

607 # elif kwargs['add_HTML_link'] and node.text: 

608 # match = re.match(r'[\n ]+', node.text) 

609 # if not match: 

610 # comment = make_links_clickable(node.text, node.text) 

611 # inner_text += comment 

612 # elif node.text: 

613 # inner_text += node.text 

614 

615 # II.2. children 

616 # child_text = html_from_mixed_content(child, params) 

617 

618 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

619 node, tex_node, **kwargs 

620 ) 

621 

622 html_text += inner_html_text 

623 tex_text += inner_tex_text 

624 jats_xml_text += inner_jats_xml_text 

625 

626 # III. Add the node's tail for children 

627 if node.tail and not kwargs["is_top"] and tag not in ("p", "list", "item", "label"): 

628 html_text += node.tail 

629 tex_text += node.tail 

630 jats_xml_text += escape(node.tail) 

631 

632 return html_text, tex_text, jats_xml_text 

633 

634 def parse_node_with_p(self, node, tex_node, **kwargs): 

635 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

636 node, tex_node, **kwargs 

637 ) 

638 

639 if "no_p" in kwargs and kwargs["no_p"]: 

640 # <p> inside <item> are removed in HTML to avoid a carriage return 

641 html_text = inner_html_text 

642 else: 

643 node_type = node.get("specific-use") 

644 if node_type: 

645 html_text = '<p class="' + node_type + '">' + inner_html_text + "</p>" 

646 else: 

647 html_text = "<p>" + inner_html_text + "</p>" 

648 

649 # TODO: BUG in JATS (no <p> in the tex version) 

650 tex_text = inner_tex_text 

651 

652 if len(inner_jats_xml_text) > 0: 

653 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>" 

654 else: 

655 xml_text = '<p xml:space="preserve"/>' 

656 

657 return html_text, tex_text, xml_text 

658 

659 def parse_node_with_ref(self, node, tex_node, **kwargs): 

660 label = node.text 

661 

662 html_text = "" 

663 tex_text = "" 

664 xml_text = '<xref ref-type="bibr">' + escape(label) + "</xref>" 

665 

666 return html_text, tex_text, xml_text 

667 

668 def parse_node_with_sansserif(self, node, tex_node, **kwargs): 

669 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

670 node, tex_node, **kwargs 

671 ) 

672 

673 xml_text = "<sans-serif>" + inner_jats_xml_text + "</sans-serif>" 

674 

675 return inner_html_text, inner_tex_text, xml_text 

676 

677 def parse_node_with_sc(self, node, tex_node, **kwargs): 

678 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

679 node, tex_node, **kwargs 

680 ) 

681 

682 html_text = '<span class="smallcaps">' + inner_html_text + "</span>" 

683 tex_text = '<span class="smallcaps">' + inner_tex_text + "</span>" 

684 

685 if len(inner_jats_xml_text) > 0: 

686 xml_text = "<sc>" + inner_jats_xml_text + "</sc>" 

687 else: 

688 xml_text = "<sc/>" 

689 

690 return html_text, tex_text, xml_text 

691 

692 def parse_node_with_slanted(self, node, tex_node, **kwargs): 

693 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

694 node, tex_node, **kwargs 

695 ) 

696 

697 xml_text = "<slanted>" + inner_jats_xml_text + "</slanted>" 

698 

699 return inner_html_text, inner_tex_text, xml_text 

700 

701 def parse_node_with_small(self, node, tex_node, **kwargs): 

702 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

703 node, tex_node, **kwargs 

704 ) 

705 

706 xml_text = "<small>" + inner_jats_xml_text + "</small>" 

707 

708 return inner_html_text, inner_tex_text, xml_text 

709 

710 def parse_node_with_sub(self, node, tex_node, **kwargs): 

711 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

712 node, tex_node, **kwargs 

713 ) 

714 

715 html_text = "<sub>" + inner_html_text + "</sub>" 

716 tex_text = "<sub>" + inner_tex_text + "</sub>" 

717 xml_text = "<sub>" + inner_jats_xml_text + "</sub>" 

718 

719 return html_text, tex_text, xml_text 

720 

721 def parse_node_with_sup(self, node, tex_node, **kwargs): 

722 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

723 node, tex_node, **kwargs 

724 ) 

725 

726 html_text = "<sup>" + inner_html_text + "</sup>" 

727 tex_text = "<sup>" + inner_tex_text + "</sup>" 

728 xml_text = "<sup>" + inner_jats_xml_text + "</sup>" 

729 

730 return html_text, tex_text, xml_text 

731 

732 def parse_node_with_texmath(self, node, tex_node, **kwargs): 

733 html_text = tex_text = xml_text = "" 

734 

735 tex_text = "$" + get_text_from_node(node) + "$" 

736 

737 return html_text, tex_text, xml_text 

738 

739 def parse_node_with_tt(self, node, tex_node, **kwargs): 

740 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

741 node, tex_node, **kwargs 

742 ) 

743 

744 if len(inner_jats_xml_text) > 0: 

745 xml_text = "<monospace>" + inner_jats_xml_text + "</monospace>" 

746 else: 

747 xml_text = "<monospace/>" 

748 

749 return inner_html_text, inner_tex_text, xml_text 

750 

751 def parse_node_with_underline(self, node, tex_node, **kwargs): 

752 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

753 node, tex_node, **kwargs 

754 ) 

755 

756 xml_text = "<underline>" + inner_jats_xml_text + "</underline>" 

757 

758 return inner_html_text, inner_tex_text, xml_text 

759 

760 def parse_node_with_xref(self, node, tex_node, **kwargs): 

761 """ 

762 Parse an xref. 

763 Extract extids (doi, mr-item-id,...) and ext_links 

764 

765 :param node: 

766 :param tex_node: 

767 :param kwargs: 

768 :return: html_text, tex_text, xml_text 

769 """ 

770 

771 location = self.get_location_from_xref(node) 

772 

773 kwargs["add_HTML_link"] = False 

774 html_text, tex_text, xml_text = self.parse_node_inner(node, None, **kwargs) 

775 metadata = html_text 

776 html_text = make_links_clickable(location, html_text) 

777 tex_text = make_links_clickable(location, tex_text) 

778 

779 is_comment = "is_comment" in kwargs and kwargs["is_comment"] 

780 

781 # No ext-links is added while parsing titles or abstracts 

782 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else True 

783 

784 xref_data = { 

785 "rel": "", 

786 "mimetype": "", 

787 "location": location, 

788 "base": "", 

789 "metadata": metadata, 

790 } 

791 

792 extid_value = (None, None) 

793 

794 if add_ext_link and not is_comment: 

795 extid_value = self.add_extids_from_node_with_link(xref_data) 

796 

797 # <ext-link> in a bibitemdata, in a comment, or if the xref is not converted into an extid 

798 # if is_bibitemdata or is_comment or extid_value[0] is None: 

799 xml_text = ( 

800 '<ext-link xlink:href="' + html.escape(location) + '">' + xml_text + "</ext-link>" 

801 ) 

802 

803 if ( 

804 add_ext_link 

805 and not is_comment 

806 and extid_value[0] is None 

807 and xref_data not in self.ext_links 

808 ): 

809 self.ext_links.append(xref_data) 

810 

811 return html_text, tex_text, xml_text 

812 

813 def parse_article_subject(self, node): 

814 lang = get_normalized_attrib(node, "lang") or self.lang 

815 

816 subjects = [text.lstrip() for text in node.text.split(",")] 

817 

818 for subject in subjects: 

819 self.subjs.append({"type": "subject", "lang": lang, "value": subject}) 

820 

821 def parse_article_subjects(self, node): 

822 for child in node: 

823 tag = normalize(child.tag) 

824 

825 if tag == "article-subject": 

826 self.parse_article_subject(child) 

827 

828 def parse_article_type(self, node): 

829 lang = get_normalized_attrib(node, "lang") or self.lang 

830 

831 subjects = [node.text] 

832 

833 for subject in subjects: 

834 self.subjs.append({"type": "type", "lang": lang, "value": subject}) 

835 

836 def parse_article_types(self, node): 

837 # 2023/12/05 <articletype> has been added to store the type 

838 if self.has_articletype: 

839 return 

840 

841 for child in node: 

842 tag = normalize(child.tag) 

843 

844 if tag == "article-type": 

845 self.parse_article_type(child) 

846 

847 def parse_articletype(self, node): 

848 self.atype = node.text 

849 self.has_articletype = True 

850 

851 def parse_auteur(self, node, is_ref=False): 

852 self.parse_common_contrib(node, "author", is_ref) 

853 

854 def _get_abstract_data(self, node, abstract_type: str = None): 

855 tex_node = node.getnext() 

856 value_html, value_tex, value_xml = self.parse_node_with_mixed_content( 

857 node, tex_node, add_ext_link=False 

858 ) 

859 

860 lang = get_normalized_attrib(node, "lang") or "" 

861 if abstract_type is None: 

862 if lang == self.lang: 

863 value_xml = f"<abstract>{value_xml}</abstract>" 

864 elif self.lang == "und": 

865 value_xml = f'<abstract xml:lang="{lang}">{value_xml}</abstract>' 

866 else: 

867 value_xml = f'<trans-abstract xml:lang="{lang}">{value_xml}</trans-abstract>' 

868 else: 

869 value_xml = f'<abstract xml:lang="{lang}" abstract-type="{abstract_type}">{value_xml}</abstract>' 

870 

871 abstract_data = { 

872 "tag": abstract_type if abstract_type is not None else "", 

873 "lang": lang, 

874 "value_xml": value_xml, 

875 "value_html": value_html, 

876 "value_tex": value_tex, 

877 } 

878 return abstract_data 

879 

880 def parse_avertissement(self, node): 

881 self.abstracts.append(self._get_abstract_data(node, "avertissement")) 

882 

883 def parse_note(self, node): 

884 self.abstracts.append(self._get_abstract_data(node, "note")) 

885 

886 def parse_biblio(self, node): 

887 biblio_type = node.get("type") or "" 

888 for child in node: 

889 tag = normalize(child.tag) 

890 

891 if tag == "bib_entry": 

892 type_ = child.get("type") or biblio_type 

893 is_mixed_citation = type_ == "flat" 

894 

895 ref = CedricsRef(tree=child, lang="und", is_mixed_citation=is_mixed_citation) 

896 self.bibitems.append(ref) 

897 # TODO: Remove bibitem. This is used for solrCmds. 

898 # solrCmds should use bibitems instead. 

899 self.bibitem.append(ref.citation_html) 

900 

901 self.sort_bibitems() 

902 

903 def parse_common_contrib(self, node, role, is_ref=False): 

904 contributor = create_contributor() 

905 

906 if role and role[-1] == "s": 

907 role = role[0:-1] 

908 contributor["role"] = role 

909 

910 equal_contrib_ = node.get("equal-contrib") or "no" 

911 contributor["equal_contrib"] = equal_contrib_ == "yes" 

912 

913 corresp = node.get("author-role") or "" 

914 if corresp == "corresponding": 

915 contributor["corresponding"] = True 

916 

917 is_etal = False 

918 has_children = False 

919 middlename = "" 

920 

921 for child in node: 

922 has_children = True 

923 tag = normalize(child.tag) 

924 

925 if tag == "nomcomplet": 

926 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur> 

927 if not is_ref: 

928 contributor["string_name"] = child.text 

929 deceased_ = child.get("deceased") or "no" 

930 contributor["deceased_before_publication"] = deceased_ == "yes" 

931 elif tag == "prenom": 

932 contributor["first_name"] = child.text or "" 

933 if middlename != "": 

934 contributor["first_name"] += " " + middlename 

935 middlename = "" 

936 elif tag in ("middlename", "particule"): 

937 contributor["first_name"] += " " + child.text 

938 middlename = child.text 

939 elif tag == "initiale": 

940 pass 

941 # if len(contributor['first_name']) > 0: 

942 # contributor['first_initials'] = child.text or '' 

943 elif tag == "junior": 

944 contributor["suffix"] = child.text 

945 elif tag == "nom": 

946 contributor["last_name"] = child.text or "" 

947 elif tag == "adresse": 

948 text = get_text_from_node(child) 

949 text = normalize_space(text).replace("\n", " ") 

950 if len(text) > 0: 

951 contributor["addresses"].append(text) 

952 elif tag == "author-orcid": 

953 contributor["orcid"] = child.text 

954 elif tag == "mel": 

955 email = None 

956 for greatchild in child: 

957 tag = normalize(greatchild.tag) 

958 if tag == "xref": 

959 email = greatchild.get("url") 

960 if email is None: 

961 email = child.text 

962 if email is not None: 

963 if len(contributor["email"]) > 0: 

964 contributor["email"] += "{{{" 

965 contributor["email"] += email 

966 elif tag == "etal": 

967 is_etal = True 

968 

969 if has_children: 

970 use_initials = is_ref and getattr(settings, "REF_JEP_STYLE", False) 

971 helper_update_name_params(contributor, use_initials) 

972 

973 contributor["contrib_xml"] = ( 

974 "<etal/>" if is_etal else get_contrib_xml(contributor, is_ref=is_ref) 

975 ) 

976 elif node.text is not None: 

977 contributor["string_name"] = node.text 

978 contributor["contrib_xml"] = ( 

979 '<string-name xml:space="preserve">' + escape(node.text) + "</string-name>" 

980 ) 

981 

982 contributor["addresses"].sort() 

983 

984 # email is ignored by jats_parser 

985 contributor["email"] = "" 

986 

987 self.contributors.append(contributor) 

988 

989 def parse_financement(self, node): 

990 abbrev = award_id = None 

991 

992 for child in node: 

993 tag = normalize(child.tag) 

994 

995 if tag == "bourse": 

996 award_id = child.text 

997 elif tag == "financeur": 

998 abbrev = get_text_from_node(child) 

999 

1000 if abbrev is not None and award_id is not None: 

1001 self.awards.append({"abbrev": abbrev, "award_id": award_id}) 

1002 

1003 def parse_financements(self, node): 

1004 for child in node: 

1005 tag = normalize(child.tag) 

1006 

1007 if tag == "financement": 

1008 self.parse_financement(child) 

1009 

1010 def parse_langue(self, node): 

1011 self.lang = node.text 

1012 

1013 def parse_motcle(self, node): 

1014 lang = get_normalized_attrib(node, "lang") or self.lang 

1015 tex_node = node.getnext() 

1016 

1017 kwds = [] 

1018 for child in tex_node: 

1019 tag = normalize(child.tag) 

1020 

1021 if tag == "mot": 

1022 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content( 

1023 child, None 

1024 ) 

1025 # text = normalize_space(get_text_from_node(child)) 

1026 kwds.append(value_tex) 

1027 

1028 if len(kwds) == 0: 

1029 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content( 

1030 node, tex_node 

1031 ) 

1032 kwds = split_kwds(value_tex) 

1033 

1034 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds]) 

1035 

1036 def parse_msc(self, node): 

1037 lang = get_normalized_attrib(node, "lang") or self.lang 

1038 kwds = node.text.split(",") 

1039 kwds = [kwd.strip() for kwd in kwds if len(kwd) > 0] 

1040 

1041 self.kwds.extend([{"type": "msc", "lang": lang, "value": kwd} for kwd in kwds]) 

1042 

1043 def parse_resp(self, node): 

1044 role = node.get("role") or "editeur" 

1045 if role == "editeur": 

1046 role = "editor" 

1047 elif role == "organisateur": 

1048 role = "organizer" 

1049 

1050 self.parse_common_contrib(node, role) 

1051 

1052 def parse_resume(self, node): 

1053 lang = get_normalized_attrib(node, "lang") or self.lang 

1054 """ 

1055 tag = "abstract" 

1056 tex_node = node.getnext() 

1057 

1058 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content( 

1059 node, tex_node, add_ext_link=False 

1060 ) 

1061 

1062 if lang == self.lang: 

1063 value_xml = "<abstract" 

1064 elif self.lang == "und": 

1065 value_xml = '<abstract xml:lang="' + lang + '"' 

1066 else: 

1067 value_xml = '<trans-abstract xml:lang="' + lang + '"' 

1068 

1069 if len(value_xml_inner) == 0: 

1070 value_xml += "/>" 

1071 else: 

1072 value_xml += ">" + value_xml_inner 

1073 

1074 if lang == self.lang or self.lang == "und": 

1075 value_xml += "</abstract>" 

1076 else: 

1077 value_xml += "</trans-abstract>" 

1078 

1079 abstract_data = { 

1080 "tag": tag, 

1081 "lang": lang, 

1082 "value_xml": value_xml, 

1083 "value_html": value_html, 

1084 "value_tex": value_tex, 

1085 } 

1086 """ 

1087 if lang == self.lang: 

1088 # JATS puts the trans_abstract after the abstract 

1089 self.abstracts.insert(0, self._get_abstract_data(node, None)) 

1090 else: 

1091 self.abstracts.append(self._get_abstract_data(node)) 

1092 

1093 def parse_supplement(self, node): 

1094 location = None 

1095 caption = "" 

1096 

1097 for child in node: 

1098 tag = normalize(child.tag) 

1099 

1100 if tag == "xref": 

1101 location = self.get_location_from_xref(child) 

1102 elif tag == "caption": 

1103 caption = escape(node.text) 

1104 

1105 if location: 

1106 pos = location.find("/attach/") 

1107 if pos > -1: 

1108 if hasattr(self, "colid") and hasattr(self, "issue_id"): 

1109 text = location 

1110 location = self.colid + "/" + self.issue_id + "/" 

1111 

1112 if hasattr(self, "article_folder") and self.article_folder is not None: 

1113 location += self.article_folder + "/Attach/" + text[pos + 8 :] 

1114 else: 

1115 location += self.pid + text[pos:] 

1116 

1117 relation = node.attrib.get("content-type") 

1118 assert relation in ["supplementary-material", "review"], ( 

1119 f"Dans la balise supplement de {self.pid}, " 

1120 f'content-type être "supplementary-material" ou "review" ' 

1121 f'au lieu de "{relation}"' 

1122 ) 

1123 

1124 material = { 

1125 "rel": node.attrib.get("content-type"), 

1126 "mimetype": node.attrib.get("mimetype"), 

1127 "location": location, 

1128 "base": "", 

1129 "metadata": "", 

1130 "caption": caption, 

1131 } 

1132 self.supplementary_materials.append(material) 

1133 

1134 def parse_supplements(self, node): 

1135 for child in node: 

1136 tag = normalize(child.tag) 

1137 

1138 if tag == "supplement": 

1139 self.parse_supplement(child) 

1140 

1141 # TODO: It is a node with mix content 

1142 # Transform the function in parse_node_with_motcle to handle formulas 

1143 def parse_texmotcle(self, node): 

1144 lang = get_normalized_attrib(node, "lang") or self.lang 

1145 tex_node = node.getnext() 

1146 

1147 kwds = [] 

1148 for child in tex_node: 

1149 tag = normalize(child.tag) 

1150 

1151 if tag == "mot": 

1152 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(child) 

1153 kwds.append(value_tex) 

1154 

1155 if len(kwds) == 0: 

1156 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(node) 

1157 kwds = split_kwds(value_tex) 

1158 

1159 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds]) 

1160 

1161 def parse_titre(self, node): 

1162 lang = get_normalized_attrib(node, "lang") or "und" 

1163 tex_node = node.getnext() 

1164 

1165 # node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") 

1166 # tex_node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") 

1167 

1168 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(node, tex_node) 

1169 

1170 if len(title_xml) > 0: 

1171 self.titres.append( 

1172 { 

1173 "lang": lang, 

1174 "title_html": title_html, 

1175 "title_tex": title_tex, 

1176 "title_xml": title_xml, 

1177 } 

1178 ) 

1179 

1180 def sort_bibitems(self): 

1181 if len(self.bibitems): 

1182 label = self.bibitems[0].label.strip("[]") # Sometimes, labels are surrounded by [] 

1183 if len(label): 

1184 # First, we split each label into label_prefix and label_suffix 

1185 for bib in self.bibitems: 

1186 bib.split_label() 

1187 

1188 if label.isdigit(): 

1189 

1190 def sort_bibitem(bibitem): 

1191 return int(bibitem.label_prefix) 

1192 

1193 self.bibitems = sorted(self.bibitems, key=sort_bibitem) 

1194 else: 

1195 self.bibitems = sorted( 

1196 self.bibitems, key=attrgetter("label_prefix", "year", "label_suffix") 

1197 ) 

1198 

1199 

1200class CedricsPublisher(PublisherData): 

1201 def __init__(self, *args, **kwargs): 

1202 super().__init__(*args, **kwargs) 

1203 self.parse_tree(kwargs["tree"]) 

1204 

1205 def parse_tree(self, tree): 

1206 self.name = tree.text 

1207 

1208 

1209class CedricsJournal(JournalData, CedricsBase): 

1210 def __init__(self, *args, **kwargs): 

1211 super().__init__(*args, **kwargs) 

1212 self.parse_tree(kwargs["tree"]) 

1213 

1214 def parse_tree(self, tree): 

1215 super().parse_tree(tree) 

1216 

1217 for node in tree: 

1218 tag = normalize(node.tag) 

1219 

1220 if tag == "acrocedram": 

1221 self.pid = node.text 

1222 elif tag == "jtitre": 

1223 self.title_html = self.title_tex = node.text 

1224 self.title_xml = "<journal-title-group><journal-title>" + escape(node.text) 

1225 elif tag == "jtitrecourt": 

1226 self.title_xml += ( 

1227 '</journal-title><abbrev-journal-title abbrev-type="short-title">' 

1228 + escape(node.text) 

1229 ) 

1230 self.title_xml += "</abbrev-journal-title></journal-title-group>" 

1231 elif tag == "jediteur": 

1232 self.publisher = CedricsPublisher(tree=node) 

1233 elif tag == "issn": 

1234 self.issn = node.text 

1235 elif tag == "E-issn": 

1236 self.e_issn = node.text 

1237 

1238 

1239class CedricsIssue(IssueData, CedricsBase): 

1240 def __init__(self, *args, **kwargs): 

1241 super().__init__(*args, **kwargs) 

1242 

1243 # Jats has a title/trans_title 

1244 # Cedrics has multiples <titre xml:lang> 

1245 # Use self.titres to store the titles temporary. 

1246 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree 

1247 self.titres = [] 

1248 

1249 self.ignore_date_published = ( 

1250 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False 

1251 ) 

1252 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False 

1253 self.colid = None 

1254 self.provider = "mathdoc" 

1255 self.article_folders = kwargs["article_folders"] if "article_folders" in kwargs else [] 

1256 self.dois = kwargs["dois"] if "dois" in kwargs else [] 

1257 

1258 self.parse_tree(kwargs["tree"]) 

1259 self.post_parse_tree() 

1260 

1261 def parse_tree(self, tree): 

1262 super().parse_tree(tree) 

1263 

1264 seq = 1 

1265 

1266 for node in tree: 

1267 tag = normalize(node.tag) 

1268 

1269 if tag == "notice": 

1270 self.parse_notice(node) 

1271 elif tag == "article": 

1272 article_folder = ( 

1273 self.article_folders[seq - 1] if len(self.article_folders) > 0 else "" 

1274 ) 

1275 doi = self.dois[seq - 1] if len(self.dois) > 0 else "" 

1276 article = CedricsArticle( 

1277 tree=node, 

1278 colid=self.colid, 

1279 issue_id=self.pid, 

1280 doi=doi, 

1281 ignore_date_published=self.ignore_date_published, 

1282 is_seminar=self.is_seminar, 

1283 article_folder=article_folder, 

1284 ) 

1285 article.seq = str(seq) 

1286 seq += 1 

1287 self.articles.append(article) 

1288 

1289 def parse_gestion(self, node): 

1290 for child in node: 

1291 tag = normalize(child.tag) 

1292 

1293 if tag == "efirst": 

1294 self.with_online_first = child.text == "yes" 

1295 

1296 def parse_notice(self, node): 

1297 for child in node: 

1298 tag = normalize(child.tag) 

1299 

1300 if tag == "idvol": 

1301 self.pid = child.text 

1302 elif tag == "tome": 

1303 self.volume = child.text 

1304 elif tag == "fascicule": 

1305 self.number = child.text 

1306 elif tag == "serie": 

1307 self.vseries = child.text 

1308 elif tag == "annee": 

1309 self.year = child.text 

1310 else: 

1311 fct_name = "parse_" + tag.replace("-", "_") 

1312 ftor = getattr(self, fct_name, None) 

1313 if callable(ftor): 

1314 ftor(child) 

1315 

1316 if self.last_modified_iso_8601_date_str is None: 

1317 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

1318 

1319 def parse_revue(self, node): 

1320 self.journal = CedricsJournal(tree=node) 

1321 self.colid = self.journal.pid 

1322 self.publisher = self.journal.publisher 

1323 

1324 def set_titles(self): 

1325 # TODO: BUG in JATS: title_html is the one of the last title (bug if title in multiple langs) 

1326 for titre in self.titres: 

1327 if titre["lang"] == self.lang or self.lang == "und": 

1328 self.title_html = titre["title_html"] 

1329 self.title_tex = titre["title_tex"] 

1330 else: 

1331 self.trans_lang = titre["lang"] 

1332 self.trans_title_html = titre["title_html"] 

1333 self.trans_title_tex = titre["title_tex"] 

1334 

1335 if self.title_html: 

1336 self.title_xml = "<issue-title-group>" 

1337 

1338 for titre in self.titres: 

1339 if titre["lang"] == self.lang or self.lang == "und": 

1340 self.title_xml += ( 

1341 '<issue-title xml:space="preserve" xml:lang="' 

1342 + titre["lang"] 

1343 + '">' 

1344 + titre["title_xml"] 

1345 + "</issue-title>" 

1346 ) 

1347 

1348 for titre in self.titres: 

1349 if titre["lang"] != self.lang and self.lang != "und": 

1350 self.title_xml += '<trans-title-group xml:lang="' + titre["lang"] + '">' 

1351 self.title_xml += ( 

1352 '<trans-title xml:space="preserve">' 

1353 + titre["title_xml"] 

1354 + "</trans-title>" 

1355 ) 

1356 self.title_xml += "</trans-title-group>" 

1357 

1358 self.title_xml += "</issue-title-group>" 

1359 

1360 

1361class CedricsArticle(ArticleData, CedricsBase): 

1362 def __init__(self, *args, **kwargs): 

1363 super().__init__(*args, **kwargs) 

1364 

1365 self.ignore_date_published = ( 

1366 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False 

1367 ) 

1368 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False 

1369 self.article_folder = kwargs["article_folder"] if "article_folder" in kwargs else None 

1370 

1371 # Jats has a title/trans_title 

1372 # Cedrics has multiples <titre xml:lang> 

1373 # Use self.titres to store the titles temporary. 

1374 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree 

1375 self.titres = [] 

1376 

1377 self.pid = kwargs["pid"] if "pid" in kwargs else None 

1378 self.colid = kwargs["colid"] 

1379 self.issue_id = kwargs["issue_id"] 

1380 self.atype = "normal" 

1381 

1382 if "doi" in kwargs and kwargs["doi"] is not None: 

1383 self.doi = clean_doi(kwargs["doi"]) 

1384 self.ids.append(("doi", self.doi)) 

1385 

1386 self.publishTeX = False 

1387 self.tex_filename = None 

1388 self.has_articletype = ( 

1389 False # 2023/12/05 <articletype> has been added. Ignore <article-types> 

1390 ) 

1391 

1392 self.parse_tree(kwargs["tree"]) 

1393 self.post_parse_tree() 

1394 

1395 def parse_tree(self, tree): 

1396 super().parse_tree(tree) 

1397 

1398 for node in tree: 

1399 tag = normalize(node.tag) 

1400 

1401 if tag == "idart": 

1402 self.pid = node.text 

1403 elif tag == "doi": 

1404 self.doi = clean_doi(node.text) 

1405 # TODO: Remove as ResourceId do not seem useful (needs to upate templates) 

1406 value = ("doi", self.doi) 

1407 if value not in self.ids: 

1408 self.ids.append(value) 

1409 elif tag == "pagedeb": 

1410 self.fpage = self.get_numeric_value(node) 

1411 elif tag == "pagefin": 

1412 self.lpage = self.get_numeric_value(node) 

1413 elif tag == "ordreart": 

1414 # Set article_number or talk_number 

1415 # Side effect in Cedrics: set page-count (handled at the end of this function) 

1416 if self.is_seminar: 

1417 self.talk_number = node.text 

1418 else: 

1419 self.article_number = node.text 

1420 elif tag == "msn-id": 

1421 self.extids.append(("mr-item-id", node.text)) 

1422 elif tag == "zbl-id": 

1423 self.extids.append(("zbl-item-id", node.text)) 

1424 

1425 # elif tag == 'pub-date': 

1426 # date_type = child.get('date-type') or 'pub' 

1427 # if date_type == 'pub': 

1428 # self.date_published_iso_8601_date_str = get_data_from_date(child) 

1429 # else: 

1430 # date_str = get_data_from_date(child) 

1431 # self.history_dates.append({'type': 'online', 'date': date_str}) 

1432 # elif tag == "history": 

1433 # self.history_dates += get_data_from_history(child) 

1434 # for date in self.history_dates: 

1435 # if date['type'] == 'prod-deployed-date': 

1436 # self.prod_deployed_date_iso_8601_date_str = date['date'] 

1437 

1438 else: 

1439 fct_name = "parse_" + tag.replace("-", "_") 

1440 print("function " + fct_name) 

1441 ftor = getattr(self, fct_name, None) 

1442 if callable(ftor): 

1443 ftor(node) 

1444 

1445 def parse_gestion(self, node): 

1446 for child in node: 

1447 tag = normalize(child.tag) 

1448 

1449 if tag == "date_online" and not self.ignore_date_published: 

1450 self.history_dates.append({"type": "online", "date": child.text}) 

1451 elif tag == "date_acceptation": 

1452 self.history_dates.append({"type": "accepted", "date": child.text}) 

1453 elif tag == "date_reception": 

1454 self.history_dates.append({"type": "received", "date": child.text}) 

1455 elif tag == "date_revision": 

1456 self.history_dates.append({"type": "revised", "date": child.text}) 

1457 elif tag == "publishTeX": 

1458 self.publishTeX = child.text == "yes" 

1459 

1460 def parse_production(self, node): 

1461 for child in node: 

1462 tag = normalize(child.tag) 

1463 

1464 if tag == "date_prod_PDF" and not self.ignore_date_published: 

1465 self.date_published_iso_8601_date_str = child.text 

1466 elif tag == "fichier_tex": 

1467 self.tex_filename = child.text 

1468 

1469 def parse_relations(self, node): 

1470 rel_type = get_normalized_attrib(node, "type") or "" 

1471 id_value = node.text 

1472 

1473 relations = { 

1474 "corrige": "corrects", 

1475 "estcorrige": "corrected-by", 

1476 "complete": "complements", 

1477 "estcomplete": "complemented-by", 

1478 "suitede": "follows", 

1479 "estsuivide": "followed-by", 

1480 "pagesprec": "prev-pages", 

1481 "pagessuiv": "next-pages", 

1482 "solutionde": "resolves", 

1483 "apoursolution": "resolved-by", 

1484 "commente": "comments", 

1485 "estcommente": "commented-by", 

1486 "remplace": "replaces", 

1487 "estremplace": "replaced-by", 

1488 } 

1489 

1490 if rel_type in relations: 

1491 obj = Foo() 

1492 obj.rel_type = relations[rel_type] 

1493 obj.id_value = id_value 

1494 

1495 self.relations.append(obj) 

1496 

1497 def post_parse_tree(self): 

1498 # Some values in Cedrics XMLs are not embedded in groups (ex: authors) 

1499 # We need to wait at the end of the parsing to finish the job 

1500 

1501 super().post_parse_tree() 

1502 

1503 if len(self.talk_number) > 0 or len(self.article_number) > 0: 

1504 try: 

1505 fpage_int = int(self.fpage) 

1506 lpage_int = int(self.lpage) 

1507 count_value = lpage_int - fpage_int + 1 

1508 self.counts.append(("page-count", str(count_value))) 

1509 except ValueError: 

1510 pass 

1511 

1512 # The (data)streams of the article's PDF and TeX are added automatically 

1513 if hasattr(self, "colid") and hasattr(self, "issue_id"): 

1514 location = self.colid + "/" + self.issue_id + "/" 

1515 if self.article_folder: 

1516 location += self.article_folder + "/" + self.article_folder + ".pdf" 

1517 else: 

1518 location += self.pid + "/" + self.pid + ".pdf" 

1519 

1520 data = { 

1521 "rel": "full-text", 

1522 "mimetype": "application/pdf", 

1523 "location": location, 

1524 "base": "", 

1525 "text": "Full (PDF)", 

1526 } 

1527 self.streams.append(data) 

1528 

1529 if self.publishTeX and self.tex_filename: 

1530 location = self.colid + "/" + self.issue_id + "/" 

1531 if self.article_folder: 

1532 location += self.article_folder + "/" + self.tex_filename + ".tex" 

1533 else: 

1534 location += self.pid + "/src/tex/" + self.tex_filename + ".tex" 

1535 

1536 data = { 

1537 "rel": "full-text", 

1538 "mimetype": "application/x-tex", 

1539 "location": location, 

1540 "base": "", 

1541 "text": "TeX source", 

1542 } 

1543 self.streams.append(data) 

1544 

1545 def set_titles(self): 

1546 for titre in self.titres: 

1547 if titre["lang"] == self.lang or self.lang == "und": 

1548 self.title_html = titre["title_html"] 

1549 self.title_tex = titre["title_tex"] 

1550 if len(titre["title_xml"]) > 0: 

1551 self.title_xml = ( 

1552 '<article-title xml:space="preserve">' 

1553 + titre["title_xml"] 

1554 + "</article-title>" 

1555 ) 

1556 else: 

1557 self.trans_title_html = titre["title_html"] 

1558 self.trans_title_tex = titre["title_tex"] 

1559 if len(titre["title_xml"]): 

1560 self.trans_title_xml = '<trans-title-group xml:lang="' + titre["lang"] + '">' 

1561 self.trans_title_xml += '<trans-title xml:space="preserve">' 

1562 self.trans_title_xml += ( 

1563 titre["title_xml"] + "</trans-title></trans-title-group>" 

1564 ) 

1565 self.trans_lang = titre["lang"] 

1566 

1567 if len(self.title_xml) > 0: 

1568 self.title_xml = ( 

1569 "<title-group>" + self.title_xml + self.trans_title_xml + "</title-group>" 

1570 ) 

1571 

1572 

1573class CedricsRef(RefBase, CedricsBase): 

1574 def __init__(self, *args, **kwargs): 

1575 super().__init__(*args, **kwargs) 

1576 

1577 self.citation_xml = self.citation_html = self.citation_tex = "" 

1578 self.REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False) 

1579 

1580 self.is_mixed_citation = ( 

1581 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

1582 ) 

1583 self.eprint_id = None 

1584 self.archive_name = None 

1585 self.has_doi = False 

1586 

1587 self.editeur_citation_xml = ( 

1588 "" # bediteur is not in the correct order. Store the xml temporarily 

1589 ) 

1590 

1591 self.parse_tree(kwargs["tree"]) 

1592 

1593 def parse_address(self, node): 

1594 self.publisher_loc = normalize_space(get_text_from_node(node)) 

1595 self.citation_xml += "<publisher-loc>" + escape(self.publisher_loc) + "</publisher-loc>" 

1596 

1597 def parse_archive_name(self, node): 

1598 # TODO 1 JEP ref has a formula in its archive-name (for biorxiv) 

1599 # It should be modified to use common names "biorxiv" 

1600 

1601 self.archive_name = node.text.lower() 

1602 

1603 def parse_article_id(self, node): 

1604 eid = node.text 

1605 self.extids.append(("eid", eid)) 

1606 

1607 self.citation_xml += '<pub-id pub-id-type="eid">' + escape(eid) + "</pub-id>" 

1608 

1609 def parse_bauteur(self, node): 

1610 self.parse_auteur(node, is_ref=True) 

1611 

1612 last_contribution = self.contributors[-1] 

1613 self.citation_xml += last_contribution["contrib_xml"] 

1614 

1615 def parse_bediteur(self, node): 

1616 self.parse_common_contrib(node, "editor", is_ref=True) 

1617 

1618 last_contribution = self.contributors[-1] 

1619 self.editeur_citation_xml += last_contribution["contrib_xml"] 

1620 

1621 def parse_bibitemdata(self, node): 

1622 tex_node = node.getnext() 

1623 

1624 # TODO: Bug in Cedrics. if bibitemdata has no text between the nodes, 

1625 # the XML is pretty printed. But since space="preserve" is added on the fly on mixed-citation 

1626 # The \n and spaces should be preserved. 

1627 # This bug is ignored (JTNB_2014__26_3_757_0 [1]) 

1628 

1629 value_html, value_tex, value_xml = self.parse_node_with_mixed_content( 

1630 node, tex_node, is_bibitemdata=True 

1631 ) 

1632 self.citation_html += value_html 

1633 self.citation_tex += value_tex 

1634 self.citation_xml += ( 

1635 '<mixed-citation xml:space="preserve">' + value_xml + "</mixed-citation>" 

1636 ) 

1637 

1638 def parse_booktitle(self, node): 

1639 tex_node = node.getnext() 

1640 title_html, title_tex, title_xml = self.parse_node_with_mixed_content( 

1641 node, tex_node, is_citation=True 

1642 ) 

1643 

1644 self.source_tex = title_tex 

1645 if title_xml != "": 

1646 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>" 

1647 

1648 def parse_burl(self, node): 

1649 for child in node: 

1650 tag = normalize(child.tag) 

1651 

1652 if tag == "xref": 

1653 html_text, tex_text, xml_text = self.parse_node_with_xref( 

1654 child, None, keep_link=True, is_citation=True 

1655 ) 

1656 

1657 self.citation_xml += xml_text 

1658 

1659 def parse_chapter(self, node): 

1660 # TODO: Bug in Cedrics <chapter> for types other than inbook 

1661 # becomes a text outside tags (AIF_2017__67_1_237_0 [16], CML_2013__5_1) 

1662 # The info is not present in the PDF. It should not be in the Cedrics XML 

1663 if self.type != "inbook": 

1664 raise ValueError("<chapter> can be used only for an inbook") 

1665 

1666 tex_node = node.getnext() 

1667 title_html, title_tex, title_xml = self.parse_node_with_mixed_content( 

1668 node, tex_node, is_citation=True 

1669 ) 

1670 

1671 self.citation_xml += ( 

1672 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>" 

1673 ) 

1674 self.chapter_title_tex = title_tex 

1675 

1676 def parse_doi(self, node): 

1677 if node.text is None: 

1678 raise ValueError("a doi can not be empty") 

1679 

1680 if "http" in node.text: 

1681 raise ValueError(node.text, "should not have http in it") 

1682 

1683 doi_value = clean_doi(node.text) 

1684 if self.doi is not None and self.doi != doi_value: 

1685 raise ValueError( 

1686 "Multiple dois for the same ref " 

1687 + self.label 

1688 + ": " 

1689 + self.doi 

1690 + " and " 

1691 + doi_value 

1692 ) 

1693 

1694 if self.doi is None: 

1695 self.doi = doi_value 

1696 self.extids.append(("doi", self.doi)) 

1697 

1698 self.has_doi = True 

1699 

1700 # TODO: bug in Cedrics if the doi has a &nbsp; in it 

1701 # the doi and the burl might not match and the dx.doi.org is no longer filtered 

1702 # (bug²) 

1703 # A doi should not have a space in it. raise an exception 

1704 other_doi = self.doi.strip().replace(chr(160), "") 

1705 if other_doi != self.doi: 

1706 raise ValueError(self.doi, "has a space in it") 

1707 

1708 if self.doi.lower().startswith("doi:"): 

1709 raise ValueError('Remove "DOI:" in ' + self.doi) 

1710 

1711 self.citation_xml += '<pub-id pub-id-type="doi">' + escape(node.text) + "</pub-id>" 

1712 

1713 def parse_edition(self, node): 

1714 # TODO: BUG in JATS (The edition is ignored in the HTML version) 

1715 self.parse_node_common(node, "edition", "edition") 

1716 

1717 def parse_editor(self, node): 

1718 # TODO: Bug in Cedrics <editeur> becomes a <string-name> and we lose the info author vs editor 

1719 self.parse_auteur(node, is_ref=True) 

1720 

1721 last_contribution = self.contributors[-1] 

1722 self.citation_xml += last_contribution["contrib_xml"] 

1723 

1724 def parse_eprint_id(self, node): 

1725 # Cannot add an ext_ids yet. Need to see if there's a archive-name 

1726 self.eprint_id = escape(node.text) 

1727 

1728 def parse_institution(self, node): 

1729 self.parse_node_common(node, "institution", "institution") 

1730 

1731 def parse_journal(self, node): 

1732 tex_node = node.getnext() 

1733 title_html, title_tex, title_xml = self.parse_node_with_mixed_content( 

1734 node, tex_node, is_citation=True 

1735 ) 

1736 

1737 self.source_tex = title_html 

1738 if len(title_xml) > 0: 

1739 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>" 

1740 

1741 def parse_mixed_citation(self, node): 

1742 for child in node: 

1743 tag = normalize(child.tag) 

1744 

1745 if tag == "reference": 

1746 self.parse_reference(child) 

1747 if len(self.label) > 0: 

1748 self.citation_html = self.citation_tex = self.label + " " 

1749 elif tag == "bibitemdata": 

1750 self.parse_bibitemdata(child) 

1751 

1752 def parse_month(self, node): 

1753 # TODO: Bug in Cedrics. month is ignored in the PDF ? JEP_2019__6__737_0 [Hoe63] 

1754 self.parse_node_common(node, "month", "month") 

1755 

1756 def parse_msn_id(self, node): 

1757 self.extids.append(("mr-item-id", node.text)) 

1758 self.citation_xml += ( 

1759 '<ext-link ext-link-type="mr-item-id">' + escape(node.text) + "</ext-link>" 

1760 ) 

1761 

1762 def parse_node_common(self, node, variable_name, jats_tag, **kwargs): 

1763 text = get_text_from_node(node) 

1764 if "keep_space" not in kwargs: 

1765 text = normalize_space(text) 

1766 setattr(self, variable_name, text) 

1767 

1768 self.citation_xml += "<" + jats_tag 

1769 if "jats_params" in kwargs and len(kwargs["jats_params"]) > 0: 

1770 self.citation_xml += " " + kwargs["jats_params"] 

1771 

1772 self.citation_xml += ">" + escape(text) + "</" + jats_tag + ">" 

1773 

1774 def parse_note(self, node): 

1775 value_html, value_tex, value_xml = self.parse_node_with_mixed_content( 

1776 node, None, is_citation=True, is_comment=True, add_HTML_link=True, temp_math=True 

1777 ) 

1778 

1779 self.comment = value_html 

1780 

1781 if len(value_html) > 0: 

1782 self.citation_xml += '<comment xml:space="preserve">' + value_xml + "</comment>" 

1783 

1784 def parse_number(self, node): 

1785 self.parse_node_common(node, "issue", "issue", keep_space=True) 

1786 

1787 def parse_pagedeb(self, node): 

1788 self.parse_node_common(node, "fpage", "fpage", keep_space=True) 

1789 

1790 def parse_pagefin(self, node): 

1791 self.parse_node_common(node, "lpage", "lpage", keep_space=True) 

1792 

1793 def parse_pages(self, node): 

1794 if len(self.fpage) == 0 and len(self.lpage) == 0: 

1795 tag = "size" if (self.type == "book" or "thesis" in self.type) else "fpage" 

1796 params = 'units="pages"' if tag == "size" else "" 

1797 self.parse_node_common(node, tag, tag, jats_params=params) 

1798 

1799 def parse_page_total_number(self, node): 

1800 self.parse_node_common(node, "size", "size", jats_params='units="pages"') 

1801 

1802 def parse_publisher(self, node): 

1803 self.publisher_name = normalize_space(get_text_from_node(node)) 

1804 self.citation_xml += "<publisher-name>" + escape(self.publisher_name) + "</publisher-name>" 

1805 

1806 def parse_reference(self, node): 

1807 cedrics_label = get_text_from_node(node) 

1808 

1809 if cedrics_label and cedrics_label[0] != "[": 

1810 self.label = "[" + cedrics_label + "]" 

1811 else: 

1812 self.label = cedrics_label 

1813 

1814 if self.label: 

1815 if self.is_mixed_citation: 

1816 self.citation_xml += "<label>" + escape(self.label) + "</label>" 

1817 else: 

1818 self.citation_xml += "<label>" + escape(cedrics_label) + "</label>" 

1819 

1820 def parse_series(self, node): 

1821 self.parse_node_common(node, "series", "series") 

1822 

1823 def parse_structured_citation(self, node): 

1824 wrapper_tag_added = False 

1825 eprint_done = False 

1826 

1827 for child in node: 

1828 tag = normalize(child.tag) 

1829 

1830 # The <label> is outside the <element-citation> in JATS 

1831 if tag != "reference" and not wrapper_tag_added: 

1832 self.citation_xml += '<element-citation publication-type="' + self.type + '">' 

1833 wrapper_tag_added = True 

1834 

1835 if self.eprint_id is not None and tag not in ("archive-prefix", "archive-name"): 

1836 self.post_parse_eprint() 

1837 eprint_done = True 

1838 

1839 # TODO: brevue bcoll bconference bseries btome... (util/bibitem.xsl) 

1840 

1841 if tag in ["howpublished"]: 

1842 self.parse_title(child) 

1843 elif tag in ("institution", "organization", "school"): 

1844 self.parse_institution(child) 

1845 elif tag not in ("TeXtitle", "TeXbooktitle", "archive-prefix"): 

1846 fct_name = "parse_" + tag.replace("-", "_") 

1847 ftor = getattr(self, fct_name, None) 

1848 if callable(ftor): 

1849 ftor(child) 

1850 

1851 if self.eprint_id is not None and not eprint_done: 

1852 self.post_parse_eprint() 

1853 

1854 # ptf-xsl mets les <bediteur> à la fin en JATS 

1855 if len(self.editeur_citation_xml) > 0: 

1856 self.citation_xml += '<person-group person-group-type="editor">' 

1857 self.citation_xml += self.editeur_citation_xml 

1858 self.citation_xml += "</person-group>" 

1859 

1860 self.citation_xml += "</element-citation>" 

1861 

1862 text = get_citation_html(self) 

1863 self.citation_html = self.citation_tex = text 

1864 

1865 def parse_title(self, node): 

1866 tex_node = node.getnext() 

1867 

1868 title_html, title_tex, title_xml = self.parse_node_with_mixed_content( 

1869 node, tex_node, is_citation=True, add_ext_link=True 

1870 ) 

1871 

1872 if self.type == "incollection": 

1873 self.chapter_title_tex = title_html 

1874 self.citation_xml += ( 

1875 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>" 

1876 ) 

1877 elif self.type in [ 

1878 "book", 

1879 "inbook", 

1880 "unpublished", 

1881 "phdthesis", 

1882 "masterthesis", 

1883 "mastersthesis", 

1884 "manual", 

1885 "techreport", 

1886 "coursenotes", 

1887 "proceedings", 

1888 ] or node.tag in ["booktitle", "howpublished"]: 

1889 self.source_tex = title_html 

1890 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>" 

1891 else: 

1892 self.article_title_tex = title_html 

1893 self.citation_xml += ( 

1894 '<article-title xml:space="preserve">' + title_xml + "</article-title>" 

1895 ) 

1896 

1897 def parse_tree(self, tree): 

1898 super().parse_tree(tree) 

1899 

1900 self.user_id = get_normalized_attrib(tree, "user-id") or "" 

1901 self.type = get_normalized_attrib(tree, "doctype") or "misc" 

1902 if self.type == "none": 

1903 self.type = "misc" 

1904 

1905 if self.is_mixed_citation: 

1906 self.parse_mixed_citation(tree) 

1907 else: 

1908 self.parse_structured_citation(tree) 

1909 

1910 def parse_type(self, node): 

1911 tex_node = node.getnext() 

1912 value_html, value_tex, value_xml = self.parse_node_with_mixed_content( 

1913 node, tex_node, bug_cedrics=True 

1914 ) 

1915 

1916 self.annotation = value_tex 

1917 

1918 if len(value_xml) > 0: 

1919 self.citation_xml += ( 

1920 '<annotation><p xml:space="preserve">' + value_xml + "</p></annotation>" 

1921 ) 

1922 

1923 def parse_url_last_visited(self, node): 

1924 self.citation_xml += '<date-in-citation content-type="access-date" iso-8601-date="' 

1925 self.citation_xml += node.text 

1926 self.citation_xml += '">' + node.text 

1927 self.citation_xml += "</date-in-citation>" 

1928 

1929 def parse_volume(self, node): 

1930 text = normalize_space(get_text_from_node(node)) 

1931 

1932 if text is not None and len(text) > 0: 

1933 self.volume = text 

1934 self.citation_xml += "<volume>" + escape(self.volume) + "</volume>" 

1935 

1936 def parse_year(self, node): 

1937 self.parse_node_common(node, "year", "year") 

1938 

1939 def parse_zbl_id(self, node): 

1940 self.extids.append(("zbl-item-id", node.text)) 

1941 self.citation_xml += ( 

1942 '<ext-link ext-link-type="zbl-item-id">' + escape(node.text) + "</ext-link>" 

1943 ) 

1944 

1945 def post_parse_eprint(self): 

1946 if self.eprint_id is not None: 

1947 if self.archive_name is None: 

1948 # Assumption made by the XSLT transform 

1949 self.archive_name = "arxiv" 

1950 

1951 if self.archive_name in ["arxiv", "tel", "hal", "theses.fr"]: 

1952 # The Cedrics archive-prefix is ignored (the URL could change overtime) 

1953 self.extids.append((self.archive_name, self.eprint_id)) 

1954 

1955 self.citation_xml += ( 

1956 '<pub-id pub-id-type="' 

1957 + self.archive_name 

1958 + '">' 

1959 + self.eprint_id 

1960 + "</pub-id>" 

1961 ) 

1962 

1963 def split_label(self): 

1964 """ 

1965 Used when sorting non-digit bibitems 

1966 """ 

1967 label = self.label.lower() 

1968 # CRAS <reference> do not allow a simple sort (?!?) 

1969 # labels with "XXX et al." need to be put after "XXX" 

1970 label = label.replace(" et al.", "ZZZ").replace(" et al.", "ZZZ") 

1971 if len(label) > 1: 

1972 label = label[1:-1] 

1973 

1974 if label.isdigit(): 

1975 self.label_prefix = label 

1976 else: 

1977 try: 

1978 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

1979 except ValueError: 

1980 # Special case where label is similar as "Sma" instead of "Sma15" 

1981 self.label_prefix, self.label_suffix = [label, ""]