Coverage for src/ptf/cmds/xml/cedrics/cedrics

1##################################################################################################

3# README

5# cedrics_parser.py is the equivalent of jats_parser for Cedrics XML

7# Bugs fixed:

8# - <xref> with url in "dx.doi.org" were filtered in ptf-xsl

9# - Non structured references (bibitemdata in Cedrics) got ext_links only if the <xref> node

10# is one step below the <bibitemdata> node

11# - comments that started with ' ' were ignored (AIF_2008__58_2_689_0 [9])

12#

13##################################################################################################

15import html

16import re

17from operator import attrgetter

19from django.conf import settings

20from django.utils import timezone

22from ptf.cmds.xml.citation_html import get_citation_html

23from ptf.cmds.xml.xml_base import RefBase

24from ptf.cmds.xml.xml_base import XmlParserBase

25from ptf.cmds.xml.xml_utils import clean_doi

26from ptf.cmds.xml.xml_utils import escape

27from ptf.cmds.xml.xml_utils import fix_mfenced_in_mathml

28from ptf.cmds.xml.xml_utils import get_contrib_xml

29from ptf.cmds.xml.xml_utils import get_normalized_attrib

30from ptf.cmds.xml.xml_utils import get_text_from_node

31from ptf.cmds.xml.xml_utils import get_xml_from_node

32from ptf.cmds.xml.xml_utils import helper_update_name_params

33from ptf.cmds.xml.xml_utils import int_to_Roman

34from ptf.cmds.xml.xml_utils import make_links_clickable

35from ptf.cmds.xml.xml_utils import normalize

36from ptf.cmds.xml.xml_utils import normalize_space

37from ptf.cmds.xml.xml_utils import replace_html_entities

38from ptf.cmds.xml.xml_utils import split_kwds

39from ptf.model_data import ArticleData

40from ptf.model_data import Foo

41from ptf.model_data import IssueData

42from ptf.model_data import JournalData

43from ptf.model_data import PublisherData

44from ptf.model_data import create_contributor

47def helper_add_link_from_node(node):

48 text = node.text or ""

49 tag = normalize(node.tag)

50 fct_name = "get_data_from_" + tag.replace("-", "_")

51 data = globals()[fct_name](node)

52 if not data["rel"]:

53 href = data["location"]

54 if "www.numdam.org" not in href:

55 text = make_links_clickable(href, data["metadata"])

56 else:

57 text = ""

58 return text

61def get_data_from_custom_meta(node):

62 name = ""

63 value = ""

65 for child in node:

66 tag = normalize(child.tag)

68 if tag == "meta-name":

69 name = child.text

70 elif tag == "meta-value":

71 value = child.text

73 return name, value

76def get_data_from_date(node):

77 date_str = ""

78 if "iso-8601-date" in node.attrib:

79 date_str = node.attrib["iso-8601-date"]

80 else:

81 year = month = day = ""

82 for child in node:

83 tag = normalize(child.tag)

85 if tag == "year":

86 year = child.text

87 elif tag == "month":

88 month = child.text

89 elif tag == "day":

90 day = child.text

91 date_str = year

92 if date_str and month:

93 date_str += "-" + month

94 if date_str and day:

95 date_str += "-" + day

97 return date_str

100def get_data_from_ext_link(node):

101 link_type = node.get("ext-link-type") or ""

102 href = get_normalized_attrib(node, "href") or ""

103 base = get_normalized_attrib(node, "base") or ""

104

105 data = {

106 "rel": link_type,

107 "mimetype": "",

108 "location": href,

109 "base": base,

110 "metadata": node.text or "",

111 }

112

113 return data

114

115

116def get_data_from_history(node):

117 history_dates = []

118 # TODO: transform history_dates in a hash where date-type is the key

119 # => Change database_cmds

120 for child in node:

121 if "date-type" in child.attrib:

122 date_type = child.attrib["date-type"]

123 date_str = get_data_from_date(child)

124 history_dates.append({"type": date_type, "date": date_str})

125 return history_dates

126

127

128def get_data_from_uri(node):

129 href = text = ""

130 href = get_normalized_attrib(node, "href") or ""

131 text = node.text or ""

132

133 data = {"rel": "", "mimetype": "", "location": href, "base": "", "metadata": text}

134

135 return data

136

137

138class CedricsBase(XmlParserBase):

139 def __init__(self, *args, **kwargs):

140 super().__init__()

141 self.warnings = []

142

143 def parse_tree(self, tree):

144 pass

145

146 def set_titles(self):

147 pass

148

149 def post_parse_tree(self):

150 self.set_titles()

151

152 def filter_text(self, text):

153 text = text.replace("<allowbreak/>", "")

154 return text

155

156 def get_location_from_xref(self, node, **kwargs):

157 location = get_normalized_attrib(node, "url") or ""

158

159 if location == "":

160 text = get_text_from_node(node)

161 location = self.filter_text(text)

162

163 return location

164

165 def get_data_from_xref(self, node, **kwargs):

166 href = text = ""

167

168 href = get_normalized_attrib(node, "url") or ""

169

170 # TODO: BUG in JATS. JEP_2017__4__435_0 [9]

171 # The comment has an ext-link with a display embedded in <monospace>

172 # jats_parser produces 2 <a> (1 for the <ext-link>, 1 for the text inside the <monospace>

173 # The code below should be removed

174 is_comment = "is_comment" in kwargs and kwargs["is_comment"]

175 if is_comment and node.text is None:

176 kwargs["add_HTML_link"] = True

177

178 html_text, _, xml_text = self.parse_node_inner(node, None, **kwargs)

179

180 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False

181

182 if href == "":

183 text = get_text_from_node(node)

184 text = self.filter_text(text)

185 href = text

186

187 bibitemdata_display = html_text

188 if is_bibitemdata and node.text is None:

189 html_text = ""

190

191 data = {

192 "rel": "",

193 "mimetype": "",

194 "location": href,

195 "base": "",

196 "metadata": html_text,

197 "xml_text": xml_text,

198 }

199

200 if is_bibitemdata:

201 data["bibitemdata_display"] = bibitemdata_display

202

203 return data

204

205 def get_numeric_value(self, node):

206 systnum = node.get("systnum") or ""

207

208 value = node.text

209 if systnum.lower() == "romain":

210 value = int_to_Roman(int(value))

211

212 return value

213

214 def parse_node_inner(self, node, tex_node, **kwargs):

215 """

216 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML

217 :param node:

218 :param kwargs:

219 :return:

220 """

221

222 kwargs["is_top"] = False

223 inner_html_text = inner_tex_text = inner_jats_xml_text = ""

224

225 if node.text:

226 text = node.text

227

228 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"):

229 text = text[1:]

230

231 inner_jats_xml_text += escape(text)

232 inner_html_text += text

233 inner_tex_text += text

234

235 for i in range(len(node)):

236 child = node[i]

237 text_child = tex_node[i] if (tex_node is not None and len(tex_node) > i) else None

238

239 (

240 child_html_text,

241 child_tex_text,

242 child_jats_xml_text,

243 ) = self.parse_node_with_mixed_content(child, text_child, **kwargs)

244 inner_html_text += child_html_text

245 inner_tex_text += child_tex_text

246 inner_jats_xml_text += child_jats_xml_text

247

248 if "add_HTML_link" in kwargs and kwargs["add_HTML_link"]:

249 match = re.match(r"[\n ]+", inner_html_text)

250 if not match:

251 inner_html_text = make_links_clickable(inner_html_text, inner_html_text)

252

253 return inner_html_text, inner_tex_text, inner_jats_xml_text

254

255 def parse_node_with_b(self, node, tex_node, **kwargs):

256 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

257 node, tex_node, **kwargs

258 )

259

260 html_text = "<strong>" + inner_html_text + "</strong>"

261 tex_text = "<strong>" + inner_tex_text + "</strong>"

262 if len(inner_jats_xml_text) > 0:

263 xml_text = "<bold>" + inner_jats_xml_text + "</bold>"

264 else:

265 xml_text = "<bold/>"

266

267 return html_text, tex_text, xml_text

268

269 def parse_node_with_cit(self, node, tex_node, **kwargs):

270 html_text = tex_text = get_text_from_node(node)

271 xml_text = escape(html_text)

272

273 return html_text, tex_text, xml_text

274

275 def parse_node_with_hi(self, node, tex_node, **kwargs):

276 rend = node.get("rend")

277

278 if rend == "it":

279 return self.parse_node_with_i(node, tex_node, **kwargs)

280 elif rend == "bold":

281 return self.parse_node_with_b(node, tex_node, **kwargs)

282 else:

283 fct_name = "parse_node_with_" + rend.replace("-", "_")

284 ftor = getattr(self, fct_name, None)

285 if callable(ftor):

286 return ftor(node, tex_node, **kwargs)

287

288 return self.parse_node_inner(node, tex_node, **kwargs)

289

290 def parse_node_with_i(self, node, tex_node, **kwargs):

291 # TODO: BUG in JATS: unlike <monospace>, no HTLM links are added in italics

292 kwargs["add_HTML_link"] = False

293

294 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

295 node, tex_node, **kwargs

296 )

297

298 is_bibitemdata = kwargs["is_bibitemdata"] if "is_bibitemdata" in kwargs else False

299 is_citation = kwargs["is_citation"] if "is_citation" in kwargs else False

300 is_comment = kwargs["is_comment"] if "is_comment" in kwargs else False

301

302 tex_text = f"<i>{inner_tex_text}</i>"

303

304 if inner_html_text == "" or (is_citation and not is_bibitemdata and not is_comment):

305 html_text = inner_html_text

306 else:

307 html_text = '<span class="italique">' + inner_html_text + "</span>"

308

309 if len(inner_jats_xml_text) > 0:

310 xml_text = "<italic>" + inner_jats_xml_text + "</italic>"

311 else:

312 xml_text = "<italic/>"

313

314 return html_text, tex_text, xml_text

315

316 def parse_node_with_label(self, node, tex_node, **kwargs):

317 html_text = tex_text = xml_text = ""

318

319 self.list_item_label = get_text_from_node(node)

320

321 return html_text, tex_text, xml_text

322

323 def parse_node_with_large(self, node, tex_node, **kwargs):

324 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

325 node, tex_node, **kwargs

326 )

327

328 xml_text = "<large>" + inner_jats_xml_text + "</large>"

329

330 return inner_html_text, inner_tex_text, xml_text

331

332 def parse_node_with_list(self, node, tex_node, **kwargs):

333 self.list_item_label = None

334

335 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

336 node, tex_node, **kwargs

337 )

338

339 list_type = node.get("type")

340

341 if list_type is None:

342 xml_text = "<list>"

343 else:

344 xml_text = '<list list-type="' + list_type + '">'

345 xml_text += inner_jats_xml_text

346 xml_text += "</list>"

347

348 if list_type is None or list_type == "bullet" or list_type == "simple":

349 prefix = "<ul>"

350 suffix = "</ul>"

351 else:

352 suffix = "</ol>"

353

354 if list_type == "order" or list_type == "number":

355 prefix = '<ol type="1">'

356 elif list_type == "alpha-lower":

357 prefix = '<ol type="a">'

358 elif list_type == "alpha-upper":

359 prefix = '<ol type="A">'

360 elif list_type == "roman-lower":

361 prefix = '<ol type="i">'

362 elif list_type == "roman-upper":

363 prefix = '<ol type="I">'

364 else:

365 prefix = '<ul class="no-bullet" style="list-style-type:none;">'

366 suffix = "</ul>"

367

368 html_text = prefix + inner_html_text + suffix

369 tex_text = prefix + inner_tex_text + suffix

370

371 return html_text, tex_text, xml_text

372

373 def parse_node_with_item(self, node, tex_node, **kwargs):

374 """

375 <list-item><label>LABEL</label><p>TEXT</p> becomes in HTML

376 <li>LABEL TEXT</li>

377 (same with <title>)

378

379 :param node:

380 :return:

381 """

382

383 label = self.list_item_label or ""

384 if label == "":

385 label = node.get("label") or ""

386

387 self.list_item_label = None

388

389 kwargs["no_p"] = True

390 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

391 node, tex_node, **kwargs

392 )

393

394 xml_text = "<list-item>"

395 if label:

396 xml_text += "<label>" + label + "</label>"

397 xml_text += inner_jats_xml_text

398 xml_text += "</list-item>"

399

400 text = "<li>"

401 if label:

402 text += label + " "

403

404 html_text = text + inner_html_text + "</li>"

405 tex_text = text + inner_tex_text + "</li>"

406

407 return html_text, tex_text, xml_text

408

409 def parse_node_with_formula(self, node, tex_node, **kwargs):

410 # '\n' are added in this function, because the Cedrics -> XML transformation

411 # does not add xml:space="preserve" with formulas (?)

412 # An abstract with <p> and <formula> will have mix of "preserve".

413

414 html_text = tex_text = jats_xml_text = ""

415 type_ = node.attrib["type"] or "inline"

416 tex_type = tex_node.attrib["textype"] if tex_node is not None else "inline"

417

418 math_node = node[0]

419 math_node_text = get_xml_from_node(math_node)

420 math_node_text = replace_html_entities(math_node_text)

421 # The Cedrics Mathml transform rounds up the width value

422 math_node_text = math_node_text.replace(".em", "em")

423 math_node_text = math_node_text.replace(".pt", "pt")

424

425 tex_prefix = tex_suffix = "$"

426 if type_ != "inline":

427 tex_prefix = "\n\\["

428 tex_suffix = "\\]\n"

429 if tex_node is not None and tex_type not in ("inline", "display"):

430 tex_prefix = "\n\\begin{" + tex_type + "}\n"

431 tex_suffix = "\n\\end{" + tex_type + "}\n"

432

433 math_node_text = fix_mfenced_in_mathml(math_node_text)

434

435 if not kwargs["is_citation"]:

436 math_node_text = math_node_text.replace(

437 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""

438 )

439 math_node_text = math_node_text.replace('mode="display"', 'display="block"')

440

441 if tex_node is None:

442 # TODO: BUG in JATS. No need for a '$$' in the title if there is no tex formula

443 # The '$$' at the end of the next line is to be compatible with jats_parser

444

445 if type_ == "inline":

446 tex_node_text = "$$"

447 else:

448 tex_node_text = ""

449 else:

450 tex_node_text = tex_prefix + tex_node.text + tex_suffix

451

452 if type_ == "inline":

453 jats_xml_text = "<inline-formula>"

454 else:

455 jats_xml_text = '<disp-formula xml:space="preserve">\n'

456

457 jats_xml_text += "<alternatives>" + math_node_text

458 jats_tex_text = escape(tex_node_text)

459

460 if type_ != "inline":

461 jats_xml_text += "\n"

462

463 jats_xml_text += "<tex-math>" + jats_tex_text + "</tex-math>"

464

465 if type_ != "inline":

466 jats_xml_text += "\n"

467

468 jats_xml_text += "</alternatives>"

469

470 if type_ == "inline":

471 jats_xml_text += "</inline-formula>"

472 else:

473 jats_xml_text += "\n</disp-formula>"

474 node.tail = ""

475

476 if "bug_cedrics" in kwargs and kwargs["bug_cedrics"]:

477 # TODO: Bug in Cedrics. AIF_2012__62_6_2053_0 [16]

478 # If there is no texmath, a <tex-math>$$</tex-math> is added and

479 # get_text_from_node appends the 2.

480 tex_text = get_text_from_node(node)

481 if tex_node is None:

482 tex_text += "$$"

483 else:

484 tex_text = tex_node_text

485

486 data_tex = tex_node_text if type_ == "inline" else tex_node_text.replace("\n", "")

487 html_text = f'<span class="mathjax-formula" data-tex="{data_tex}">{math_node_text}</span>'

488

489 if type_ != "inline":

490 prefix = '<table class="formula"><tr><td class="formula-inner">'

491 suffix = '</td><td class="formula-label"></td></tr></table>'

492

493 html_text = prefix + html_text + suffix

494

495 # tex_text = escape(tex_text)

496

497 return html_text, tex_text, jats_xml_text

498

499 def parse_node_with_mixed_content(self, node, tex_node, **kwargs):

500 """

501 Parse and return the text of an XML node which mixes text and XML sub-nodes.

502 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>

503 Some inner nodes are removed, others are kept or replaced.

504

505 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings.

506 Parse the 2 nodes at the same time.

507

508 The JATS xml string is constructed at the same time because it is used during a PTF export

509

510 :param node: XML Node (with MathML), XML Node (with TexMath)

511 :param kwargs: params of the function

512 :return: HTML text, TeX test, XML text

513 """

514

515 html_text = tex_text = jats_xml_text = ""

516

517 if node is None:

518 return html_text, tex_text, jats_xml_text

519

520 name_ = type(node).__name__

521 # Found 1 exception with <title>Дополнение к работе (AIF_2013__63_4)

522 # The XML parser creates a different node with no tag for " "

523 if name_ != "_Element":

524 html_text = tex_text = jats_xml_text = html.unescape(node.text)

525 if node.tail and not kwargs["is_top"]:

526 html_text += node.tail

527 tex_text += node.tail

528 jats_xml_text += escape(node.tail)

529 return html_text, tex_text, jats_xml_text

530

531 # The tail is the text following the end of the node

532 # Ex: <node>text1<a>text_a</a>a_tail</node>

533 # The HTML text has to include the tail

534 # only if html_from_mixed_content was called recursively

535 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

536

537 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>

538 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2

539

540 # Text in <comment> is parsed to add HTML link.

541 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False

542

543 # base_url to image links

544 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""

545

546 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False

547 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False

548

549 # TODO remove once jats_parser has been validated agains xmldata

550 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False

551 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False

552 kwargs["temp_mixed_citation"] = (

553 kwargs["temp_mixed_citation"] if "temp_mixed_citation" in kwargs else False

554 )

555

556 tag = normalize(node.tag)

557

558 # pub-id/object-id are ignored by default are they are treated separately

559 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):

560 print(tag, "in", jats_xml_text)

561 return html_text, tex_text, jats_xml_text

562

563 if tag in ("bibitemdata", "toc"):

564 kwargs["is_citation"] = True

565 kwargs["temp_mixed_citation"] = True

566 elif tag == "comment":

567 kwargs["is_comment"] = True

568

569 inner_html_text = inner_tex_text = inner_jats_xml_text = ""

570

571 # I. Add the node's text.

572 # Some tag have a corresponding html_from_@tag function to generate the HTML text.

573

574 # Check if the html_from_@tag exists

575 tag_mapped = {

576 "statement": "sec",

577 "disp-formula": "inline-formula",

578 "chapter-title": "article-title",

579 "bold": "strong",

580 "table": "table-generic",

581 "th": "table-generic",

582 "tr": "table-generic",

583 "td": "table-generic",

584 "thead": "table-generic",

585 "tbody": "table-generic",

586 "colgroup": "table-generic",

587 "col": "table-generic",

588 "em": "i",

589 }

590

591 fct_name = tag_mapped[tag] if tag in tag_mapped else tag

592 fct_name = "parse_node_with_" + fct_name.replace("-", "_")

593 ftor = getattr(self, fct_name, None)

594 if callable(ftor):

595 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, tex_node, **kwargs)

596 # Code if fc_name is a module fonction, not a class function:

597 # if fct_name in globals():

598 # Call the html_from_@tag function

599 # inner_text = globals()[fct_name](node, **kwargs)

600 else:

601 # II.1. Add the node text (before the children text)

602

603 # TODO Add HTML links to the text with URLs

604 # if tag in ("ext-link", "uri"):

605 # if kwargs['include_ext_link']:

606 # inner_text += helper_add_link_from_node(node)

607 # elif kwargs['add_HTML_link'] and node.text:

608 # match = re.match(r'[\n ]+', node.text)

609 # if not match:

610 # comment = make_links_clickable(node.text, node.text)

611 # inner_text += comment

612 # elif node.text:

613 # inner_text += node.text

614

615 # II.2. children

616 # child_text = html_from_mixed_content(child, params)

617

618 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

619 node, tex_node, **kwargs

620 )

621

622 html_text += inner_html_text

623 tex_text += inner_tex_text

624 jats_xml_text += inner_jats_xml_text

625

626 # III. Add the node's tail for children

627 if node.tail and not kwargs["is_top"] and tag not in ("p", "list", "item", "label"):

628 html_text += node.tail

629 tex_text += node.tail

630 jats_xml_text += escape(node.tail)

631

632 return html_text, tex_text, jats_xml_text

633

634 def parse_node_with_p(self, node, tex_node, **kwargs):

635 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

636 node, tex_node, **kwargs

637 )

638

639 if "no_p" in kwargs and kwargs["no_p"]:

640 # <p> inside <item> are removed in HTML to avoid a carriage return

641 html_text = inner_html_text

642 else:

643 node_type = node.get("specific-use")

644 if node_type:

645 html_text = '<p class="' + node_type + '">' + inner_html_text + "</p>"

646 else:

647 html_text = "<p>" + inner_html_text + "</p>"

648

649 # TODO: BUG in JATS (no <p> in the tex version)

650 tex_text = inner_tex_text

651

652 if len(inner_jats_xml_text) > 0:

653 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>"

654 else:

655 xml_text = '<p xml:space="preserve"/>'

656

657 return html_text, tex_text, xml_text

658

659 def parse_node_with_ref(self, node, tex_node, **kwargs):

660 label = node.text

661

662 html_text = ""

663 tex_text = ""

664 xml_text = '<xref ref-type="bibr">' + escape(label) + "</xref>"

665

666 return html_text, tex_text, xml_text

667

668 def parse_node_with_sansserif(self, node, tex_node, **kwargs):

669 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

670 node, tex_node, **kwargs

671 )

672

673 xml_text = "<sans-serif>" + inner_jats_xml_text + "</sans-serif>"

674

675 return inner_html_text, inner_tex_text, xml_text

676

677 def parse_node_with_sc(self, node, tex_node, **kwargs):

678 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

679 node, tex_node, **kwargs

680 )

681

682 html_text = '<span class="smallcaps">' + inner_html_text + "</span>"

683 tex_text = '<span class="smallcaps">' + inner_tex_text + "</span>"

684

685 if len(inner_jats_xml_text) > 0:

686 xml_text = "<sc>" + inner_jats_xml_text + "</sc>"

687 else:

688 xml_text = "<sc/>"

689

690 return html_text, tex_text, xml_text

691

692 def parse_node_with_slanted(self, node, tex_node, **kwargs):

693 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

694 node, tex_node, **kwargs

695 )

696

697 xml_text = "<slanted>" + inner_jats_xml_text + "</slanted>"

698

699 return inner_html_text, inner_tex_text, xml_text

700

701 def parse_node_with_small(self, node, tex_node, **kwargs):

702 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

703 node, tex_node, **kwargs

704 )

705

706 xml_text = "<small>" + inner_jats_xml_text + "</small>"

707

708 return inner_html_text, inner_tex_text, xml_text

709

710 def parse_node_with_sub(self, node, tex_node, **kwargs):

711 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

712 node, tex_node, **kwargs

713 )

714

715 html_text = "<sub>" + inner_html_text + "</sub>"

716 tex_text = "<sub>" + inner_tex_text + "</sub>"

717 xml_text = "<sub>" + inner_jats_xml_text + "</sub>"

718

719 return html_text, tex_text, xml_text

720

721 def parse_node_with_sup(self, node, tex_node, **kwargs):

722 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

723 node, tex_node, **kwargs

724 )

725

726 html_text = "<sup>" + inner_html_text + "</sup>"

727 tex_text = "<sup>" + inner_tex_text + "</sup>"

728 xml_text = "<sup>" + inner_jats_xml_text + "</sup>"

729

730 return html_text, tex_text, xml_text

731

732 def parse_node_with_texmath(self, node, tex_node, **kwargs):

733 html_text = tex_text = xml_text = ""

734

735 tex_text = "$" + get_text_from_node(node) + "$"

736

737 return html_text, tex_text, xml_text

738

739 def parse_node_with_tt(self, node, tex_node, **kwargs):

740 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

741 node, tex_node, **kwargs

742 )

743

744 if len(inner_jats_xml_text) > 0:

745 xml_text = "<monospace>" + inner_jats_xml_text + "</monospace>"

746 else:

747 xml_text = "<monospace/>"

748

749 return inner_html_text, inner_tex_text, xml_text

750

751 def parse_node_with_underline(self, node, tex_node, **kwargs):

752 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

753 node, tex_node, **kwargs

754 )

755

756 xml_text = "<underline>" + inner_jats_xml_text + "</underline>"

757

758 return inner_html_text, inner_tex_text, xml_text

759

760 def parse_node_with_xref(self, node, tex_node, **kwargs):

761 """

762 Parse an xref.

763 Extract extids (doi, mr-item-id,...) and ext_links

764

765 :param node:

766 :param tex_node:

767 :param kwargs:

768 :return: html_text, tex_text, xml_text

769 """

770

771 location = self.get_location_from_xref(node)

772

773 kwargs["add_HTML_link"] = False

774 html_text, tex_text, xml_text = self.parse_node_inner(node, None, **kwargs)

775 metadata = html_text

776 html_text = make_links_clickable(location, html_text)

777 tex_text = make_links_clickable(location, tex_text)

778

779 is_comment = "is_comment" in kwargs and kwargs["is_comment"]

780

781 # No ext-links is added while parsing titles or abstracts

782 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else True

783

784 xref_data = {

785 "rel": "",

786 "mimetype": "",

787 "location": location,

788 "base": "",

789 "metadata": metadata,

790 }

791

792 extid_value = (None, None)

793

794 if add_ext_link and not is_comment:

795 extid_value = self.add_extids_from_node_with_link(xref_data)

796

797 # <ext-link> in a bibitemdata, in a comment, or if the xref is not converted into an extid

798 # if is_bibitemdata or is_comment or extid_value[0] is None:

799 xml_text = (

800 '<ext-link xlink:href="' + html.escape(location) + '">' + xml_text + "</ext-link>"

801 )

802

803 if (

804 add_ext_link

805 and not is_comment

806 and extid_value[0] is None

807 and xref_data not in self.ext_links

808 ):

809 self.ext_links.append(xref_data)

810

811 return html_text, tex_text, xml_text

812

813 def parse_article_subject(self, node):

814 lang = get_normalized_attrib(node, "lang") or self.lang

815

816 subjects = [text.lstrip() for text in node.text.split(",")]

817

818 for subject in subjects:

819 self.subjs.append({"type": "subject", "lang": lang, "value": subject})

820

821 def parse_article_subjects(self, node):

822 for child in node:

823 tag = normalize(child.tag)

824

825 if tag == "article-subject":

826 self.parse_article_subject(child)

827

828 def parse_article_type(self, node):

829 lang = get_normalized_attrib(node, "lang") or self.lang

830

831 subjects = [node.text]

832

833 for subject in subjects:

834 self.subjs.append({"type": "type", "lang": lang, "value": subject})

835

836 def parse_article_types(self, node):

837 # 2023/12/05 <articletype> has been added to store the type

838 if self.has_articletype:

839 return

840

841 for child in node:

842 tag = normalize(child.tag)

843

844 if tag == "article-type":

845 self.parse_article_type(child)

846

847 def parse_articletype(self, node):

848 self.atype = node.text

849 self.has_articletype = True

850

851 def parse_auteur(self, node, is_ref=False):

852 self.parse_common_contrib(node, "author", is_ref)

853

854 def _get_abstract_data(self, node, abstract_type: str = None):

855 tex_node = node.getnext()

856 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(

857 node, tex_node, add_ext_link=False

858 )

859

860 lang = get_normalized_attrib(node, "lang") or ""

861 if abstract_type is None:

862 if lang == self.lang:

863 value_xml = f"<abstract>{value_xml}</abstract>"

864 elif self.lang == "und":

865 value_xml = f'<abstract xml:lang="{lang}">{value_xml}</abstract>'

866 else:

867 value_xml = f'<trans-abstract xml:lang="{lang}">{value_xml}</trans-abstract>'

868 else:

869 value_xml = f'<abstract xml:lang="{lang}" abstract-type="{abstract_type}">{value_xml}</abstract>'

870

871 abstract_data = {

872 "tag": abstract_type if abstract_type is not None else "",

873 "lang": lang,

874 "value_xml": value_xml,

875 "value_html": value_html,

876 "value_tex": value_tex,

877 }

878 return abstract_data

879

880 def parse_avertissement(self, node):

881 self.abstracts.append(self._get_abstract_data(node, "avertissement"))

882

883 def parse_note(self, node):

884 self.abstracts.append(self._get_abstract_data(node, "note"))

885

886 def parse_biblio(self, node):

887 biblio_type = node.get("type") or ""

888 for child in node:

889 tag = normalize(child.tag)

890

891 if tag == "bib_entry":

892 type_ = child.get("type") or biblio_type

893 is_mixed_citation = type_ == "flat"

894

895 ref = CedricsRef(tree=child, lang="und", is_mixed_citation=is_mixed_citation)

896 self.bibitems.append(ref)

897 # TODO: Remove bibitem. This is used for solrCmds.

898 # solrCmds should use bibitems instead.

899 self.bibitem.append(ref.citation_html)

900

901 self.sort_bibitems()

902

903 def parse_common_contrib(self, node, role, is_ref=False):

904 contributor = create_contributor()

905

906 if role and role[-1] == "s":

907 role = role[0:-1]

908 contributor["role"] = role

909

910 equal_contrib_ = node.get("equal-contrib") or "no"

911 contributor["equal_contrib"] = equal_contrib_ == "yes"

912

913 corresp = node.get("author-role") or ""

914 if corresp == "corresponding":

915 contributor["corresponding"] = True

916

917 is_etal = False

918 has_children = False

919 middlename = ""

920

921 for child in node:

922 has_children = True

923 tag = normalize(child.tag)

924

925 if tag == "nomcomplet":

926 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>

927 if not is_ref:

928 contributor["string_name"] = child.text

929 deceased_ = child.get("deceased") or "no"

930 contributor["deceased_before_publication"] = deceased_ == "yes"

931 elif tag == "prenom":

932 contributor["first_name"] = child.text or ""

933 if middlename != "":

934 contributor["first_name"] += " " + middlename

935 middlename = ""

936 elif tag in ("middlename", "particule"):

937 contributor["first_name"] += " " + child.text

938 middlename = child.text

939 elif tag == "initiale":

940 pass

941 # if len(contributor['first_name']) > 0:

942 # contributor['first_initials'] = child.text or ''

943 elif tag == "junior":

944 contributor["suffix"] = child.text

945 elif tag == "nom":

946 contributor["last_name"] = child.text or ""

947 elif tag == "adresse":

948 text = get_text_from_node(child)

949 text = normalize_space(text).replace("\n", " ")

950 if len(text) > 0:

951 contributor["addresses"].append(text)

952 elif tag == "author-orcid":

953 contributor["orcid"] = child.text

954 elif tag == "mel":

955 email = None

956 for greatchild in child:

957 tag = normalize(greatchild.tag)

958 if tag == "xref":

959 email = greatchild.get("url")

960 if email is None:

961 email = child.text

962 if email is not None:

963 if len(contributor["email"]) > 0:

964 contributor["email"] += "{{{"

965 contributor["email"] += email

966 elif tag == "etal":

967 is_etal = True

968

969 if has_children:

970 use_initials = is_ref and getattr(settings, "REF_JEP_STYLE", False)

971 helper_update_name_params(contributor, use_initials)

972

973 contributor["contrib_xml"] = (

974 "<etal/>" if is_etal else get_contrib_xml(contributor, is_ref=is_ref)

975 )

976 elif node.text is not None:

977 contributor["string_name"] = node.text

978 contributor["contrib_xml"] = (

979 '<string-name xml:space="preserve">' + escape(node.text) + "</string-name>"

980 )

981

982 contributor["addresses"].sort()

983

984 # email is ignored by jats_parser

985 contributor["email"] = ""

986

987 self.contributors.append(contributor)

988

989 def parse_financement(self, node):

990 abbrev = award_id = None

991

992 for child in node:

993 tag = normalize(child.tag)

994

995 if tag == "bourse":

996 award_id = child.text

997 elif tag == "financeur":

998 abbrev = get_text_from_node(child)

999

1000 if abbrev is not None and award_id is not None:

1001 self.awards.append({"abbrev": abbrev, "award_id": award_id})

1002

1003 def parse_financements(self, node):

1004 for child in node:

1005 tag = normalize(child.tag)

1006

1007 if tag == "financement":

1008 self.parse_financement(child)

1009

1010 def parse_langue(self, node):

1011 self.lang = node.text

1012

1013 def parse_motcle(self, node):

1014 lang = get_normalized_attrib(node, "lang") or self.lang

1015 tex_node = node.getnext()

1016

1017 kwds = []

1018 for child in tex_node:

1019 tag = normalize(child.tag)

1020

1021 if tag == "mot":

1022 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(

1023 child, None

1024 )

1025 # text = normalize_space(get_text_from_node(child))

1026 kwds.append(value_tex)

1027

1028 if len(kwds) == 0:

1029 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(

1030 node, tex_node

1031 )

1032 kwds = split_kwds(value_tex)

1033

1034 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds])

1035

1036 def parse_msc(self, node):

1037 lang = get_normalized_attrib(node, "lang") or self.lang

1038 kwds = node.text.split(",")

1039 kwds = [kwd.strip() for kwd in kwds if len(kwd) > 0]

1040

1041 self.kwds.extend([{"type": "msc", "lang": lang, "value": kwd} for kwd in kwds])

1042

1043 def parse_resp(self, node):

1044 role = node.get("role") or "editeur"

1045 if role == "editeur":

1046 role = "editor"

1047 elif role == "organisateur":

1048 role = "organizer"

1049

1050 self.parse_common_contrib(node, role)

1051

1052 def parse_resume(self, node):

1053 lang = get_normalized_attrib(node, "lang") or self.lang

1054 """

1055 tag = "abstract"

1056 tex_node = node.getnext()

1057

1058 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(

1059 node, tex_node, add_ext_link=False

1060 )

1061

1062 if lang == self.lang:

1063 value_xml = "<abstract"

1064 elif self.lang == "und":

1065 value_xml = '<abstract xml:lang="' + lang + '"'

1066 else:

1067 value_xml = '<trans-abstract xml:lang="' + lang + '"'

1068

1069 if len(value_xml_inner) == 0:

1070 value_xml += "/>"

1071 else:

1072 value_xml += ">" + value_xml_inner

1073

1074 if lang == self.lang or self.lang == "und":

1075 value_xml += "</abstract>"

1076 else:

1077 value_xml += "</trans-abstract>"

1078

1079 abstract_data = {

1080 "tag": tag,

1081 "lang": lang,

1082 "value_xml": value_xml,

1083 "value_html": value_html,

1084 "value_tex": value_tex,

1085 }

1086 """

1087 if lang == self.lang:

1088 # JATS puts the trans_abstract after the abstract

1089 self.abstracts.insert(0, self._get_abstract_data(node, None))

1090 else:

1091 self.abstracts.append(self._get_abstract_data(node))

1092

1093 def parse_supplement(self, node):

1094 location = None

1095 caption = ""

1096

1097 for child in node:

1098 tag = normalize(child.tag)

1099

1100 if tag == "xref":

1101 location = self.get_location_from_xref(child)

1102 elif tag == "caption":

1103 caption = escape(node.text)

1104

1105 if location:

1106 pos = location.find("/attach/")

1107 if pos > -1:

1108 if hasattr(self, "colid") and hasattr(self, "issue_id"):

1109 text = location

1110 location = self.colid + "/" + self.issue_id + "/"

1111

1112 if hasattr(self, "article_folder") and self.article_folder is not None:

1113 location += self.article_folder + "/Attach/" + text[pos + 8 :]

1114 else:

1115 location += self.pid + text[pos:]

1116

1117 relation = node.attrib.get("content-type")

1118 assert relation in ["supplementary-material", "review"], (

1119 f"Dans la balise supplement de {self.pid}, "

1120 f'content-type être "supplementary-material" ou "review" '

1121 f'au lieu de "{relation}"'

1122 )

1123

1124 material = {

1125 "rel": node.attrib.get("content-type"),

1126 "mimetype": node.attrib.get("mimetype"),

1127 "location": location,

1128 "base": "",

1129 "metadata": "",

1130 "caption": caption,

1131 }

1132 self.supplementary_materials.append(material)

1133

1134 def parse_supplements(self, node):

1135 for child in node:

1136 tag = normalize(child.tag)

1137

1138 if tag == "supplement":

1139 self.parse_supplement(child)

1140

1141 # TODO: It is a node with mix content

1142 # Transform the function in parse_node_with_motcle to handle formulas

1143 def parse_texmotcle(self, node):

1144 lang = get_normalized_attrib(node, "lang") or self.lang

1145 tex_node = node.getnext()

1146

1147 kwds = []

1148 for child in tex_node:

1149 tag = normalize(child.tag)

1150

1151 if tag == "mot":

1152 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(child)

1153 kwds.append(value_tex)

1154

1155 if len(kwds) == 0:

1156 value_html, value_tex, value_xml_inner = self.parse_node_with_mixed_content(node)

1157 kwds = split_kwds(value_tex)

1158

1159 self.kwds.extend([{"type": "", "lang": lang, "value": kwd} for kwd in kwds])

1160

1161 def parse_titre(self, node):

1162 lang = get_normalized_attrib(node, "lang") or "und"

1163 tex_node = node.getnext()

1164

1165 # node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")

1166 # tex_node.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")

1167

1168 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(node, tex_node)

1169

1170 if len(title_xml) > 0:

1171 self.titres.append(

1172 {

1173 "lang": lang,

1174 "title_html": title_html,

1175 "title_tex": title_tex,

1176 "title_xml": title_xml,

1177 }

1178 )

1179

1180 def sort_bibitems(self):

1181 if len(self.bibitems):

1182 label = self.bibitems[0].label.strip("[]") # Sometimes, labels are surrounded by []

1183 if len(label):

1184 # First, we split each label into label_prefix and label_suffix

1185 for bib in self.bibitems:

1186 bib.split_label()

1187

1188 if label.isdigit():

1189

1190 def sort_bibitem(bibitem):

1191 return int(bibitem.label_prefix)

1192

1193 self.bibitems = sorted(self.bibitems, key=sort_bibitem)

1194 else:

1195 self.bibitems = sorted(

1196 self.bibitems, key=attrgetter("label_prefix", "year", "label_suffix")

1197 )

1198

1199

1200class CedricsPublisher(PublisherData):

1201 def __init__(self, *args, **kwargs):

1202 super().__init__(*args, **kwargs)

1203 self.parse_tree(kwargs["tree"])

1204

1205 def parse_tree(self, tree):

1206 self.name = tree.text

1207

1208

1209class CedricsJournal(JournalData, CedricsBase):

1210 def __init__(self, *args, **kwargs):

1211 super().__init__(*args, **kwargs)

1212 self.parse_tree(kwargs["tree"])

1213

1214 def parse_tree(self, tree):

1215 super().parse_tree(tree)

1216

1217 for node in tree:

1218 tag = normalize(node.tag)

1219

1220 if tag == "acrocedram":

1221 self.pid = node.text

1222 elif tag == "jtitre":

1223 self.title_html = self.title_tex = node.text

1224 self.title_xml = "<journal-title-group><journal-title>" + escape(node.text)

1225 elif tag == "jtitrecourt":

1226 self.title_xml += (

1227 '</journal-title><abbrev-journal-title abbrev-type="short-title">'

1228 + escape(node.text)

1229 )

1230 self.title_xml += "</abbrev-journal-title></journal-title-group>"

1231 elif tag == "jediteur":

1232 self.publisher = CedricsPublisher(tree=node)

1233 elif tag == "issn":

1234 self.issn = node.text

1235 elif tag == "E-issn":

1236 self.e_issn = node.text

1237

1238

1239class CedricsIssue(IssueData, CedricsBase):

1240 def __init__(self, *args, **kwargs):

1241 super().__init__(*args, **kwargs)

1242

1243 # Jats has a title/trans_title

1244 # Cedrics has multiples <titre xml:lang>

1245 # Use self.titres to store the titles temporary.

1246 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree

1247 self.titres = []

1248

1249 self.ignore_date_published = (

1250 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False

1251 )

1252 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False

1253 self.colid = None

1254 self.provider = "mathdoc"

1255 self.article_folders = kwargs["article_folders"] if "article_folders" in kwargs else []

1256 self.dois = kwargs["dois"] if "dois" in kwargs else []

1257

1258 self.parse_tree(kwargs["tree"])

1259 self.post_parse_tree()

1260

1261 def parse_tree(self, tree):

1262 super().parse_tree(tree)

1263

1264 seq = 1

1265

1266 for node in tree:

1267 tag = normalize(node.tag)

1268

1269 if tag == "notice":

1270 self.parse_notice(node)

1271 elif tag == "article":

1272 article_folder = (

1273 self.article_folders[seq - 1] if len(self.article_folders) > 0 else ""

1274 )

1275 doi = self.dois[seq - 1] if len(self.dois) > 0 else ""

1276 article = CedricsArticle(

1277 tree=node,

1278 colid=self.colid,

1279 issue_id=self.pid,

1280 doi=doi,

1281 ignore_date_published=self.ignore_date_published,

1282 is_seminar=self.is_seminar,

1283 article_folder=article_folder,

1284 )

1285 article.seq = str(seq)

1286 seq += 1

1287 self.articles.append(article)

1288

1289 def parse_gestion(self, node):

1290 for child in node:

1291 tag = normalize(child.tag)

1292

1293 if tag == "efirst":

1294 self.with_online_first = child.text == "yes"

1295

1296 def parse_notice(self, node):

1297 for child in node:

1298 tag = normalize(child.tag)

1299

1300 if tag == "idvol":

1301 self.pid = child.text

1302 elif tag == "tome":

1303 self.volume = child.text

1304 elif tag == "fascicule":

1305 self.number = child.text

1306 elif tag == "serie":

1307 self.vseries = child.text

1308 elif tag == "annee":

1309 self.year = child.text

1310 else:

1311 fct_name = "parse_" + tag.replace("-", "_")

1312 ftor = getattr(self, fct_name, None)

1313 if callable(ftor):

1314 ftor(child)

1315

1316 if self.last_modified_iso_8601_date_str is None:

1317 self.last_modified_iso_8601_date_str = timezone.now().isoformat()

1318

1319 def parse_revue(self, node):

1320 self.journal = CedricsJournal(tree=node)

1321 self.colid = self.journal.pid

1322 self.publisher = self.journal.publisher

1323

1324 def set_titles(self):

1325 # TODO: BUG in JATS: title_html is the one of the last title (bug if title in multiple langs)

1326 for titre in self.titres:

1327 if titre["lang"] == self.lang or self.lang == "und":

1328 self.title_html = titre["title_html"]

1329 self.title_tex = titre["title_tex"]

1330 else:

1331 self.trans_lang = titre["lang"]

1332 self.trans_title_html = titre["title_html"]

1333 self.trans_title_tex = titre["title_tex"]

1334

1335 if self.title_html:

1336 self.title_xml = "<issue-title-group>"

1337

1338 for titre in self.titres:

1339 if titre["lang"] == self.lang or self.lang == "und":

1340 self.title_xml += (

1341 '<issue-title xml:space="preserve" xml:lang="'

1342 + titre["lang"]

1343 + '">'

1344 + titre["title_xml"]

1345 + "</issue-title>"

1346 )

1347

1348 for titre in self.titres:

1349 if titre["lang"] != self.lang and self.lang != "und":

1350 self.title_xml += '<trans-title-group xml:lang="' + titre["lang"] + '">'

1351 self.title_xml += (

1352 '<trans-title xml:space="preserve">'

1353 + titre["title_xml"]

1354 + "</trans-title>"

1355 )

1356 self.title_xml += "</trans-title-group>"

1357

1358 self.title_xml += "</issue-title-group>"

1359

1360

1361class CedricsArticle(ArticleData, CedricsBase):

1362 def __init__(self, *args, **kwargs):

1363 super().__init__(*args, **kwargs)

1364

1365 self.ignore_date_published = (

1366 kwargs["ignore_date_published"] if "ignore_date_published" in kwargs else False

1367 )

1368 self.is_seminar = kwargs["is_seminar"] if "is_seminar" in kwargs else False

1369 self.article_folder = kwargs["article_folder"] if "article_folder" in kwargs else None

1370

1371 # Jats has a title/trans_title

1372 # Cedrics has multiples <titre xml:lang>

1373 # Use self.titres to store the titles temporary.

1374 # self.title_* and self_trans_title* are set at the end of the concrete parse_tree

1375 self.titres = []

1376

1377 self.pid = kwargs["pid"] if "pid" in kwargs else None

1378 self.colid = kwargs["colid"]

1379 self.issue_id = kwargs["issue_id"]

1380 self.atype = "normal"

1381

1382 if "doi" in kwargs and kwargs["doi"] is not None:

1383 self.doi = clean_doi(kwargs["doi"])

1384 self.ids.append(("doi", self.doi))

1385

1386 self.publishTeX = False

1387 self.tex_filename = None

1388 self.has_articletype = (

1389 False # 2023/12/05 <articletype> has been added. Ignore <article-types>

1390 )

1391

1392 self.parse_tree(kwargs["tree"])

1393 self.post_parse_tree()

1394

1395 def parse_tree(self, tree):

1396 super().parse_tree(tree)

1397

1398 for node in tree:

1399 tag = normalize(node.tag)

1400

1401 if tag == "idart":

1402 self.pid = node.text

1403 elif tag == "doi":

1404 self.doi = clean_doi(node.text)

1405 # TODO: Remove as ResourceId do not seem useful (needs to upate templates)

1406 value = ("doi", self.doi)

1407 if value not in self.ids:

1408 self.ids.append(value)

1409 elif tag == "pagedeb":

1410 self.fpage = self.get_numeric_value(node)

1411 elif tag == "pagefin":

1412 self.lpage = self.get_numeric_value(node)

1413 elif tag == "ordreart":

1414 # Set article_number or talk_number

1415 # Side effect in Cedrics: set page-count (handled at the end of this function)

1416 if self.is_seminar:

1417 self.talk_number = node.text

1418 else:

1419 self.article_number = node.text

1420 elif tag == "msn-id":

1421 self.extids.append(("mr-item-id", node.text))

1422 elif tag == "zbl-id":

1423 self.extids.append(("zbl-item-id", node.text))

1424

1425 # elif tag == 'pub-date':

1426 # date_type = child.get('date-type') or 'pub'

1427 # if date_type == 'pub':

1428 # self.date_published_iso_8601_date_str = get_data_from_date(child)

1429 # else:

1430 # date_str = get_data_from_date(child)

1431 # self.history_dates.append({'type': 'online', 'date': date_str})

1432 # elif tag == "history":

1433 # self.history_dates += get_data_from_history(child)

1434 # for date in self.history_dates:

1435 # if date['type'] == 'prod-deployed-date':

1436 # self.prod_deployed_date_iso_8601_date_str = date['date']

1437

1438 else:

1439 fct_name = "parse_" + tag.replace("-", "_")

1440 print("function " + fct_name)

1441 ftor = getattr(self, fct_name, None)

1442 if callable(ftor):

1443 ftor(node)

1444

1445 def parse_gestion(self, node):

1446 for child in node:

1447 tag = normalize(child.tag)

1448

1449 if tag == "date_online" and not self.ignore_date_published:

1450 self.history_dates.append({"type": "online", "date": child.text})

1451 elif tag == "date_acceptation":

1452 self.history_dates.append({"type": "accepted", "date": child.text})

1453 elif tag == "date_reception":

1454 self.history_dates.append({"type": "received", "date": child.text})

1455 elif tag == "date_revision":

1456 self.history_dates.append({"type": "revised", "date": child.text})

1457 elif tag == "publishTeX":

1458 self.publishTeX = child.text == "yes"

1459

1460 def parse_production(self, node):

1461 for child in node:

1462 tag = normalize(child.tag)

1463

1464 if tag == "date_prod_PDF" and not self.ignore_date_published:

1465 self.date_published_iso_8601_date_str = child.text

1466 elif tag == "fichier_tex":

1467 self.tex_filename = child.text

1468

1469 def parse_relations(self, node):

1470 rel_type = get_normalized_attrib(node, "type") or ""

1471 id_value = node.text

1472

1473 relations = {

1474 "corrige": "corrects",

1475 "estcorrige": "corrected-by",

1476 "complete": "complements",

1477 "estcomplete": "complemented-by",

1478 "suitede": "follows",

1479 "estsuivide": "followed-by",

1480 "pagesprec": "prev-pages",

1481 "pagessuiv": "next-pages",

1482 "solutionde": "resolves",

1483 "apoursolution": "resolved-by",

1484 "commente": "comments",

1485 "estcommente": "commented-by",

1486 "remplace": "replaces",

1487 "estremplace": "replaced-by",

1488 }

1489

1490 if rel_type in relations:

1491 obj = Foo()

1492 obj.rel_type = relations[rel_type]

1493 obj.id_value = id_value

1494

1495 self.relations.append(obj)

1496

1497 def post_parse_tree(self):

1498 # Some values in Cedrics XMLs are not embedded in groups (ex: authors)

1499 # We need to wait at the end of the parsing to finish the job

1500

1501 super().post_parse_tree()

1502

1503 if len(self.talk_number) > 0 or len(self.article_number) > 0:

1504 try:

1505 fpage_int = int(self.fpage)

1506 lpage_int = int(self.lpage)

1507 count_value = lpage_int - fpage_int + 1

1508 self.counts.append(("page-count", str(count_value)))

1509 except ValueError:

1510 pass

1511

1512 # The (data)streams of the article's PDF and TeX are added automatically

1513 if hasattr(self, "colid") and hasattr(self, "issue_id"):

1514 location = self.colid + "/" + self.issue_id + "/"

1515 if self.article_folder:

1516 location += self.article_folder + "/" + self.article_folder + ".pdf"

1517 else:

1518 location += self.pid + "/" + self.pid + ".pdf"

1519

1520 data = {

1521 "rel": "full-text",

1522 "mimetype": "application/pdf",

1523 "location": location,

1524 "base": "",

1525 "text": "Full (PDF)",

1526 }

1527 self.streams.append(data)

1528

1529 if self.publishTeX and self.tex_filename:

1530 location = self.colid + "/" + self.issue_id + "/"

1531 if self.article_folder:

1532 location += self.article_folder + "/" + self.tex_filename + ".tex"

1533 else:

1534 location += self.pid + "/src/tex/" + self.tex_filename + ".tex"

1535

1536 data = {

1537 "rel": "full-text",

1538 "mimetype": "application/x-tex",

1539 "location": location,

1540 "base": "",

1541 "text": "TeX source",

1542 }

1543 self.streams.append(data)

1544

1545 def set_titles(self):

1546 for titre in self.titres:

1547 if titre["lang"] == self.lang or self.lang == "und":

1548 self.title_html = titre["title_html"]

1549 self.title_tex = titre["title_tex"]

1550 if len(titre["title_xml"]) > 0:

1551 self.title_xml = (

1552 '<article-title xml:space="preserve">'

1553 + titre["title_xml"]

1554 + "</article-title>"

1555 )

1556 else:

1557 self.trans_title_html = titre["title_html"]

1558 self.trans_title_tex = titre["title_tex"]

1559 if len(titre["title_xml"]):

1560 self.trans_title_xml = '<trans-title-group xml:lang="' + titre["lang"] + '">'

1561 self.trans_title_xml += '<trans-title xml:space="preserve">'

1562 self.trans_title_xml += (

1563 titre["title_xml"] + "</trans-title></trans-title-group>"

1564 )

1565 self.trans_lang = titre["lang"]

1566

1567 if len(self.title_xml) > 0:

1568 self.title_xml = (

1569 "<title-group>" + self.title_xml + self.trans_title_xml + "</title-group>"

1570 )

1571

1572

1573class CedricsRef(RefBase, CedricsBase):

1574 def __init__(self, *args, **kwargs):

1575 super().__init__(*args, **kwargs)

1576

1577 self.citation_xml = self.citation_html = self.citation_tex = ""

1578 self.REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)

1579

1580 self.is_mixed_citation = (

1581 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

1582 )

1583 self.eprint_id = None

1584 self.archive_name = None

1585 self.has_doi = False

1586

1587 self.editeur_citation_xml = (

1588 "" # bediteur is not in the correct order. Store the xml temporarily

1589 )

1590

1591 self.parse_tree(kwargs["tree"])

1592

1593 def parse_address(self, node):

1594 self.publisher_loc = normalize_space(get_text_from_node(node))

1595 self.citation_xml += "<publisher-loc>" + escape(self.publisher_loc) + "</publisher-loc>"

1596

1597 def parse_archive_name(self, node):

1598 # TODO 1 JEP ref has a formula in its archive-name (for biorxiv)

1599 # It should be modified to use common names "biorxiv"

1600

1601 self.archive_name = node.text.lower()

1602

1603 def parse_article_id(self, node):

1604 eid = node.text

1605 self.extids.append(("eid", eid))

1606

1607 self.citation_xml += '<pub-id pub-id-type="eid">' + escape(eid) + "</pub-id>"

1608

1609 def parse_bauteur(self, node):

1610 self.parse_auteur(node, is_ref=True)

1611

1612 last_contribution = self.contributors[-1]

1613 self.citation_xml += last_contribution["contrib_xml"]

1614

1615 def parse_bediteur(self, node):

1616 self.parse_common_contrib(node, "editor", is_ref=True)

1617

1618 last_contribution = self.contributors[-1]

1619 self.editeur_citation_xml += last_contribution["contrib_xml"]

1620

1621 def parse_bibitemdata(self, node):

1622 tex_node = node.getnext()

1623

1624 # TODO: Bug in Cedrics. if bibitemdata has no text between the nodes,

1625 # the XML is pretty printed. But since space="preserve" is added on the fly on mixed-citation

1626 # The \n and spaces should be preserved.

1627 # This bug is ignored (JTNB_2014__26_3_757_0 [1])

1628

1629 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(

1630 node, tex_node, is_bibitemdata=True

1631 )

1632 self.citation_html += value_html

1633 self.citation_tex += value_tex

1634 self.citation_xml += (

1635 '<mixed-citation xml:space="preserve">' + value_xml + "</mixed-citation>"

1636 )

1637

1638 def parse_booktitle(self, node):

1639 tex_node = node.getnext()

1640 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(

1641 node, tex_node, is_citation=True

1642 )

1643

1644 self.source_tex = title_tex

1645 if title_xml != "":

1646 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"

1647

1648 def parse_burl(self, node):

1649 for child in node:

1650 tag = normalize(child.tag)

1651

1652 if tag == "xref":

1653 html_text, tex_text, xml_text = self.parse_node_with_xref(

1654 child, None, keep_link=True, is_citation=True

1655 )

1656

1657 self.citation_xml += xml_text

1658

1659 def parse_chapter(self, node):

1660 # TODO: Bug in Cedrics <chapter> for types other than inbook

1661 # becomes a text outside tags (AIF_2017__67_1_237_0 [16], CML_2013__5_1)

1662 # The info is not present in the PDF. It should not be in the Cedrics XML

1663 if self.type != "inbook":

1664 raise ValueError("<chapter> can be used only for an inbook")

1665

1666 tex_node = node.getnext()

1667 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(

1668 node, tex_node, is_citation=True

1669 )

1670

1671 self.citation_xml += (

1672 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>"

1673 )

1674 self.chapter_title_tex = title_tex

1675

1676 def parse_doi(self, node):

1677 if node.text is None:

1678 raise ValueError("a doi can not be empty")

1679

1680 if "http" in node.text:

1681 raise ValueError(node.text, "should not have http in it")

1682

1683 doi_value = clean_doi(node.text)

1684 if self.doi is not None and self.doi != doi_value:

1685 raise ValueError(

1686 "Multiple dois for the same ref "

1687 + self.label

1688 + ": "

1689 + self.doi

1690 + " and "

1691 + doi_value

1692 )

1693

1694 if self.doi is None:

1695 self.doi = doi_value

1696 self.extids.append(("doi", self.doi))

1697

1698 self.has_doi = True

1699

1700 # TODO: bug in Cedrics if the doi has a   in it

1701 # the doi and the burl might not match and the dx.doi.org is no longer filtered

1702 # (bug²)

1703 # A doi should not have a space in it. raise an exception

1704 other_doi = self.doi.strip().replace(chr(160), "")

1705 if other_doi != self.doi:

1706 raise ValueError(self.doi, "has a space in it")

1707

1708 if self.doi.lower().startswith("doi:"):

1709 raise ValueError('Remove "DOI:" in ' + self.doi)

1710

1711 self.citation_xml += '<pub-id pub-id-type="doi">' + escape(node.text) + "</pub-id>"

1712

1713 def parse_edition(self, node):

1714 # TODO: BUG in JATS (The edition is ignored in the HTML version)

1715 self.parse_node_common(node, "edition", "edition")

1716

1717 def parse_editor(self, node):

1718 # TODO: Bug in Cedrics <editeur> becomes a <string-name> and we lose the info author vs editor

1719 self.parse_auteur(node, is_ref=True)

1720

1721 last_contribution = self.contributors[-1]

1722 self.citation_xml += last_contribution["contrib_xml"]

1723

1724 def parse_eprint_id(self, node):

1725 # Cannot add an ext_ids yet. Need to see if there's a archive-name

1726 self.eprint_id = escape(node.text)

1727

1728 def parse_institution(self, node):

1729 self.parse_node_common(node, "institution", "institution")

1730

1731 def parse_journal(self, node):

1732 tex_node = node.getnext()

1733 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(

1734 node, tex_node, is_citation=True

1735 )

1736

1737 self.source_tex = title_html

1738 if len(title_xml) > 0:

1739 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"

1740

1741 def parse_mixed_citation(self, node):

1742 for child in node:

1743 tag = normalize(child.tag)

1744

1745 if tag == "reference":

1746 self.parse_reference(child)

1747 if len(self.label) > 0:

1748 self.citation_html = self.citation_tex = self.label + " "

1749 elif tag == "bibitemdata":

1750 self.parse_bibitemdata(child)

1751

1752 def parse_month(self, node):

1753 # TODO: Bug in Cedrics. month is ignored in the PDF ? JEP_2019__6__737_0 [Hoe63]

1754 self.parse_node_common(node, "month", "month")

1755

1756 def parse_msn_id(self, node):

1757 self.extids.append(("mr-item-id", node.text))

1758 self.citation_xml += (

1759 '<ext-link ext-link-type="mr-item-id">' + escape(node.text) + "</ext-link>"

1760 )

1761

1762 def parse_node_common(self, node, variable_name, jats_tag, **kwargs):

1763 text = get_text_from_node(node)

1764 if "keep_space" not in kwargs:

1765 text = normalize_space(text)

1766 setattr(self, variable_name, text)

1767

1768 self.citation_xml += "<" + jats_tag

1769 if "jats_params" in kwargs and len(kwargs["jats_params"]) > 0:

1770 self.citation_xml += " " + kwargs["jats_params"]

1771

1772 self.citation_xml += ">" + escape(text) + "</" + jats_tag + ">"

1773

1774 def parse_note(self, node):

1775 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(

1776 node, None, is_citation=True, is_comment=True, add_HTML_link=True, temp_math=True

1777 )

1778

1779 self.comment = value_html

1780

1781 if len(value_html) > 0:

1782 self.citation_xml += '<comment xml:space="preserve">' + value_xml + "</comment>"

1783

1784 def parse_number(self, node):

1785 self.parse_node_common(node, "issue", "issue", keep_space=True)

1786

1787 def parse_pagedeb(self, node):

1788 self.parse_node_common(node, "fpage", "fpage", keep_space=True)

1789

1790 def parse_pagefin(self, node):

1791 self.parse_node_common(node, "lpage", "lpage", keep_space=True)

1792

1793 def parse_pages(self, node):

1794 if len(self.fpage) == 0 and len(self.lpage) == 0:

1795 tag = "size" if (self.type == "book" or "thesis" in self.type) else "fpage"

1796 params = 'units="pages"' if tag == "size" else ""

1797 self.parse_node_common(node, tag, tag, jats_params=params)

1798

1799 def parse_page_total_number(self, node):

1800 self.parse_node_common(node, "size", "size", jats_params='units="pages"')

1801

1802 def parse_publisher(self, node):

1803 self.publisher_name = normalize_space(get_text_from_node(node))

1804 self.citation_xml += "<publisher-name>" + escape(self.publisher_name) + "</publisher-name>"

1805

1806 def parse_reference(self, node):

1807 cedrics_label = get_text_from_node(node)

1808

1809 if cedrics_label and cedrics_label[0] != "[":

1810 self.label = "[" + cedrics_label + "]"

1811 else:

1812 self.label = cedrics_label

1813

1814 if self.label:

1815 if self.is_mixed_citation:

1816 self.citation_xml += "<label>" + escape(self.label) + "</label>"

1817 else:

1818 self.citation_xml += "<label>" + escape(cedrics_label) + "</label>"

1819

1820 def parse_series(self, node):

1821 self.parse_node_common(node, "series", "series")

1822

1823 def parse_structured_citation(self, node):

1824 wrapper_tag_added = False

1825 eprint_done = False

1826

1827 for child in node:

1828 tag = normalize(child.tag)

1829

1830 # The <label> is outside the <element-citation> in JATS

1831 if tag != "reference" and not wrapper_tag_added:

1832 self.citation_xml += '<element-citation publication-type="' + self.type + '">'

1833 wrapper_tag_added = True

1834

1835 if self.eprint_id is not None and tag not in ("archive-prefix", "archive-name"):

1836 self.post_parse_eprint()

1837 eprint_done = True

1838

1839 # TODO: brevue bcoll bconference bseries btome... (util/bibitem.xsl)

1840

1841 if tag in ["howpublished"]:

1842 self.parse_title(child)

1843 elif tag in ("institution", "organization", "school"):

1844 self.parse_institution(child)

1845 elif tag not in ("TeXtitle", "TeXbooktitle", "archive-prefix"):

1846 fct_name = "parse_" + tag.replace("-", "_")

1847 ftor = getattr(self, fct_name, None)

1848 if callable(ftor):

1849 ftor(child)

1850

1851 if self.eprint_id is not None and not eprint_done:

1852 self.post_parse_eprint()

1853

1854 # ptf-xsl mets les <bediteur> à la fin en JATS

1855 if len(self.editeur_citation_xml) > 0:

1856 self.citation_xml += '<person-group person-group-type="editor">'

1857 self.citation_xml += self.editeur_citation_xml

1858 self.citation_xml += "</person-group>"

1859

1860 self.citation_xml += "</element-citation>"

1861

1862 text = get_citation_html(self)

1863 self.citation_html = self.citation_tex = text

1864

1865 def parse_title(self, node):

1866 tex_node = node.getnext()

1867

1868 title_html, title_tex, title_xml = self.parse_node_with_mixed_content(

1869 node, tex_node, is_citation=True, add_ext_link=True

1870 )

1871

1872 if self.type == "incollection":

1873 self.chapter_title_tex = title_html

1874 self.citation_xml += (

1875 '<chapter-title xml:space="preserve">' + title_xml + "</chapter-title>"

1876 )

1877 elif self.type in [

1878 "book",

1879 "inbook",

1880 "unpublished",

1881 "phdthesis",

1882 "masterthesis",

1883 "mastersthesis",

1884 "manual",

1885 "techreport",

1886 "coursenotes",

1887 "proceedings",

1888 ] or node.tag in ["booktitle", "howpublished"]:

1889 self.source_tex = title_html

1890 self.citation_xml += '<source xml:space="preserve">' + title_xml + "</source>"

1891 else:

1892 self.article_title_tex = title_html

1893 self.citation_xml += (

1894 '<article-title xml:space="preserve">' + title_xml + "</article-title>"

1895 )

1896

1897 def parse_tree(self, tree):

1898 super().parse_tree(tree)

1899

1900 self.user_id = get_normalized_attrib(tree, "user-id") or ""

1901 self.type = get_normalized_attrib(tree, "doctype") or "misc"

1902 if self.type == "none":

1903 self.type = "misc"

1904

1905 if self.is_mixed_citation:

1906 self.parse_mixed_citation(tree)

1907 else:

1908 self.parse_structured_citation(tree)

1909

1910 def parse_type(self, node):

1911 tex_node = node.getnext()

1912 value_html, value_tex, value_xml = self.parse_node_with_mixed_content(

1913 node, tex_node, bug_cedrics=True

1914 )

1915

1916 self.annotation = value_tex

1917

1918 if len(value_xml) > 0:

1919 self.citation_xml += (

1920 '<annotation><p xml:space="preserve">' + value_xml + "</p></annotation>"

1921 )

1922

1923 def parse_url_last_visited(self, node):

1924 self.citation_xml += '<date-in-citation content-type="access-date" iso-8601-date="'

1925 self.citation_xml += node.text

1926 self.citation_xml += '">' + node.text

1927 self.citation_xml += "</date-in-citation>"

1928

1929 def parse_volume(self, node):

1930 text = normalize_space(get_text_from_node(node))

1931

1932 if text is not None and len(text) > 0:

1933 self.volume = text

1934 self.citation_xml += "<volume>" + escape(self.volume) + "</volume>"

1935

1936 def parse_year(self, node):

1937 self.parse_node_common(node, "year", "year")

1938

1939 def parse_zbl_id(self, node):

1940 self.extids.append(("zbl-item-id", node.text))

1941 self.citation_xml += (

1942 '<ext-link ext-link-type="zbl-item-id">' + escape(node.text) + "</ext-link>"

1943 )

1944

1945 def post_parse_eprint(self):

1946 if self.eprint_id is not None:

1947 if self.archive_name is None:

1948 # Assumption made by the XSLT transform

1949 self.archive_name = "arxiv"

1950

1951 if self.archive_name in ["arxiv", "tel", "hal", "theses.fr"]:

1952 # The Cedrics archive-prefix is ignored (the URL could change overtime)

1953 self.extids.append((self.archive_name, self.eprint_id))

1954

1955 self.citation_xml += (

1956 '<pub-id pub-id-type="'

1957 + self.archive_name

1958 + '">'

1959 + self.eprint_id

1960 + "</pub-id>"

1961 )

1962

1963 def split_label(self):

1964 """

1965 Used when sorting non-digit bibitems

1966 """

1967 label = self.label.lower()

1968 # CRAS <reference> do not allow a simple sort (?!?)

1969 # labels with "XXX et al." need to be put after "XXX"

1970 label = label.replace(" et al.", "ZZZ").replace(" et al.", "ZZZ")

1971 if len(label) > 1:

1972 label = label[1:-1]

1973

1974 if label.isdigit():

1975 self.label_prefix = label

1976 else:

1977 try:

1978 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)

1979 except ValueError:

1980 # Special case where label is similar as "Sma" instead of "Sma15"

1981 self.label_prefix, self.label_suffix = [label, ""]

Coverage for src/ptf/cmds/xml/cedrics/cedrics_parser.py: 10%

1098 statements