Coverage for src/ptf/cmds/xml/jats/jats

1##################################################################################################

3# README

5# jats_parser.py is a replacement of xmldata.py

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom.

7# Each node is read only once.

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds.

10# The xml tree is parsed in the class constructor (__init__)

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables.

12# Some parse_<tag> functions are called directly.

13# Ex: if tag == "article-meta":

14# self.parse_article_meta(child)

15# Other parse_<tag> functions are called "automatically"

16# fct_name = 'parse_' + tag.replace('-', '_')

17# ftor = getattr(self, fct_name, None)

18# if callable(ftor):

19# ftor(child)

20#

21# JatsBase and JatsArticleBase are base classes.

22# They provide common instance variables and their corresponding parse_<tag> functions

23#

24# html_from_<tag> are used to generate the HTML text of a node with mixed content:

25# a node that mixes text, children and tail

26# These functions can also extract data and set instance variables (ex: self.figures)

27#

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects

29#

30# At the end of this file, there are some functions that are/were called by ptf-tools.

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser

32#

33# TODO: the import OAI or the import of a collection could simply call the first function

34# (def parser(tree))

35#

36##################################################################################################

38import copy

39import inspect

40import os

41import re

43from lxml import etree

44from pylatexenc.latexencode import unicode_to_latex

46from django.conf import settings

47from django.urls import reverse

48from django.utils import timezone

50from matching import scrapping

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume

56from ptf.cmds.xml.citation_html import get_citation_html

57from ptf.cmds.xml.jats.builder.issue import get_single_title_xml

58from ptf.cmds.xml.jats.builder.issue import get_title_xml

59from ptf.cmds.xml.xml_base import RefBase

60from ptf.cmds.xml.xml_base import XmlParserBase

61from ptf.cmds.xml.xml_utils import escape

62from ptf.cmds.xml.xml_utils import get_contrib_xml

63from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions

64from ptf.cmds.xml.xml_utils import get_normalized_attrib

65from ptf.cmds.xml.xml_utils import get_text_from_node

66from ptf.cmds.xml.xml_utils import get_xml_from_node

67from ptf.cmds.xml.xml_utils import helper_update_name_params

68from ptf.cmds.xml.xml_utils import make_links_clickable

69from ptf.cmds.xml.xml_utils import normalize

70from ptf.cmds.xml.xml_utils import normalize_space

71from ptf.cmds.xml.xml_utils import split_kwds

72from ptf.display import resolver

73from ptf.model_data import ArticleData

74from ptf.model_data import BookData

75from ptf.model_data import BookPartData

76from ptf.model_data import CollectionData

77from ptf.model_data import ExtLinkDict

78from ptf.model_data import Foo

79from ptf.model_data import IssueData

80from ptf.model_data import JournalData

81from ptf.model_data import MathdocPublicationData

82from ptf.model_data import PublisherData

83from ptf.model_data import RefData

84from ptf.model_data import create_contributor

85from ptf.model_data import create_extlink

88class JatsBase(XmlParserBase):

89 def __init__(self, *args, **kwargs):

90 super().__init__()

91 self.warnings = []

92 self.fns = []

93 self.tree = None

94 # Used to convert an XML value for CKEditor (ie abstract)

95 self.add_span_around_tex_formula = False

96 # Used to create a Tex file from an XML value (ie abstract)

97 self.for_tex_file = False

99 def parse_tree(self, tree):

100 self.tree = tree

101 self.lang = get_normalized_attrib(tree, "lang") or "und"

102

103 def post_parse_tree(self):

104 if self.no_bib: 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was never true

105 # For Geodesic

106 ext_link = create_extlink()

107 ext_link["rel"] = "source"

108 ext_link["location"] = "http://www.numdam.org/item/" + self.pid

109 ext_link[

110 "metadata"

111 ] = "NUMDAM" # Used as the source id to find the source in the GDML Views

112 self.ext_links.append(ext_link)

113

114 def parse_node_with_article_title(self, node, **kwargs):

115 tex, html = self.parse_inner_node(node, **kwargs)

116

117 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

118 if is_mixed_citation:

119 html = add_span_class_to_html_from_article_title(html, **kwargs)

120

121 return tex, html

122

123 def parse_node_with_break(self, node, **kwargs):

124 tex = "\\newline\n" if self.for_tex_file else " "

125 html = "<br/>"

126

127 return tex, html

128

129 def parse_node_with_chem_struct_wrap(self, node, **kwargs):

130 table_id = label = None

131 inner_text = ""

132

133 if "id" in node.attrib:

134 table_id = node.attrib["id"]

135

136 for child in node:

137 tag = normalize(child.tag)

138 if tag == "label":

139 _, label = self.parse_node_with_mixed_content(child, **kwargs)

140 else:

141 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)

142 inner_text += child_text

143

144 text = "<table "

145 if table_id:

146 text += f'id="{table_id}" '

147 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>'

148

149 text += '<td class="formula-label">'

150 if label:

151 text += label

152 text += "</td></tr>"

153 text += "</table>"

154

155 return text, text

156

157 def parse_node_with_disp_quote(self, node, **kwargs):

158 tex, html = self.parse_inner_node(node, **kwargs)

159

160 html = f'<div class="disp-quote">{html}</div>'

161 tex = f'<div class="disp-quote">{tex}</div>'

162

163 return tex, html

164

165 def parse_node_with_boxed_text(self, node, **kwargs):

166 box_id = node.attrib["id"] if "id" in node.attrib else None

167

168 _, node_html = self.parse_inner_node(node, **kwargs)

169

170 if box_id:

171 html = f'<div id="{box_id}" class="boxed-text">'

172 else:

173 html = '<div class="boxed-text">'

174

175 html = f"{html}{node_html}</div>"

176

177 return "", html

178

179 def parse_node_with_fig(self, node, **kwargs):

180 """

181 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig>

182 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure>

183

184 :param node: XML node of a fig

185 :return: the HTML text + the dict representing the image (mimetype, location,...)

186 """

187 html = ""

188

189 fig_id = label_html = title_html = caption_html = None

190 img_html = ""

191

192 if "id" in node.attrib:

193 fig_id = node.attrib["id"]

194

195 for child in node:

196 tag = normalize(child.tag)

197 if tag == "label":

198 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)

199 elif tag == "caption":

200 for caption_child in child:

201 tag = normalize(caption_child.tag)

202 if tag == "title":

203 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs)

204 elif tag == "p": 204 ↛ 218line 204 didn't jump to line 218 because the condition on line 204 was always true

205 _, caption_p_html = self.parse_node_with_mixed_content(

206 caption_child, **kwargs

207 )

208 if caption_html:

209 caption_html = caption_html.replace(

210 "<p>", '<p class="fig-first-caption">', 1

211 )

212 caption_html += caption_p_html.replace(

213 "<p>", '<p class="fig-small-caption">', 1

214 )

215 else:

216 caption_html = caption_p_html

217 else:

218 self.warnings.append(

219 {

220 self.pid: self.__class__.__name__

221 + "."

222 + inspect.currentframe().f_code.co_name

223 + " "

224 + tag

225 }

226 )

227

228 elif tag == "graphic":

229 _, graphic_html = self.parse_node_with_graphic(child, **kwargs)

230 img_html += graphic_html

231 elif tag == "attrib":

232 _, html = self.parse_node_with_mixed_content(child, **kwargs)

233 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'

234 elif tag == "permissions": 234 ↛ 240line 234 didn't jump to line 240 because the condition on line 234 was always true

235 for gchild in child:

236 if gchild.tag == "copyright-statement": 236 ↛ 235line 236 didn't jump to line 235 because the condition on line 236 was always true

237 _, html = self.parse_node_with_mixed_content(gchild, **kwargs)

238 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'

239 else:

240 self.warnings.append(

241 {

242 self.pid: self.__class__.__name__

243 + "."

244 + inspect.currentframe().f_code.co_name

245 + " "

246 + tag

247 }

248 )

249

250 if fig_id:

251 html = '<figure id="' + fig_id + '">'

252 else:

253 html = "<figure>"

254

255 if len(img_html) > 0: 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true

256 html += img_html

257

258 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 258 ↛ 272line 258 didn't jump to line 272 because the condition on line 258 was always true

259 html += "<figcaption>"

260

261 if label_html: 261 ↛ 263line 261 didn't jump to line 263 because the condition on line 261 was always true

262 html += label_html

263 if label_html and title_html:

264 html += " : "

265 if title_html:

266 html += title_html

267 if caption_html: 267 ↛ 270line 267 didn't jump to line 270 because the condition on line 267 was always true

268 html += caption_html

269

270 html += "</figcaption>"

271

272 html += "</figure>"

273

274 if ( 274 ↛ 280line 274 didn't jump to line 280

275 "append_floats" in kwargs

276 and kwargs["append_floats"]

277 and hasattr(self, "floats")

278 and fig_id is not None

279 ):

280 self.floats[fig_id] = html

281

282 return "", html

283

284 def parse_node_with_fn(self, node, **kwargs):

285 """

286 Ex: <fn><label>LABEL</label><p>TEXT</p></fn>

287

288 :param node: XML node of a fn

289 :return: ''. the text is stripped from the HTML. but a list of fn is built

290 """

291 html = fn_html = ""

292

293 label_html = fn_id = None

294

295 if "id" in node.attrib: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 fn_id = node.attrib["id"]

297

298 for child in node:

299 tag = normalize(child.tag)

300 if tag == "label":

301 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)

302 elif tag == "p": 302 ↛ 306line 302 didn't jump to line 306

303 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs)

304 fn_html = fn_html.replace("<p>", "").replace("</p>", "")

305 else:

306 warning = (

307 self.__class__.__name__

308 + "."

309 + inspect.currentframe().f_code.co_name

310 + " "

311 + tag

312 )

313 self.warnings.append({self.pid: warning})

314

315 if fn_id: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true

316 html = '<p id="' + fn_id + '">'

317 else:

318 html = "<p>"

319

320 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true

321 html += f"<sup>{label_html}</sup> "

322

323 html += fn_html + "</p>"

324

325 if not kwargs["keep_fn"] and html not in self.fns: 325 ↛ 326line 325 didn't jump to line 326 because the condition on line 325 was never true

326 self.fns.append(html)

327

328 html = html if kwargs["keep_fn"] else ""

329 return "", html

330

331 def parse_node_with_graphic(self, node, **kwargs):

332 """

333 The href value of graphics used in our XML can have the following values

334 - relative path to the issue XML folder (Elsevier JATS)

335 - full path starting with "file:/" (Elsevier JATS created in early 2022)

336 - simple file name (with no relative path) in the RVT FullText XML

337

338 After the import, we want

339 - the files located in the src/tex/figures article folder

340 - the url pointing to the image, built thanks to kwargs['base_url']

341

342 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/

343 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute)

344 """

345 href = ""

346

347 for attrib in node.attrib:

348 name = normalize(attrib)

349 if name == "href":

350 href = node.attrib[attrib]

351

352 if href: 352 ↛ 398line 352 didn't jump to line 398 because the condition on line 352 was always true

353 basename = os.path.basename(href)

354 ext = basename.split(".")[-1]

355 if ext == "png": 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true

356 mimetype = "image/png"

357 else:

358 mimetype = "image/jpeg"

359

360 img_url = "src/tex/figures/" + basename

361

362 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 362 ↛ 365line 362 didn't jump to line 365 because the condition on line 362 was always true

363 img_url = img_url[0 : -len(ext)] + "jpg"

364

365 data_location = href if "file:/" in href else img_url

366 if ( 366 ↛ 372line 366 didn't jump to line 372

367 hasattr(self, "pii")

368 and hasattr(self, "issue")

369 and "file:/" not in href

370 and self.from_folder

371 ):

372 base_dir = self.issue.journal.pid

373 if os.path.dirname(href) != base_dir:

374 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href)

375 data_location = "file:" + href

376

377 data = {

378 "rel": "html-image",

379 "mimetype": mimetype,

380 "location": data_location,

381 "base": None,

382 "metadata": node.text if node.text is not None else "",

383 }

384

385 if ext == "png": 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 img_url = os.path.join(kwargs["base_url"], "png", img_url)

387 else:

388 img_url = os.path.join(kwargs["base_url"], "jpg", img_url)

389 img_text = '<a href="' + img_url + '" data-lightbox="image-'

390 img_text += str(len(self.figures)) + '" title="">'

391 img_text += '<img src="' + img_url + '" class="article-body-img" />'

392 img_text += "</a>"

393

394 if data not in self.figures: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true

395 self.figures.append(data)

396 self.related_objects.append(data)

397

398 return "", img_text

399

400 def parse_node_with_inline_formula(self, node, **kwargs):

401 # MathJAX is doing a good job with formulae and is now the standard

402 # MathML could be ignored in HTML (the original XML value is preserved with value_xml)

403 # We could simply return the tex-math text

404 # But there are multiple errors in the TeX of the Mersenne articles.

405 # We first need to fix those mistakes before switching to TeX

406

407 tex_math = ""

408 math_text = ""

409 formula_id = label = None

410

411 if "id" in node.attrib: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 formula_id = node.attrib["id"]

413

414 for child in node:

415 tag = normalize(child.tag)

416 if tag == "alternatives": 416 ↛ 435line 416 didn't jump to line 435 because the condition on line 416 was always true

417 for alternative in child:

418 tag = normalize(alternative.tag)

419 if tag == "tex-math":

420 tex_math = alternative.text or ""

421 elif tag == "math":

422 # remove_namespace(child)

423 # Elsevier sometimes provide the formula a an alternative image. Remove it.

424 alternative.attrib.pop("altimg", None)

425

426 math_text = get_xml_from_node(alternative).replace("mml:", "")

427 math_text = math_text.replace(

428 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""

429 )

430 math_text = math_text.replace(

431 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', ""

432 )

433 if node.tag == "disp-formula":

434 math_text = math_text.replace("<math", '<math display="block"')

435 elif tag == "label":

436 label = child.text or ""

437 else:

438 self.warnings.append(

439 {

440 self.pid: self.__class__.__name__

441 + "."

442 + inspect.currentframe().f_code.co_name

443 + " "

444 + tag

445 }

446 )

447

448 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""):

449 stack = inspect.stack()

450 stack_str = " ".join(

451 [

452 frameinfo[3]

453 for frameinfo in stack[1:]

454 if frameinfo[3].find("parse_") == 0

455 and frameinfo[3].find("parse_node") == -1

456 and frameinfo[3].find("parse_inner") == -1

457 and frameinfo[3].find("parse_tree") == -1

458 and frameinfo[3].find("parse_article_meta") == -1

459 ]

460 )

461 print(f"{self.pid} no math formula for {stack_str}")

462 # raise ValueError("No formula alternative")

463

464 if node.tag != "disp-formula":

465 if tex_math != "" and tex_math[0] != "$": 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true

466 tex_math = "$" + tex_math

467 if tex_math != "" and tex_math[-1] != "$": 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true

468 tex_math = tex_math + "$"

469

470 tex = tex_math

471

472 html = ""

473 if label or node.tag == "disp-formula":

474 html += '<table class="formula"><tr><td class="formula-inner">'

475

476 html += '<span class="mathjax-formula" '

477 if formula_id: 477 ↛ 478line 477 didn't jump to line 478 because the condition on line 477 was never true

478 html += 'id="' + formula_id + '" '

479 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math

480 if math_text:

481 html += f'data-tex="{alt_text}">{math_text}</span>'

482 else:

483 html += f'data-tex="{alt_text}">{tex_math}</span>'

484

485 if label or node.tag == "disp-formula":

486 html += '</td><td class="formula-label">'

487 if label: 487 ↛ 488line 487 didn't jump to line 488 because the condition on line 487 was never true

488 html += label

489 html += "</td></tr>"

490 html += "</table>"

491

492 if self.add_span_around_tex_formula: 492 ↛ 493line 492 didn't jump to line 493 because the condition on line 492 was never true

493 tex = f'<span class="mathjax-formula">\${tex[1:-1]}\$</span>'

494

495 return tex, html

496

497 def parse_node_with_institution_id(self, node, **kwargs):

498 return "", ""

499

500 def parse_node_with_italic(self, node, **kwargs):

501 tex, html = self.parse_inner_node(node, **kwargs)

502

503 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False

504 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False

505 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False

506 #

507 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment):

508 # text = inner_text

509 # else:

510 # text = '<span class="italique">' + inner_text + '</span>'

511

512 html = f'<span class="italique">{html}</span>'

513

514 if self.for_tex_file: 514 ↛ 515line 514 didn't jump to line 515 because the condition on line 514 was never true

515 tex = "{\\it " + tex + "}"

516 else:

517 tex = f"<i>{tex}</i>"

518

519 return tex, html

520

521 def parse_node_with_list(self, node, **kwargs):

522 tex, html = self.parse_inner_node(node, **kwargs)

523

524 start = None

525 continued_from = node.get("continued-from")

526 if continued_from is not None: 526 ↛ 527line 526 didn't jump to line 527 because the condition on line 526 was never true

527 start = self.get_list_start_value(node) + 1

528

529 list_type = node.get("list-type")

530 if list_type == "bullet" or list_type == "simple":

531 if self.for_tex_file: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true

532 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n"

533 else:

534 tex = f"<ul>{tex}</ul>"

535

536 html = f"<ul>{html}</ul>"

537 else:

538 if self.for_tex_file: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n"

540 else:

541 if list_type == "order" or list_type == "number":

542 if start is not None: 542 ↛ 543line 542 didn't jump to line 543 because the condition on line 542 was never true

543 html = f'<ol type="1" start="{str(start)}">{html}</ol>'

544 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>'

545 else:

546 html = f'<ol type="1">{html}</ol>'

547 tex = f'<ol type="1">{tex}</ol>'

548 elif list_type == "alpha-lower":

549 html = f'<ol type="a">{html}</ol>'

550 tex = f'<ol type="a">{tex}</ol>'

551 elif list_type == "alpha-upper":

552 html = f'<ol type="A">{html}</ol>'

553 tex = f'<ol type="A">{tex}</ol>'

554 elif list_type == "roman-lower":

555 html = f'<ol type="i">{html}</ol>'

556 tex = f'<ol type="i">{tex}</ol>'

557 elif list_type == "roman-upper": 557 ↛ 558line 557 didn't jump to line 558 because the condition on line 557 was never true

558 html = f'<ol type="I">{html}</ol>'

559 tex = f'<ol type="I">{tex}</ol>'

560 else:

561 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>'

562 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>'

563

564 return tex, html

565

566 def parse_node_with_list_item(self, node, **kwargs):

567 """

568 <list-item><label>LABEL</label><p>TEXT</p> becomes

569 <li>LABEL TEXT</li>

570 (same with <title>)

571

572 :param node:

573 :return:

574 """

575

576 title_tex = (

577 title_html

578 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = ""

579

580 for child in node:

581 tag = normalize(child.tag)

582 if tag == "label":

583 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs)

584 elif tag == "title": 584 ↛ 585line 584 didn't jump to line 585 because the condition on line 584 was never true

585 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs)

586 elif tag == "p":

587 if p_html == "" and content_html == "": 587 ↛ 590line 587 didn't jump to line 590 because the condition on line 587 was always true

588 p_tex, p_html = self.parse_inner_node(child, **kwargs)

589 else:

590 content_tex, content_html = self.parse_inner_node(child, **kwargs)

591 content_html = f"<p>{content_html}</p>"

592 elif tag == "list": 592 ↛ 596line 592 didn't jump to line 596 because the condition on line 592 was always true

593 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs)

594 # TODO if tag == "def-list":

595 else:

596 self.warnings.append(

597 {

598 self.pid: self.__class__.__name__

599 + "."

600 + inspect.currentframe().f_code.co_name

601 + " "

602 + tag

603 }

604 )

605

606 inner_tex = ""

607 if label_tex:

608 inner_tex += label_tex + " "

609 if title_tex: 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 inner_tex += title_tex + " "

611 inner_tex += p_tex + content_tex

612

613 if self.for_tex_file: 613 ↛ 614line 613 didn't jump to line 614 because the condition on line 613 was never true

614 tex = "\\item " + inner_tex + "\n"

615 else:

616 tex = f"<li>{inner_tex}</li>"

617

618 html = "<li>"

619 if label_html:

620 html += label_html + " "

621 if title_html: 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true

622 html += title_html + " "

623 html += p_html + content_html + "</li>"

624

625 return tex, html

626

627 def parse_node_with_name_content(self, node, **kwargs):

628 tex, html = self.parse_inner_node(node, **kwargs)

629 return tex, html

630

631 def parse_node_with_p(self, node, **kwargs):

632 tex, html = self.parse_inner_node(node, **kwargs)

633

634 if not self.for_tex_file: 634 ↛ 637line 634 didn't jump to line 637 because the condition on line 634 was always true

635 tex = f"<p>{tex}</p>"

636

637 node_type = node.get("specific-use")

638 if node_type:

639 html = f'<p class="{node_type}">{html}</p>'

640 else:

641 html = f"<p>{html}</p>"

642

643 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true

644 while len(self.floats_to_insert) > 0:

645 float_id = self.floats_to_insert.pop(0)

646 if float_id in self.floats:

647 html += self.floats[float_id]

648 self.floats.pop(float_id)

649

650 return tex, html

651

652 def parse_node_with_h1(self, node, **kwargs):

653 tex, html = self.parse_inner_node(node, **kwargs)

654

655 if not self.for_tex_file:

656 tex = f"<h1>{tex}</h1>"

657

658 node_type = node.get("specific-use")

659 if node_type:

660 html = f'<h1 class="{node_type}">{html}</h1>'

661 else:

662 html = f"<h1>{html}</h1>"

663

664 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"):

665 while len(self.floats_to_insert) > 0:

666 float_id = self.floats_to_insert.pop(0)

667 if float_id in self.floats:

668 html += self.floats[float_id]

669 self.floats.pop(float_id)

670

671 return tex, html

672

673 def parse_node_with_sc(self, node, **kwargs):

674 tex, html = self.parse_inner_node(node, **kwargs)

675 html = f'<span class="smallcaps">{html}</span>'

676

677 return tex, html

678

679 def parse_node_with_sec(self, node, **kwargs):

680 """

681 <sec><title>TITLE</title><p>TEXT</p> becomes

682 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children)

683

684 :param node:

685 :param kwargs:

686 :return:

687 """

688

689 label_tex = label_html = title_tex = title_html = None

690 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2

691

692 inner_tex = inner_html = ""

693 kwargs["sec_level"] += 1

694

695 for child in node:

696 tag = normalize(child.tag)

697 if tag == "label":

698 label_tex, label_html = self.parse_node_with_mixed_content(child)

699 elif tag == "title":

700 title_tex, title_html = self.parse_node_with_mixed_content(child)

701 else:

702 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)

703 inner_tex += child_tex

704 inner_html += child_html

705

706 tex = ""

707 html = "<section>"

708

709 if label_html or title_html: 709 ↛ 722line 709 didn't jump to line 722 because the condition on line 709 was always true

710 html += f"<h{str(sec_level)}>"

711 if label_html: 711 ↛ 714line 711 didn't jump to line 714 because the condition on line 711 was always true

712 tex += label_tex

713 html += label_html

714 if label_html and title_html: 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was always true

715 tex += " "

716 html += " "

717 if title_html: 717 ↛ 720line 717 didn't jump to line 720 because the condition on line 717 was always true

718 tex += title_tex

719 html += title_html

720 html += f"</h{str(sec_level)}>"

721

722 tex += inner_tex

723 html += inner_html + "</section>"

724

725 return tex, html

726

727 def parse_node_with_string_name(self, node, **kwargs):

728 tex, html = self.parse_inner_node(node, **kwargs)

729

730 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

731 if is_mixed_citation: 731 ↛ 734line 731 didn't jump to line 734 because the condition on line 731 was always true

732 html = add_span_class_to_html_from_authors(html.title(), **kwargs)

733

734 return tex, html

735

736 def parse_node_with_strong(self, node, **kwargs):

737 tex, html = self.parse_inner_node(node, **kwargs)

738

739 if self.for_tex_file: 739 ↛ 740line 739 didn't jump to line 740 because the condition on line 739 was never true

740 tex = "{\\bf " + tex + "}"

741 else:

742 tex = f"<strong>{tex}</strong>"

743 html = f"<strong>{html}</strong>"

744

745 return tex, html

746

747 def parse_node_with_styled_content(self, node, **kwargs):

748 tex, html = self.parse_inner_node(node, **kwargs)

749

750 if "style" in node.attrib: 750 ↛ 755line 750 didn't jump to line 755 because the condition on line 750 was always true

751 style = node.attrib["style"]

752 if style != "": 752 ↛ 755line 752 didn't jump to line 755 because the condition on line 752 was always true

753 html = f'<span style="{style}">{html}</span>'

754

755 return tex, html

756

757 def parse_node_with_sub(self, node, **kwargs):

758 tex, html = self.parse_inner_node(node, **kwargs)

759

760 if self.for_tex_file: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 tex = "\\textsubscript{" + tex + "}"

762 else:

763 tex = f"<sub>{tex}</sub>"

764 html = f"<sub>{html}</sub>"

765

766 return tex, html

767

768 def parse_node_with_sup(self, node, **kwargs):

769 tex, html = self.parse_inner_node(node, **kwargs)

770

771 if self.for_tex_file: 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true

772 tex = "\\textsuperscript{" + tex + "}"

773 else:

774 tex = f"<sup>{tex}</sup>"

775 html = f"<sup>{html}</sup>"

776

777 return tex, html

778

779 def parse_node_with_table_generic(self, node, **kwargs):

780 tex, html = self.parse_inner_node(node, **kwargs)

781

782 tag = normalize(node.tag)

783 if tag == "row": 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true

784 tag = "tr"

785 elif tag == "entry": 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true

786 tag = "td"

787 open_tag = "<" + tag

788

789 if tag == "table":

790 class_table = "table"

791

792 cols = node.xpath("colgroup/col")

793 i = 1

794 for col in cols:

795 if "width" in col.attrib:

796 class_table += f" nowrap-col-{i}"

797 i += 1

798

799 open_tag += f' class="{class_table}"'

800 if "rowspan" in node.attrib:

801 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"'

802 if "colspan" in node.attrib:

803 open_tag += ' colspan="' + node.attrib["colspan"] + '"'

804 if "align" in node.attrib:

805 open_tag += ' align="' + node.attrib["align"] + '"'

806 if "valign" in node.attrib:

807 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"'

808 if "style" in node.attrib:

809 open_tag += ' style="' + node.attrib["style"] + '"'

810 open_tag += ">"

811

812 html = f"{open_tag}{html}</{tag}>"

813

814 return "", html

815

816 def parse_node_with_table_wrap(self, node, **kwargs):

817 """

818 Create a <div class="table-wrap"> around the table

819 :param node:

820 :return:

821 """

822

823 table_id = label = caption = None

824 inner_text = ""

825

826 if "id" in node.attrib: 826 ↛ 829line 826 didn't jump to line 829 because the condition on line 826 was always true

827 table_id = node.attrib["id"]

828

829 for child in node:

830 tag = normalize(child.tag)

831 if tag == "label":

832 _, label = self.parse_node_with_mixed_content(child, **kwargs)

833 elif tag == "caption":

834 _, caption = self.parse_node_with_mixed_content(child, **kwargs)

835 else:

836 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)

837 inner_text += child_text

838

839 if table_id: 839 ↛ 842line 839 didn't jump to line 842 because the condition on line 839 was always true

840 text = '<div class="table-wrap table-responsive" id="' + table_id + '">'

841 else:

842 text = '<div class="table-wrap table-responsive">'

843

844 if label or caption: 844 ↛ 847line 844 didn't jump to line 847 because the condition on line 844 was always true

845 text += '<div class="table-wrap-header">'

846

847 if label: 847 ↛ 850line 847 didn't jump to line 850 because the condition on line 847 was always true

848 text += "<strong>" + label + "</strong>"

849

850 if caption: 850 ↛ 856line 850 didn't jump to line 856 because the condition on line 850 was always true

851 if label: 851 ↛ 853line 851 didn't jump to line 853 because the condition on line 851 was always true

852 text += " "

853 if caption: 853 ↛ 856line 853 didn't jump to line 856 because the condition on line 853 was always true

854 text += caption

855

856 if label or caption: 856 ↛ 859line 856 didn't jump to line 859 because the condition on line 856 was always true

857 text += "</div>"

858

859 text += inner_text

860 text += "</div>"

861

862 if ( 862 ↛ 868line 862 didn't jump to line 868

863 "append_floats" in kwargs

864 and kwargs["append_floats"]

865 and hasattr(self, "floats")

866 and table_id is not None

867 ):

868 self.floats[table_id] = text

869

870 return "", text

871

872 def parse_node_with_table_wrap_foot(self, node, **kwargs):

873 """

874 Create a <div class="table-wrap-foot"> at bottom of the table

875 Keep the footnotes inside this div

876 :param node:

877 :return:

878 """

879

880 text = '<div class="table-wrap-foot">'

881 kwargs["keep_fn"] = True

882

883 for child in node:

884 tag = normalize(child.tag)

885 if tag == "fn-group": 885 ↛ 883line 885 didn't jump to line 883 because the condition on line 885 was always true

886 _, html = self.parse_node_with_mixed_content(child, **kwargs)

887 text += html

888

889 text += "</div>"

890

891 return "", text

892

893 def parse_node_with_toc(self, node, **kwargs):

894 tex, html = self.parse_inner_node(node, **kwargs)

895

896 html = f"<table>{html}</table>"

897

898 # text = '<ul class="no-bullet book-toc">'

899 # text += inner_text + '</ul>'

900

901 return "", html

902

903 def parse_node_with_toc_entry(self, node, **kwargs):

904 html = label = title = child_text = page = anchor = ""

905 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"]

906 toc_class = "inside-toc" if inside_toc_entry else ""

907 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul>

908 # html = '<tr class="inside-toc">'

909 # #html = '<ul class="no-bullet book-toc">'

910

911 for child in node:

912 tag = normalize(child.tag)

913 if tag == "title":

914 _, title = self.parse_node_with_mixed_content(child, **kwargs)

915 elif tag == "label":

916 _, label = self.parse_node_with_mixed_content(child, **kwargs)

917 elif tag == "nav-pointer":

918 _, page = self.parse_node_with_mixed_content(child, **kwargs)

919 elif tag == "nav-pointer-group": 919 ↛ 920line 919 didn't jump to line 920 because the condition on line 919 was never true

920 for grandchild in child:

921 if (

922 grandchild.tag == "nav-pointer"

923 and "specific-use" in grandchild.attrib

924 and grandchild.attrib["specific-use"] == "pagenum"

925 ):

926 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs)

927 if (

928 grandchild.tag == "nav-pointer"

929 and "specific-use" in grandchild.attrib

930 and grandchild.attrib["specific-use"] == "pageindex"

931 ):

932 anchor = int(grandchild.text) + 1

933 elif tag == "toc-entry": 933 ↛ 911line 933 didn't jump to line 911 because the condition on line 933 was always true

934 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True)

935 child_text += text

936

937 toc_text = f"{label} {title}"

938 page_text = f"p. {page}"

939

940 if anchor: 940 ↛ 941line 940 didn't jump to line 941 because the condition on line 940 was never true

941 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"})

942 href += f"#page={anchor}"

943 toc_text = f'<a href="{href}">{toc_text}</a>'

944 page_text = f'<a href="{href}">{page_text}</a>'

945

946 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>'

947 if len(child_text) > 0:

948 html += child_text

949 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>'

950

951 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']:

952 # html += '</tr>'

953 # #html += '</ul>'

954

955 return "", html

956

957 def parse_node_with_underline(self, node, **kwargs):

958 tex, html = self.parse_inner_node(node, **kwargs)

959 tex = f"<u>{tex}</u>"

960 html = f"<u>{html}</u>"

961

962 return tex, html

963

964 def parse_node_with_volume(self, node, **kwargs):

965 tex, html = self.parse_inner_node(node, **kwargs)

966

967 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

968 if is_mixed_citation: 968 ↛ 971line 968 didn't jump to line 971 because the condition on line 968 was always true

969 html = add_span_class_to_html_from_volume(html, **kwargs)

970

971 return tex, html

972

973 def parse_node_with_xref(self, node, **kwargs):

974 tex = html = ""

975

976 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 976 ↛ 977line 976 didn't jump to line 977 because the condition on line 976 was never true

977 return tex, html

978

979 xref_id = node.get("rid")

980 if xref_id: 980 ↛ 994line 980 didn't jump to line 994 because the condition on line 980 was always true

981 rids = xref_id.split()

982

983 tex, html = self.parse_inner_node(node, **kwargs)

984 rid0 = rids[0]

985 if rid0.find("bib") == 0: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true

986 rid0 = "r" + rid0[3:]

987 html = f'<a href="#{rid0}">{html}</a>'

988

989 for rid in rids:

990 ref_type = node.get("ref-type") or None

991 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true

992 self.floats_to_insert.append(rid)

993

994 return tex, html

995

996 def parse_inner_node(self, node, **kwargs):

997 """

998 Used by html_from_mixed_content for nodes that have a different tag in HTML

999 :param node:

1000 :param kwargs:

1001 :return:

1002 """

1003 tex = html = ""

1004 kwargs["is_top"] = False

1005 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False

1006

1007 if node.text:

1008 node_text = node.text

1009 if self.for_tex_file: 1009 ↛ 1010line 1009 didn't jump to line 1010 because the condition on line 1009 was never true

1010 node_text = unicode_to_latex(node_text)

1011 tex = node_text

1012 html = escape(node.text)

1013

1014 for child in node:

1015 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)

1016 tex += child_tex

1017 html += child_html

1018

1019 return tex, html

1020

1021 def parse_node_with_mixed_content(self, node, **kwargs):

1022 """

1023 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes.

1024 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>

1025 Some inner nodes are removed, others are kept or replaced by their HTML equivalent.

1026 html_from_mixed_content is called recursively to get the HTML text of the children.

1027

1028 :param node: XML Node

1029 :param kwargs: params of the function

1030 :return: HTML text

1031 """

1032

1033 if node is None: 1033 ↛ 1034line 1033 didn't jump to line 1034 because the condition on line 1033 was never true

1034 return "", ""

1035

1036 # The tail is the text following the end of the node

1037 # Ex: <node>text1<a>text_a</a>a_tail</node>

1038 # The HTML text has to include the tail

1039 # only if html_from_mixed_content was called recursively

1040 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

1041

1042 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>

1043 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2

1044

1045 # Text in <comment> is parsed to add HTML link.

1046 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False

1047

1048 # base_url to image links

1049 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""

1050

1051 # footnotes are removed from the fulltext (and put at the end) except for those in a table

1052 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False

1053

1054 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False

1055 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False

1056 # mixed-citation ignores ext-link

1057 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False

1058

1059 # TODO remove once jats_parser has been validated agains xmldata

1060 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False

1061 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False

1062 kwargs["is_mixed_citation"] = (

1063 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

1064 )

1065 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False

1066

1067 tag = normalize(node.tag)

1068

1069 # pub-id/object-id are ignored by default are they are treated separately

1070 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):

1071 return "", ""

1072

1073 if tag in ("mixed-citation", "toc"):

1074 kwargs["is_citation"] = True

1075 elif tag == "comment":

1076 kwargs["is_comment"] = True

1077

1078 tex = html = inner_tex = inner_html = ""

1079

1080 # I. Add the node's text.

1081 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text.

1082

1083 # Check if the parse_node_with_@tag exists

1084 tag_mapped = {

1085 "statement": "sec",

1086 "disp-formula": "inline-formula",

1087 "chapter-title": "article-title",

1088 "bold": "strong",

1089 "table": "table-generic",

1090 "th": "table-generic",

1091 "tr": "table-generic",

1092 "td": "table-generic",

1093 "thead": "table-generic",

1094 "tbody": "table-generic",

1095 "colgroup": "table-generic",

1096 "col": "table-generic",

1097 "tgroup": "table-generic",

1098 "entry": "table-generic",

1099 "row": "table-generic",

1100 }

1101

1102 fct_name = tag_mapped[tag] if tag in tag_mapped else tag

1103 fct_name = "parse_node_with_" + fct_name.replace("-", "_")

1104 ftor = getattr(self, fct_name, None)

1105 if callable(ftor):

1106 inner_tex, inner_html = ftor(node, **kwargs)

1107 elif tag in ("ext-link", "uri"):

1108 # Add HTML links

1109 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs)

1110 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>,

1111 # and not caught by parse_citation_node

1112 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]:

1113 is_extid_value = self.parse_ext_link(node, **kwargs)

1114 if is_extid_value and kwargs["is_mixed_citation"]:

1115 # an extid has been found in a mixed_citation, no need to add the text of the id here

1116 inner_tex = inner_html = ""

1117 elif tag == "supplementary-material": 1117 ↛ 1118line 1117 didn't jump to line 1118 because the condition on line 1117 was never true

1118 self.parse_supplementary_material(node, **kwargs)

1119 else:

1120 # II.1. Add the node text (before the children text)

1121 if node.text is not None:

1122 node_text = node.text

1123 if self.for_tex_file: 1123 ↛ 1124line 1123 didn't jump to line 1124 because the condition on line 1123 was never true

1124 node_text = unicode_to_latex(node_text)

1125 inner_tex += node_text

1126 inner_html += escape(node.text)

1127

1128 # II.2. children

1129 # child_text = html_from_mixed_content(child, params)

1130

1131 child_kwargs = kwargs.copy()

1132 child_kwargs["is_top"] = False

1133

1134 for child in node:

1135 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs)

1136

1137 # Case where an ext-link has been removed in a mixed-citation

1138 # We may have "title. , (year)"

1139 # Remove the comma that is now useless

1140 if ( 1140 ↛ 1146line 1140 didn't jump to line 1146

1141 kwargs["is_mixed_citation"]

1142 and child_html

1143 and child_html[0] in [",", "."]

1144 and inner_html[-2:] == ". "

1145 ):

1146 inner_html = inner_html[0:-1]

1147 child_html = child_html[1:]

1148 inner_tex = inner_tex[0:-1]

1149 child_tex = child_tex[1:]

1150

1151 inner_tex += child_tex

1152 inner_html += child_html

1153

1154 # II.3. wrap the children text with html links

1155 if kwargs["add_HTML_link"] and node.text:

1156 match = re.match(r"[\n ]+", node.text)

1157 if not match:

1158 inner_html = make_links_clickable(node.text, inner_html)

1159

1160 tex += inner_tex

1161 html += inner_html

1162

1163 # III. Add the node's tail for children

1164 if node.tail and not kwargs["is_top"]:

1165 node_tail = node.tail

1166 if self.for_tex_file: 1166 ↛ 1167line 1166 didn't jump to line 1167 because the condition on line 1166 was never true

1167 node_tail = unicode_to_latex(node_tail)

1168 tex += node_tail

1169 html += escape(node.tail)

1170

1171 return tex, html

1172

1173 def parse_abstract(self, node, **kwargs):

1174 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"

1175 tag = get_normalized_attrib(node, "abstract-type") or "abstract"

1176 if tag == "author": 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true

1177 tag = "abstract"

1178 lang = get_normalized_attrib(node, "lang") or self.lang

1179 value_tex, value_html = self.parse_node_with_mixed_content(node)

1180 value_xml = get_xml_from_node(node)

1181 self.abstracts.append(

1182 {

1183 "tag": tag,

1184 "lang": lang,

1185 "value_xml": value_xml,

1186 "value_html": value_html,

1187 "value_tex": value_tex,

1188 }

1189 )

1190

1191 def parse_aff_alternatives(self, node, **kwargs):

1192 xref_id = get_normalized_attrib(node, "id") or ""

1193 address = ""

1194 aff_to_all = True

1195

1196 for child in node:

1197 tag = normalize(child.tag)

1198

1199 if tag == "aff": 1199 ↛ 1210line 1199 didn't jump to line 1210 because the condition on line 1199 was always true

1200 # Skip the formatted aff and use only the complete address text

1201 # TODO support <aff> properly

1202 for aff in child:

1203 if aff.tag == "label" and address == "": 1203 ↛ 1204line 1203 didn't jump to line 1204 because the condition on line 1203 was never true

1204 label = get_text_from_node(aff)

1205 address = get_text_from_node(child)[len(label) :]

1206 aff_to_all = False

1207 if address == "" and child.text:

1208 address = child.text

1209 else:

1210 self.warnings.append(

1211 {

1212 self.pid: self.__class__.__name__

1213 + "."

1214 + inspect.currentframe().f_code.co_name

1215 + " "

1216 + tag

1217 }

1218 )

1219

1220 if address != "": 1220 ↛ exitline 1220 didn't return from function 'parse_aff_alternatives' because the condition on line 1220 was always true

1221 for contrib in self.contributors:

1222 if address not in contrib["addresses"] and ( 1222 ↛ 1221line 1222 didn't jump to line 1221 because the condition on line 1222 was always true

1223 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all

1224 ):

1225 contrib["addresses"].append(address)

1226 contrib["contrib_xml"] = get_contrib_xml(contrib)

1227

1228 def parse_award_group(self, node, **kwargs):

1229 abbrev = award_id = None

1230

1231 for child in node:

1232 tag = normalize(child.tag)

1233

1234 if tag == "award-id":

1235 award_id = child.text

1236 elif tag == "funding-source":

1237 abbrev = get_text_from_node(child)

1238 else:

1239 self.warnings.append(

1240 {

1241 self.pid: self.__class__.__name__

1242 + "."

1243 + inspect.currentframe().f_code.co_name

1244 + " "

1245 + tag

1246 }

1247 )

1248

1249 if abbrev is not None and award_id is not None:

1250 self.awards.append({"abbrev": abbrev, "award_id": award_id})

1251

1252 def parse_contrib_group(self, node, **kwargs):

1253 role = node.get("content-type") or ""

1254 if role and role[-1] == "s": 1254 ↛ 1257line 1254 didn't jump to line 1257 because the condition on line 1254 was always true

1255 role = role[0:-1]

1256

1257 for child in node:

1258 tag = normalize(child.tag)

1259

1260 if tag == "contrib": 1260 ↛ 1265line 1260 didn't jump to line 1265 because the condition on line 1260 was always true

1261 contrib = self.get_data_from_contrib(child)

1262 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role

1263 contrib["contrib_xml"] = get_xml_from_node(child)

1264 self.contributors.append(contrib)

1265 elif tag == "aff-alternatives":

1266 self.parse_aff_alternatives(child)

1267 elif tag == "fn":

1268 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)

1269 xml = get_xml_from_node(child)

1270 self.footnotes_xml += xml

1271 self.footnotes_html += html

1272 else:

1273 self.warnings.append(

1274 {

1275 self.pid: self.__class__.__name__

1276 + "."

1277 + inspect.currentframe().f_code.co_name

1278 + " "

1279 + tag

1280 }

1281 )

1282

1283 def parse_counts(self, node, **kwargs):

1284 for child in node:

1285 count_value = child.get("count")

1286 if count_value is None:

1287 count_value = child.text

1288

1289 if count_value is not None: 1289 ↛ 1284line 1289 didn't jump to line 1284 because the condition on line 1289 was always true

1290 tag = normalize(child.tag)

1291 if tag == "book-page-count":

1292 tag = "page-count"

1293

1294 self.counts.append((tag, count_value))

1295

1296 def parse_ext_link(self, node, **kwargs):

1297 datas = self.get_data_from_ext_link(node)

1298 extid_value = self.add_extids_from_node_with_link(datas)

1299

1300 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False

1301 if (

1302 add_ext_link

1303 and extid_value[0] is None

1304 and datas not in self.ext_links

1305 and datas["rel"] != "cover"

1306 ):

1307 self.ext_links.append(datas)

1308

1309 return extid_value[0] is not None

1310

1311 def parse_front_matter(self, node, **kwargs):

1312 self.frontmatter_xml = get_xml_from_node(node)

1313 self.frontmatter_foreword_html = ""

1314

1315 for child in node:

1316 tag = normalize(child.tag)

1317

1318 if tag == "foreword": 1318 ↛ 1319line 1318 didn't jump to line 1319 because the condition on line 1318 was never true

1319 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child)

1320 elif tag == "toc": 1320 ↛ 1315line 1320 didn't jump to line 1315 because the condition on line 1320 was always true

1321 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child)

1322

1323 def parse_id(self, node, **kwargs):

1324 node_id = node.text

1325 if "pub-id-type" in node.attrib:

1326 node_type = node.attrib["pub-id-type"]

1327 elif "book-id-type" in node.attrib:

1328 node_type = node.attrib["book-id-type"]

1329 elif "book-part-id-type" in node.attrib: 1329 ↛ 1332line 1329 didn't jump to line 1332 because the condition on line 1329 was always true

1330 node_type = node.attrib["book-part-id-type"]

1331 else:

1332 node_type = ""

1333

1334 if node_type == "pii": 1334 ↛ 1336line 1334 didn't jump to line 1336 because the condition on line 1334 was never true

1335 # Elsevier ids get a special treatment: web scrapping to find the date_published

1336 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR":

1337 self.pii = node_id

1338 elif node_type in ("numdam-id", "mathdoc-id"):

1339 self.pid = node_id

1340 elif node_type == "ark": 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true

1341 self.extids.append((node_type, node_id))

1342 elif node_type in ("doi", "eid"):

1343 self.ids.append((node_type, node_id))

1344 if node_type == "doi": 1344 ↛ exitline 1344 didn't return from function 'parse_id' because the condition on line 1344 was always true

1345 self.doi = node_id

1346

1347 def parse_kwd_group(self, node, **kwargs):

1348 kwds = []

1349 value_html = value_tex = ""

1350 for child in node:

1351 tag = normalize(child.tag)

1352

1353 if tag == "kwd":

1354 kwds.append(child.text)

1355 elif tag == "unstructured-kwd-group": 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true

1356 # value_xml = get_xml_from_node(child)

1357 value_tex, value_html = self.parse_node_with_mixed_content(child)

1358 kwds = split_kwds(value_tex)

1359 else:

1360 self.warnings.append(

1361 {

1362 self.pid: self.__class__.__name__

1363 + "."

1364 + inspect.currentframe().f_code.co_name

1365 + " "

1366 + tag

1367 }

1368 )

1369

1370 content_type = node.get("content-node_type") or ""

1371 if content_type == "": 1371 ↛ 1373line 1371 didn't jump to line 1373 because the condition on line 1371 was always true

1372 content_type = node.get("kwd-group-type") or ""

1373 lang = get_normalized_attrib(node, "lang") or self.lang

1374

1375 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds])

1376

1377 def parse_ref_list(self, node, **kwargs):

1378 for child in node:

1379 tag = normalize(child.tag)

1380

1381 if tag == "ref": 1381 ↛ 1386line 1381 didn't jump to line 1386 because the condition on line 1381 was always true

1382 ref = JatsRef(tree=child, lang=self.lang)

1383 self.warnings.extend(ref.warnings)

1384 self.bibitems.append(ref)

1385 self.bibitem.append(ref.citation_html)

1386 elif tag == "p":

1387 # Elsevier can store supplementary-material inside ref-list / p

1388 self.parse_node_with_mixed_content(child)

1389 else:

1390 self.warnings.append(

1391 {

1392 self.pid: self.__class__.__name__

1393 + "."

1394 + inspect.currentframe().f_code.co_name

1395 + " "

1396 + tag

1397 }

1398 )

1399

1400 def parse_related_article(self, node, **kwargs):

1401 rel_type = get_normalized_attrib(node, "related-article-type") or ""

1402 id_value = node.text

1403

1404 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1404 ↛ 1407line 1404 didn't jump to line 1407 because the condition on line 1404 was never true

1405 # a pii is used instead of a DOI

1406 # Call Elsevier to get the doi

1407 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)

1408 id_value = doi

1409

1410 obj = Foo()

1411 obj.rel_type = rel_type

1412 obj.id_value = id_value

1413

1414 self.relations.append(obj)

1415

1416 def parse_related_object(self, node, **kwargs):

1417 node_type = node.get("content-type") or ""

1418 rel = node.get("link-type") or ""

1419 href = get_normalized_attrib(node, "href") or ""

1420 base = get_normalized_attrib(node, "base") or ""

1421 text = get_xml_from_node(node)

1422

1423 data = {

1424 "rel": rel,

1425 "mimetype": node_type,

1426 "location": href,

1427 "base": base,

1428 "metadata": text,

1429 }

1430

1431 document_id_type = node.get("document-id-type") or ""

1432 if document_id_type: 1432 ↛ 1433line 1432 didn't jump to line 1433 because the condition on line 1432 was never true

1433 id_value = node.get("document-id") or ""

1434 if id_value != "NONE":

1435 if id_value and id_value.find("10.") == -1:

1436 # a pii is used instead of a DOI

1437 # Call Elsevier to get the doi

1438 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)

1439 id_value = doi

1440

1441 obj = Foo()

1442 obj.rel_type = "refers to"

1443 obj.id_value = id_value

1444

1445 self.relations.append(obj)

1446 else:

1447 self.related_objects.append(data)

1448

1449 def parse_sec(self, node, **kwargs):

1450 for child in node:

1451 tag = normalize(child.tag)

1452

1453 if tag == "title":

1454 pass

1455 elif tag == "ref-list":

1456 self.parse_ref_list(child)

1457 else:

1458 self.warnings.append(

1459 {

1460 self.pid: self.__class__.__name__

1461 + "."

1462 + inspect.currentframe().f_code.co_name

1463 + " "

1464 + tag

1465 }

1466 )

1467

1468 def parse_self_uri(self, node, **kwargs):

1469 node_type = node.get("content-type") or "text/html"

1470 href = get_normalized_attrib(node, "href") or ""

1471 base = get_normalized_attrib(node, "base") or ""

1472

1473 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections:

1474 # The collection folder is missing: add it back

1475 if hasattr(self, "pii") and hasattr(self, "issue"): 1475 ↛ 1476line 1475 didn't jump to line 1476 because the condition on line 1475 was never true

1476 base_dir = self.issue.journal.pid

1477 if os.path.dirname(href) != base_dir:

1478 href = os.path.join(base_dir, self.issue.pid, href)

1479

1480 if self.no_bib: 1480 ↛ 1481line 1480 didn't jump to line 1481 because the condition on line 1480 was never true

1481 href = "http://www.numdam.org/item/" + os.path.basename(href)

1482

1483 data = {

1484 "rel": "full-text",

1485 "mimetype": node_type,

1486 "location": href,

1487 "base": base,

1488 "text": normalize_space(node.text) if node.text is not None else "",

1489 }

1490

1491 # Ext-links, Related-objects used metadata instead of text. Strange difference ?

1492 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here.

1493 if node_type != "application/xml":

1494 self.streams.append(data)

1495

1496 def parse_sub_article(self, node, **kwargs):

1497 # Used for translations

1498 trans_article = JatsArticle(tree=node)

1499 self.translations.append(trans_article)

1500

1501 def parse_subj_group(self, node, **kwargs):

1502 lang = get_normalized_attrib(node, "lang") or self.lang

1503 type_ = node.get("subj-group-type") or ""

1504

1505 for child in node:

1506 tag = normalize(child.tag)

1507

1508 if tag == "subject": 1508 ↛ 1513line 1508 didn't jump to line 1513 because the condition on line 1508 was always true

1509 self.subjs.append(

1510 {"type": type_, "lang": lang, "value": get_text_from_node(child)}

1511 )

1512 else:

1513 self.warnings.append(

1514 {

1515 self.pid: self.__class__.__name__

1516 + "."

1517 + inspect.currentframe().f_code.co_name

1518 + " "

1519 + tag

1520 }

1521 )

1522

1523 def parse_supplementary_material(self, node, **kwargs):

1524 caption = ""

1525 for child in node:

1526 if child.tag == "caption":

1527 _, caption = self.parse_node_with_mixed_content(child)

1528

1529 location = get_normalized_attrib(node, "href") or None

1530 if location is None:

1531 location = get_normalized_attrib(node, "id") or ""

1532

1533 mimetype = node.attrib.get("mimetype") or None

1534 if mimetype is None:

1535 mimetype = resolver.get_mimetype(location)

1536

1537 material = {

1538 "rel": node.attrib.get("content-type") or "supplementary-material",

1539 "mimetype": mimetype,

1540 "location": location,

1541 "base": "",

1542 "metadata": "",

1543 "caption": caption if caption else "",

1544 }

1545 base_location = os.path.basename(location)

1546 found_list = [

1547 item

1548 for item in self.supplementary_materials

1549 if os.path.basename(item["location"]) == base_location

1550 ]

1551 if len(found_list) == 0:

1552 self.supplementary_materials.append(material)

1553

1554 def parse_title(self, node, **kwargs):

1555 self.title_tex, self.title_html = self.parse_node_with_mixed_content(

1556 node, ignore_xref=True

1557 )

1558 # In xmldata.py, title_xml had the <title_group> tag:

1559 # self.title_xml can't be set in parse_title

1560

1561 def parse_title_group(self, node, **kwargs):

1562 has_fn_group = False

1563

1564 for child in node:

1565 tag = normalize(child.tag)

1566

1567 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"):

1568 self.parse_title(child)

1569 elif tag == "subtitle": 1569 ↛ 1570line 1569 didn't jump to line 1570 because the condition on line 1569 was never true

1570 title_tex, title_html = self.parse_node_with_mixed_content(child)

1571 self.title_tex += " " + title_tex

1572 self.title_html += " " + title_html

1573 elif tag == "trans-title-group":

1574 self.parse_trans_title_group(child)

1575 elif tag == "abbrev-title":

1576 _, self.abbrev = self.parse_node_with_mixed_content(child)

1577 elif tag == "fn-group": 1577 ↛ 1578line 1577 didn't jump to line 1578 because the condition on line 1577 was never true

1578 has_fn_group = True

1579 for fn_node in child:

1580 if fn_node.tag == "fn":

1581 _, html = self.parse_node_with_fn(

1582 fn_node, keep_fn=True, keep_fn_label=False

1583 )

1584 xml = get_xml_from_node(fn_node)

1585 self.footnotes_xml += xml

1586 self.footnotes_html += html

1587 else:

1588 self.warnings.append(

1589 {

1590 self.pid: self.__class__.__name__

1591 + "."

1592 + inspect.currentframe().f_code.co_name

1593 + " "

1594 + tag

1595 }

1596 )

1597

1598 if has_fn_group: 1598 ↛ 1601line 1598 didn't jump to line 1601 because the condition on line 1598 was never true

1599 # fn-group is now a funding statement and will be exported separately in the XML:

1600 # => remove it from the title-group

1601 new_node = etree.Element("title-group")

1602 for child in node:

1603 tag = normalize(child.tag)

1604 if tag != "fn-group":

1605 new_node.append(copy.deepcopy(child))

1606 self.title_xml = get_xml_from_node(new_node)

1607 else:

1608 self.title_xml = get_xml_from_node(node)

1609

1610 def parse_trans_abstract(self, node, **kwargs):

1611 tag = get_normalized_attrib(node, "abstract-type") or "abstract"

1612 if tag == "author": 1612 ↛ 1613line 1612 didn't jump to line 1613 because the condition on line 1612 was never true

1613 tag = "abstract"

1614 lang = get_normalized_attrib(node, "lang") or "und"

1615 value_tex, value_html = self.parse_node_with_mixed_content(node)

1616 value_xml = get_xml_from_node(node)

1617 self.abstracts.append(

1618 {

1619 "tag": tag,

1620 "lang": lang,

1621 "value_xml": value_xml,

1622 "value_html": value_html,

1623 "value_tex": value_tex,

1624 }

1625 )

1626

1627 def parse_trans_title(self, node, **kwargs):

1628 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node)

1629 self.trans_title_xml = get_xml_from_node(node)

1630

1631 def parse_trans_title_group(self, node, **kwargs):

1632 for child in node:

1633 tag = normalize(child.tag)

1634

1635 if tag == "trans-title": 1635 ↛ 1638line 1635 didn't jump to line 1638 because the condition on line 1635 was always true

1636 self.parse_trans_title(child)

1637 else:

1638 self.warnings.append(

1639 {

1640 self.pid: self.__class__.__name__

1641 + "."

1642 + inspect.currentframe().f_code.co_name

1643 + " "

1644 + tag

1645 }

1646 )

1647

1648 self.trans_lang = get_normalized_attrib(node, "lang") or "und"

1649

1650 def get_data_from_contrib(self, node):

1651 """

1652 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives>

1653 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code

1654 :param node:

1655 :return:

1656 """

1657

1658 params = create_contributor()

1659

1660 for child in node:

1661 if child.tag == "name":

1662 self.update_data_from_name(child, params)

1663 elif child.tag == "string-name":

1664 self.update_data_from_name(child, params)

1665 if params["first_name"] == "" and params["last_name"] == "": 1665 ↛ 1660line 1665 didn't jump to line 1660 because the condition on line 1665 was always true

1666 params["string_name"] = child.text or ""

1667 elif child.tag == "name-alternatives":

1668 params["mid"] = self.get_data_from_name_alternatives(child)

1669 elif child.tag == "contrib-id":

1670 type_ = child.get("contrib-id-type") or ""

1671 if type_ == "orcid": 1671 ↛ 1673line 1671 didn't jump to line 1673 because the condition on line 1671 was always true

1672 params["orcid"] = child.text or ""

1673 if type_ == "idref": 1673 ↛ 1674line 1673 didn't jump to line 1674 because the condition on line 1673 was never true

1674 params["idref"] = child.text or ""

1675 elif child.tag == "address":

1676 addr = get_text_from_node(child)

1677 params["addresses"].append(addr)

1678 elif child.tag == "email":

1679 params["email"] = child.text or ""

1680 elif child.tag == "xref": 1680 ↛ 1692line 1680 didn't jump to line 1692 because the condition on line 1680 was always true

1681 # Elsevier uses xref/aff-alternatives to store affiliations

1682 type_ = child.get("ref-type") or ""

1683 if type_ == "aff": 1683 ↛ 1660line 1683 didn't jump to line 1660 because the condition on line 1683 was always true

1684 xref = child.get("rid") or ""

1685 if xref == "": 1685 ↛ 1686line 1685 didn't jump to line 1686 because the condition on line 1685 was never true

1686 xref = get_text_from_node(child)

1687 if xref != "": 1687 ↛ 1660line 1687 didn't jump to line 1660 because the condition on line 1687 was always true

1688 if "xrefs" not in params: 1688 ↛ 1691line 1688 didn't jump to line 1691 because the condition on line 1688 was always true

1689 params["xrefs"] = [xref]

1690 else:

1691 params["xrefs"].append(xref)

1692 elif child.tag == "collab":

1693 params["string_name"] = child.text or ""

1694 elif child.tag == "role":

1695 pass

1696 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente").

1697 # The node value can not be assigned to params['role'] as we want a controlled vocabulary

1698 # (author /editor / organizer...)

1699 # Ignore the value

1700 # params["role"] = child.text or ""

1701 else:

1702 self.warnings.append(

1703 {

1704 self.pid: self.__class__.__name__

1705 + "."

1706 + inspect.currentframe().f_code.co_name

1707 + " "

1708 + child.tag

1709 }

1710 )

1711

1712 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ)

1713 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import

1714 # params['addresses'].sort()

1715

1716 helper_update_name_params(params)

1717

1718 corresp = node.get("corresp") or ""

1719 if corresp == "yes": 1719 ↛ 1720line 1719 didn't jump to line 1720 because the condition on line 1719 was never true

1720 params["corresponding"] = True

1721

1722 deceased_ = node.get("deceased") or "no"

1723 params["deceased_before_publication"] = deceased_ == "yes"

1724

1725 equal_contrib_ = node.get("equal-contrib") or "no"

1726 params["equal_contrib"] = equal_contrib_ == "yes"

1727

1728 return params

1729

1730 def get_data_from_custom_meta(self, node):

1731 name = ""

1732 value = ""

1733

1734 for child in node:

1735 tag = normalize(child.tag)

1736

1737 if tag == "meta-name":

1738 name = child.text

1739 elif tag == "meta-value": 1739 ↛ 1742line 1739 didn't jump to line 1742 because the condition on line 1739 was always true

1740 value = child.text

1741 else:

1742 self.warnings.append(

1743 {

1744 self.pid: self.__class__.__name__

1745 + "."

1746 + inspect.currentframe().f_code.co_name

1747 + " "

1748 + tag

1749 }

1750 )

1751

1752 return name, value

1753

1754 def get_data_from_date(self, node, ignore_month=False):

1755 date_str = ""

1756 if "iso-8601-date" in node.attrib:

1757 date_str = node.attrib["iso-8601-date"]

1758 else:

1759 year = month = day = ""

1760 for child in node:

1761 tag = normalize(child.tag)

1762

1763 if tag == "year": 1763 ↛ 1765line 1763 didn't jump to line 1765 because the condition on line 1763 was always true

1764 year = child.text

1765 elif tag == "month" and not ignore_month:

1766 month = child.text

1767 elif tag == "day":

1768 day = child.text

1769 else:

1770 self.warnings.append(

1771 {

1772 self.pid: self.__class__.__name__

1773 + "."

1774 + inspect.currentframe().f_code.co_name

1775 + " "

1776 + tag

1777 }

1778 )

1779

1780 date_str = year

1781 if date_str and month: 1781 ↛ 1782line 1781 didn't jump to line 1782 because the condition on line 1781 was never true

1782 date_str += "-" + month

1783 if date_str and day: 1783 ↛ 1784line 1783 didn't jump to line 1784 because the condition on line 1783 was never true

1784 date_str += "-" + day

1785

1786 return date_str

1787

1788 def get_data_from_ext_link(self, node, **kwargs):

1789 link_type = node.get("ext-link-type") or ""

1790 href = get_normalized_attrib(node, "href") or ""

1791 base = get_normalized_attrib(node, "base") or ""

1792

1793 kwargs["add_HTML_link"] = False

1794 _, metadata = self.parse_inner_node(node, **kwargs)

1795

1796 data = {

1797 "rel": link_type,

1798 "mimetype": "",

1799 "location": href,

1800 "base": base,

1801 "metadata": metadata,

1802 }

1803

1804 return data

1805

1806 def get_data_from_history(self, node):

1807 history_dates = []

1808 # TODO: transform history_dates in a hash where date-type is the key

1809 # => Change database_cmds

1810 for child in node:

1811 if "date-type" in child.attrib:

1812 date_type = child.attrib["date-type"]

1813 date_str = self.get_data_from_date(child)

1814 history_dates.append({"type": date_type, "date": date_str})

1815 else:

1816 self.warnings.append(

1817 {

1818 self.pid: self.__class__.__name__

1819 + "."

1820 + inspect.currentframe().f_code.co_name

1821 + " "

1822 + child.tag

1823 }

1824 )

1825

1826 return history_dates

1827

1828 def update_data_from_name(self, node, contributor):

1829 for child in node:

1830 if child.text is not None: 1830 ↛ 1829line 1830 didn't jump to line 1829 because the condition on line 1830 was always true

1831 if child.tag == "given-names":

1832 contributor["first_name"] = child.text

1833 elif child.tag == "surname":

1834 contributor["last_name"] = child.text

1835 elif child.tag == "prefix": 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true

1836 contributor["prefix"] = child.text

1837 elif child.tag == "suffix": 1837 ↛ 1840line 1837 didn't jump to line 1840 because the condition on line 1837 was always true

1838 contributor["suffix"] = child.text

1839 else:

1840 self.warnings.append(

1841 {

1842 self.pid: self.__class__.__name__

1843 + "."

1844 + inspect.currentframe().f_code.co_name

1845 + " "

1846 + child.tag

1847 }

1848 )

1849

1850 def get_data_from_name_alternatives(self, node):

1851 mid = ""

1852

1853 for child in node:

1854 if child.text is not None: 1854 ↛ 1853line 1854 didn't jump to line 1853 because the condition on line 1854 was always true

1855 if child.tag == "string-name": 1855 ↛ 1859line 1855 didn't jump to line 1859 because the condition on line 1855 was always true

1856 if child.get("specific-use") == "index": 1856 ↛ 1853line 1856 didn't jump to line 1853 because the condition on line 1856 was always true

1857 mid = child.text

1858 else:

1859 self.warnings.append(

1860 {

1861 self.pid: self.__class__.__name__

1862 + "."

1863 + inspect.currentframe().f_code.co_name

1864 + " "

1865 + child.tag

1866 }

1867 )

1868

1869 return mid

1870

1871 def get_data_from_uri(self, node, **kwargs):

1872 href = get_normalized_attrib(node, "href") or ""

1873

1874 kwargs["add_HTML_link"] = False

1875 _, metadata = self.parse_inner_node(node, **kwargs)

1876

1877 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata}

1878

1879 return data

1880

1881 def helper_add_link_from_node(self, node, **kwargs):

1882 text = node.text or ""

1883 tag = normalize(node.tag)

1884 fct_name = "get_data_from_" + tag.replace("-", "_")

1885 meth = getattr(self, fct_name)

1886 data = meth(node, **kwargs)

1887 if not data["rel"] or data["rel"] == "uri":

1888 href = data["location"]

1889 if self.for_tex_file: 1889 ↛ 1890line 1889 didn't jump to line 1890 because the condition on line 1889 was never true

1890 text = "\\href{" + href + "}{" + data["metadata"] + "}"

1891 else:

1892 text = make_links_clickable(href, data["metadata"])

1893 return text

1894

1895 def get_list_start_value(self, list_node):

1896 continued_from = list_node.get("continued-from")

1897 if continued_from is None:

1898 start = 0

1899 else:

1900 from_node = self.tree.find(f'.//*[@id="{continued_from}"]')

1901 if from_node is not None:

1902 start = len(from_node) + self.get_list_start_value(from_node)

1903

1904 return start

1905

1906

1907class MathdocPublication(MathdocPublicationData, JatsBase):

1908 def __init__(self, *args, **kwargs):

1909 super().__init__(*args, **kwargs)

1910 self.parse_tree(kwargs["tree"])

1911

1912 def parse_tree(self, tree):

1913 super().parse_tree(tree)

1914

1915 for node in tree:

1916 tag = normalize(node.tag)

1917

1918 if tag in ("publication-id", "collection-id"):

1919 node_type = node.get("publication-id-type")

1920 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]:

1921 self.pid = node.text

1922 elif tag == "title-group":

1923 self.parse_title_group(node)

1924 elif tag == "issn":

1925 node_type = node.get("pub-type")

1926 if node_type == "ppub":

1927 self.issn = node.text

1928 self.ids.append(("issn", node.text))

1929 elif node_type == "epub": 1929 ↛ 1915line 1929 didn't jump to line 1915 because the condition on line 1929 was always true

1930 self.e_issn = node.text

1931 self.ids.append(("e-issn", node.text))

1932 elif tag == "ext-link":

1933 data = self.get_data_from_ext_link(node)

1934 self.ext_links.append(data)

1935 elif tag == "custom-meta-group":

1936 self.parse_custom_meta_group(node)

1937 elif tag == "description": 1937 ↛ 1938line 1937 didn't jump to line 1938 because the condition on line 1937 was never true

1938 self.parse_description(node)

1939 else:

1940 self.warnings.append(

1941 {

1942 self.pid: self.__class__.__name__

1943 + "."

1944 + inspect.currentframe().f_code.co_name

1945 + " "

1946 + tag

1947 }

1948 )

1949

1950 def parse_custom_meta_group(self, node, **kwargs):

1951 for child in node:

1952 tag = normalize(child.tag)

1953

1954 if tag == "custom-meta": 1954 ↛ 1964line 1954 didn't jump to line 1964 because the condition on line 1954 was always true

1955 name, value = self.get_data_from_custom_meta(child)

1956

1957 if name == "serial-type":

1958 self.coltype = value

1959 elif name == "wall":

1960 self.wall = int(value)

1961 elif name == "provider": 1961 ↛ 1951line 1961 didn't jump to line 1951 because the condition on line 1961 was always true

1962 self.provider = value

1963 else:

1964 self.warnings.append(

1965 {

1966 self.pid: self.__class__.__name__

1967 + "."

1968 + inspect.currentframe().f_code.co_name

1969 + " "

1970 + tag

1971 }

1972 )

1973

1974 def parse_description(self, node, **kwargs):

1975 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"

1976 tag = "description"

1977 lang = get_normalized_attrib(node, "lang") or self.lang

1978 value_xml = get_xml_from_node(node)

1979 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "")

1980 self.abstracts.append(

1981 {

1982 "tag": tag,

1983 "lang": lang,

1984 "value_xml": value_xml,

1985 "value_html": value_html,

1986 "value_tex": value_tex,

1987 }

1988 )

1989

1990

1991class JatsPublisher(PublisherData):

1992 def __init__(self, *args, **kwargs):

1993 super().__init__(*args, **kwargs)

1994 self.warnings = []

1995 self.parse_tree(kwargs["tree"])

1996 self.warnings = []

1997

1998 def parse_tree(self, tree):

1999 for node in tree:

2000 tag = normalize(node.tag)

2001

2002 if tag == "publisher-name": 2002 ↛ 2004line 2002 didn't jump to line 2004 because the condition on line 2002 was always true

2003 self.name = node.text

2004 elif tag == "publisher-loc":

2005 self.loc = node.text

2006 else:

2007 self.warnings.append(

2008 {

2009 self.pid: self.__class__.__name__

2010 + "."

2011 + inspect.currentframe().f_code.co_name

2012 + " "

2013 + tag

2014 }

2015 )

2016

2017

2018class JatsJournal(JournalData, JatsBase):

2019 def __init__(self, *args, **kwargs):

2020 super().__init__(*args, **kwargs)

2021 self.parse_tree(kwargs["tree"])

2022

2023 def parse_tree(self, tree):

2024 super().parse_tree(tree)

2025

2026 for node in tree:

2027 tag = normalize(node.tag)

2028

2029 if tag == "journal-id":

2030 id_type = node.get("journal-id-type") or "numdam-id"

2031 if id_type == "numdam-id" or id_type == "mathdoc-id": 2031 ↛ 2026line 2031 didn't jump to line 2026 because the condition on line 2031 was always true

2032 self.pid = node.text

2033 elif tag == "journal-title-group":

2034 self.parse_title_group(node)

2035 elif tag == "publisher":

2036 self.publisher = JatsPublisher(tree=node)

2037 elif tag == "issn": 2037 ↛ 2046line 2037 didn't jump to line 2046 because the condition on line 2037 was always true

2038 node_type = node.get("pub-type") or "ppub"

2039 if node_type == "ppub":

2040 self.issn = node.text

2041 self.ids.append(("issn", node.text))

2042 elif node_type == "epub": 2042 ↛ 2026line 2042 didn't jump to line 2026 because the condition on line 2042 was always true

2043 self.e_issn = node.text

2044 self.ids.append(("e-issn", node.text))

2045 else:

2046 self.warnings.append(

2047 {

2048 self.pid: self.__class__.__name__

2049 + "."

2050 + inspect.currentframe().f_code.co_name

2051 + " "

2052 + tag

2053 }

2054 )

2055

2056

2057class JatsEdito(ArticleData, JatsBase):

2058 def __init__(self, *args, **kwargs): # , tree, pid=None):

2059 super().__init__(*args, **kwargs)

2060 self.pid = kwargs["pid"] if "pid" in kwargs else None

2061 self.issue = kwargs["issue"] if "issue" in kwargs else None

2062

2063 self.add_span_around_tex_formula = (

2064 kwargs["add_span_around_tex_formula"]

2065 if "add_span_around_tex_formula" in kwargs

2066 else False

2067 )

2068 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False

2069 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None

2070 self.no_bib = kwargs.get("no_bib", False)

2071

2072 self.parse_tree(kwargs["tree"])

2073

2074 def parse_tree(self, tree):

2075 super().parse_tree(tree)

2076 for node in tree:

2077 text_html = ""

2078

2079 tag = normalize(node.tag)

2080 if tag == "p":

2081 text_html = get_text_from_node(node)

2082 if text_html:

2083 self.body_html += "<p>" + text_html + "</p>"

2084 elif tag == "h1":

2085 text_html = get_text_from_node(node)

2086 if text_html:

2087 self.body_html += "<h1>" + text_html + "</h1>"

2088

2089 return self.body_html

2090

2091

2092class JatsIssue(IssueData, JatsBase):

2093 def __init__(self, *args, **kwargs):

2094 super().__init__(*args, **kwargs)

2095 # from_folder is used to change the location of Elsevier graphics to a full path location

2096 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None

2097 self.no_bib = kwargs.get("no_bib", False)

2098

2099 self.parse_tree(kwargs["tree"])

2100

2101 def parse_tree(self, tree):

2102 super().parse_tree(tree)

2103

2104 for node in tree:

2105 tag = normalize(node.tag)

2106 if tag == "journal-meta":

2107 self.journal = JatsJournal(tree=node)

2108 elif tag == "issue-meta":

2109 ctype = get_normalized_attrib(node, "issue_type")

2110 if ctype == "issue_special": 2110 ↛ 2111line 2110 didn't jump to line 2111 because the condition on line 2110 was never true

2111 self.ctype = "issue_special"

2112 self.parse_issue_meta(node)

2113 elif tag == "body": 2113 ↛ 2138line 2113 didn't jump to line 2138 because the condition on line 2113 was always true

2114 for child in node:

2115 tag = normalize(child.tag)

2116

2117 if tag == "article": 2117 ↛ 2128line 2117 didn't jump to line 2128 because the condition on line 2117 was always true

2118 article = JatsArticle(

2119 tree=child,

2120 issue=self,

2121 from_folder=self.from_folder,

2122 no_bib=self.no_bib,

2123 )

2124 self.warnings.extend(article.warnings)

2125 self.articles.append(article)

2126

2127 else:

2128 self.warnings.append(

2129 {

2130 self.pid: self.__class__.__name__

2131 + "."

2132 + inspect.currentframe().f_code.co_name

2133 + " "

2134 + tag

2135 }

2136 )

2137 else:

2138 self.warnings.append(

2139 {

2140 self.pid: self.__class__.__name__

2141 + "."

2142 + inspect.currentframe().f_code.co_name

2143 + " "

2144 + tag

2145 }

2146 )

2147

2148 if self.journal is not None: 2148 ↛ 2152line 2148 didn't jump to line 2152 because the condition on line 2148 was always true

2149 self.publisher = self.journal.publisher

2150

2151 # Issue editors may be replicated in all the articles, remove them

2152 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"]

2153

2154 is_elsevier = False

2155 for xarticle in self.articles:

2156 if hasattr(xarticle, "pii"): 2156 ↛ 2157line 2156 didn't jump to line 2157 because the condition on line 2156 was never true

2157 is_elsevier = True

2158

2159 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"]

2160 is_equal = len(editors) == len(issue_editors)

2161 i = 0

2162 while is_equal and i < len(editors): 2162 ↛ 2163line 2162 didn't jump to line 2163 because the condition on line 2162 was never true

2163 if (

2164 editors[i]["last_name"] != issue_editors[i]["last_name"]

2165 or editors[i]["first_name"] != issue_editors[i]["first_name"]

2166 ):

2167 is_equal = False

2168 i += 1

2169 if is_equal: 2169 ↛ 2155line 2169 didn't jump to line 2155 because the condition on line 2169 was always true

2170 xarticle.contributors = [

2171 contrib for contrib in xarticle.contributors if contrib["role"] != "editor"

2172 ]

2173

2174 if is_elsevier: 2174 ↛ 2176line 2174 didn't jump to line 2176 because the condition on line 2174 was never true

2175 # Fix location of icons

2176 for link in self.ext_links:

2177 if link["rel"] in ["icon", "small_icon"]:

2178 base_dir = self.journal.pid

2179 location = link["location"]

2180 if os.path.dirname(location) != base_dir:

2181 location = os.path.join(base_dir, self.pid, location)

2182 if self.from_folder:

2183 location = os.path.join(self.from_folder, location)

2184 location = "file:" + location

2185 link["location"] = location

2186

2187 # Fix article types and subjects

2188 for xarticle in self.articles:

2189 article_type = "research-article"

2190 old_type = ""

2191 new_subjs = []

2192

2193 if xarticle.fpage != "":

2194 try:

2195 value = int(xarticle.fpage)

2196 except ValueError:

2197 # fpage is not a number: the article is an editorial

2198 article_type = "editorial"

2199

2200 if article_type == "research-article":

2201 for subj in xarticle.subjs:

2202 if subj["type"] == "type":

2203 # Fix article types

2204 value = subj["value"].lower()

2205 old_type = value

2206 if value == "discussion":

2207 article_type = "letter"

2208 elif value == "editorial":

2209 if xarticle.title_tex.lower().find("foreword") == 0:

2210 article_type = "foreword"

2211 else:

2212 article_type = "editorial"

2213 elif value in ["mini review", "review article", "book review"]:

2214 article_type = "review"

2215 elif value == "research article":

2216 article_type = "research-article"

2217 elif value == "short communication":

2218 article_type = "foreword"

2219 elif value == "correspondence":

2220 article_type = "letter"

2221 elif value.find("conference") == 0:

2222 article_type = "congress"

2223 elif subj["type"] == "heading" and not xarticle.title_tex:

2224 # The title may be stored in the heading: fix it

2225 xarticle.title_tex = xarticle.title_html = subj["value"]

2226 xarticle.title_xml = get_title_xml(subj["value"])

2227 elif subj["type"] == "heading":

2228 value = subj["value"].lower().strip()

2229 issue_title = self.title_tex.lower()

2230 if issue_title.find("dossier: ") == 0:

2231 issue_title = issue_title[9:]

2232 self.title_tex = self.title_html = self.title_tex[9:]

2233 self.title_xml = (

2234 "<issue-title>"

2235 + get_single_title_xml(issue_title)

2236 + "</issue-title>"

2237 )

2238

2239 # Some heading values are in fact article type

2240 if value.find("erratum") == 0:

2241 article_type = "erratum"

2242 elif value.find("corrigendum") == 0:

2243 article_type = "corrigendum"

2244 elif value.find("foreword") == 0:

2245 article_type = "foreword"

2246 elif value.find("nécrologie") == 0 or value.find("obituary") == 0:

2247 article_type = "history-of-sciences"

2248 elif (

2249 value.find("block calendar/éphéméride") == 0

2250 or value.find("chronique") == 0

2251 ):

2252 article_type = "history-of-sciences"

2253 elif value.find("histoire") == 0 or value.find("historic") == 0:

2254 article_type = "history-of-sciences"

2255 elif value.find("tribute/hommage") == 0:

2256 article_type = "history-of-sciences"

2257 elif value.find("note historique") == 0:

2258 article_type = "historical-commentary"

2259 elif (

2260 value.find("le point sur") == 0 or value.find("le point-sur") == 0

2261 ):

2262 article_type = "review"

2263 elif (

2264 value.find("review") == 0

2265 or value.find("revue") == 0

2266 or value.find("concise review") == 0

2267 ):

2268 article_type = "review"

2269 elif value.find("conférence") == 0:

2270 article_type = "congress"

2271 elif (

2272 value.find("communication") == 0 or value.find("preliminary") == 0

2273 ):

2274 article_type = "preliminary-communication"

2275 elif value.find("perspective") == 0 and old_type in [

2276 "correspondence",

2277 "short communication",

2278 ]:

2279 article_type = "opinion"

2280 elif value.find("debate") == 0:

2281 article_type = "opinion"

2282 elif (

2283 value.find("index") == 0

2284 or value.find("keyword") == 0

2285 or value.find("sommaire") == 0

2286 ):

2287 article_type = "editorial"

2288 elif (

2289 value.find("table auteurs") == 0

2290 or value.find("table sommaire") == 0

2291 ):

2292 article_type = "editorial"

2293 elif value.find("page présentation des index") == 0:

2294 article_type = "editorial"

2295 elif value.find("fac-similé") == 0:

2296 # Article de crbiol, Pubmed les met en "Classical Article"

2297 article_type = "historical-commentary"

2298 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie)

2299 new_subjs.append(subj)

2300 # Ignore the issue titles

2301 elif (

2302 not self.title_tex

2303 or value.find(self.title_tex.lower().strip()) != 0

2304 ):

2305 # Exclude headings that are redundant with article types

2306 exclude_list = [

2307 "editorial",

2308 "éditorial",

2309 "avant-propos",

2310 "book review",

2311 "comment",

2312 "concise review paper",

2313 "answer",

2314 "commentaire",

2315 "commentary",

2316 "reply",

2317 "foreword",

2318 "full paper",

2319 "mémoire",

2320 ]

2321 if len([x for x in exclude_list if value.find(x) == 0]) == 0:

2322 new_subjs.append(subj)

2323 else:

2324 new_subjs.append(subj)

2325

2326 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage)

2327 xarticle.atype = article_type

2328 xarticle.subjs = new_subjs

2329

2330 def parse_custom_meta_group(self, node, **kwargs):

2331 for child in node:

2332 tag = normalize(child.tag)

2333

2334 if tag == "custom-meta": 2334 ↛ 2342line 2334 didn't jump to line 2342 because the condition on line 2334 was always true

2335 name, value = self.get_data_from_custom_meta(child)

2336

2337 if name == "provider":

2338 self.provider = value

2339 elif name == "efirst": 2339 ↛ 2331line 2339 didn't jump to line 2331 because the condition on line 2339 was always true

2340 self.with_online_first = value == "yes"

2341 else:

2342 self.warnings.append(

2343 {

2344 self.pid: self.__class__.__name__

2345 + "."

2346 + inspect.currentframe().f_code.co_name

2347 + " "

2348 + tag

2349 }

2350 )

2351

2352 def parse_issue_meta(self, node, **kwargs):

2353 for child in node:

2354 tag = normalize(child.tag)

2355

2356 if tag == "issue-id":

2357 self.parse_id(child)

2358 elif tag == "volume-series":

2359 self.vseries = child.text

2360 elif tag == "volume":

2361 self.volume = child.text

2362 elif tag == "issue":

2363 self.number = child.text

2364 elif tag == "pub-date":

2365 self.year = self.get_data_from_date(child, ignore_month=True)

2366 elif tag == "history":

2367 history_dates = self.get_data_from_history(child)

2368 for date in history_dates:

2369 if date["type"] == "last-modified":

2370 self.last_modified_iso_8601_date_str = date["date"]

2371 elif date["type"] == "prod-deployed-date":

2372 self.prod_deployed_date_iso_8601_date_str = date["date"]

2373 elif tag == "issue-title":

2374 content_type = child.get("content-type") or ""

2375 if content_type != "subtitle" and content_type != "cover-date": 2375 ↛ 2353line 2375 didn't jump to line 2353 because the condition on line 2375 was always true

2376 # Elsevier stores contributors in subtitles. Ignore.

2377 lang = get_normalized_attrib(child, "lang") or "und"

2378 if not self.title_tex and ( 2378 ↛ 2386line 2378 didn't jump to line 2386 because the condition on line 2378 was always true

2379 self.lang == "und" or lang == "und" or lang == self.lang

2380 ):

2381 self.parse_title(child)

2382 # In xmldata, title_xml had the <title_group> tag:

2383 # self.title_xml can't be set in parse_title

2384 self.title_xml += get_xml_from_node(child)

2385 else:

2386 self.trans_lang = lang

2387 (

2388 self.trans_title_tex,

2389 self.trans_title_html,

2390 ) = self.parse_node_with_mixed_content(child)

2391 self.title_xml += get_xml_from_node(child)

2392 elif tag == "issue-title-group": 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true

2393 self.parse_title_group(child)

2394 else:

2395 fct_name = "parse_" + tag.replace("-", "_")

2396 ftor = getattr(self, fct_name, None)

2397 if callable(ftor): 2397 ↛ 2400line 2397 didn't jump to line 2400 because the condition on line 2397 was always true

2398 ftor(child, add_ext_link=True)

2399 else:

2400 self.warnings.append(

2401 {

2402 self.pid: self.__class__.__name__

2403 + "."

2404 + inspect.currentframe().f_code.co_name

2405 + " "

2406 + tag

2407 }

2408 )

2409

2410 if self.last_modified_iso_8601_date_str is None:

2411 self.last_modified_iso_8601_date_str = timezone.now().isoformat()

2412

2413

2414class JatsArticleBase(JatsBase):

2415 def parse_custom_meta_group(self, node, **kwargs):

2416 for child in node:

2417 tag = normalize(child.tag)

2418

2419 if tag == "custom-meta": 2419 ↛ 2438line 2419 didn't jump to line 2438 because the condition on line 2419 was always true

2420 name, value = self.get_data_from_custom_meta(child)

2421

2422 if name == "article-number":

2423 self.article_number = value

2424 elif name == "talk-number":

2425 self.talk_number = value

2426 elif name == "presented": 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true

2427 presenter = create_contributor()

2428 presenter["role"] = "presenter"

2429 presenter["string_name"] = value.replace("Presented by ", "").replace(

2430 "Présenté par ", ""

2431 )

2432 presenter["contrib_xml"] = get_contrib_xml(presenter)

2433 self.contributors.append(presenter)

2434 elif name == "provider": 2434 ↛ 2416line 2434 didn't jump to line 2416 because the condition on line 2434 was always true

2435 self.provider = value

2436

2437 else:

2438 self.warnings.append(

2439 {

2440 self.pid: self.__class__.__name__

2441 + "."

2442 + inspect.currentframe().f_code.co_name

2443 + " "

2444 + tag

2445 }

2446 )

2447

2448

2449class JatsArticle(ArticleData, JatsArticleBase):

2450 def __init__(self, *args, **kwargs): # , tree, pid=None):

2451 super().__init__(*args, **kwargs)

2452 self.pid = kwargs["pid"] if "pid" in kwargs else None

2453 self.issue = kwargs["issue"] if "issue" in kwargs else None

2454

2455 self.add_span_around_tex_formula = (

2456 kwargs["add_span_around_tex_formula"]

2457 if "add_span_around_tex_formula" in kwargs

2458 else False

2459 )

2460 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False

2461 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None

2462 self.no_bib = kwargs.get("no_bib", False)

2463

2464 self.parse_tree(kwargs["tree"])

2465

2466 def parse_tree(self, tree):

2467 super().parse_tree(tree)

2468

2469 self.atype = get_normalized_attrib(tree, "article-type") or ""

2470

2471 # First loop to catch float-groups that are inserted inside the body

2472 for node in tree:

2473 tag = normalize(node.tag)

2474

2475 if tag == "front":

2476 for child in node:

2477 tag = normalize(child.tag)

2478

2479 if tag == "article-meta":

2480 self.parse_article_meta(child)

2481 else:

2482 self.warnings.append(

2483 {

2484 self.pid: self.__class__.__name__

2485 + "."

2486 + inspect.currentframe().f_code.co_name

2487 + " "

2488 + tag

2489 }

2490 )

2491 elif tag == "front-stub": 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true

2492 self.parse_article_meta(node)

2493 elif tag == "floats-group": 2493 ↛ 2494line 2493 didn't jump to line 2494 because the condition on line 2493 was never true

2494 self.parse_floats_group(node)

2495

2496 for node in tree:

2497 tag = normalize(node.tag)

2498 if tag == "back":

2499 for child in node:

2500 tag = normalize(child.tag)

2501

2502 if tag == "ref-list" and not self.no_bib:

2503 print("Parse bib")

2504 self.parse_ref_list(child)

2505 elif tag == "ack": 2505 ↛ 2506line 2505 didn't jump to line 2506 because the condition on line 2505 was never true

2506 self.parse_ack(child)

2507 elif tag == "sec": 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true

2508 self.parse_sec(child)

2509 elif tag == "app-group": 2509 ↛ 2510line 2509 didn't jump to line 2510 because the condition on line 2509 was never true

2510 self.parse_app_group(child)

2511 elif tag == "fn-group": 2511 ↛ 2512line 2511 didn't jump to line 2512 because the condition on line 2511 was never true

2512 self.parse_fn_group(child)

2513 else:

2514 self.warnings.append(

2515 {

2516 self.pid: self.__class__.__name__

2517 + "."

2518 + inspect.currentframe().f_code.co_name

2519 + " "

2520 + tag

2521 }

2522 )

2523

2524 elif tag == "body":

2525 self.parse_body(node)

2526 elif tag == "sub-article": 2526 ↛ 2527line 2526 didn't jump to line 2527 because the condition on line 2526 was never true

2527 self.parse_sub_article(node)

2528 elif tag == "floats-group" or tag == "front": 2528 ↛ 2532line 2528 didn't jump to line 2532 because the condition on line 2528 was always true

2529 # Handled above

2530 pass

2531 else:

2532 self.warnings.append(

2533 {

2534 self.pid: self.__class__.__name__

2535 + "."

2536 + inspect.currentframe().f_code.co_name

2537 + " "

2538 + tag

2539 }

2540 )

2541

2542 # Add the footnotes at the end

2543 if len(self.fns) > 0: 2543 ↛ 2544line 2543 didn't jump to line 2544 because the condition on line 2543 was never true

2544 fn_text = '<div class="footnotes">'

2545 for fn in self.fns:

2546 fn_text += fn

2547 fn_text += "</div>"

2548

2549 self.body_html = fn_text if not self.body_html else self.body_html + fn_text

2550

2551 if ( 2551 ↛ 2555line 2551 didn't jump to line 2555

2552 len(self.funding_statement_xml) > 0

2553 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1

2554 ):

2555 self.funding_statement_xml = (

2556 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>'

2557 )

2558

2559 # Case for XML with <body>, then <back> and <floats_group>

2560 # The figures/tables of the floats_group are added inside the body_html

2561 # (close to their first <xref>)

2562 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function.

2563 # Instead, we append the floats_group_xml to the body_xml

2564 if hasattr(self, "floats_group_xml"): 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true

2565 self.body_xml += self.floats_group_xml

2566

2567 # Special treatment for Elsevier articles: web scrapping to find the date_published

2568 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests

2569 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None:

2570 # article_data = scrapping.fetch_article(self.doi, self.pii)

2571 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str

2572

2573 self.post_parse_tree()

2574

2575 def update_body_content(self, node, **kwargs):

2576 if len(node) == 0:

2577 # Most journals do not display the Full text

2578 # the <body> is then used to store the text for the search engine and has no children

2579 # Let's not compute body_html in this case.

2580 # We want the same behavior for journals that display the Full text,

2581 # but with old articles without Full text.

2582 return

2583

2584 # <front> has to be put before <body> so self.pid is defined here

2585 if hasattr(settings, "SITE_URL_PREFIX"): 2585 ↛ 2586line 2585 didn't jump to line 2586 because the condition on line 2585 was never true

2586 prefix = settings.SITE_URL_PREFIX

2587 base_article = settings.ARTICLE_BASE_URL

2588 base_url = "/" + prefix + base_article + self.pid

2589 else:

2590 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)

2591 kwargs["base_url"] = base_url

2592

2593 append_to_body = True

2594 current_len = len(self.supplementary_materials)

2595

2596 if "use_sec" in kwargs and kwargs["use_sec"]: 2596 ↛ 2598line 2596 didn't jump to line 2598 because the condition on line 2596 was never true

2597 # Hack for Elsevier: convert <ack> into <sec> of the <body>

2598 body_tex, body_html = self.parse_node_with_sec(node, **kwargs)

2599 else:

2600 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs)

2601

2602 if len(self.supplementary_materials) != current_len: 2602 ↛ 2605line 2602 didn't jump to line 2605 because the condition on line 2602 was never true

2603 # Elsevier stores supplementary-material in app-group.

2604 # They are extracted, but ignored in the body_html if the appendix has only supplements

2605 append_to_body = False

2606

2607 for child in node:

2608 if child.tag == "p":

2609 for gchild in child:

2610 if gchild.tag != "supplementary-material":

2611 append_to_body = True

2612

2613 if append_to_body: 2613 ↛ exitline 2613 didn't return from function 'update_body_content' because the condition on line 2613 was always true

2614 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex

2615 self.body_html = body_html if not self.body_html else self.body_html + body_html

2616

2617 body_xml = get_xml_from_node(node)

2618 if not self.body_xml: 2618 ↛ 2621line 2618 didn't jump to line 2621 because the condition on line 2618 was always true

2619 self.body_xml = body_xml

2620 else:

2621 if "use_sec" in kwargs and kwargs["use_sec"]:

2622 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>"

2623 else:

2624 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>"

2625

2626 def parse_ack(self, node, **kwargs):

2627 content_type = node.get("content-type") or ""

2628 if content_type == "COI-statement":

2629 self.coi_statement = get_text_from_node(node)

2630 else:

2631 # Hack for Elsevier: convert <ack> into <sec> of the <body>

2632 self.update_body_content(node, use_sec=True)

2633

2634 def parse_app(self, node, **kwargs):

2635 for child in node:

2636 tag = normalize(child.tag)

2637

2638 if tag == "sec":

2639 # Elsevier can store all appendixes inside one <app> ?!?

2640 # One of them can store the supplements and has to be ignored in the body_html

2641 self.update_body_content(child)

2642 else:

2643 self.warnings.append(

2644 {

2645 self.pid: self.__class__.__name__

2646 + "."

2647 + inspect.currentframe().f_code.co_name

2648 + " "

2649 + tag

2650 }

2651 )

2652

2653 def parse_app_group(self, node, **kwargs):

2654 for child in node:

2655 tag = normalize(child.tag)

2656

2657 if tag == "app":

2658 self.parse_app(child)

2659 else:

2660 self.warnings.append(

2661 {

2662 self.pid: self.__class__.__name__

2663 + "."

2664 + inspect.currentframe().f_code.co_name

2665 + " "

2666 + tag

2667 }

2668 )

2669

2670 def parse_article_categories(self, node, **kwargs):

2671 for child in node:

2672 tag = normalize(child.tag)

2673

2674 if tag == "subj-group": 2674 ↛ 2677line 2674 didn't jump to line 2677 because the condition on line 2674 was always true

2675 self.parse_subj_group(child)

2676 else:

2677 self.warnings.append(

2678 {

2679 self.pid: self.__class__.__name__

2680 + "."

2681 + inspect.currentframe().f_code.co_name

2682 + " "

2683 + tag

2684 }

2685 )

2686

2687 def parse_article_meta(self, node, **kwargs):

2688 for child in node:

2689 tag = normalize(child.tag)

2690

2691 if tag == "article-id":

2692 self.parse_id(child)

2693 elif tag == "fpage":

2694 self.fpage = child.text

2695 self.page_type = child.get("content-type") or ""

2696 elif tag == "lpage":

2697 self.lpage = child.text or ""

2698 elif tag == "page-range":

2699 self.page_range = child.text

2700 elif tag in ("page-count", "size"): 2700 ↛ 2701line 2700 didn't jump to line 2701 because the condition on line 2700 was never true

2701 self.size = child.text

2702 elif tag == "elocation-id": 2702 ↛ 2703line 2702 didn't jump to line 2703 because the condition on line 2702 was never true

2703 self.elocation = child.text

2704 elif tag == "pub-date":

2705 date_type = child.get("date-type") or "pub"

2706 if date_type == "pub": 2706 ↛ 2709line 2706 didn't jump to line 2709 because the condition on line 2706 was always true

2707 self.date_published_iso_8601_date_str = self.get_data_from_date(child)

2708 else:

2709 date_str = self.get_data_from_date(child)

2710 self.history_dates.append({"type": "online", "date": date_str})

2711 elif tag == "history":

2712 self.history_dates += self.get_data_from_history(child)

2713 for date in self.history_dates:

2714 if date["type"] == "prod-deployed-date":

2715 self.prod_deployed_date_iso_8601_date_str = date["date"]

2716 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]:

2717 pass

2718 # TODO: store permissions in XML

2719 elif tag == "author-notes": 2719 ↛ 2721line 2719 didn't jump to line 2721 because the condition on line 2719 was never true

2720 # 2022/11/15 Mersenne meeting. ignore author-notes

2721 pass

2722 # self.parse_author_notes(child)

2723 else:

2724 fct_name = "parse_" + tag.replace("-", "_")

2725 ftor = getattr(self, fct_name, None)

2726 if callable(ftor): 2726 ↛ 2729line 2726 didn't jump to line 2729 because the condition on line 2726 was always true

2727 ftor(child, add_ext_link=True)

2728 else:

2729 self.warnings.append(

2730 {

2731 self.pid: self.__class__.__name__

2732 + "."

2733 + inspect.currentframe().f_code.co_name

2734 + " "

2735 + tag

2736 }

2737 )

2738

2739 def parse_author_notes(self, node, **kwargs):

2740 for child in node:

2741 tag = normalize(child.tag)

2742 if tag == "fn":

2743 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)

2744 xml = get_xml_from_node(child)

2745 self.footnotes_xml += xml

2746 self.footnotes_html += html

2747

2748 def parse_body(self, node, **kwargs):

2749 self.body = get_text_from_node(node)

2750

2751 if hasattr(self, "floats"): 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true

2752 self.floats_to_insert = []

2753

2754 self.update_body_content(node, **kwargs)

2755

2756 if not self.body_xml:

2757 self.body_xml = get_xml_from_node(node)

2758

2759 def parse_boxed_text(self, node, **kwargs):

2760 """

2761 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary.

2762 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML.

2763 """

2764 box_id = node.attrib["id"] if "id" in node.attrib else None

2765

2766 _, html = self.parse_node_with_boxed_text(node, **kwargs)

2767

2768 if box_id is not None:

2769 self.floats[box_id] = html

2770

2771 def parse_floats_group(self, node, **kwargs):

2772 if hasattr(settings, "SITE_URL_PREFIX"):

2773 prefix = settings.SITE_URL_PREFIX

2774 base_article = settings.ARTICLE_BASE_URL

2775 base_url = "/" + prefix + base_article + self.pid

2776 else:

2777 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)

2778

2779 self.floats = {}

2780 for child in node:

2781 tag = normalize(child.tag)

2782

2783 if tag == "fig":

2784 self.parse_node_with_fig(child, append_floats=True, base_url=base_url)

2785 elif tag == "table-wrap":

2786 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url)

2787 elif tag == "boxed-text":

2788 self.parse_boxed_text(child, base_url=base_url)

2789 else:

2790 self.warnings.append(

2791 {

2792 self.pid: self.__class__.__name__

2793 + "."

2794 + inspect.currentframe().f_code.co_name

2795 + " "

2796 + tag

2797 }

2798 )

2799

2800 self.floats_group_xml = get_xml_from_node(node)

2801

2802 def parse_fn_group(self, node, **kwargs):

2803 for child in node:

2804 tag = normalize(child.tag)

2805

2806 if tag == "fn":

2807 _, html = self.parse_node_with_fn(child, keep_fn=True)

2808 xml = get_xml_from_node(child)

2809

2810 self.footnotes_html += html

2811 self.footnotes_xml += xml

2812 else:

2813 self.warnings.append(

2814 {

2815 self.pid: self.__class__.__name__

2816 + "."

2817 + inspect.currentframe().f_code.co_name

2818 + " "

2819 + tag

2820 }

2821 )

2822

2823 def parse_funding_group(self, node, **kwargs):

2824 for child in node:

2825 tag = normalize(child.tag)

2826

2827 if tag == "award-group":

2828 self.parse_award_group(child)

2829 elif tag == "funding-statement":

2830 for funding_node in child:

2831 if funding_node.tag == "name-content":

2832 for funding_child in funding_node:

2833 if funding_child.tag == "fn":

2834 _, html = self.parse_node_with_fn(funding_child, keep_fn=True)

2835 self.funding_statement_html += html

2836 self.funding_statement_xml = get_xml_from_node(funding_node)

2837

2838 # TODO: handle funding-statement with simple texts

2839 else:

2840 self.warnings.append(

2841 {

2842 self.pid: self.__class__.__name__

2843 + "."

2844 + inspect.currentframe().f_code.co_name

2845 + " "

2846 + tag

2847 }

2848 )

2849

2850 def parse_issue(self, node, **kwargs):

2851 # Elsevier stores bs in the seq attribute

2852 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0")

2853

2854

2855class JatsRef(RefBase, JatsBase):

2856 def __init__(self, *args, tree, lang="und", **kwargs):

2857 super().__init__(*args, lang=lang, **kwargs)

2858 self.parse_tree(tree)

2859

2860 def parse_tree(self, tree):

2861 super().parse_tree(tree)

2862

2863 self.user_id = get_normalized_attrib(tree, "id") or ""

2864

2865 for node in tree:

2866 tag = normalize(node.tag)

2867

2868 if tag == "label":

2869 self.label = node.text or ""

2870

2871 if self.label: 2871 ↛ 2906line 2871 didn't jump to line 2906 because the condition on line 2871 was always true

2872 if self.label[0] != "[":

2873 self.label = "[" + self.label + "]"

2874

2875 elif tag == "mixed-citation" or tag == "note":

2876 self.parse_citation_node(node)

2877

2878 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content(

2879 node,

2880 is_citation=True,

2881 is_mixed_citation=True,

2882 add_ext_link=True,

2883 ref_type="misc",

2884 )

2885

2886 if self.label:

2887 self.citation_html = self.label + " " + self.citation_html

2888 self.citation_tex = self.label + " " + self.citation_tex

2889

2890 elif tag == "element-citation":

2891 self.parse_citation_node(node)

2892

2893 self.citation_tex = self.citation_html = get_citation_html(self)

2894 else:

2895 self.warnings.append(

2896 {

2897 self.pid: self.__class__.__name__

2898 + "."

2899 + inspect.currentframe().f_code.co_name

2900 + " "

2901 + tag

2902 }

2903 )

2904

2905 # With xmldata, citation_xml does not have '<ref>', but only the text of the children

2906 self.citation_xml += get_xml_from_node(node)

2907

2908 def get_data_from_name_in_ref(self, node, role):

2909 params = create_contributor()

2910 params["role"] = role

2911

2912 if node.tag == "name":

2913 self.update_data_from_name(node, params)

2914 elif node.tag == "string-name": 2914 ↛ 2918line 2914 didn't jump to line 2918 because the condition on line 2914 was always true

2915 self.update_data_from_name(node, params)

2916 if params["first_name"] == "" and params["last_name"] == "":

2917 params["string_name"] = node.text or ""

2918 elif node.tag == "name-alternatives":

2919 params["mid"] = self.get_data_from_name_alternatives(node)

2920 elif node.tag == "collab":

2921 params["string_name"] = node.text or ""

2922

2923 use_initials = getattr(settings, "REF_JEP_STYLE", False)

2924 helper_update_name_params(params, use_initials)

2925 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node)

2926

2927 return params

2928

2929 def parse_node_with_chapter_title(self, node, **kwargs):

2930 tex, html = self.parse_inner_node(node, **kwargs)

2931

2932 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

2933 if is_mixed_citation:

2934 html = add_span_class_to_html_from_chapter_title(html, **kwargs)

2935

2936 return tex, html

2937

2938 def parse_node_with_source(self, node, **kwargs):

2939 tex, html = self.parse_inner_node(node, **kwargs)

2940

2941 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

2942 if is_mixed_citation:

2943 html = add_span_class_to_html_from_source(html, **kwargs)

2944

2945 return tex, html

2946

2947 def parse_citation_node(self, node, **kwargs):

2948 self.type = get_normalized_attrib(node, "publication-type") or "misc"

2949

2950 # Elsevier can store data about a translation after comments (<source>...)

2951 # Append these tags in the comment

2952 has_comment = False

2953

2954 for child in node:

2955 tag = normalize(child.tag)

2956

2957 if tag in ("page-count", "size"): 2957 ↛ 2958line 2957 didn't jump to line 2958 because the condition on line 2957 was never true

2958 if not self.size:

2959 self.size = child.text

2960 elif tag == "comment":

2961 has_comment = True

2962 # comments may have ext-links or uri. HTML <a> links will be added

2963 _, comment = self.parse_node_with_mixed_content(

2964 child, is_citation=True, is_comment=True, add_HTML_link=True

2965 )

2966 if self.comment:

2967 self.comment += " "

2968 self.comment += comment

2969 elif tag == "source":

2970 # TODO: migration to store source_tex and source_html

2971 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True)

2972

2973 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2973 ↛ 2975line 2973 didn't jump to line 2975 because the condition on line 2973 was never true

2974 # Multiple source for a book, store the extra source in series

2975 if self.series and has_comment:

2976 self.comment += " " + source_tex

2977 else:

2978 if self.series:

2979 self.series += ", "

2980 self.series += get_text_from_node(child)

2981 else:

2982 if self.source_tex and has_comment: 2982 ↛ 2983line 2982 didn't jump to line 2983 because the condition on line 2982 was never true

2983 self.comment += " " + source_tex

2984 else:

2985 self.source_tex = source_tex

2986 elif tag == "series":

2987 series = get_text_from_node(child)

2988 if self.series and has_comment: 2988 ↛ 2989line 2988 didn't jump to line 2989 because the condition on line 2988 was never true

2989 self.comment += ", " + series

2990 else:

2991 if self.series: 2991 ↛ 2992line 2991 didn't jump to line 2992 because the condition on line 2991 was never true

2992 self.series += ", "

2993 self.series += series

2994 elif tag == "annotation": 2994 ↛ 2995line 2994 didn't jump to line 2995 because the condition on line 2994 was never true

2995 if not self.annotation:

2996 self.annotation = get_text_from_node(child)

2997 elif tag == "article-title":

2998 # TODO: migration to store article_title_tex and article_title_html

2999 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)

3000

3001 if self.type == "book": 3001 ↛ 3003line 3001 didn't jump to line 3003 because the condition on line 3001 was never true

3002 # Elsevier uses article-title for books !?!

3003 if len(self.source_tex) == 0:

3004 if has_comment:

3005 self.comment += " " + article_title_tex

3006 else:

3007 self.source_tex = article_title_tex

3008 else:

3009 if self.series and has_comment:

3010 self.comment += ", " + article_title_tex

3011 else:

3012 self.series += get_text_from_node(child)

3013 elif self.type == "inproceedings":

3014 if self.chapter_title_tex and has_comment: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true

3015 self.comment += " " + article_title_tex

3016 else:

3017 self.chapter_title_tex = article_title_tex

3018 else:

3019 if self.article_title_tex and has_comment: 3019 ↛ 3020line 3019 didn't jump to line 3020 because the condition on line 3019 was never true

3020 self.comment += " " + article_title_tex

3021 else:

3022 self.article_title_tex = article_title_tex

3023 elif tag == "chapter-title":

3024 # TODO: migration to store chapter_title_tex and chapter_title_html

3025 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)

3026 if self.chapter_title_tex and has_comment: 3026 ↛ 3027line 3026 didn't jump to line 3027 because the condition on line 3026 was never true

3027 self.comment += " " + chapter_title_tex

3028 else:

3029 self.chapter_title_tex = chapter_title_tex

3030 elif tag == "conf-name":

3031 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True)

3032 if self.source_tex and has_comment: 3032 ↛ 3033line 3032 didn't jump to line 3033 because the condition on line 3032 was never true

3033 self.comment += ", " + conf_tex

3034 else:

3035 self.source_tex = conf_tex

3036 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"):

3037 params = self.get_data_from_name_in_ref(child, "author")

3038 self.contributors.append(params)

3039 elif tag == "person-group":

3040 self.parse_person_group(child)

3041 elif tag == "ext-link":

3042 self.parse_ext_link(child, add_ext_link=True)

3043 elif tag == "pub-id":

3044 self.parse_pub_id(child)

3045 elif tag == "date": 3045 ↛ 3046line 3045 didn't jump to line 3046 because the condition on line 3045 was never true

3046 self.year = get_text_from_node(child)

3047 elif tag == "date-in-citation": 3047 ↛ 3048line 3047 didn't jump to line 3048 because the condition on line 3047 was never true

3048 date_ = child.get("iso-8601-date") or ""

3049 if date_:

3050 if self.comment:

3051 self.comment += ", "

3052 self.comment += "Accessed " + date_

3053 elif tag == "isbn": 3053 ↛ 3054line 3053 didn't jump to line 3054 because the condition on line 3053 was never true

3054 if self.annotation:

3055 self.annotation += ", "

3056 self.annotation += "ISBN: " + child.text

3057 elif tag == "issn": 3057 ↛ 3058line 3057 didn't jump to line 3058 because the condition on line 3057 was never true

3058 if self.annotation:

3059 self.annotation += ", "

3060 self.annotation += "ISSN: " + child.text

3061 elif child.text is not None:

3062 variable_name = tag.replace("-", "_")

3063 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true

3064 if tag == "fpage":

3065 self.comment += ", pp. "

3066 elif tag == "lpage":

3067 self.comment += "-"

3068 else:

3069 self.comment += ", "

3070 self.comment += child.text

3071 elif not hasattr(self, variable_name) or not getattr(self, variable_name):

3072 setattr(self, variable_name, child.text)

3073

3074 def parse_person_group(self, node, **kwargs):

3075 role = node.get("person-group-type") or ""

3076 if role and role[-1] == "s": 3076 ↛ 3077line 3076 didn't jump to line 3077 because the condition on line 3076 was never true

3077 role = role[:-1]

3078

3079 for child in node:

3080 tag = normalize(child.tag)

3081

3082 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3082 ↛ 3086line 3082 didn't jump to line 3086 because the condition on line 3082 was always true

3083 contrib = self.get_data_from_name_in_ref(child, role)

3084 self.contributors.append(contrib)

3085 else:

3086 self.warnings.append(

3087 {

3088 self.pid: self.__class__.__name__

3089 + "."

3090 + inspect.currentframe().f_code.co_name

3091 + " "

3092 + tag

3093 }

3094 )

3095

3096 def parse_pub_id(self, node, **kwargs):

3097 node_type = node.get("pub-id-type") or ""

3098

3099 data: ExtLinkDict = {

3100 "rel": node_type,

3101 "mimetype": "",

3102 "location": "",

3103 "base": "",

3104 "metadata": node.text,

3105 }

3106

3107 self.add_extids_from_node_with_link(data)

3108

3109 def split_label(self):

3110 """

3111 Used when sorting non-digit bibitems

3112 """

3113 label = self.label.lower()

3114 if len(label) > 1:

3115 label = label[1:-1]

3116

3117 try:

3118 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)

3119 except ValueError:

3120 # Special case where label is similar as "Sma" instead of "Sma15"

3121 self.label_prefix, self.label_suffix = [label, ""]

3122

3123

3124class BitsCollection(CollectionData, JatsBase):

3125 def __init__(self, *args, **kwargs):

3126 super().__init__(*args, **kwargs)

3127 self.parse_tree(kwargs["tree"])

3128

3129 def parse_tree(self, tree):

3130 super().parse_tree(tree)

3131

3132 if tree is not None: 3132 ↛ 3175line 3132 didn't jump to line 3175 because the condition on line 3132 was always true

3133 tag = normalize(tree.tag)

3134 collection_meta_node = None

3135 if tag == "collection-meta":

3136 self.parse_collection_meta(tree)

3137 collection_meta_node = tree

3138 elif tag == "in-collection": 3138 ↛ 3162line 3138 didn't jump to line 3162 because the condition on line 3138 was always true

3139 for node in tree:

3140 tag = normalize(node.tag)

3141

3142 if tag == "collection-meta":

3143 self.parse_collection_meta(node)

3144 collection_meta_node = node

3145 elif tag == "volume":

3146 self.parse_volume(node)

3147 elif tag == "volume-series": 3147 ↛ 3149line 3147 didn't jump to line 3149 because the condition on line 3147 was always true

3148 self.parse_volume_series(node)

3149 elif tag == "volume-title":

3150 self.parse_volume_title(node)

3151 else:

3152 self.warnings.append(

3153 {

3154 self.pid: self.__class__.__name__

3155 + "."

3156 + inspect.currentframe().f_code.co_name

3157 + " "

3158 + tag

3159 }

3160 )

3161

3162 if collection_meta_node is not None: 3162 ↛ 3165line 3162 didn't jump to line 3165 because the condition on line 3162 was always true

3163 self.set_seq(collection_meta_node)

3164 else:

3165 self.warnings.append(

3166 {

3167 self.pid: self.__class__.__name__

3168 + "."

3169 + inspect.currentframe().f_code.co_name

3170 + " "

3171 + tag

3172 }

3173 )

3174

3175 self.collection = Foo()

3176 self.collection.pid = self.pid

3177

3178 def parse_collection_meta(self, node, **kwargs):

3179 self.coltype = node.get("collection-type")

3180

3181 for child in node:

3182 tag = normalize(child.tag)

3183

3184 if tag == "collection-id":

3185 self.pid = child.text

3186 elif tag == "title-group":

3187 self.parse_title_group(child)

3188 elif tag == "issn":

3189 node_type = child.get("pub-type")

3190 if node_type == "ppub": 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true

3191 self.issn = child.text

3192 self.ids.append(("issn", child.text))

3193 elif node_type == "epub": 3193 ↛ 3194line 3193 didn't jump to line 3194 because the condition on line 3193 was never true

3194 self.e_issn = child.text

3195 self.ids.append(("e-issn", child.text))

3196 elif tag == "ext-link": 3196 ↛ 3197line 3196 didn't jump to line 3197 because the condition on line 3196 was never true

3197 data = self.get_data_from_ext_link(child)

3198 self.ext_links.append(data)

3199 elif tag == "volume-in-collection":

3200 self.parse_volume_in_collection(child)

3201 else:

3202 self.warnings.append(

3203 {

3204 self.pid: self.__class__.__name__

3205 + "."

3206 + inspect.currentframe().f_code.co_name

3207 + " "

3208 + tag

3209 }

3210 )

3211

3212 def parse_volume(self, node, **kwargs):

3213 self.volume = node.text

3214

3215 def parse_volume_in_collection(self, node, **kwargs):

3216 for child in node:

3217 tag = normalize(child.tag)

3218

3219 if tag == "volume-number":

3220 self.parse_volume(child)

3221 elif tag == "volume-series":

3222 self.parse_volume_series(child)

3223 elif tag == "volume-title": 3223 ↛ 3226line 3223 didn't jump to line 3226 because the condition on line 3223 was always true

3224 self.parse_volume_title(child)

3225 else:

3226 self.warnings.append(

3227 {

3228 self.pid: self.__class__.__name__

3229 + "."

3230 + inspect.currentframe().f_code.co_name

3231 + " "

3232 + tag

3233 }

3234 )

3235

3236 def parse_volume_series(self, node, **kwargs):

3237 self.vseries = node.text

3238

3239 def parse_volume_title(self, node, **kwargs):

3240 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node)

3241 self.title_xml = get_xml_from_node(node)

3242

3243 def set_seq(self, node):

3244 try:

3245 # First, use the seq attribute, if any

3246 self.seq = int(node.get("seq") or "")

3247 except ValueError:

3248 # Second, use self.volume (which can be like "158-159")

3249 if not self.volume: 3249 ↛ 3250line 3249 didn't jump to line 3250 because the condition on line 3249 was never true

3250 self.seq = 0

3251 else:

3252 text = self.volume.split("-")[0]

3253 try:

3254 self.seq = int(text)

3255 except ValueError:

3256 self.seq = 0

3257

3258 # Third, use self.vseries as an offset

3259 try:

3260 # pas plus de 10000 ouvrages dans une série (gasp)

3261 self.seq = int(self.vseries) * 10000 + self.seq

3262 except ValueError:

3263 pass

3264

3265

3266class BitsBook(BookData, JatsBase):

3267 def __init__(self, *args, **kwargs):

3268 super().__init__(*args, **kwargs)

3269 self.no_bib = kwargs.get("no_bib", False)

3270

3271 self.parse_tree(kwargs["tree"])

3272

3273 def parse_tree(self, tree):

3274 super().parse_tree(tree)

3275

3276 book_type = get_normalized_attrib(tree, "book-type") or "Book"

3277 self.ctype = "book-" + book_type

3278

3279 for node in tree:

3280 if type(tree) == type(node): 3280 ↛ 3279line 3280 didn't jump to line 3279 because the condition on line 3280 was always true

3281 tag = normalize(node.tag)

3282

3283 if tag in ("collection-meta", "in-collection"):

3284 col = BitsCollection(tree=node)

3285 self.incollection.append(col)

3286 elif tag == "book-meta":

3287 self.parse_book_meta(node)

3288 elif tag == "book-body":

3289 self.parse_book_body(node)

3290 elif tag == "front-matter":

3291 self.parse_front_matter(node)

3292 elif tag == "book-back": 3292 ↛ 3308line 3292 didn't jump to line 3308 because the condition on line 3292 was always true

3293 for child in node:

3294 tag = normalize(child.tag)

3295 if tag == "ref-list":

3296 self.parse_ref_list(child)

3297 else:

3298 self.warnings.append(

3299 {

3300 self.pid: self.__class__.__name__

3301 + "."

3302 + inspect.currentframe().f_code.co_name

3303 + " "

3304 + tag

3305 }

3306 )

3307 else:

3308 self.warnings.append(

3309 {

3310 self.pid: self.__class__.__name__

3311 + "."

3312 + inspect.currentframe().f_code.co_name

3313 + " "

3314 + tag

3315 }

3316 )

3317

3318 self.set_contribs()

3319 self.set_title()

3320 self.post_parse_tree()

3321

3322 def parse_book_body(self, node, **kwargs):

3323 for child in node:

3324 if type(child) == type(node): 3324 ↛ 3323line 3324 didn't jump to line 3323 because the condition on line 3324 was always true

3325 tag = normalize(child.tag)

3326

3327 if tag == "book-part": 3327 ↛ 3332line 3327 didn't jump to line 3332 because the condition on line 3327 was always true

3328 book_part = BitsBookPart(tree=child, no_bib=self.no_bib)

3329 self.warnings.extend(book_part.warnings)

3330 self.parts.append(book_part)

3331 else:

3332 self.warnings.append(

3333 {

3334 self.pid: self.__class__.__name__

3335 + "."

3336 + inspect.currentframe().f_code.co_name

3337 + " "

3338 + tag

3339 }

3340 )

3341

3342 if not self.parts:

3343 self.body = get_text_from_node(node)

3344

3345 def parse_book_meta(self, node, **kwargs):

3346 for child in node:

3347 tag = normalize(child.tag)

3348

3349 if tag == "book-id":

3350 self.parse_id(child)

3351 elif tag == "pub-date":

3352 self.year = self.get_data_from_date(child)

3353 elif tag == "book-volume-number": 3353 ↛ 3354line 3353 didn't jump to line 3354 because the condition on line 3353 was never true

3354 self.volume = child.text

3355 self.volume_int = child.text

3356 elif tag == "pub-history":

3357 history_dates = self.get_data_from_history(child)

3358 for date in history_dates:

3359 if date["type"] == "last-modified":

3360 self.last_modified_iso_8601_date_str = date["date"]

3361 elif date["type"] == "prod-deployed-date": 3361 ↛ 3362line 3361 didn't jump to line 3362 because the condition on line 3361 was never true

3362 self.prod_deployed_date_iso_8601_date_str = date["date"]

3363 elif tag == "book-title-group":

3364 self.parse_title_group(child)

3365 elif tag == "publisher":

3366 self.publisher = JatsPublisher(tree=child)

3367 else:

3368 fct_name = "parse_" + tag.replace("-", "_")

3369 ftor = getattr(self, fct_name, None)

3370 if callable(ftor):

3371 ftor(child, add_ext_link=True)

3372 else:

3373 self.warnings.append(

3374 {

3375 self.pid: self.__class__.__name__

3376 + "."

3377 + inspect.currentframe().f_code.co_name

3378 + " "

3379 + tag

3380 }

3381 )

3382

3383 if self.last_modified_iso_8601_date_str is None: 3383 ↛ 3384line 3383 didn't jump to line 3384 because the condition on line 3383 was never true

3384 self.last_modified_iso_8601_date_str = timezone.now().isoformat()

3385

3386 def parse_custom_meta_group(self, node, **kwargs):

3387 for child in node:

3388 tag = normalize(child.tag)

3389

3390 if tag == "custom-meta": 3390 ↛ 3387line 3390 didn't jump to line 3387 because the condition on line 3390 was always true

3391 name, value = self.get_data_from_custom_meta(child)

3392

3393 if name == "provider": 3393 ↛ 3387line 3393 didn't jump to line 3387 because the condition on line 3393 was always true

3394 self.provider = value

3395

3396 def set_contribs(self):

3397 """

3398 Update the contrib_groups if the XML does not declare any

3399 - with the authors of the first part

3400 - if the book is a monograph

3401 - if all parts are written by the same authors

3402

3403 :return:

3404 """

3405

3406 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"]

3407 if not authors:

3408 if self.ctype == "book-monograph" and self.parts:

3409 first_part = self.parts[0]

3410 self.contributors = first_part.contributors

3411 elif ( 3411 ↛ exitline 3411 didn't return from function 'set_contribs' because the condition on line 3411 was always true

3412 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes"

3413 ) and self.parts:

3414 # check if authors of the book-parts are identical

3415 equal = True

3416 book_part_contributors = self.parts[0].contributors

3417 i = 1

3418 while equal and i < len(self.parts):

3419 part = self.parts[i]

3420 if part.contributors != book_part_contributors: 3420 ↛ 3422line 3420 didn't jump to line 3422 because the condition on line 3420 was always true

3421 equal = False

3422 i += 1

3423 if equal: 3423 ↛ 3424line 3423 didn't jump to line 3424 because the condition on line 3423 was never true

3424 if self.ctype == "book-edited-book":

3425 self.ctype = "book-monograph"

3426 self.contributors = book_part_contributors

3427 else:

3428 contrib = create_contributor()

3429 contrib["string_name"] = "Collectif"

3430 contrib["role"] = "author"

3431 contrib["contrib_xml"] = get_contrib_xml(contrib)

3432 self.contributors.append(contrib)

3433

3434 def set_title(self):

3435 if self.title_xml == "" and len(self.incollection) > 0:

3436 self.title_xml = self.incollection[0].title_xml

3437 self.title_html = self.incollection[0].title_html

3438 self.title_tex = self.incollection[0].title_tex

3439

3440

3441class BitsBookPart(BookPartData, JatsArticleBase):

3442 def __init__(self, *args, **kwargs):

3443 super().__init__(*args, **kwargs)

3444 self.no_bib = kwargs.get("no_bib", False)

3445

3446 self.parse_tree(kwargs["tree"])

3447

3448 def parse_tree(self, tree):

3449 super().parse_tree(tree)

3450

3451 self.atype = get_normalized_attrib(tree, "book-part-type") or ""

3452 try:

3453 self.seq = int(get_normalized_attrib(tree, "seq") or "")

3454 except ValueError:

3455 pass

3456

3457 for node in tree:

3458 tag = normalize(node.tag)

3459

3460 if tag == "book-part-meta":

3461 self.parse_book_part_meta(node)

3462 elif tag == "body":

3463 self.parse_body(node)

3464 elif tag == "front-matter": 3464 ↛ 3465line 3464 didn't jump to line 3465 because the condition on line 3464 was never true

3465 self.parse_front_matter(node)

3466 elif tag == "back": 3466 ↛ 3483line 3466 didn't jump to line 3483 because the condition on line 3466 was always true

3467 for child in node:

3468 tag = normalize(child.tag)

3469

3470 if tag == "ref-list": 3470 ↛ 3473line 3470 didn't jump to line 3473 because the condition on line 3470 was always true

3471 self.parse_ref_list(child)

3472 else:

3473 self.warnings.append(

3474 {

3475 self.pid: self.__class__.__name__

3476 + "."

3477 + inspect.currentframe().f_code.co_name

3478 + " "

3479 + tag

3480 }

3481 )

3482 else:

3483 self.warnings.append(

3484 {

3485 self.pid: self.__class__.__name__

3486 + "."

3487 + inspect.currentframe().f_code.co_name

3488 + " "

3489 + tag

3490 }

3491 )

3492

3493 # Work around a numdam-plus bug where a book-part can have a trans-title without a title

3494 # TODO: Fix numdam-plus, the books impacted and remove the hack

3495 self.set_title()

3496

3497 self.post_parse_tree()

3498

3499 def parse_book_part_meta(self, node, **kwargs):

3500 for child in node:

3501 tag = normalize(child.tag)

3502

3503 if tag == "book-part-id":

3504 self.parse_id(child)

3505 elif tag == "fpage":

3506 self.fpage = child.text

3507 self.page_type = get_normalized_attrib(child, "content-type") or ""

3508 elif tag == "lpage":

3509 self.lpage = child.text

3510 elif tag == "page-range": 3510 ↛ 3511line 3510 didn't jump to line 3511 because the condition on line 3510 was never true

3511 self.page_range = child.text

3512 else:

3513 fct_name = "parse_" + tag.replace("-", "_")

3514 ftor = getattr(self, fct_name, None)

3515 if callable(ftor): 3515 ↛ 3518line 3515 didn't jump to line 3518 because the condition on line 3515 was always true

3516 ftor(child)

3517 else:

3518 self.warnings.append(

3519 {

3520 self.pid: self.__class__.__name__

3521 + "."

3522 + inspect.currentframe().f_code.co_name

3523 + " "

3524 + tag

3525 }

3526 )

3527

3528 def parse_body(self, node, **kwargs):

3529 for child in node:

3530 tag = normalize(child.tag)

3531

3532 if tag == "book-part":

3533 book_part = BitsBookPart(tree=child, no_bib=self.no_bib)

3534 self.warnings.extend(book_part.warnings)

3535 self.parts.append(book_part)

3536 else:

3537 self.warnings.append(

3538 {

3539 self.pid: self.__class__.__name__

3540 + "."

3541 + inspect.currentframe().f_code.co_name

3542 + " "

3543 + tag

3544 }

3545 )

3546

3547 self.body = get_text_from_node(node)

3548

3549 def set_title(self):

3550 """

3551 Bug in some books: some chapters may have a trans-title, but no title !

3552 Hack and manually set the title*

3553 :return:

3554 """

3555

3556 if self.trans_title_html and not self.title_html:

3557 self.title_html = self.trans_title_html

3558 self.title_tex = self.trans_title_tex

3559

3560

3561######################################################################################

3562#

3563# Functions used by ptf-tools

3564#

3565######################################################################################

3566

3567

3568def update_bibitem_xml(bibitem, new_ids):

3569 xml = "<ref>" + bibitem.citation_xml + "</ref>"

3570 the_parser = etree.XMLParser(

3571 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3572 )

3573 tree = etree.fromstring(xml, parser=the_parser)

3574

3575 node = tree.find("element-citation")

3576 if node is None:

3577 node = tree.find("mixed-citation")

3578 if node is not None: 3578 ↛ 3619line 3578 didn't jump to line 3619 because the condition on line 3578 was always true

3579 children_to_remove = []

3580 for child in node:

3581 if child.tag == "ext-link":

3582 child_type = child.get("ext-link-type")

3583 if child_type and child_type in [

3584 "zbl-item-id",

3585 "mr-item-id",

3586 "doi",

3587 "numdam-id",

3588 "mathdoc-id",

3589 "eid",

3590 ]:

3591 children_to_remove.append(child)

3592 elif child.tag == "pub-id":

3593 child_type = child.get("pub-id-type")

3594 if child_type and child_type in [ 3594 ↛ 3580line 3594 didn't jump to line 3580 because the condition on line 3594 was always true

3595 "zbl-item-id",

3596 "mr-item-id",

3597 "doi",

3598 "numdam-id",

3599 "mathdoc-id",

3600 ]:

3601 children_to_remove.append(child)

3602

3603 for child in children_to_remove:

3604 node.remove(child)

3605

3606 for id_type, value_dict in new_ids.items():

3607 if value_dict["checked"] and not value_dict["false_positive"]:

3608 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:

3609 new_node = etree.Element("pub-id")

3610 new_node.set("pub-id-type", id_type)

3611 else:

3612 new_node = etree.Element("ext-link")

3613 new_node.set("ext-link-type", id_type)

3614

3615 new_node.text = value_dict["id_value"]

3616 node.append(new_node)

3617

3618 # TODO Modify the call to update_bibitem_xml and pass the parent's lang

3619 result = JatsRef(tree=tree, lang="und")

3620 return result

3621

3622

3623def check_bibitem_xml(bibitem: RefData):

3624 xml = "<ref>" + bibitem.citation_xml + "</ref>"

3625 the_parser = etree.XMLParser(

3626 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3627 )

3628 tree = etree.fromstring(xml, parser=the_parser)

3629

3630 result = JatsRef(tree=tree, lang="und")

3631 return result

3632

3633

3634# Create XML strings based on internal data

3635

3636

3637def get_tex_from_xml(xml, tag, **kwargs):

3638 parser_ = etree.XMLParser(

3639 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3640 )

3641 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

3642 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '')

3643 text = xml

3644

3645 if tag in ["abstract", "title"]:

3646 text = f"<article><front><article-meta>{text}</article-meta></front></article>"

3647

3648 tree = etree.fromstring(text.encode("utf-8"), parser=parser_)

3649 xarticle = JatsArticle(tree=tree, **kwargs)

3650

3651 result = ""

3652 if tag == "abstract":

3653 result = xarticle.abstracts[0]["value_tex"]

3654 elif tag == "title":

3655 result = xarticle.title_tex, xarticle.trans_title_tex

3656

3657 return result

Coverage for src/ptf/cmds/xml/jats/jats_parser.py: 67%

2067 statements