Coverage for src/ptf/cmds/xml/jats/jats_parser.py: 67%

2067 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# jats_parser.py is a replacement of xmldata.py 

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom. 

7# Each node is read only once. 

8# 

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds. 

10# The xml tree is parsed in the class constructor (__init__) 

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables. 

12# Some parse_<tag> functions are called directly. 

13# Ex: if tag == "article-meta": 

14# self.parse_article_meta(child) 

15# Other parse_<tag> functions are called "automatically" 

16# fct_name = 'parse_' + tag.replace('-', '_') 

17# ftor = getattr(self, fct_name, None) 

18# if callable(ftor): 

19# ftor(child) 

20# 

21# JatsBase and JatsArticleBase are base classes. 

22# They provide common instance variables and their corresponding parse_<tag> functions 

23# 

24# html_from_<tag> are used to generate the HTML text of a node with mixed content: 

25# a node that mixes text, children and tail 

26# These functions can also extract data and set instance variables (ex: self.figures) 

27# 

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects 

29# 

30# At the end of this file, there are some functions that are/were called by ptf-tools. 

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser 

32# 

33# TODO: the import OAI or the import of a collection could simply call the first function 

34# (def parser(tree)) 

35# 

36################################################################################################## 

37 

38import copy 

39import inspect 

40import os 

41import re 

42 

43from lxml import etree 

44from pylatexenc.latexencode import unicode_to_latex 

45 

46from django.conf import settings 

47from django.urls import reverse 

48from django.utils import timezone 

49 

50from matching import scrapping 

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title 

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors 

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title 

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source 

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume 

56from ptf.cmds.xml.citation_html import get_citation_html 

57from ptf.cmds.xml.jats.builder.issue import get_single_title_xml 

58from ptf.cmds.xml.jats.builder.issue import get_title_xml 

59from ptf.cmds.xml.xml_base import RefBase 

60from ptf.cmds.xml.xml_base import XmlParserBase 

61from ptf.cmds.xml.xml_utils import escape 

62from ptf.cmds.xml.xml_utils import get_contrib_xml 

63from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions 

64from ptf.cmds.xml.xml_utils import get_normalized_attrib 

65from ptf.cmds.xml.xml_utils import get_text_from_node 

66from ptf.cmds.xml.xml_utils import get_xml_from_node 

67from ptf.cmds.xml.xml_utils import helper_update_name_params 

68from ptf.cmds.xml.xml_utils import make_links_clickable 

69from ptf.cmds.xml.xml_utils import normalize 

70from ptf.cmds.xml.xml_utils import normalize_space 

71from ptf.cmds.xml.xml_utils import split_kwds 

72from ptf.display import resolver 

73from ptf.model_data import ArticleData 

74from ptf.model_data import BookData 

75from ptf.model_data import BookPartData 

76from ptf.model_data import CollectionData 

77from ptf.model_data import ExtLinkDict 

78from ptf.model_data import Foo 

79from ptf.model_data import IssueData 

80from ptf.model_data import JournalData 

81from ptf.model_data import MathdocPublicationData 

82from ptf.model_data import PublisherData 

83from ptf.model_data import RefData 

84from ptf.model_data import create_contributor 

85from ptf.model_data import create_extlink 

86 

87 

88class JatsBase(XmlParserBase): 

89 def __init__(self, *args, **kwargs): 

90 super().__init__() 

91 self.warnings = [] 

92 self.fns = [] 

93 self.tree = None 

94 # Used to convert an XML value for CKEditor (ie abstract) 

95 self.add_span_around_tex_formula = False 

96 # Used to create a Tex file from an XML value (ie abstract) 

97 self.for_tex_file = False 

98 

99 def parse_tree(self, tree): 

100 self.tree = tree 

101 self.lang = get_normalized_attrib(tree, "lang") or "und" 

102 

103 def post_parse_tree(self): 

104 if self.no_bib: 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was never true

105 # For Geodesic 

106 ext_link = create_extlink() 

107 ext_link["rel"] = "source" 

108 ext_link["location"] = "http://www.numdam.org/item/" + self.pid 

109 ext_link[ 

110 "metadata" 

111 ] = "NUMDAM" # Used as the source id to find the source in the GDML Views 

112 self.ext_links.append(ext_link) 

113 

114 def parse_node_with_article_title(self, node, **kwargs): 

115 tex, html = self.parse_inner_node(node, **kwargs) 

116 

117 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

118 if is_mixed_citation: 

119 html = add_span_class_to_html_from_article_title(html, **kwargs) 

120 

121 return tex, html 

122 

123 def parse_node_with_break(self, node, **kwargs): 

124 tex = "\\newline\n" if self.for_tex_file else " " 

125 html = "<br/>" 

126 

127 return tex, html 

128 

129 def parse_node_with_chem_struct_wrap(self, node, **kwargs): 

130 table_id = label = None 

131 inner_text = "" 

132 

133 if "id" in node.attrib: 

134 table_id = node.attrib["id"] 

135 

136 for child in node: 

137 tag = normalize(child.tag) 

138 if tag == "label": 

139 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

140 else: 

141 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

142 inner_text += child_text 

143 

144 text = "<table " 

145 if table_id: 

146 text += f'id="{table_id}" ' 

147 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>' 

148 

149 text += '<td class="formula-label">' 

150 if label: 

151 text += label 

152 text += "</td></tr>" 

153 text += "</table>" 

154 

155 return text, text 

156 

157 def parse_node_with_disp_quote(self, node, **kwargs): 

158 tex, html = self.parse_inner_node(node, **kwargs) 

159 

160 html = f'<div class="disp-quote">{html}</div>' 

161 tex = f'<div class="disp-quote">{tex}</div>' 

162 

163 return tex, html 

164 

165 def parse_node_with_boxed_text(self, node, **kwargs): 

166 box_id = node.attrib["id"] if "id" in node.attrib else None 

167 

168 _, node_html = self.parse_inner_node(node, **kwargs) 

169 

170 if box_id: 

171 html = f'<div id="{box_id}" class="boxed-text">' 

172 else: 

173 html = '<div class="boxed-text">' 

174 

175 html = f"{html}{node_html}</div>" 

176 

177 return "", html 

178 

179 def parse_node_with_fig(self, node, **kwargs): 

180 """ 

181 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig> 

182 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure> 

183 

184 :param node: XML node of a fig 

185 :return: the HTML text + the dict representing the image (mimetype, location,...) 

186 """ 

187 html = "" 

188 

189 fig_id = label_html = title_html = caption_html = None 

190 img_html = "" 

191 

192 if "id" in node.attrib: 

193 fig_id = node.attrib["id"] 

194 

195 for child in node: 

196 tag = normalize(child.tag) 

197 if tag == "label": 

198 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

199 elif tag == "caption": 

200 for caption_child in child: 

201 tag = normalize(caption_child.tag) 

202 if tag == "title": 

203 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs) 

204 elif tag == "p": 204 ↛ 218line 204 didn't jump to line 218 because the condition on line 204 was always true

205 _, caption_p_html = self.parse_node_with_mixed_content( 

206 caption_child, **kwargs 

207 ) 

208 if caption_html: 

209 caption_html = caption_html.replace( 

210 "<p>", '<p class="fig-first-caption">', 1 

211 ) 

212 caption_html += caption_p_html.replace( 

213 "<p>", '<p class="fig-small-caption">', 1 

214 ) 

215 else: 

216 caption_html = caption_p_html 

217 else: 

218 self.warnings.append( 

219 { 

220 self.pid: self.__class__.__name__ 

221 + "." 

222 + inspect.currentframe().f_code.co_name 

223 + " " 

224 + tag 

225 } 

226 ) 

227 

228 elif tag == "graphic": 

229 _, graphic_html = self.parse_node_with_graphic(child, **kwargs) 

230 img_html += graphic_html 

231 elif tag == "attrib": 

232 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

233 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

234 elif tag == "permissions": 234 ↛ 240line 234 didn't jump to line 240 because the condition on line 234 was always true

235 for gchild in child: 

236 if gchild.tag == "copyright-statement": 236 ↛ 235line 236 didn't jump to line 235 because the condition on line 236 was always true

237 _, html = self.parse_node_with_mixed_content(gchild, **kwargs) 

238 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

239 else: 

240 self.warnings.append( 

241 { 

242 self.pid: self.__class__.__name__ 

243 + "." 

244 + inspect.currentframe().f_code.co_name 

245 + " " 

246 + tag 

247 } 

248 ) 

249 

250 if fig_id: 

251 html = '<figure id="' + fig_id + '">' 

252 else: 

253 html = "<figure>" 

254 

255 if len(img_html) > 0: 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was always true

256 html += img_html 

257 

258 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 258 ↛ 272line 258 didn't jump to line 272 because the condition on line 258 was always true

259 html += "<figcaption>" 

260 

261 if label_html: 261 ↛ 263line 261 didn't jump to line 263 because the condition on line 261 was always true

262 html += label_html 

263 if label_html and title_html: 

264 html += " : " 

265 if title_html: 

266 html += title_html 

267 if caption_html: 267 ↛ 270line 267 didn't jump to line 270 because the condition on line 267 was always true

268 html += caption_html 

269 

270 html += "</figcaption>" 

271 

272 html += "</figure>" 

273 

274 if ( 274 ↛ 280line 274 didn't jump to line 280

275 "append_floats" in kwargs 

276 and kwargs["append_floats"] 

277 and hasattr(self, "floats") 

278 and fig_id is not None 

279 ): 

280 self.floats[fig_id] = html 

281 

282 return "", html 

283 

284 def parse_node_with_fn(self, node, **kwargs): 

285 """ 

286 Ex: <fn><label>LABEL</label><p>TEXT</p></fn> 

287 

288 :param node: XML node of a fn 

289 :return: ''. the text is stripped from the HTML. but a list of fn is built 

290 """ 

291 html = fn_html = "" 

292 

293 label_html = fn_id = None 

294 

295 if "id" in node.attrib: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 fn_id = node.attrib["id"] 

297 

298 for child in node: 

299 tag = normalize(child.tag) 

300 if tag == "label": 

301 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

302 elif tag == "p": 302 ↛ 306line 302 didn't jump to line 306

303 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs) 

304 fn_html = fn_html.replace("<p>", "").replace("</p>", "") 

305 else: 

306 warning = ( 

307 self.__class__.__name__ 

308 + "." 

309 + inspect.currentframe().f_code.co_name 

310 + " " 

311 + tag 

312 ) 

313 self.warnings.append({self.pid: warning}) 

314 

315 if fn_id: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true

316 html = '<p id="' + fn_id + '">' 

317 else: 

318 html = "<p>" 

319 

320 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true

321 html += f"<sup>{label_html}</sup> " 

322 

323 html += fn_html + "</p>" 

324 

325 if not kwargs["keep_fn"] and html not in self.fns: 325 ↛ 326line 325 didn't jump to line 326 because the condition on line 325 was never true

326 self.fns.append(html) 

327 

328 html = html if kwargs["keep_fn"] else "" 

329 return "", html 

330 

331 def parse_node_with_graphic(self, node, **kwargs): 

332 """ 

333 The href value of graphics used in our XML can have the following values 

334 - relative path to the issue XML folder (Elsevier JATS) 

335 - full path starting with "file:/" (Elsevier JATS created in early 2022) 

336 - simple file name (with no relative path) in the RVT FullText XML 

337 

338 After the import, we want 

339 - the files located in the src/tex/figures article folder 

340 - the url pointing to the image, built thanks to kwargs['base_url'] 

341 

342 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/ 

343 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute) 

344 """ 

345 href = "" 

346 

347 for attrib in node.attrib: 

348 name = normalize(attrib) 

349 if name == "href": 

350 href = node.attrib[attrib] 

351 

352 if href: 352 ↛ 398line 352 didn't jump to line 398 because the condition on line 352 was always true

353 basename = os.path.basename(href) 

354 ext = basename.split(".")[-1] 

355 if ext == "png": 355 ↛ 356line 355 didn't jump to line 356 because the condition on line 355 was never true

356 mimetype = "image/png" 

357 else: 

358 mimetype = "image/jpeg" 

359 

360 img_url = "src/tex/figures/" + basename 

361 

362 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 362 ↛ 365line 362 didn't jump to line 365 because the condition on line 362 was always true

363 img_url = img_url[0 : -len(ext)] + "jpg" 

364 

365 data_location = href if "file:/" in href else img_url 

366 if ( 366 ↛ 372line 366 didn't jump to line 372

367 hasattr(self, "pii") 

368 and hasattr(self, "issue") 

369 and "file:/" not in href 

370 and self.from_folder 

371 ): 

372 base_dir = self.issue.journal.pid 

373 if os.path.dirname(href) != base_dir: 

374 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href) 

375 data_location = "file:" + href 

376 

377 data = { 

378 "rel": "html-image", 

379 "mimetype": mimetype, 

380 "location": data_location, 

381 "base": None, 

382 "metadata": node.text if node.text is not None else "", 

383 } 

384 

385 if ext == "png": 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 img_url = os.path.join(kwargs["base_url"], "png", img_url) 

387 else: 

388 img_url = os.path.join(kwargs["base_url"], "jpg", img_url) 

389 img_text = '<a href="' + img_url + '" data-lightbox="image-' 

390 img_text += str(len(self.figures)) + '" title="">' 

391 img_text += '<img src="' + img_url + '" class="article-body-img" />' 

392 img_text += "</a>" 

393 

394 if data not in self.figures: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true

395 self.figures.append(data) 

396 self.related_objects.append(data) 

397 

398 return "", img_text 

399 

400 def parse_node_with_inline_formula(self, node, **kwargs): 

401 # MathJAX is doing a good job with formulae and is now the standard 

402 # MathML could be ignored in HTML (the original XML value is preserved with value_xml) 

403 # We could simply return the tex-math text 

404 # But there are multiple errors in the TeX of the Mersenne articles. 

405 # We first need to fix those mistakes before switching to TeX 

406 

407 tex_math = "" 

408 math_text = "" 

409 formula_id = label = None 

410 

411 if "id" in node.attrib: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 formula_id = node.attrib["id"] 

413 

414 for child in node: 

415 tag = normalize(child.tag) 

416 if tag == "alternatives": 416 ↛ 435line 416 didn't jump to line 435 because the condition on line 416 was always true

417 for alternative in child: 

418 tag = normalize(alternative.tag) 

419 if tag == "tex-math": 

420 tex_math = alternative.text or "" 

421 elif tag == "math": 

422 # remove_namespace(child) 

423 # Elsevier sometimes provide the formula a an alternative image. Remove it. 

424 alternative.attrib.pop("altimg", None) 

425 

426 math_text = get_xml_from_node(alternative).replace("mml:", "") 

427 math_text = math_text.replace( 

428 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

429 ) 

430 math_text = math_text.replace( 

431 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', "" 

432 ) 

433 if node.tag == "disp-formula": 

434 math_text = math_text.replace("<math", '<math display="block"') 

435 elif tag == "label": 

436 label = child.text or "" 

437 else: 

438 self.warnings.append( 

439 { 

440 self.pid: self.__class__.__name__ 

441 + "." 

442 + inspect.currentframe().f_code.co_name 

443 + " " 

444 + tag 

445 } 

446 ) 

447 

448 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""): 

449 stack = inspect.stack() 

450 stack_str = " ".join( 

451 [ 

452 frameinfo[3] 

453 for frameinfo in stack[1:] 

454 if frameinfo[3].find("parse_") == 0 

455 and frameinfo[3].find("parse_node") == -1 

456 and frameinfo[3].find("parse_inner") == -1 

457 and frameinfo[3].find("parse_tree") == -1 

458 and frameinfo[3].find("parse_article_meta") == -1 

459 ] 

460 ) 

461 print(f"{self.pid} no math formula for {stack_str}") 

462 # raise ValueError("No formula alternative") 

463 

464 if node.tag != "disp-formula": 

465 if tex_math != "" and tex_math[0] != "$": 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true

466 tex_math = "$" + tex_math 

467 if tex_math != "" and tex_math[-1] != "$": 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true

468 tex_math = tex_math + "$" 

469 

470 tex = tex_math 

471 

472 html = "" 

473 if label or node.tag == "disp-formula": 

474 html += '<table class="formula"><tr><td class="formula-inner">' 

475 

476 html += '<span class="mathjax-formula" ' 

477 if formula_id: 477 ↛ 478line 477 didn't jump to line 478 because the condition on line 477 was never true

478 html += 'id="' + formula_id + '" ' 

479 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math 

480 if math_text: 

481 html += f'data-tex="{alt_text}">{math_text}</span>' 

482 else: 

483 html += f'data-tex="{alt_text}">{tex_math}</span>' 

484 

485 if label or node.tag == "disp-formula": 

486 html += '</td><td class="formula-label">' 

487 if label: 487 ↛ 488line 487 didn't jump to line 488 because the condition on line 487 was never true

488 html += label 

489 html += "</td></tr>" 

490 html += "</table>" 

491 

492 if self.add_span_around_tex_formula: 492 ↛ 493line 492 didn't jump to line 493 because the condition on line 492 was never true

493 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>' 

494 

495 return tex, html 

496 

497 def parse_node_with_institution_id(self, node, **kwargs): 

498 return "", "" 

499 

500 def parse_node_with_italic(self, node, **kwargs): 

501 tex, html = self.parse_inner_node(node, **kwargs) 

502 

503 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False 

504 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False 

505 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False 

506 # 

507 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment): 

508 # text = inner_text 

509 # else: 

510 # text = '<span class="italique">' + inner_text + '</span>' 

511 

512 html = f'<span class="italique">{html}</span>' 

513 

514 if self.for_tex_file: 514 ↛ 515line 514 didn't jump to line 515 because the condition on line 514 was never true

515 tex = "{\\it " + tex + "}" 

516 else: 

517 tex = f"<i>{tex}</i>" 

518 

519 return tex, html 

520 

521 def parse_node_with_list(self, node, **kwargs): 

522 tex, html = self.parse_inner_node(node, **kwargs) 

523 

524 start = None 

525 continued_from = node.get("continued-from") 

526 if continued_from is not None: 526 ↛ 527line 526 didn't jump to line 527 because the condition on line 526 was never true

527 start = self.get_list_start_value(node) + 1 

528 

529 list_type = node.get("list-type") 

530 if list_type == "bullet" or list_type == "simple": 

531 if self.for_tex_file: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true

532 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n" 

533 else: 

534 tex = f"<ul>{tex}</ul>" 

535 

536 html = f"<ul>{html}</ul>" 

537 else: 

538 if self.for_tex_file: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n" 

540 else: 

541 if list_type == "order" or list_type == "number": 

542 if start is not None: 542 ↛ 543line 542 didn't jump to line 543 because the condition on line 542 was never true

543 html = f'<ol type="1" start="{str(start)}">{html}</ol>' 

544 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>' 

545 else: 

546 html = f'<ol type="1">{html}</ol>' 

547 tex = f'<ol type="1">{tex}</ol>' 

548 elif list_type == "alpha-lower": 

549 html = f'<ol type="a">{html}</ol>' 

550 tex = f'<ol type="a">{tex}</ol>' 

551 elif list_type == "alpha-upper": 

552 html = f'<ol type="A">{html}</ol>' 

553 tex = f'<ol type="A">{tex}</ol>' 

554 elif list_type == "roman-lower": 

555 html = f'<ol type="i">{html}</ol>' 

556 tex = f'<ol type="i">{tex}</ol>' 

557 elif list_type == "roman-upper": 557 ↛ 558line 557 didn't jump to line 558 because the condition on line 557 was never true

558 html = f'<ol type="I">{html}</ol>' 

559 tex = f'<ol type="I">{tex}</ol>' 

560 else: 

561 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>' 

562 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>' 

563 

564 return tex, html 

565 

566 def parse_node_with_list_item(self, node, **kwargs): 

567 """ 

568 <list-item><label>LABEL</label><p>TEXT</p> becomes 

569 <li>LABEL TEXT</li> 

570 (same with <title>) 

571 

572 :param node: 

573 :return: 

574 """ 

575 

576 title_tex = ( 

577 title_html 

578 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = "" 

579 

580 for child in node: 

581 tag = normalize(child.tag) 

582 if tag == "label": 

583 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

584 elif tag == "title": 584 ↛ 585line 584 didn't jump to line 585 because the condition on line 584 was never true

585 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs) 

586 elif tag == "p": 

587 if p_html == "" and content_html == "": 587 ↛ 590line 587 didn't jump to line 590 because the condition on line 587 was always true

588 p_tex, p_html = self.parse_inner_node(child, **kwargs) 

589 else: 

590 content_tex, content_html = self.parse_inner_node(child, **kwargs) 

591 content_html = f"<p>{content_html}</p>" 

592 elif tag == "list": 592 ↛ 596line 592 didn't jump to line 596 because the condition on line 592 was always true

593 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs) 

594 # TODO if tag == "def-list": 

595 else: 

596 self.warnings.append( 

597 { 

598 self.pid: self.__class__.__name__ 

599 + "." 

600 + inspect.currentframe().f_code.co_name 

601 + " " 

602 + tag 

603 } 

604 ) 

605 

606 inner_tex = "" 

607 if label_tex: 

608 inner_tex += label_tex + " " 

609 if title_tex: 609 ↛ 610line 609 didn't jump to line 610 because the condition on line 609 was never true

610 inner_tex += title_tex + " " 

611 inner_tex += p_tex + content_tex 

612 

613 if self.for_tex_file: 613 ↛ 614line 613 didn't jump to line 614 because the condition on line 613 was never true

614 tex = "\\item " + inner_tex + "\n" 

615 else: 

616 tex = f"<li>{inner_tex}</li>" 

617 

618 html = "<li>" 

619 if label_html: 

620 html += label_html + " " 

621 if title_html: 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true

622 html += title_html + " " 

623 html += p_html + content_html + "</li>" 

624 

625 return tex, html 

626 

627 def parse_node_with_name_content(self, node, **kwargs): 

628 tex, html = self.parse_inner_node(node, **kwargs) 

629 return tex, html 

630 

631 def parse_node_with_p(self, node, **kwargs): 

632 tex, html = self.parse_inner_node(node, **kwargs) 

633 

634 if not self.for_tex_file: 634 ↛ 637line 634 didn't jump to line 637 because the condition on line 634 was always true

635 tex = f"<p>{tex}</p>" 

636 

637 node_type = node.get("specific-use") 

638 if node_type: 

639 html = f'<p class="{node_type}">{html}</p>' 

640 else: 

641 html = f"<p>{html}</p>" 

642 

643 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true

644 while len(self.floats_to_insert) > 0: 

645 float_id = self.floats_to_insert.pop(0) 

646 if float_id in self.floats: 

647 html += self.floats[float_id] 

648 self.floats.pop(float_id) 

649 

650 return tex, html 

651 

652 def parse_node_with_h1(self, node, **kwargs): 

653 tex, html = self.parse_inner_node(node, **kwargs) 

654 

655 if not self.for_tex_file: 

656 tex = f"<h1>{tex}</h1>" 

657 

658 node_type = node.get("specific-use") 

659 if node_type: 

660 html = f'<h1 class="{node_type}">{html}</h1>' 

661 else: 

662 html = f"<h1>{html}</h1>" 

663 

664 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 

665 while len(self.floats_to_insert) > 0: 

666 float_id = self.floats_to_insert.pop(0) 

667 if float_id in self.floats: 

668 html += self.floats[float_id] 

669 self.floats.pop(float_id) 

670 

671 return tex, html 

672 

673 def parse_node_with_sc(self, node, **kwargs): 

674 tex, html = self.parse_inner_node(node, **kwargs) 

675 html = f'<span class="smallcaps">{html}</span>' 

676 

677 return tex, html 

678 

679 def parse_node_with_sec(self, node, **kwargs): 

680 """ 

681 <sec><title>TITLE</title><p>TEXT</p> becomes 

682 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children) 

683 

684 :param node: 

685 :param kwargs: 

686 :return: 

687 """ 

688 

689 label_tex = label_html = title_tex = title_html = None 

690 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

691 

692 inner_tex = inner_html = "" 

693 kwargs["sec_level"] += 1 

694 

695 for child in node: 

696 tag = normalize(child.tag) 

697 if tag == "label": 

698 label_tex, label_html = self.parse_node_with_mixed_content(child) 

699 elif tag == "title": 

700 title_tex, title_html = self.parse_node_with_mixed_content(child) 

701 else: 

702 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

703 inner_tex += child_tex 

704 inner_html += child_html 

705 

706 tex = "" 

707 html = "<section>" 

708 

709 if label_html or title_html: 709 ↛ 722line 709 didn't jump to line 722 because the condition on line 709 was always true

710 html += f"<h{str(sec_level)}>" 

711 if label_html: 711 ↛ 714line 711 didn't jump to line 714 because the condition on line 711 was always true

712 tex += label_tex 

713 html += label_html 

714 if label_html and title_html: 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was always true

715 tex += " " 

716 html += " " 

717 if title_html: 717 ↛ 720line 717 didn't jump to line 720 because the condition on line 717 was always true

718 tex += title_tex 

719 html += title_html 

720 html += f"</h{str(sec_level)}>" 

721 

722 tex += inner_tex 

723 html += inner_html + "</section>" 

724 

725 return tex, html 

726 

727 def parse_node_with_string_name(self, node, **kwargs): 

728 tex, html = self.parse_inner_node(node, **kwargs) 

729 

730 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

731 if is_mixed_citation: 731 ↛ 734line 731 didn't jump to line 734 because the condition on line 731 was always true

732 html = add_span_class_to_html_from_authors(html.title(), **kwargs) 

733 

734 return tex, html 

735 

736 def parse_node_with_strong(self, node, **kwargs): 

737 tex, html = self.parse_inner_node(node, **kwargs) 

738 

739 if self.for_tex_file: 739 ↛ 740line 739 didn't jump to line 740 because the condition on line 739 was never true

740 tex = "{\\bf " + tex + "}" 

741 else: 

742 tex = f"<strong>{tex}</strong>" 

743 html = f"<strong>{html}</strong>" 

744 

745 return tex, html 

746 

747 def parse_node_with_styled_content(self, node, **kwargs): 

748 tex, html = self.parse_inner_node(node, **kwargs) 

749 

750 if "style" in node.attrib: 750 ↛ 755line 750 didn't jump to line 755 because the condition on line 750 was always true

751 style = node.attrib["style"] 

752 if style != "": 752 ↛ 755line 752 didn't jump to line 755 because the condition on line 752 was always true

753 html = f'<span style="{style}">{html}</span>' 

754 

755 return tex, html 

756 

757 def parse_node_with_sub(self, node, **kwargs): 

758 tex, html = self.parse_inner_node(node, **kwargs) 

759 

760 if self.for_tex_file: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 tex = "\\textsubscript{" + tex + "}" 

762 else: 

763 tex = f"<sub>{tex}</sub>" 

764 html = f"<sub>{html}</sub>" 

765 

766 return tex, html 

767 

768 def parse_node_with_sup(self, node, **kwargs): 

769 tex, html = self.parse_inner_node(node, **kwargs) 

770 

771 if self.for_tex_file: 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true

772 tex = "\\textsuperscript{" + tex + "}" 

773 else: 

774 tex = f"<sup>{tex}</sup>" 

775 html = f"<sup>{html}</sup>" 

776 

777 return tex, html 

778 

779 def parse_node_with_table_generic(self, node, **kwargs): 

780 tex, html = self.parse_inner_node(node, **kwargs) 

781 

782 tag = normalize(node.tag) 

783 if tag == "row": 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true

784 tag = "tr" 

785 elif tag == "entry": 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true

786 tag = "td" 

787 open_tag = "<" + tag 

788 

789 if tag == "table": 

790 class_table = "table" 

791 

792 cols = node.xpath("colgroup/col") 

793 i = 1 

794 for col in cols: 

795 if "width" in col.attrib: 

796 class_table += f" nowrap-col-{i}" 

797 i += 1 

798 

799 open_tag += f' class="{class_table}"' 

800 if "rowspan" in node.attrib: 

801 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"' 

802 if "colspan" in node.attrib: 

803 open_tag += ' colspan="' + node.attrib["colspan"] + '"' 

804 if "align" in node.attrib: 

805 open_tag += ' align="' + node.attrib["align"] + '"' 

806 if "valign" in node.attrib: 

807 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"' 

808 if "style" in node.attrib: 

809 open_tag += ' style="' + node.attrib["style"] + '"' 

810 open_tag += ">" 

811 

812 html = f"{open_tag}{html}</{tag}>" 

813 

814 return "", html 

815 

816 def parse_node_with_table_wrap(self, node, **kwargs): 

817 """ 

818 Create a <div class="table-wrap"> around the table 

819 :param node: 

820 :return: 

821 """ 

822 

823 table_id = label = caption = None 

824 inner_text = "" 

825 

826 if "id" in node.attrib: 826 ↛ 829line 826 didn't jump to line 829 because the condition on line 826 was always true

827 table_id = node.attrib["id"] 

828 

829 for child in node: 

830 tag = normalize(child.tag) 

831 if tag == "label": 

832 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

833 elif tag == "caption": 

834 _, caption = self.parse_node_with_mixed_content(child, **kwargs) 

835 else: 

836 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

837 inner_text += child_text 

838 

839 if table_id: 839 ↛ 842line 839 didn't jump to line 842 because the condition on line 839 was always true

840 text = '<div class="table-wrap table-responsive" id="' + table_id + '">' 

841 else: 

842 text = '<div class="table-wrap table-responsive">' 

843 

844 if label or caption: 844 ↛ 847line 844 didn't jump to line 847 because the condition on line 844 was always true

845 text += '<div class="table-wrap-header">' 

846 

847 if label: 847 ↛ 850line 847 didn't jump to line 850 because the condition on line 847 was always true

848 text += "<strong>" + label + "</strong>" 

849 

850 if caption: 850 ↛ 856line 850 didn't jump to line 856 because the condition on line 850 was always true

851 if label: 851 ↛ 853line 851 didn't jump to line 853 because the condition on line 851 was always true

852 text += " " 

853 if caption: 853 ↛ 856line 853 didn't jump to line 856 because the condition on line 853 was always true

854 text += caption 

855 

856 if label or caption: 856 ↛ 859line 856 didn't jump to line 859 because the condition on line 856 was always true

857 text += "</div>" 

858 

859 text += inner_text 

860 text += "</div>" 

861 

862 if ( 862 ↛ 868line 862 didn't jump to line 868

863 "append_floats" in kwargs 

864 and kwargs["append_floats"] 

865 and hasattr(self, "floats") 

866 and table_id is not None 

867 ): 

868 self.floats[table_id] = text 

869 

870 return "", text 

871 

872 def parse_node_with_table_wrap_foot(self, node, **kwargs): 

873 """ 

874 Create a <div class="table-wrap-foot"> at bottom of the table 

875 Keep the footnotes inside this div 

876 :param node: 

877 :return: 

878 """ 

879 

880 text = '<div class="table-wrap-foot">' 

881 kwargs["keep_fn"] = True 

882 

883 for child in node: 

884 tag = normalize(child.tag) 

885 if tag == "fn-group": 885 ↛ 883line 885 didn't jump to line 883 because the condition on line 885 was always true

886 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

887 text += html 

888 

889 text += "</div>" 

890 

891 return "", text 

892 

893 def parse_node_with_toc(self, node, **kwargs): 

894 tex, html = self.parse_inner_node(node, **kwargs) 

895 

896 html = f"<table>{html}</table>" 

897 

898 # text = '<ul class="no-bullet book-toc">' 

899 # text += inner_text + '</ul>' 

900 

901 return "", html 

902 

903 def parse_node_with_toc_entry(self, node, **kwargs): 

904 html = label = title = child_text = page = anchor = "" 

905 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"] 

906 toc_class = "inside-toc" if inside_toc_entry else "" 

907 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul> 

908 # html = '<tr class="inside-toc">' 

909 # #html = '<ul class="no-bullet book-toc">' 

910 

911 for child in node: 

912 tag = normalize(child.tag) 

913 if tag == "title": 

914 _, title = self.parse_node_with_mixed_content(child, **kwargs) 

915 elif tag == "label": 

916 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

917 elif tag == "nav-pointer": 

918 _, page = self.parse_node_with_mixed_content(child, **kwargs) 

919 elif tag == "nav-pointer-group": 919 ↛ 920line 919 didn't jump to line 920 because the condition on line 919 was never true

920 for grandchild in child: 

921 if ( 

922 grandchild.tag == "nav-pointer" 

923 and "specific-use" in grandchild.attrib 

924 and grandchild.attrib["specific-use"] == "pagenum" 

925 ): 

926 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs) 

927 if ( 

928 grandchild.tag == "nav-pointer" 

929 and "specific-use" in grandchild.attrib 

930 and grandchild.attrib["specific-use"] == "pageindex" 

931 ): 

932 anchor = int(grandchild.text) + 1 

933 elif tag == "toc-entry": 933 ↛ 911line 933 didn't jump to line 911 because the condition on line 933 was always true

934 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True) 

935 child_text += text 

936 

937 toc_text = f"{label} {title}" 

938 page_text = f"p. {page}" 

939 

940 if anchor: 940 ↛ 941line 940 didn't jump to line 941 because the condition on line 940 was never true

941 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"}) 

942 href += f"#page={anchor}" 

943 toc_text = f'<a href="{href}">{toc_text}</a>' 

944 page_text = f'<a href="{href}">{page_text}</a>' 

945 

946 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>' 

947 if len(child_text) > 0: 

948 html += child_text 

949 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>' 

950 

951 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']: 

952 # html += '</tr>' 

953 # #html += '</ul>' 

954 

955 return "", html 

956 

957 def parse_node_with_underline(self, node, **kwargs): 

958 tex, html = self.parse_inner_node(node, **kwargs) 

959 tex = f"<u>{tex}</u>" 

960 html = f"<u>{html}</u>" 

961 

962 return tex, html 

963 

964 def parse_node_with_volume(self, node, **kwargs): 

965 tex, html = self.parse_inner_node(node, **kwargs) 

966 

967 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

968 if is_mixed_citation: 968 ↛ 971line 968 didn't jump to line 971 because the condition on line 968 was always true

969 html = add_span_class_to_html_from_volume(html, **kwargs) 

970 

971 return tex, html 

972 

973 def parse_node_with_xref(self, node, **kwargs): 

974 tex = html = "" 

975 

976 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 976 ↛ 977line 976 didn't jump to line 977 because the condition on line 976 was never true

977 return tex, html 

978 

979 xref_id = node.get("rid") 

980 if xref_id: 980 ↛ 994line 980 didn't jump to line 994 because the condition on line 980 was always true

981 rids = xref_id.split() 

982 

983 tex, html = self.parse_inner_node(node, **kwargs) 

984 rid0 = rids[0] 

985 if rid0.find("bib") == 0: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true

986 rid0 = "r" + rid0[3:] 

987 html = f'<a href="#{rid0}">{html}</a>' 

988 

989 for rid in rids: 

990 ref_type = node.get("ref-type") or None 

991 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true

992 self.floats_to_insert.append(rid) 

993 

994 return tex, html 

995 

996 def parse_inner_node(self, node, **kwargs): 

997 """ 

998 Used by html_from_mixed_content for nodes that have a different tag in HTML 

999 :param node: 

1000 :param kwargs: 

1001 :return: 

1002 """ 

1003 tex = html = "" 

1004 kwargs["is_top"] = False 

1005 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

1006 

1007 if node.text: 

1008 node_text = node.text 

1009 if self.for_tex_file: 1009 ↛ 1010line 1009 didn't jump to line 1010 because the condition on line 1009 was never true

1010 node_text = unicode_to_latex(node_text) 

1011 tex = node_text 

1012 html = escape(node.text) 

1013 

1014 for child in node: 

1015 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

1016 tex += child_tex 

1017 html += child_html 

1018 

1019 return tex, html 

1020 

1021 def parse_node_with_mixed_content(self, node, **kwargs): 

1022 """ 

1023 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes. 

1024 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

1025 Some inner nodes are removed, others are kept or replaced by their HTML equivalent. 

1026 html_from_mixed_content is called recursively to get the HTML text of the children. 

1027 

1028 :param node: XML Node 

1029 :param kwargs: params of the function 

1030 :return: HTML text 

1031 """ 

1032 

1033 if node is None: 1033 ↛ 1034line 1033 didn't jump to line 1034 because the condition on line 1033 was never true

1034 return "", "" 

1035 

1036 # The tail is the text following the end of the node 

1037 # Ex: <node>text1<a>text_a</a>a_tail</node> 

1038 # The HTML text has to include the tail 

1039 # only if html_from_mixed_content was called recursively 

1040 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

1041 

1042 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec> 

1043 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

1044 

1045 # Text in <comment> is parsed to add HTML link. 

1046 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False 

1047 

1048 # base_url to image links 

1049 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else "" 

1050 

1051 # footnotes are removed from the fulltext (and put at the end) except for those in a table 

1052 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False 

1053 

1054 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False 

1055 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False 

1056 # mixed-citation ignores ext-link 

1057 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1058 

1059 # TODO remove once jats_parser has been validated agains xmldata 

1060 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False 

1061 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False 

1062 kwargs["is_mixed_citation"] = ( 

1063 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

1064 ) 

1065 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

1066 

1067 tag = normalize(node.tag) 

1068 

1069 # pub-id/object-id are ignored by default are they are treated separately 

1070 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"): 

1071 return "", "" 

1072 

1073 if tag in ("mixed-citation", "toc"): 

1074 kwargs["is_citation"] = True 

1075 elif tag == "comment": 

1076 kwargs["is_comment"] = True 

1077 

1078 tex = html = inner_tex = inner_html = "" 

1079 

1080 # I. Add the node's text. 

1081 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text. 

1082 

1083 # Check if the parse_node_with_@tag exists 

1084 tag_mapped = { 

1085 "statement": "sec", 

1086 "disp-formula": "inline-formula", 

1087 "chapter-title": "article-title", 

1088 "bold": "strong", 

1089 "table": "table-generic", 

1090 "th": "table-generic", 

1091 "tr": "table-generic", 

1092 "td": "table-generic", 

1093 "thead": "table-generic", 

1094 "tbody": "table-generic", 

1095 "colgroup": "table-generic", 

1096 "col": "table-generic", 

1097 "tgroup": "table-generic", 

1098 "entry": "table-generic", 

1099 "row": "table-generic", 

1100 } 

1101 

1102 fct_name = tag_mapped[tag] if tag in tag_mapped else tag 

1103 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

1104 ftor = getattr(self, fct_name, None) 

1105 if callable(ftor): 

1106 inner_tex, inner_html = ftor(node, **kwargs) 

1107 elif tag in ("ext-link", "uri"): 

1108 # Add HTML links 

1109 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs) 

1110 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>, 

1111 # and not caught by parse_citation_node 

1112 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]: 

1113 is_extid_value = self.parse_ext_link(node, **kwargs) 

1114 if is_extid_value and kwargs["is_mixed_citation"]: 

1115 # an extid has been found in a mixed_citation, no need to add the text of the id here 

1116 inner_tex = inner_html = "" 

1117 elif tag == "supplementary-material": 1117 ↛ 1118line 1117 didn't jump to line 1118 because the condition on line 1117 was never true

1118 self.parse_supplementary_material(node, **kwargs) 

1119 else: 

1120 # II.1. Add the node text (before the children text) 

1121 if node.text is not None: 

1122 node_text = node.text 

1123 if self.for_tex_file: 1123 ↛ 1124line 1123 didn't jump to line 1124 because the condition on line 1123 was never true

1124 node_text = unicode_to_latex(node_text) 

1125 inner_tex += node_text 

1126 inner_html += escape(node.text) 

1127 

1128 # II.2. children 

1129 # child_text = html_from_mixed_content(child, params) 

1130 

1131 child_kwargs = kwargs.copy() 

1132 child_kwargs["is_top"] = False 

1133 

1134 for child in node: 

1135 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs) 

1136 

1137 # Case where an ext-link has been removed in a mixed-citation 

1138 # We may have "title. , (year)" 

1139 # Remove the comma that is now useless 

1140 if ( 1140 ↛ 1146line 1140 didn't jump to line 1146

1141 kwargs["is_mixed_citation"] 

1142 and child_html 

1143 and child_html[0] in [",", "."] 

1144 and inner_html[-2:] == ". " 

1145 ): 

1146 inner_html = inner_html[0:-1] 

1147 child_html = child_html[1:] 

1148 inner_tex = inner_tex[0:-1] 

1149 child_tex = child_tex[1:] 

1150 

1151 inner_tex += child_tex 

1152 inner_html += child_html 

1153 

1154 # II.3. wrap the children text with html links 

1155 if kwargs["add_HTML_link"] and node.text: 

1156 match = re.match(r"[\n ]+", node.text) 

1157 if not match: 

1158 inner_html = make_links_clickable(node.text, inner_html) 

1159 

1160 tex += inner_tex 

1161 html += inner_html 

1162 

1163 # III. Add the node's tail for children 

1164 if node.tail and not kwargs["is_top"]: 

1165 node_tail = node.tail 

1166 if self.for_tex_file: 1166 ↛ 1167line 1166 didn't jump to line 1167 because the condition on line 1166 was never true

1167 node_tail = unicode_to_latex(node_tail) 

1168 tex += node_tail 

1169 html += escape(node.tail) 

1170 

1171 return tex, html 

1172 

1173 def parse_abstract(self, node, **kwargs): 

1174 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1175 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1176 if tag == "author": 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true

1177 tag = "abstract" 

1178 lang = get_normalized_attrib(node, "lang") or self.lang 

1179 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1180 value_xml = get_xml_from_node(node) 

1181 self.abstracts.append( 

1182 { 

1183 "tag": tag, 

1184 "lang": lang, 

1185 "value_xml": value_xml, 

1186 "value_html": value_html, 

1187 "value_tex": value_tex, 

1188 } 

1189 ) 

1190 

1191 def parse_aff_alternatives(self, node, **kwargs): 

1192 xref_id = get_normalized_attrib(node, "id") or "" 

1193 address = "" 

1194 aff_to_all = True 

1195 

1196 for child in node: 

1197 tag = normalize(child.tag) 

1198 

1199 if tag == "aff": 1199 ↛ 1210line 1199 didn't jump to line 1210 because the condition on line 1199 was always true

1200 # Skip the formatted aff and use only the complete address text 

1201 # TODO support <aff> properly 

1202 for aff in child: 

1203 if aff.tag == "label" and address == "": 1203 ↛ 1204line 1203 didn't jump to line 1204 because the condition on line 1203 was never true

1204 label = get_text_from_node(aff) 

1205 address = get_text_from_node(child)[len(label) :] 

1206 aff_to_all = False 

1207 if address == "" and child.text: 

1208 address = child.text 

1209 else: 

1210 self.warnings.append( 

1211 { 

1212 self.pid: self.__class__.__name__ 

1213 + "." 

1214 + inspect.currentframe().f_code.co_name 

1215 + " " 

1216 + tag 

1217 } 

1218 ) 

1219 

1220 if address != "": 1220 ↛ exitline 1220 didn't return from function 'parse_aff_alternatives' because the condition on line 1220 was always true

1221 for contrib in self.contributors: 

1222 if address not in contrib["addresses"] and ( 1222 ↛ 1221line 1222 didn't jump to line 1221 because the condition on line 1222 was always true

1223 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all 

1224 ): 

1225 contrib["addresses"].append(address) 

1226 contrib["contrib_xml"] = get_contrib_xml(contrib) 

1227 

1228 def parse_award_group(self, node, **kwargs): 

1229 abbrev = award_id = None 

1230 

1231 for child in node: 

1232 tag = normalize(child.tag) 

1233 

1234 if tag == "award-id": 

1235 award_id = child.text 

1236 elif tag == "funding-source": 

1237 abbrev = get_text_from_node(child) 

1238 else: 

1239 self.warnings.append( 

1240 { 

1241 self.pid: self.__class__.__name__ 

1242 + "." 

1243 + inspect.currentframe().f_code.co_name 

1244 + " " 

1245 + tag 

1246 } 

1247 ) 

1248 

1249 if abbrev is not None and award_id is not None: 

1250 self.awards.append({"abbrev": abbrev, "award_id": award_id}) 

1251 

1252 def parse_contrib_group(self, node, **kwargs): 

1253 role = node.get("content-type") or "" 

1254 if role and role[-1] == "s": 1254 ↛ 1257line 1254 didn't jump to line 1257 because the condition on line 1254 was always true

1255 role = role[0:-1] 

1256 

1257 for child in node: 

1258 tag = normalize(child.tag) 

1259 

1260 if tag == "contrib": 1260 ↛ 1265line 1260 didn't jump to line 1265 because the condition on line 1260 was always true

1261 contrib = self.get_data_from_contrib(child) 

1262 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role 

1263 contrib["contrib_xml"] = get_xml_from_node(child) 

1264 self.contributors.append(contrib) 

1265 elif tag == "aff-alternatives": 

1266 self.parse_aff_alternatives(child) 

1267 elif tag == "fn": 

1268 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

1269 xml = get_xml_from_node(child) 

1270 self.footnotes_xml += xml 

1271 self.footnotes_html += html 

1272 else: 

1273 self.warnings.append( 

1274 { 

1275 self.pid: self.__class__.__name__ 

1276 + "." 

1277 + inspect.currentframe().f_code.co_name 

1278 + " " 

1279 + tag 

1280 } 

1281 ) 

1282 

1283 def parse_counts(self, node, **kwargs): 

1284 for child in node: 

1285 count_value = child.get("count") 

1286 if count_value is None: 

1287 count_value = child.text 

1288 

1289 if count_value is not None: 1289 ↛ 1284line 1289 didn't jump to line 1284 because the condition on line 1289 was always true

1290 tag = normalize(child.tag) 

1291 if tag == "book-page-count": 

1292 tag = "page-count" 

1293 

1294 self.counts.append((tag, count_value)) 

1295 

1296 def parse_ext_link(self, node, **kwargs): 

1297 datas = self.get_data_from_ext_link(node) 

1298 extid_value = self.add_extids_from_node_with_link(datas) 

1299 

1300 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1301 if ( 

1302 add_ext_link 

1303 and extid_value[0] is None 

1304 and datas not in self.ext_links 

1305 and datas["rel"] != "cover" 

1306 ): 

1307 self.ext_links.append(datas) 

1308 

1309 return extid_value[0] is not None 

1310 

1311 def parse_front_matter(self, node, **kwargs): 

1312 self.frontmatter_xml = get_xml_from_node(node) 

1313 self.frontmatter_foreword_html = "" 

1314 

1315 for child in node: 

1316 tag = normalize(child.tag) 

1317 

1318 if tag == "foreword": 1318 ↛ 1319line 1318 didn't jump to line 1319 because the condition on line 1318 was never true

1319 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child) 

1320 elif tag == "toc": 1320 ↛ 1315line 1320 didn't jump to line 1315 because the condition on line 1320 was always true

1321 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child) 

1322 

1323 def parse_id(self, node, **kwargs): 

1324 node_id = node.text 

1325 if "pub-id-type" in node.attrib: 

1326 node_type = node.attrib["pub-id-type"] 

1327 elif "book-id-type" in node.attrib: 

1328 node_type = node.attrib["book-id-type"] 

1329 elif "book-part-id-type" in node.attrib: 1329 ↛ 1332line 1329 didn't jump to line 1332 because the condition on line 1329 was always true

1330 node_type = node.attrib["book-part-id-type"] 

1331 else: 

1332 node_type = "" 

1333 

1334 if node_type == "pii": 1334 ↛ 1336line 1334 didn't jump to line 1336 because the condition on line 1334 was never true

1335 # Elsevier ids get a special treatment: web scrapping to find the date_published 

1336 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR": 

1337 self.pii = node_id 

1338 elif node_type in ("numdam-id", "mathdoc-id"): 

1339 self.pid = node_id 

1340 elif node_type == "ark": 1340 ↛ 1341line 1340 didn't jump to line 1341 because the condition on line 1340 was never true

1341 self.extids.append((node_type, node_id)) 

1342 elif node_type in ("doi", "eid"): 

1343 self.ids.append((node_type, node_id)) 

1344 if node_type == "doi": 1344 ↛ exitline 1344 didn't return from function 'parse_id' because the condition on line 1344 was always true

1345 self.doi = node_id 

1346 

1347 def parse_kwd_group(self, node, **kwargs): 

1348 kwds = [] 

1349 value_html = value_tex = "" 

1350 for child in node: 

1351 tag = normalize(child.tag) 

1352 

1353 if tag == "kwd": 

1354 kwds.append(child.text) 

1355 elif tag == "unstructured-kwd-group": 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true

1356 # value_xml = get_xml_from_node(child) 

1357 value_tex, value_html = self.parse_node_with_mixed_content(child) 

1358 kwds = split_kwds(value_tex) 

1359 else: 

1360 self.warnings.append( 

1361 { 

1362 self.pid: self.__class__.__name__ 

1363 + "." 

1364 + inspect.currentframe().f_code.co_name 

1365 + " " 

1366 + tag 

1367 } 

1368 ) 

1369 

1370 content_type = node.get("content-node_type") or "" 

1371 if content_type == "": 1371 ↛ 1373line 1371 didn't jump to line 1373 because the condition on line 1371 was always true

1372 content_type = node.get("kwd-group-type") or "" 

1373 lang = get_normalized_attrib(node, "lang") or self.lang 

1374 

1375 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds]) 

1376 

1377 def parse_ref_list(self, node, **kwargs): 

1378 for child in node: 

1379 tag = normalize(child.tag) 

1380 

1381 if tag == "ref": 1381 ↛ 1386line 1381 didn't jump to line 1386 because the condition on line 1381 was always true

1382 ref = JatsRef(tree=child, lang=self.lang) 

1383 self.warnings.extend(ref.warnings) 

1384 self.bibitems.append(ref) 

1385 self.bibitem.append(ref.citation_html) 

1386 elif tag == "p": 

1387 # Elsevier can store supplementary-material inside ref-list / p 

1388 self.parse_node_with_mixed_content(child) 

1389 else: 

1390 self.warnings.append( 

1391 { 

1392 self.pid: self.__class__.__name__ 

1393 + "." 

1394 + inspect.currentframe().f_code.co_name 

1395 + " " 

1396 + tag 

1397 } 

1398 ) 

1399 

1400 def parse_related_article(self, node, **kwargs): 

1401 rel_type = get_normalized_attrib(node, "related-article-type") or "" 

1402 id_value = node.text 

1403 

1404 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1404 ↛ 1407line 1404 didn't jump to line 1407 because the condition on line 1404 was never true

1405 # a pii is used instead of a DOI 

1406 # Call Elsevier to get the doi 

1407 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1408 id_value = doi 

1409 

1410 obj = Foo() 

1411 obj.rel_type = rel_type 

1412 obj.id_value = id_value 

1413 

1414 self.relations.append(obj) 

1415 

1416 def parse_related_object(self, node, **kwargs): 

1417 node_type = node.get("content-type") or "" 

1418 rel = node.get("link-type") or "" 

1419 href = get_normalized_attrib(node, "href") or "" 

1420 base = get_normalized_attrib(node, "base") or "" 

1421 text = get_xml_from_node(node) 

1422 

1423 data = { 

1424 "rel": rel, 

1425 "mimetype": node_type, 

1426 "location": href, 

1427 "base": base, 

1428 "metadata": text, 

1429 } 

1430 

1431 document_id_type = node.get("document-id-type") or "" 

1432 if document_id_type: 1432 ↛ 1433line 1432 didn't jump to line 1433 because the condition on line 1432 was never true

1433 id_value = node.get("document-id") or "" 

1434 if id_value != "NONE": 

1435 if id_value and id_value.find("10.") == -1: 

1436 # a pii is used instead of a DOI 

1437 # Call Elsevier to get the doi 

1438 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1439 id_value = doi 

1440 

1441 obj = Foo() 

1442 obj.rel_type = "refers to" 

1443 obj.id_value = id_value 

1444 

1445 self.relations.append(obj) 

1446 else: 

1447 self.related_objects.append(data) 

1448 

1449 def parse_sec(self, node, **kwargs): 

1450 for child in node: 

1451 tag = normalize(child.tag) 

1452 

1453 if tag == "title": 

1454 pass 

1455 elif tag == "ref-list": 

1456 self.parse_ref_list(child) 

1457 else: 

1458 self.warnings.append( 

1459 { 

1460 self.pid: self.__class__.__name__ 

1461 + "." 

1462 + inspect.currentframe().f_code.co_name 

1463 + " " 

1464 + tag 

1465 } 

1466 ) 

1467 

1468 def parse_self_uri(self, node, **kwargs): 

1469 node_type = node.get("content-type") or "text/html" 

1470 href = get_normalized_attrib(node, "href") or "" 

1471 base = get_normalized_attrib(node, "base") or "" 

1472 

1473 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections: 

1474 # The collection folder is missing: add it back 

1475 if hasattr(self, "pii") and hasattr(self, "issue"): 1475 ↛ 1476line 1475 didn't jump to line 1476 because the condition on line 1475 was never true

1476 base_dir = self.issue.journal.pid 

1477 if os.path.dirname(href) != base_dir: 

1478 href = os.path.join(base_dir, self.issue.pid, href) 

1479 

1480 if self.no_bib: 1480 ↛ 1481line 1480 didn't jump to line 1481 because the condition on line 1480 was never true

1481 href = "http://www.numdam.org/item/" + os.path.basename(href) 

1482 

1483 data = { 

1484 "rel": "full-text", 

1485 "mimetype": node_type, 

1486 "location": href, 

1487 "base": base, 

1488 "text": normalize_space(node.text) if node.text is not None else "", 

1489 } 

1490 

1491 # Ext-links, Related-objects used metadata instead of text. Strange difference ? 

1492 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here. 

1493 if node_type != "application/xml": 

1494 self.streams.append(data) 

1495 

1496 def parse_sub_article(self, node, **kwargs): 

1497 # Used for translations 

1498 trans_article = JatsArticle(tree=node) 

1499 self.translations.append(trans_article) 

1500 

1501 def parse_subj_group(self, node, **kwargs): 

1502 lang = get_normalized_attrib(node, "lang") or self.lang 

1503 type_ = node.get("subj-group-type") or "" 

1504 

1505 for child in node: 

1506 tag = normalize(child.tag) 

1507 

1508 if tag == "subject": 1508 ↛ 1513line 1508 didn't jump to line 1513 because the condition on line 1508 was always true

1509 self.subjs.append( 

1510 {"type": type_, "lang": lang, "value": get_text_from_node(child)} 

1511 ) 

1512 else: 

1513 self.warnings.append( 

1514 { 

1515 self.pid: self.__class__.__name__ 

1516 + "." 

1517 + inspect.currentframe().f_code.co_name 

1518 + " " 

1519 + tag 

1520 } 

1521 ) 

1522 

1523 def parse_supplementary_material(self, node, **kwargs): 

1524 caption = "" 

1525 for child in node: 

1526 if child.tag == "caption": 

1527 _, caption = self.parse_node_with_mixed_content(child) 

1528 

1529 location = get_normalized_attrib(node, "href") or None 

1530 if location is None: 

1531 location = get_normalized_attrib(node, "id") or "" 

1532 

1533 mimetype = node.attrib.get("mimetype") or None 

1534 if mimetype is None: 

1535 mimetype = resolver.get_mimetype(location) 

1536 

1537 material = { 

1538 "rel": node.attrib.get("content-type") or "supplementary-material", 

1539 "mimetype": mimetype, 

1540 "location": location, 

1541 "base": "", 

1542 "metadata": "", 

1543 "caption": caption if caption else "", 

1544 } 

1545 base_location = os.path.basename(location) 

1546 found_list = [ 

1547 item 

1548 for item in self.supplementary_materials 

1549 if os.path.basename(item["location"]) == base_location 

1550 ] 

1551 if len(found_list) == 0: 

1552 self.supplementary_materials.append(material) 

1553 

1554 def parse_title(self, node, **kwargs): 

1555 self.title_tex, self.title_html = self.parse_node_with_mixed_content( 

1556 node, ignore_xref=True 

1557 ) 

1558 # In xmldata.py, title_xml had the <title_group> tag: 

1559 # self.title_xml can't be set in parse_title 

1560 

1561 def parse_title_group(self, node, **kwargs): 

1562 has_fn_group = False 

1563 

1564 for child in node: 

1565 tag = normalize(child.tag) 

1566 

1567 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"): 

1568 self.parse_title(child) 

1569 elif tag == "subtitle": 1569 ↛ 1570line 1569 didn't jump to line 1570 because the condition on line 1569 was never true

1570 title_tex, title_html = self.parse_node_with_mixed_content(child) 

1571 self.title_tex += " " + title_tex 

1572 self.title_html += " " + title_html 

1573 elif tag == "trans-title-group": 

1574 self.parse_trans_title_group(child) 

1575 elif tag == "abbrev-title": 

1576 _, self.abbrev = self.parse_node_with_mixed_content(child) 

1577 elif tag == "fn-group": 1577 ↛ 1578line 1577 didn't jump to line 1578 because the condition on line 1577 was never true

1578 has_fn_group = True 

1579 for fn_node in child: 

1580 if fn_node.tag == "fn": 

1581 _, html = self.parse_node_with_fn( 

1582 fn_node, keep_fn=True, keep_fn_label=False 

1583 ) 

1584 xml = get_xml_from_node(fn_node) 

1585 self.footnotes_xml += xml 

1586 self.footnotes_html += html 

1587 else: 

1588 self.warnings.append( 

1589 { 

1590 self.pid: self.__class__.__name__ 

1591 + "." 

1592 + inspect.currentframe().f_code.co_name 

1593 + " " 

1594 + tag 

1595 } 

1596 ) 

1597 

1598 if has_fn_group: 1598 ↛ 1601line 1598 didn't jump to line 1601 because the condition on line 1598 was never true

1599 # fn-group is now a funding statement and will be exported separately in the XML: 

1600 # => remove it from the title-group 

1601 new_node = etree.Element("title-group") 

1602 for child in node: 

1603 tag = normalize(child.tag) 

1604 if tag != "fn-group": 

1605 new_node.append(copy.deepcopy(child)) 

1606 self.title_xml = get_xml_from_node(new_node) 

1607 else: 

1608 self.title_xml = get_xml_from_node(node) 

1609 

1610 def parse_trans_abstract(self, node, **kwargs): 

1611 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1612 if tag == "author": 1612 ↛ 1613line 1612 didn't jump to line 1613 because the condition on line 1612 was never true

1613 tag = "abstract" 

1614 lang = get_normalized_attrib(node, "lang") or "und" 

1615 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1616 value_xml = get_xml_from_node(node) 

1617 self.abstracts.append( 

1618 { 

1619 "tag": tag, 

1620 "lang": lang, 

1621 "value_xml": value_xml, 

1622 "value_html": value_html, 

1623 "value_tex": value_tex, 

1624 } 

1625 ) 

1626 

1627 def parse_trans_title(self, node, **kwargs): 

1628 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node) 

1629 self.trans_title_xml = get_xml_from_node(node) 

1630 

1631 def parse_trans_title_group(self, node, **kwargs): 

1632 for child in node: 

1633 tag = normalize(child.tag) 

1634 

1635 if tag == "trans-title": 1635 ↛ 1638line 1635 didn't jump to line 1638 because the condition on line 1635 was always true

1636 self.parse_trans_title(child) 

1637 else: 

1638 self.warnings.append( 

1639 { 

1640 self.pid: self.__class__.__name__ 

1641 + "." 

1642 + inspect.currentframe().f_code.co_name 

1643 + " " 

1644 + tag 

1645 } 

1646 ) 

1647 

1648 self.trans_lang = get_normalized_attrib(node, "lang") or "und" 

1649 

1650 def get_data_from_contrib(self, node): 

1651 """ 

1652 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives> 

1653 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code 

1654 :param node: 

1655 :return: 

1656 """ 

1657 

1658 params = create_contributor() 

1659 

1660 for child in node: 

1661 if child.tag == "name": 

1662 self.update_data_from_name(child, params) 

1663 elif child.tag == "string-name": 

1664 self.update_data_from_name(child, params) 

1665 if params["first_name"] == "" and params["last_name"] == "": 1665 ↛ 1660line 1665 didn't jump to line 1660 because the condition on line 1665 was always true

1666 params["string_name"] = child.text or "" 

1667 elif child.tag == "name-alternatives": 

1668 params["mid"] = self.get_data_from_name_alternatives(child) 

1669 elif child.tag == "contrib-id": 

1670 type_ = child.get("contrib-id-type") or "" 

1671 if type_ == "orcid": 1671 ↛ 1673line 1671 didn't jump to line 1673 because the condition on line 1671 was always true

1672 params["orcid"] = child.text or "" 

1673 if type_ == "idref": 1673 ↛ 1674line 1673 didn't jump to line 1674 because the condition on line 1673 was never true

1674 params["idref"] = child.text or "" 

1675 elif child.tag == "address": 

1676 addr = get_text_from_node(child) 

1677 params["addresses"].append(addr) 

1678 elif child.tag == "email": 

1679 params["email"] = child.text or "" 

1680 elif child.tag == "xref": 1680 ↛ 1692line 1680 didn't jump to line 1692 because the condition on line 1680 was always true

1681 # Elsevier uses xref/aff-alternatives to store affiliations 

1682 type_ = child.get("ref-type") or "" 

1683 if type_ == "aff": 1683 ↛ 1660line 1683 didn't jump to line 1660 because the condition on line 1683 was always true

1684 xref = child.get("rid") or "" 

1685 if xref == "": 1685 ↛ 1686line 1685 didn't jump to line 1686 because the condition on line 1685 was never true

1686 xref = get_text_from_node(child) 

1687 if xref != "": 1687 ↛ 1660line 1687 didn't jump to line 1660 because the condition on line 1687 was always true

1688 if "xrefs" not in params: 1688 ↛ 1691line 1688 didn't jump to line 1691 because the condition on line 1688 was always true

1689 params["xrefs"] = [xref] 

1690 else: 

1691 params["xrefs"].append(xref) 

1692 elif child.tag == "collab": 

1693 params["string_name"] = child.text or "" 

1694 elif child.tag == "role": 

1695 pass 

1696 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente"). 

1697 # The node value can not be assigned to params['role'] as we want a controlled vocabulary 

1698 # (author /editor / organizer...) 

1699 # Ignore the value 

1700 # params["role"] = child.text or "" 

1701 else: 

1702 self.warnings.append( 

1703 { 

1704 self.pid: self.__class__.__name__ 

1705 + "." 

1706 + inspect.currentframe().f_code.co_name 

1707 + " " 

1708 + child.tag 

1709 } 

1710 ) 

1711 

1712 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ) 

1713 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import 

1714 # params['addresses'].sort() 

1715 

1716 helper_update_name_params(params) 

1717 

1718 corresp = node.get("corresp") or "" 

1719 if corresp == "yes": 1719 ↛ 1720line 1719 didn't jump to line 1720 because the condition on line 1719 was never true

1720 params["corresponding"] = True 

1721 

1722 deceased_ = node.get("deceased") or "no" 

1723 params["deceased_before_publication"] = deceased_ == "yes" 

1724 

1725 equal_contrib_ = node.get("equal-contrib") or "no" 

1726 params["equal_contrib"] = equal_contrib_ == "yes" 

1727 

1728 return params 

1729 

1730 def get_data_from_custom_meta(self, node): 

1731 name = "" 

1732 value = "" 

1733 

1734 for child in node: 

1735 tag = normalize(child.tag) 

1736 

1737 if tag == "meta-name": 

1738 name = child.text 

1739 elif tag == "meta-value": 1739 ↛ 1742line 1739 didn't jump to line 1742 because the condition on line 1739 was always true

1740 value = child.text 

1741 else: 

1742 self.warnings.append( 

1743 { 

1744 self.pid: self.__class__.__name__ 

1745 + "." 

1746 + inspect.currentframe().f_code.co_name 

1747 + " " 

1748 + tag 

1749 } 

1750 ) 

1751 

1752 return name, value 

1753 

1754 def get_data_from_date(self, node, ignore_month=False): 

1755 date_str = "" 

1756 if "iso-8601-date" in node.attrib: 

1757 date_str = node.attrib["iso-8601-date"] 

1758 else: 

1759 year = month = day = "" 

1760 for child in node: 

1761 tag = normalize(child.tag) 

1762 

1763 if tag == "year": 1763 ↛ 1765line 1763 didn't jump to line 1765 because the condition on line 1763 was always true

1764 year = child.text 

1765 elif tag == "month" and not ignore_month: 

1766 month = child.text 

1767 elif tag == "day": 

1768 day = child.text 

1769 else: 

1770 self.warnings.append( 

1771 { 

1772 self.pid: self.__class__.__name__ 

1773 + "." 

1774 + inspect.currentframe().f_code.co_name 

1775 + " " 

1776 + tag 

1777 } 

1778 ) 

1779 

1780 date_str = year 

1781 if date_str and month: 1781 ↛ 1782line 1781 didn't jump to line 1782 because the condition on line 1781 was never true

1782 date_str += "-" + month 

1783 if date_str and day: 1783 ↛ 1784line 1783 didn't jump to line 1784 because the condition on line 1783 was never true

1784 date_str += "-" + day 

1785 

1786 return date_str 

1787 

1788 def get_data_from_ext_link(self, node, **kwargs): 

1789 link_type = node.get("ext-link-type") or "" 

1790 href = get_normalized_attrib(node, "href") or "" 

1791 base = get_normalized_attrib(node, "base") or "" 

1792 

1793 kwargs["add_HTML_link"] = False 

1794 _, metadata = self.parse_inner_node(node, **kwargs) 

1795 

1796 data = { 

1797 "rel": link_type, 

1798 "mimetype": "", 

1799 "location": href, 

1800 "base": base, 

1801 "metadata": metadata, 

1802 } 

1803 

1804 return data 

1805 

1806 def get_data_from_history(self, node): 

1807 history_dates = [] 

1808 # TODO: transform history_dates in a hash where date-type is the key 

1809 # => Change database_cmds 

1810 for child in node: 

1811 if "date-type" in child.attrib: 

1812 date_type = child.attrib["date-type"] 

1813 date_str = self.get_data_from_date(child) 

1814 history_dates.append({"type": date_type, "date": date_str}) 

1815 else: 

1816 self.warnings.append( 

1817 { 

1818 self.pid: self.__class__.__name__ 

1819 + "." 

1820 + inspect.currentframe().f_code.co_name 

1821 + " " 

1822 + child.tag 

1823 } 

1824 ) 

1825 

1826 return history_dates 

1827 

1828 def update_data_from_name(self, node, contributor): 

1829 for child in node: 

1830 if child.text is not None: 1830 ↛ 1829line 1830 didn't jump to line 1829 because the condition on line 1830 was always true

1831 if child.tag == "given-names": 

1832 contributor["first_name"] = child.text 

1833 elif child.tag == "surname": 

1834 contributor["last_name"] = child.text 

1835 elif child.tag == "prefix": 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true

1836 contributor["prefix"] = child.text 

1837 elif child.tag == "suffix": 1837 ↛ 1840line 1837 didn't jump to line 1840 because the condition on line 1837 was always true

1838 contributor["suffix"] = child.text 

1839 else: 

1840 self.warnings.append( 

1841 { 

1842 self.pid: self.__class__.__name__ 

1843 + "." 

1844 + inspect.currentframe().f_code.co_name 

1845 + " " 

1846 + child.tag 

1847 } 

1848 ) 

1849 

1850 def get_data_from_name_alternatives(self, node): 

1851 mid = "" 

1852 

1853 for child in node: 

1854 if child.text is not None: 1854 ↛ 1853line 1854 didn't jump to line 1853 because the condition on line 1854 was always true

1855 if child.tag == "string-name": 1855 ↛ 1859line 1855 didn't jump to line 1859 because the condition on line 1855 was always true

1856 if child.get("specific-use") == "index": 1856 ↛ 1853line 1856 didn't jump to line 1853 because the condition on line 1856 was always true

1857 mid = child.text 

1858 else: 

1859 self.warnings.append( 

1860 { 

1861 self.pid: self.__class__.__name__ 

1862 + "." 

1863 + inspect.currentframe().f_code.co_name 

1864 + " " 

1865 + child.tag 

1866 } 

1867 ) 

1868 

1869 return mid 

1870 

1871 def get_data_from_uri(self, node, **kwargs): 

1872 href = get_normalized_attrib(node, "href") or "" 

1873 

1874 kwargs["add_HTML_link"] = False 

1875 _, metadata = self.parse_inner_node(node, **kwargs) 

1876 

1877 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata} 

1878 

1879 return data 

1880 

1881 def helper_add_link_from_node(self, node, **kwargs): 

1882 text = node.text or "" 

1883 tag = normalize(node.tag) 

1884 fct_name = "get_data_from_" + tag.replace("-", "_") 

1885 meth = getattr(self, fct_name) 

1886 data = meth(node, **kwargs) 

1887 if not data["rel"] or data["rel"] == "uri": 

1888 href = data["location"] 

1889 if self.for_tex_file: 1889 ↛ 1890line 1889 didn't jump to line 1890 because the condition on line 1889 was never true

1890 text = "\\href{" + href + "}{" + data["metadata"] + "}" 

1891 else: 

1892 text = make_links_clickable(href, data["metadata"]) 

1893 return text 

1894 

1895 def get_list_start_value(self, list_node): 

1896 continued_from = list_node.get("continued-from") 

1897 if continued_from is None: 

1898 start = 0 

1899 else: 

1900 from_node = self.tree.find(f'.//*[@id="{continued_from}"]') 

1901 if from_node is not None: 

1902 start = len(from_node) + self.get_list_start_value(from_node) 

1903 

1904 return start 

1905 

1906 

1907class MathdocPublication(MathdocPublicationData, JatsBase): 

1908 def __init__(self, *args, **kwargs): 

1909 super().__init__(*args, **kwargs) 

1910 self.parse_tree(kwargs["tree"]) 

1911 

1912 def parse_tree(self, tree): 

1913 super().parse_tree(tree) 

1914 

1915 for node in tree: 

1916 tag = normalize(node.tag) 

1917 

1918 if tag in ("publication-id", "collection-id"): 

1919 node_type = node.get("publication-id-type") 

1920 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]: 

1921 self.pid = node.text 

1922 elif tag == "title-group": 

1923 self.parse_title_group(node) 

1924 elif tag == "issn": 

1925 node_type = node.get("pub-type") 

1926 if node_type == "ppub": 

1927 self.issn = node.text 

1928 self.ids.append(("issn", node.text)) 

1929 elif node_type == "epub": 1929 ↛ 1915line 1929 didn't jump to line 1915 because the condition on line 1929 was always true

1930 self.e_issn = node.text 

1931 self.ids.append(("e-issn", node.text)) 

1932 elif tag == "ext-link": 

1933 data = self.get_data_from_ext_link(node) 

1934 self.ext_links.append(data) 

1935 elif tag == "custom-meta-group": 

1936 self.parse_custom_meta_group(node) 

1937 elif tag == "description": 1937 ↛ 1938line 1937 didn't jump to line 1938 because the condition on line 1937 was never true

1938 self.parse_description(node) 

1939 else: 

1940 self.warnings.append( 

1941 { 

1942 self.pid: self.__class__.__name__ 

1943 + "." 

1944 + inspect.currentframe().f_code.co_name 

1945 + " " 

1946 + tag 

1947 } 

1948 ) 

1949 

1950 def parse_custom_meta_group(self, node, **kwargs): 

1951 for child in node: 

1952 tag = normalize(child.tag) 

1953 

1954 if tag == "custom-meta": 1954 ↛ 1964line 1954 didn't jump to line 1964 because the condition on line 1954 was always true

1955 name, value = self.get_data_from_custom_meta(child) 

1956 

1957 if name == "serial-type": 

1958 self.coltype = value 

1959 elif name == "wall": 

1960 self.wall = int(value) 

1961 elif name == "provider": 1961 ↛ 1951line 1961 didn't jump to line 1951 because the condition on line 1961 was always true

1962 self.provider = value 

1963 else: 

1964 self.warnings.append( 

1965 { 

1966 self.pid: self.__class__.__name__ 

1967 + "." 

1968 + inspect.currentframe().f_code.co_name 

1969 + " " 

1970 + tag 

1971 } 

1972 ) 

1973 

1974 def parse_description(self, node, **kwargs): 

1975 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1976 tag = "description" 

1977 lang = get_normalized_attrib(node, "lang") or self.lang 

1978 value_xml = get_xml_from_node(node) 

1979 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "") 

1980 self.abstracts.append( 

1981 { 

1982 "tag": tag, 

1983 "lang": lang, 

1984 "value_xml": value_xml, 

1985 "value_html": value_html, 

1986 "value_tex": value_tex, 

1987 } 

1988 ) 

1989 

1990 

1991class JatsPublisher(PublisherData): 

1992 def __init__(self, *args, **kwargs): 

1993 super().__init__(*args, **kwargs) 

1994 self.warnings = [] 

1995 self.parse_tree(kwargs["tree"]) 

1996 self.warnings = [] 

1997 

1998 def parse_tree(self, tree): 

1999 for node in tree: 

2000 tag = normalize(node.tag) 

2001 

2002 if tag == "publisher-name": 2002 ↛ 2004line 2002 didn't jump to line 2004 because the condition on line 2002 was always true

2003 self.name = node.text 

2004 elif tag == "publisher-loc": 

2005 self.loc = node.text 

2006 else: 

2007 self.warnings.append( 

2008 { 

2009 self.pid: self.__class__.__name__ 

2010 + "." 

2011 + inspect.currentframe().f_code.co_name 

2012 + " " 

2013 + tag 

2014 } 

2015 ) 

2016 

2017 

2018class JatsJournal(JournalData, JatsBase): 

2019 def __init__(self, *args, **kwargs): 

2020 super().__init__(*args, **kwargs) 

2021 self.parse_tree(kwargs["tree"]) 

2022 

2023 def parse_tree(self, tree): 

2024 super().parse_tree(tree) 

2025 

2026 for node in tree: 

2027 tag = normalize(node.tag) 

2028 

2029 if tag == "journal-id": 

2030 id_type = node.get("journal-id-type") or "numdam-id" 

2031 if id_type == "numdam-id" or id_type == "mathdoc-id": 2031 ↛ 2026line 2031 didn't jump to line 2026 because the condition on line 2031 was always true

2032 self.pid = node.text 

2033 elif tag == "journal-title-group": 

2034 self.parse_title_group(node) 

2035 elif tag == "publisher": 

2036 self.publisher = JatsPublisher(tree=node) 

2037 elif tag == "issn": 2037 ↛ 2046line 2037 didn't jump to line 2046 because the condition on line 2037 was always true

2038 node_type = node.get("pub-type") or "ppub" 

2039 if node_type == "ppub": 

2040 self.issn = node.text 

2041 self.ids.append(("issn", node.text)) 

2042 elif node_type == "epub": 2042 ↛ 2026line 2042 didn't jump to line 2026 because the condition on line 2042 was always true

2043 self.e_issn = node.text 

2044 self.ids.append(("e-issn", node.text)) 

2045 else: 

2046 self.warnings.append( 

2047 { 

2048 self.pid: self.__class__.__name__ 

2049 + "." 

2050 + inspect.currentframe().f_code.co_name 

2051 + " " 

2052 + tag 

2053 } 

2054 ) 

2055 

2056 

2057class JatsEdito(ArticleData, JatsBase): 

2058 def __init__(self, *args, **kwargs): # , tree, pid=None): 

2059 super().__init__(*args, **kwargs) 

2060 self.pid = kwargs["pid"] if "pid" in kwargs else None 

2061 self.issue = kwargs["issue"] if "issue" in kwargs else None 

2062 

2063 self.add_span_around_tex_formula = ( 

2064 kwargs["add_span_around_tex_formula"] 

2065 if "add_span_around_tex_formula" in kwargs 

2066 else False 

2067 ) 

2068 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False 

2069 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2070 self.no_bib = kwargs.get("no_bib", False) 

2071 

2072 self.parse_tree(kwargs["tree"]) 

2073 

2074 def parse_tree(self, tree): 

2075 super().parse_tree(tree) 

2076 for node in tree: 

2077 text_html = "" 

2078 

2079 tag = normalize(node.tag) 

2080 if tag == "p": 

2081 text_html = get_text_from_node(node) 

2082 if text_html: 

2083 self.body_html += "<p>" + text_html + "</p>" 

2084 elif tag == "h1": 

2085 text_html = get_text_from_node(node) 

2086 if text_html: 

2087 self.body_html += "<h1>" + text_html + "</h1>" 

2088 

2089 return self.body_html 

2090 

2091 

2092class JatsIssue(IssueData, JatsBase): 

2093 def __init__(self, *args, **kwargs): 

2094 super().__init__(*args, **kwargs) 

2095 # from_folder is used to change the location of Elsevier graphics to a full path location 

2096 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2097 self.no_bib = kwargs.get("no_bib", False) 

2098 

2099 self.parse_tree(kwargs["tree"]) 

2100 

2101 def parse_tree(self, tree): 

2102 super().parse_tree(tree) 

2103 

2104 for node in tree: 

2105 tag = normalize(node.tag) 

2106 if tag == "journal-meta": 

2107 self.journal = JatsJournal(tree=node) 

2108 elif tag == "issue-meta": 

2109 ctype = get_normalized_attrib(node, "issue_type") 

2110 if ctype == "issue_special": 2110 ↛ 2111line 2110 didn't jump to line 2111 because the condition on line 2110 was never true

2111 self.ctype = "issue_special" 

2112 self.parse_issue_meta(node) 

2113 elif tag == "body": 2113 ↛ 2138line 2113 didn't jump to line 2138 because the condition on line 2113 was always true

2114 for child in node: 

2115 tag = normalize(child.tag) 

2116 

2117 if tag == "article": 2117 ↛ 2128line 2117 didn't jump to line 2128 because the condition on line 2117 was always true

2118 article = JatsArticle( 

2119 tree=child, 

2120 issue=self, 

2121 from_folder=self.from_folder, 

2122 no_bib=self.no_bib, 

2123 ) 

2124 self.warnings.extend(article.warnings) 

2125 self.articles.append(article) 

2126 

2127 else: 

2128 self.warnings.append( 

2129 { 

2130 self.pid: self.__class__.__name__ 

2131 + "." 

2132 + inspect.currentframe().f_code.co_name 

2133 + " " 

2134 + tag 

2135 } 

2136 ) 

2137 else: 

2138 self.warnings.append( 

2139 { 

2140 self.pid: self.__class__.__name__ 

2141 + "." 

2142 + inspect.currentframe().f_code.co_name 

2143 + " " 

2144 + tag 

2145 } 

2146 ) 

2147 

2148 if self.journal is not None: 2148 ↛ 2152line 2148 didn't jump to line 2152 because the condition on line 2148 was always true

2149 self.publisher = self.journal.publisher 

2150 

2151 # Issue editors may be replicated in all the articles, remove them 

2152 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"] 

2153 

2154 is_elsevier = False 

2155 for xarticle in self.articles: 

2156 if hasattr(xarticle, "pii"): 2156 ↛ 2157line 2156 didn't jump to line 2157 because the condition on line 2156 was never true

2157 is_elsevier = True 

2158 

2159 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"] 

2160 is_equal = len(editors) == len(issue_editors) 

2161 i = 0 

2162 while is_equal and i < len(editors): 2162 ↛ 2163line 2162 didn't jump to line 2163 because the condition on line 2162 was never true

2163 if ( 

2164 editors[i]["last_name"] != issue_editors[i]["last_name"] 

2165 or editors[i]["first_name"] != issue_editors[i]["first_name"] 

2166 ): 

2167 is_equal = False 

2168 i += 1 

2169 if is_equal: 2169 ↛ 2155line 2169 didn't jump to line 2155 because the condition on line 2169 was always true

2170 xarticle.contributors = [ 

2171 contrib for contrib in xarticle.contributors if contrib["role"] != "editor" 

2172 ] 

2173 

2174 if is_elsevier: 2174 ↛ 2176line 2174 didn't jump to line 2176 because the condition on line 2174 was never true

2175 # Fix location of icons 

2176 for link in self.ext_links: 

2177 if link["rel"] in ["icon", "small_icon"]: 

2178 base_dir = self.journal.pid 

2179 location = link["location"] 

2180 if os.path.dirname(location) != base_dir: 

2181 location = os.path.join(base_dir, self.pid, location) 

2182 if self.from_folder: 

2183 location = os.path.join(self.from_folder, location) 

2184 location = "file:" + location 

2185 link["location"] = location 

2186 

2187 # Fix article types and subjects 

2188 for xarticle in self.articles: 

2189 article_type = "research-article" 

2190 old_type = "" 

2191 new_subjs = [] 

2192 

2193 if xarticle.fpage != "": 

2194 try: 

2195 value = int(xarticle.fpage) 

2196 except ValueError: 

2197 # fpage is not a number: the article is an editorial 

2198 article_type = "editorial" 

2199 

2200 if article_type == "research-article": 

2201 for subj in xarticle.subjs: 

2202 if subj["type"] == "type": 

2203 # Fix article types 

2204 value = subj["value"].lower() 

2205 old_type = value 

2206 if value == "discussion": 

2207 article_type = "letter" 

2208 elif value == "editorial": 

2209 if xarticle.title_tex.lower().find("foreword") == 0: 

2210 article_type = "foreword" 

2211 else: 

2212 article_type = "editorial" 

2213 elif value in ["mini review", "review article", "book review"]: 

2214 article_type = "review" 

2215 elif value == "research article": 

2216 article_type = "research-article" 

2217 elif value == "short communication": 

2218 article_type = "foreword" 

2219 elif value == "correspondence": 

2220 article_type = "letter" 

2221 elif value.find("conference") == 0: 

2222 article_type = "congress" 

2223 elif subj["type"] == "heading" and not xarticle.title_tex: 

2224 # The title may be stored in the heading: fix it 

2225 xarticle.title_tex = xarticle.title_html = subj["value"] 

2226 xarticle.title_xml = get_title_xml(subj["value"]) 

2227 elif subj["type"] == "heading": 

2228 value = subj["value"].lower().strip() 

2229 issue_title = self.title_tex.lower() 

2230 if issue_title.find("dossier: ") == 0: 

2231 issue_title = issue_title[9:] 

2232 self.title_tex = self.title_html = self.title_tex[9:] 

2233 self.title_xml = ( 

2234 "<issue-title>" 

2235 + get_single_title_xml(issue_title) 

2236 + "</issue-title>" 

2237 ) 

2238 

2239 # Some heading values are in fact article type 

2240 if value.find("erratum") == 0: 

2241 article_type = "erratum" 

2242 elif value.find("corrigendum") == 0: 

2243 article_type = "corrigendum" 

2244 elif value.find("foreword") == 0: 

2245 article_type = "foreword" 

2246 elif value.find("nécrologie") == 0 or value.find("obituary") == 0: 

2247 article_type = "history-of-sciences" 

2248 elif ( 

2249 value.find("block calendar/éphéméride") == 0 

2250 or value.find("chronique") == 0 

2251 ): 

2252 article_type = "history-of-sciences" 

2253 elif value.find("histoire") == 0 or value.find("historic") == 0: 

2254 article_type = "history-of-sciences" 

2255 elif value.find("tribute/hommage") == 0: 

2256 article_type = "history-of-sciences" 

2257 elif value.find("note historique") == 0: 

2258 article_type = "historical-commentary" 

2259 elif ( 

2260 value.find("le point sur") == 0 or value.find("le point-sur") == 0 

2261 ): 

2262 article_type = "review" 

2263 elif ( 

2264 value.find("review") == 0 

2265 or value.find("revue") == 0 

2266 or value.find("concise review") == 0 

2267 ): 

2268 article_type = "review" 

2269 elif value.find("conférence") == 0: 

2270 article_type = "congress" 

2271 elif ( 

2272 value.find("communication") == 0 or value.find("preliminary") == 0 

2273 ): 

2274 article_type = "preliminary-communication" 

2275 elif value.find("perspective") == 0 and old_type in [ 

2276 "correspondence", 

2277 "short communication", 

2278 ]: 

2279 article_type = "opinion" 

2280 elif value.find("debate") == 0: 

2281 article_type = "opinion" 

2282 elif ( 

2283 value.find("index") == 0 

2284 or value.find("keyword") == 0 

2285 or value.find("sommaire") == 0 

2286 ): 

2287 article_type = "editorial" 

2288 elif ( 

2289 value.find("table auteurs") == 0 

2290 or value.find("table sommaire") == 0 

2291 ): 

2292 article_type = "editorial" 

2293 elif value.find("page présentation des index") == 0: 

2294 article_type = "editorial" 

2295 elif value.find("fac-similé") == 0: 

2296 # Article de crbiol, Pubmed les met en "Classical Article" 

2297 article_type = "historical-commentary" 

2298 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie) 

2299 new_subjs.append(subj) 

2300 # Ignore the issue titles 

2301 elif ( 

2302 not self.title_tex 

2303 or value.find(self.title_tex.lower().strip()) != 0 

2304 ): 

2305 # Exclude headings that are redundant with article types 

2306 exclude_list = [ 

2307 "editorial", 

2308 "éditorial", 

2309 "avant-propos", 

2310 "book review", 

2311 "comment", 

2312 "concise review paper", 

2313 "answer", 

2314 "commentaire", 

2315 "commentary", 

2316 "reply", 

2317 "foreword", 

2318 "full paper", 

2319 "mémoire", 

2320 ] 

2321 if len([x for x in exclude_list if value.find(x) == 0]) == 0: 

2322 new_subjs.append(subj) 

2323 else: 

2324 new_subjs.append(subj) 

2325 

2326 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage) 

2327 xarticle.atype = article_type 

2328 xarticle.subjs = new_subjs 

2329 

2330 def parse_custom_meta_group(self, node, **kwargs): 

2331 for child in node: 

2332 tag = normalize(child.tag) 

2333 

2334 if tag == "custom-meta": 2334 ↛ 2342line 2334 didn't jump to line 2342 because the condition on line 2334 was always true

2335 name, value = self.get_data_from_custom_meta(child) 

2336 

2337 if name == "provider": 

2338 self.provider = value 

2339 elif name == "efirst": 2339 ↛ 2331line 2339 didn't jump to line 2331 because the condition on line 2339 was always true

2340 self.with_online_first = value == "yes" 

2341 else: 

2342 self.warnings.append( 

2343 { 

2344 self.pid: self.__class__.__name__ 

2345 + "." 

2346 + inspect.currentframe().f_code.co_name 

2347 + " " 

2348 + tag 

2349 } 

2350 ) 

2351 

2352 def parse_issue_meta(self, node, **kwargs): 

2353 for child in node: 

2354 tag = normalize(child.tag) 

2355 

2356 if tag == "issue-id": 

2357 self.parse_id(child) 

2358 elif tag == "volume-series": 

2359 self.vseries = child.text 

2360 elif tag == "volume": 

2361 self.volume = child.text 

2362 elif tag == "issue": 

2363 self.number = child.text 

2364 elif tag == "pub-date": 

2365 self.year = self.get_data_from_date(child, ignore_month=True) 

2366 elif tag == "history": 

2367 history_dates = self.get_data_from_history(child) 

2368 for date in history_dates: 

2369 if date["type"] == "last-modified": 

2370 self.last_modified_iso_8601_date_str = date["date"] 

2371 elif date["type"] == "prod-deployed-date": 

2372 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2373 elif tag == "issue-title": 

2374 content_type = child.get("content-type") or "" 

2375 if content_type != "subtitle" and content_type != "cover-date": 2375 ↛ 2353line 2375 didn't jump to line 2353 because the condition on line 2375 was always true

2376 # Elsevier stores contributors in subtitles. Ignore. 

2377 lang = get_normalized_attrib(child, "lang") or "und" 

2378 if not self.title_tex and ( 2378 ↛ 2386line 2378 didn't jump to line 2386 because the condition on line 2378 was always true

2379 self.lang == "und" or lang == "und" or lang == self.lang 

2380 ): 

2381 self.parse_title(child) 

2382 # In xmldata, title_xml had the <title_group> tag: 

2383 # self.title_xml can't be set in parse_title 

2384 self.title_xml += get_xml_from_node(child) 

2385 else: 

2386 self.trans_lang = lang 

2387 ( 

2388 self.trans_title_tex, 

2389 self.trans_title_html, 

2390 ) = self.parse_node_with_mixed_content(child) 

2391 self.title_xml += get_xml_from_node(child) 

2392 elif tag == "issue-title-group": 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true

2393 self.parse_title_group(child) 

2394 else: 

2395 fct_name = "parse_" + tag.replace("-", "_") 

2396 ftor = getattr(self, fct_name, None) 

2397 if callable(ftor): 2397 ↛ 2400line 2397 didn't jump to line 2400 because the condition on line 2397 was always true

2398 ftor(child, add_ext_link=True) 

2399 else: 

2400 self.warnings.append( 

2401 { 

2402 self.pid: self.__class__.__name__ 

2403 + "." 

2404 + inspect.currentframe().f_code.co_name 

2405 + " " 

2406 + tag 

2407 } 

2408 ) 

2409 

2410 if self.last_modified_iso_8601_date_str is None: 

2411 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

2412 

2413 

2414class JatsArticleBase(JatsBase): 

2415 def parse_custom_meta_group(self, node, **kwargs): 

2416 for child in node: 

2417 tag = normalize(child.tag) 

2418 

2419 if tag == "custom-meta": 2419 ↛ 2438line 2419 didn't jump to line 2438 because the condition on line 2419 was always true

2420 name, value = self.get_data_from_custom_meta(child) 

2421 

2422 if name == "article-number": 

2423 self.article_number = value 

2424 elif name == "talk-number": 

2425 self.talk_number = value 

2426 elif name == "presented": 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true

2427 presenter = create_contributor() 

2428 presenter["role"] = "presenter" 

2429 presenter["string_name"] = value.replace("Presented by ", "").replace( 

2430 "Présenté par ", "" 

2431 ) 

2432 presenter["contrib_xml"] = get_contrib_xml(presenter) 

2433 self.contributors.append(presenter) 

2434 elif name == "provider": 2434 ↛ 2416line 2434 didn't jump to line 2416 because the condition on line 2434 was always true

2435 self.provider = value 

2436 

2437 else: 

2438 self.warnings.append( 

2439 { 

2440 self.pid: self.__class__.__name__ 

2441 + "." 

2442 + inspect.currentframe().f_code.co_name 

2443 + " " 

2444 + tag 

2445 } 

2446 ) 

2447 

2448 

2449class JatsArticle(ArticleData, JatsArticleBase): 

2450 def __init__(self, *args, **kwargs): # , tree, pid=None): 

2451 super().__init__(*args, **kwargs) 

2452 self.pid = kwargs["pid"] if "pid" in kwargs else None 

2453 self.issue = kwargs["issue"] if "issue" in kwargs else None 

2454 

2455 self.add_span_around_tex_formula = ( 

2456 kwargs["add_span_around_tex_formula"] 

2457 if "add_span_around_tex_formula" in kwargs 

2458 else False 

2459 ) 

2460 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False 

2461 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2462 self.no_bib = kwargs.get("no_bib", False) 

2463 

2464 self.parse_tree(kwargs["tree"]) 

2465 

2466 def parse_tree(self, tree): 

2467 super().parse_tree(tree) 

2468 

2469 self.atype = get_normalized_attrib(tree, "article-type") or "" 

2470 

2471 # First loop to catch float-groups that are inserted inside the body 

2472 for node in tree: 

2473 tag = normalize(node.tag) 

2474 

2475 if tag == "front": 

2476 for child in node: 

2477 tag = normalize(child.tag) 

2478 

2479 if tag == "article-meta": 

2480 self.parse_article_meta(child) 

2481 else: 

2482 self.warnings.append( 

2483 { 

2484 self.pid: self.__class__.__name__ 

2485 + "." 

2486 + inspect.currentframe().f_code.co_name 

2487 + " " 

2488 + tag 

2489 } 

2490 ) 

2491 elif tag == "front-stub": 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true

2492 self.parse_article_meta(node) 

2493 elif tag == "floats-group": 2493 ↛ 2494line 2493 didn't jump to line 2494 because the condition on line 2493 was never true

2494 self.parse_floats_group(node) 

2495 

2496 for node in tree: 

2497 tag = normalize(node.tag) 

2498 if tag == "back": 

2499 for child in node: 

2500 tag = normalize(child.tag) 

2501 

2502 if tag == "ref-list" and not self.no_bib: 

2503 print("Parse bib") 

2504 self.parse_ref_list(child) 

2505 elif tag == "ack": 2505 ↛ 2506line 2505 didn't jump to line 2506 because the condition on line 2505 was never true

2506 self.parse_ack(child) 

2507 elif tag == "sec": 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true

2508 self.parse_sec(child) 

2509 elif tag == "app-group": 2509 ↛ 2510line 2509 didn't jump to line 2510 because the condition on line 2509 was never true

2510 self.parse_app_group(child) 

2511 elif tag == "fn-group": 2511 ↛ 2512line 2511 didn't jump to line 2512 because the condition on line 2511 was never true

2512 self.parse_fn_group(child) 

2513 else: 

2514 self.warnings.append( 

2515 { 

2516 self.pid: self.__class__.__name__ 

2517 + "." 

2518 + inspect.currentframe().f_code.co_name 

2519 + " " 

2520 + tag 

2521 } 

2522 ) 

2523 

2524 elif tag == "body": 

2525 self.parse_body(node) 

2526 elif tag == "sub-article": 2526 ↛ 2527line 2526 didn't jump to line 2527 because the condition on line 2526 was never true

2527 self.parse_sub_article(node) 

2528 elif tag == "floats-group" or tag == "front": 2528 ↛ 2532line 2528 didn't jump to line 2532 because the condition on line 2528 was always true

2529 # Handled above 

2530 pass 

2531 else: 

2532 self.warnings.append( 

2533 { 

2534 self.pid: self.__class__.__name__ 

2535 + "." 

2536 + inspect.currentframe().f_code.co_name 

2537 + " " 

2538 + tag 

2539 } 

2540 ) 

2541 

2542 # Add the footnotes at the end 

2543 if len(self.fns) > 0: 2543 ↛ 2544line 2543 didn't jump to line 2544 because the condition on line 2543 was never true

2544 fn_text = '<div class="footnotes">' 

2545 for fn in self.fns: 

2546 fn_text += fn 

2547 fn_text += "</div>" 

2548 

2549 self.body_html = fn_text if not self.body_html else self.body_html + fn_text 

2550 

2551 if ( 2551 ↛ 2555line 2551 didn't jump to line 2555

2552 len(self.funding_statement_xml) > 0 

2553 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1 

2554 ): 

2555 self.funding_statement_xml = ( 

2556 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>' 

2557 ) 

2558 

2559 # Case for XML with <body>, then <back> and <floats_group> 

2560 # The figures/tables of the floats_group are added inside the body_html 

2561 # (close to their first <xref>) 

2562 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function. 

2563 # Instead, we append the floats_group_xml to the body_xml 

2564 if hasattr(self, "floats_group_xml"): 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true

2565 self.body_xml += self.floats_group_xml 

2566 

2567 # Special treatment for Elsevier articles: web scrapping to find the date_published 

2568 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests 

2569 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None: 

2570 # article_data = scrapping.fetch_article(self.doi, self.pii) 

2571 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str 

2572 

2573 self.post_parse_tree() 

2574 

2575 def update_body_content(self, node, **kwargs): 

2576 if len(node) == 0: 

2577 # Most journals do not display the Full text 

2578 # the <body> is then used to store the text for the search engine and has no children 

2579 # Let's not compute body_html in this case. 

2580 # We want the same behavior for journals that display the Full text, 

2581 # but with old articles without Full text. 

2582 return 

2583 

2584 # <front> has to be put before <body> so self.pid is defined here 

2585 if hasattr(settings, "SITE_URL_PREFIX"): 2585 ↛ 2586line 2585 didn't jump to line 2586 because the condition on line 2585 was never true

2586 prefix = settings.SITE_URL_PREFIX 

2587 base_article = settings.ARTICLE_BASE_URL 

2588 base_url = "/" + prefix + base_article + self.pid 

2589 else: 

2590 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2591 kwargs["base_url"] = base_url 

2592 

2593 append_to_body = True 

2594 current_len = len(self.supplementary_materials) 

2595 

2596 if "use_sec" in kwargs and kwargs["use_sec"]: 2596 ↛ 2598line 2596 didn't jump to line 2598 because the condition on line 2596 was never true

2597 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2598 body_tex, body_html = self.parse_node_with_sec(node, **kwargs) 

2599 else: 

2600 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs) 

2601 

2602 if len(self.supplementary_materials) != current_len: 2602 ↛ 2605line 2602 didn't jump to line 2605 because the condition on line 2602 was never true

2603 # Elsevier stores supplementary-material in app-group. 

2604 # They are extracted, but ignored in the body_html if the appendix has only supplements 

2605 append_to_body = False 

2606 

2607 for child in node: 

2608 if child.tag == "p": 

2609 for gchild in child: 

2610 if gchild.tag != "supplementary-material": 

2611 append_to_body = True 

2612 

2613 if append_to_body: 2613 ↛ exitline 2613 didn't return from function 'update_body_content' because the condition on line 2613 was always true

2614 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex 

2615 self.body_html = body_html if not self.body_html else self.body_html + body_html 

2616 

2617 body_xml = get_xml_from_node(node) 

2618 if not self.body_xml: 2618 ↛ 2621line 2618 didn't jump to line 2621 because the condition on line 2618 was always true

2619 self.body_xml = body_xml 

2620 else: 

2621 if "use_sec" in kwargs and kwargs["use_sec"]: 

2622 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>" 

2623 else: 

2624 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>" 

2625 

2626 def parse_ack(self, node, **kwargs): 

2627 content_type = node.get("content-type") or "" 

2628 if content_type == "COI-statement": 

2629 self.coi_statement = get_text_from_node(node) 

2630 else: 

2631 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2632 self.update_body_content(node, use_sec=True) 

2633 

2634 def parse_app(self, node, **kwargs): 

2635 for child in node: 

2636 tag = normalize(child.tag) 

2637 

2638 if tag == "sec": 

2639 # Elsevier can store all appendixes inside one <app> ?!? 

2640 # One of them can store the supplements and has to be ignored in the body_html 

2641 self.update_body_content(child) 

2642 else: 

2643 self.warnings.append( 

2644 { 

2645 self.pid: self.__class__.__name__ 

2646 + "." 

2647 + inspect.currentframe().f_code.co_name 

2648 + " " 

2649 + tag 

2650 } 

2651 ) 

2652 

2653 def parse_app_group(self, node, **kwargs): 

2654 for child in node: 

2655 tag = normalize(child.tag) 

2656 

2657 if tag == "app": 

2658 self.parse_app(child) 

2659 else: 

2660 self.warnings.append( 

2661 { 

2662 self.pid: self.__class__.__name__ 

2663 + "." 

2664 + inspect.currentframe().f_code.co_name 

2665 + " " 

2666 + tag 

2667 } 

2668 ) 

2669 

2670 def parse_article_categories(self, node, **kwargs): 

2671 for child in node: 

2672 tag = normalize(child.tag) 

2673 

2674 if tag == "subj-group": 2674 ↛ 2677line 2674 didn't jump to line 2677 because the condition on line 2674 was always true

2675 self.parse_subj_group(child) 

2676 else: 

2677 self.warnings.append( 

2678 { 

2679 self.pid: self.__class__.__name__ 

2680 + "." 

2681 + inspect.currentframe().f_code.co_name 

2682 + " " 

2683 + tag 

2684 } 

2685 ) 

2686 

2687 def parse_article_meta(self, node, **kwargs): 

2688 for child in node: 

2689 tag = normalize(child.tag) 

2690 

2691 if tag == "article-id": 

2692 self.parse_id(child) 

2693 elif tag == "fpage": 

2694 self.fpage = child.text 

2695 self.page_type = child.get("content-type") or "" 

2696 elif tag == "lpage": 

2697 self.lpage = child.text or "" 

2698 elif tag == "page-range": 

2699 self.page_range = child.text 

2700 elif tag in ("page-count", "size"): 2700 ↛ 2701line 2700 didn't jump to line 2701 because the condition on line 2700 was never true

2701 self.size = child.text 

2702 elif tag == "elocation-id": 2702 ↛ 2703line 2702 didn't jump to line 2703 because the condition on line 2702 was never true

2703 self.elocation = child.text 

2704 elif tag == "pub-date": 

2705 date_type = child.get("date-type") or "pub" 

2706 if date_type == "pub": 2706 ↛ 2709line 2706 didn't jump to line 2709 because the condition on line 2706 was always true

2707 self.date_published_iso_8601_date_str = self.get_data_from_date(child) 

2708 else: 

2709 date_str = self.get_data_from_date(child) 

2710 self.history_dates.append({"type": "online", "date": date_str}) 

2711 elif tag == "history": 

2712 self.history_dates += self.get_data_from_history(child) 

2713 for date in self.history_dates: 

2714 if date["type"] == "prod-deployed-date": 

2715 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2716 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]: 

2717 pass 

2718 # TODO: store permissions in XML 

2719 elif tag == "author-notes": 2719 ↛ 2721line 2719 didn't jump to line 2721 because the condition on line 2719 was never true

2720 # 2022/11/15 Mersenne meeting. ignore author-notes 

2721 pass 

2722 # self.parse_author_notes(child) 

2723 else: 

2724 fct_name = "parse_" + tag.replace("-", "_") 

2725 ftor = getattr(self, fct_name, None) 

2726 if callable(ftor): 2726 ↛ 2729line 2726 didn't jump to line 2729 because the condition on line 2726 was always true

2727 ftor(child, add_ext_link=True) 

2728 else: 

2729 self.warnings.append( 

2730 { 

2731 self.pid: self.__class__.__name__ 

2732 + "." 

2733 + inspect.currentframe().f_code.co_name 

2734 + " " 

2735 + tag 

2736 } 

2737 ) 

2738 

2739 def parse_author_notes(self, node, **kwargs): 

2740 for child in node: 

2741 tag = normalize(child.tag) 

2742 if tag == "fn": 

2743 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

2744 xml = get_xml_from_node(child) 

2745 self.footnotes_xml += xml 

2746 self.footnotes_html += html 

2747 

2748 def parse_body(self, node, **kwargs): 

2749 self.body = get_text_from_node(node) 

2750 

2751 if hasattr(self, "floats"): 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true

2752 self.floats_to_insert = [] 

2753 

2754 self.update_body_content(node, **kwargs) 

2755 

2756 if not self.body_xml: 

2757 self.body_xml = get_xml_from_node(node) 

2758 

2759 def parse_boxed_text(self, node, **kwargs): 

2760 """ 

2761 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary. 

2762 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML. 

2763 """ 

2764 box_id = node.attrib["id"] if "id" in node.attrib else None 

2765 

2766 _, html = self.parse_node_with_boxed_text(node, **kwargs) 

2767 

2768 if box_id is not None: 

2769 self.floats[box_id] = html 

2770 

2771 def parse_floats_group(self, node, **kwargs): 

2772 if hasattr(settings, "SITE_URL_PREFIX"): 

2773 prefix = settings.SITE_URL_PREFIX 

2774 base_article = settings.ARTICLE_BASE_URL 

2775 base_url = "/" + prefix + base_article + self.pid 

2776 else: 

2777 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2778 

2779 self.floats = {} 

2780 for child in node: 

2781 tag = normalize(child.tag) 

2782 

2783 if tag == "fig": 

2784 self.parse_node_with_fig(child, append_floats=True, base_url=base_url) 

2785 elif tag == "table-wrap": 

2786 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url) 

2787 elif tag == "boxed-text": 

2788 self.parse_boxed_text(child, base_url=base_url) 

2789 else: 

2790 self.warnings.append( 

2791 { 

2792 self.pid: self.__class__.__name__ 

2793 + "." 

2794 + inspect.currentframe().f_code.co_name 

2795 + " " 

2796 + tag 

2797 } 

2798 ) 

2799 

2800 self.floats_group_xml = get_xml_from_node(node) 

2801 

2802 def parse_fn_group(self, node, **kwargs): 

2803 for child in node: 

2804 tag = normalize(child.tag) 

2805 

2806 if tag == "fn": 

2807 _, html = self.parse_node_with_fn(child, keep_fn=True) 

2808 xml = get_xml_from_node(child) 

2809 

2810 self.footnotes_html += html 

2811 self.footnotes_xml += xml 

2812 else: 

2813 self.warnings.append( 

2814 { 

2815 self.pid: self.__class__.__name__ 

2816 + "." 

2817 + inspect.currentframe().f_code.co_name 

2818 + " " 

2819 + tag 

2820 } 

2821 ) 

2822 

2823 def parse_funding_group(self, node, **kwargs): 

2824 for child in node: 

2825 tag = normalize(child.tag) 

2826 

2827 if tag == "award-group": 

2828 self.parse_award_group(child) 

2829 elif tag == "funding-statement": 

2830 for funding_node in child: 

2831 if funding_node.tag == "name-content": 

2832 for funding_child in funding_node: 

2833 if funding_child.tag == "fn": 

2834 _, html = self.parse_node_with_fn(funding_child, keep_fn=True) 

2835 self.funding_statement_html += html 

2836 self.funding_statement_xml = get_xml_from_node(funding_node) 

2837 

2838 # TODO: handle funding-statement with simple texts 

2839 else: 

2840 self.warnings.append( 

2841 { 

2842 self.pid: self.__class__.__name__ 

2843 + "." 

2844 + inspect.currentframe().f_code.co_name 

2845 + " " 

2846 + tag 

2847 } 

2848 ) 

2849 

2850 def parse_issue(self, node, **kwargs): 

2851 # Elsevier stores bs in the seq attribute 

2852 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0") 

2853 

2854 

2855class JatsRef(RefBase, JatsBase): 

2856 def __init__(self, *args, tree, lang="und", **kwargs): 

2857 super().__init__(*args, lang=lang, **kwargs) 

2858 self.parse_tree(tree) 

2859 

2860 def parse_tree(self, tree): 

2861 super().parse_tree(tree) 

2862 

2863 self.user_id = get_normalized_attrib(tree, "id") or "" 

2864 

2865 for node in tree: 

2866 tag = normalize(node.tag) 

2867 

2868 if tag == "label": 

2869 self.label = node.text or "" 

2870 

2871 if self.label: 2871 ↛ 2906line 2871 didn't jump to line 2906 because the condition on line 2871 was always true

2872 if self.label[0] != "[": 

2873 self.label = "[" + self.label + "]" 

2874 

2875 elif tag == "mixed-citation" or tag == "note": 

2876 self.parse_citation_node(node) 

2877 

2878 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content( 

2879 node, 

2880 is_citation=True, 

2881 is_mixed_citation=True, 

2882 add_ext_link=True, 

2883 ref_type="misc", 

2884 ) 

2885 

2886 if self.label: 

2887 self.citation_html = self.label + " " + self.citation_html 

2888 self.citation_tex = self.label + " " + self.citation_tex 

2889 

2890 elif tag == "element-citation": 

2891 self.parse_citation_node(node) 

2892 

2893 self.citation_tex = self.citation_html = get_citation_html(self) 

2894 else: 

2895 self.warnings.append( 

2896 { 

2897 self.pid: self.__class__.__name__ 

2898 + "." 

2899 + inspect.currentframe().f_code.co_name 

2900 + " " 

2901 + tag 

2902 } 

2903 ) 

2904 

2905 # With xmldata, citation_xml does not have '<ref>', but only the text of the children 

2906 self.citation_xml += get_xml_from_node(node) 

2907 

2908 def get_data_from_name_in_ref(self, node, role): 

2909 params = create_contributor() 

2910 params["role"] = role 

2911 

2912 if node.tag == "name": 

2913 self.update_data_from_name(node, params) 

2914 elif node.tag == "string-name": 2914 ↛ 2918line 2914 didn't jump to line 2918 because the condition on line 2914 was always true

2915 self.update_data_from_name(node, params) 

2916 if params["first_name"] == "" and params["last_name"] == "": 

2917 params["string_name"] = node.text or "" 

2918 elif node.tag == "name-alternatives": 

2919 params["mid"] = self.get_data_from_name_alternatives(node) 

2920 elif node.tag == "collab": 

2921 params["string_name"] = node.text or "" 

2922 

2923 use_initials = getattr(settings, "REF_JEP_STYLE", False) 

2924 helper_update_name_params(params, use_initials) 

2925 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node) 

2926 

2927 return params 

2928 

2929 def parse_node_with_chapter_title(self, node, **kwargs): 

2930 tex, html = self.parse_inner_node(node, **kwargs) 

2931 

2932 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2933 if is_mixed_citation: 

2934 html = add_span_class_to_html_from_chapter_title(html, **kwargs) 

2935 

2936 return tex, html 

2937 

2938 def parse_node_with_source(self, node, **kwargs): 

2939 tex, html = self.parse_inner_node(node, **kwargs) 

2940 

2941 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2942 if is_mixed_citation: 

2943 html = add_span_class_to_html_from_source(html, **kwargs) 

2944 

2945 return tex, html 

2946 

2947 def parse_citation_node(self, node, **kwargs): 

2948 self.type = get_normalized_attrib(node, "publication-type") or "misc" 

2949 

2950 # Elsevier can store data about a translation after comments (<source>...) 

2951 # Append these tags in the comment 

2952 has_comment = False 

2953 

2954 for child in node: 

2955 tag = normalize(child.tag) 

2956 

2957 if tag in ("page-count", "size"): 2957 ↛ 2958line 2957 didn't jump to line 2958 because the condition on line 2957 was never true

2958 if not self.size: 

2959 self.size = child.text 

2960 elif tag == "comment": 

2961 has_comment = True 

2962 # comments may have ext-links or uri. HTML <a> links will be added 

2963 _, comment = self.parse_node_with_mixed_content( 

2964 child, is_citation=True, is_comment=True, add_HTML_link=True 

2965 ) 

2966 if self.comment: 

2967 self.comment += " " 

2968 self.comment += comment 

2969 elif tag == "source": 

2970 # TODO: migration to store source_tex and source_html 

2971 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2972 

2973 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2973 ↛ 2975line 2973 didn't jump to line 2975 because the condition on line 2973 was never true

2974 # Multiple source for a book, store the extra source in series 

2975 if self.series and has_comment: 

2976 self.comment += " " + source_tex 

2977 else: 

2978 if self.series: 

2979 self.series += ", " 

2980 self.series += get_text_from_node(child) 

2981 else: 

2982 if self.source_tex and has_comment: 2982 ↛ 2983line 2982 didn't jump to line 2983 because the condition on line 2982 was never true

2983 self.comment += " " + source_tex 

2984 else: 

2985 self.source_tex = source_tex 

2986 elif tag == "series": 

2987 series = get_text_from_node(child) 

2988 if self.series and has_comment: 2988 ↛ 2989line 2988 didn't jump to line 2989 because the condition on line 2988 was never true

2989 self.comment += ", " + series 

2990 else: 

2991 if self.series: 2991 ↛ 2992line 2991 didn't jump to line 2992 because the condition on line 2991 was never true

2992 self.series += ", " 

2993 self.series += series 

2994 elif tag == "annotation": 2994 ↛ 2995line 2994 didn't jump to line 2995 because the condition on line 2994 was never true

2995 if not self.annotation: 

2996 self.annotation = get_text_from_node(child) 

2997 elif tag == "article-title": 

2998 # TODO: migration to store article_title_tex and article_title_html 

2999 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

3000 

3001 if self.type == "book": 3001 ↛ 3003line 3001 didn't jump to line 3003 because the condition on line 3001 was never true

3002 # Elsevier uses article-title for books !?! 

3003 if len(self.source_tex) == 0: 

3004 if has_comment: 

3005 self.comment += " " + article_title_tex 

3006 else: 

3007 self.source_tex = article_title_tex 

3008 else: 

3009 if self.series and has_comment: 

3010 self.comment += ", " + article_title_tex 

3011 else: 

3012 self.series += get_text_from_node(child) 

3013 elif self.type == "inproceedings": 

3014 if self.chapter_title_tex and has_comment: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true

3015 self.comment += " " + article_title_tex 

3016 else: 

3017 self.chapter_title_tex = article_title_tex 

3018 else: 

3019 if self.article_title_tex and has_comment: 3019 ↛ 3020line 3019 didn't jump to line 3020 because the condition on line 3019 was never true

3020 self.comment += " " + article_title_tex 

3021 else: 

3022 self.article_title_tex = article_title_tex 

3023 elif tag == "chapter-title": 

3024 # TODO: migration to store chapter_title_tex and chapter_title_html 

3025 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

3026 if self.chapter_title_tex and has_comment: 3026 ↛ 3027line 3026 didn't jump to line 3027 because the condition on line 3026 was never true

3027 self.comment += " " + chapter_title_tex 

3028 else: 

3029 self.chapter_title_tex = chapter_title_tex 

3030 elif tag == "conf-name": 

3031 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

3032 if self.source_tex and has_comment: 3032 ↛ 3033line 3032 didn't jump to line 3033 because the condition on line 3032 was never true

3033 self.comment += ", " + conf_tex 

3034 else: 

3035 self.source_tex = conf_tex 

3036 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 

3037 params = self.get_data_from_name_in_ref(child, "author") 

3038 self.contributors.append(params) 

3039 elif tag == "person-group": 

3040 self.parse_person_group(child) 

3041 elif tag == "ext-link": 

3042 self.parse_ext_link(child, add_ext_link=True) 

3043 elif tag == "pub-id": 

3044 self.parse_pub_id(child) 

3045 elif tag == "date": 3045 ↛ 3046line 3045 didn't jump to line 3046 because the condition on line 3045 was never true

3046 self.year = get_text_from_node(child) 

3047 elif tag == "date-in-citation": 3047 ↛ 3048line 3047 didn't jump to line 3048 because the condition on line 3047 was never true

3048 date_ = child.get("iso-8601-date") or "" 

3049 if date_: 

3050 if self.comment: 

3051 self.comment += ", " 

3052 self.comment += "Accessed " + date_ 

3053 elif tag == "isbn": 3053 ↛ 3054line 3053 didn't jump to line 3054 because the condition on line 3053 was never true

3054 if self.annotation: 

3055 self.annotation += ", " 

3056 self.annotation += "ISBN: " + child.text 

3057 elif tag == "issn": 3057 ↛ 3058line 3057 didn't jump to line 3058 because the condition on line 3057 was never true

3058 if self.annotation: 

3059 self.annotation += ", " 

3060 self.annotation += "ISSN: " + child.text 

3061 elif child.text is not None: 

3062 variable_name = tag.replace("-", "_") 

3063 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true

3064 if tag == "fpage": 

3065 self.comment += ", pp. " 

3066 elif tag == "lpage": 

3067 self.comment += "-" 

3068 else: 

3069 self.comment += ", " 

3070 self.comment += child.text 

3071 elif not hasattr(self, variable_name) or not getattr(self, variable_name): 

3072 setattr(self, variable_name, child.text) 

3073 

3074 def parse_person_group(self, node, **kwargs): 

3075 role = node.get("person-group-type") or "" 

3076 if role and role[-1] == "s": 3076 ↛ 3077line 3076 didn't jump to line 3077 because the condition on line 3076 was never true

3077 role = role[:-1] 

3078 

3079 for child in node: 

3080 tag = normalize(child.tag) 

3081 

3082 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3082 ↛ 3086line 3082 didn't jump to line 3086 because the condition on line 3082 was always true

3083 contrib = self.get_data_from_name_in_ref(child, role) 

3084 self.contributors.append(contrib) 

3085 else: 

3086 self.warnings.append( 

3087 { 

3088 self.pid: self.__class__.__name__ 

3089 + "." 

3090 + inspect.currentframe().f_code.co_name 

3091 + " " 

3092 + tag 

3093 } 

3094 ) 

3095 

3096 def parse_pub_id(self, node, **kwargs): 

3097 node_type = node.get("pub-id-type") or "" 

3098 

3099 data: ExtLinkDict = { 

3100 "rel": node_type, 

3101 "mimetype": "", 

3102 "location": "", 

3103 "base": "", 

3104 "metadata": node.text, 

3105 } 

3106 

3107 self.add_extids_from_node_with_link(data) 

3108 

3109 def split_label(self): 

3110 """ 

3111 Used when sorting non-digit bibitems 

3112 """ 

3113 label = self.label.lower() 

3114 if len(label) > 1: 

3115 label = label[1:-1] 

3116 

3117 try: 

3118 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

3119 except ValueError: 

3120 # Special case where label is similar as "Sma" instead of "Sma15" 

3121 self.label_prefix, self.label_suffix = [label, ""] 

3122 

3123 

3124class BitsCollection(CollectionData, JatsBase): 

3125 def __init__(self, *args, **kwargs): 

3126 super().__init__(*args, **kwargs) 

3127 self.parse_tree(kwargs["tree"]) 

3128 

3129 def parse_tree(self, tree): 

3130 super().parse_tree(tree) 

3131 

3132 if tree is not None: 3132 ↛ 3175line 3132 didn't jump to line 3175 because the condition on line 3132 was always true

3133 tag = normalize(tree.tag) 

3134 collection_meta_node = None 

3135 if tag == "collection-meta": 

3136 self.parse_collection_meta(tree) 

3137 collection_meta_node = tree 

3138 elif tag == "in-collection": 3138 ↛ 3162line 3138 didn't jump to line 3162 because the condition on line 3138 was always true

3139 for node in tree: 

3140 tag = normalize(node.tag) 

3141 

3142 if tag == "collection-meta": 

3143 self.parse_collection_meta(node) 

3144 collection_meta_node = node 

3145 elif tag == "volume": 

3146 self.parse_volume(node) 

3147 elif tag == "volume-series": 3147 ↛ 3149line 3147 didn't jump to line 3149 because the condition on line 3147 was always true

3148 self.parse_volume_series(node) 

3149 elif tag == "volume-title": 

3150 self.parse_volume_title(node) 

3151 else: 

3152 self.warnings.append( 

3153 { 

3154 self.pid: self.__class__.__name__ 

3155 + "." 

3156 + inspect.currentframe().f_code.co_name 

3157 + " " 

3158 + tag 

3159 } 

3160 ) 

3161 

3162 if collection_meta_node is not None: 3162 ↛ 3165line 3162 didn't jump to line 3165 because the condition on line 3162 was always true

3163 self.set_seq(collection_meta_node) 

3164 else: 

3165 self.warnings.append( 

3166 { 

3167 self.pid: self.__class__.__name__ 

3168 + "." 

3169 + inspect.currentframe().f_code.co_name 

3170 + " " 

3171 + tag 

3172 } 

3173 ) 

3174 

3175 self.collection = Foo() 

3176 self.collection.pid = self.pid 

3177 

3178 def parse_collection_meta(self, node, **kwargs): 

3179 self.coltype = node.get("collection-type") 

3180 

3181 for child in node: 

3182 tag = normalize(child.tag) 

3183 

3184 if tag == "collection-id": 

3185 self.pid = child.text 

3186 elif tag == "title-group": 

3187 self.parse_title_group(child) 

3188 elif tag == "issn": 

3189 node_type = child.get("pub-type") 

3190 if node_type == "ppub": 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true

3191 self.issn = child.text 

3192 self.ids.append(("issn", child.text)) 

3193 elif node_type == "epub": 3193 ↛ 3194line 3193 didn't jump to line 3194 because the condition on line 3193 was never true

3194 self.e_issn = child.text 

3195 self.ids.append(("e-issn", child.text)) 

3196 elif tag == "ext-link": 3196 ↛ 3197line 3196 didn't jump to line 3197 because the condition on line 3196 was never true

3197 data = self.get_data_from_ext_link(child) 

3198 self.ext_links.append(data) 

3199 elif tag == "volume-in-collection": 

3200 self.parse_volume_in_collection(child) 

3201 else: 

3202 self.warnings.append( 

3203 { 

3204 self.pid: self.__class__.__name__ 

3205 + "." 

3206 + inspect.currentframe().f_code.co_name 

3207 + " " 

3208 + tag 

3209 } 

3210 ) 

3211 

3212 def parse_volume(self, node, **kwargs): 

3213 self.volume = node.text 

3214 

3215 def parse_volume_in_collection(self, node, **kwargs): 

3216 for child in node: 

3217 tag = normalize(child.tag) 

3218 

3219 if tag == "volume-number": 

3220 self.parse_volume(child) 

3221 elif tag == "volume-series": 

3222 self.parse_volume_series(child) 

3223 elif tag == "volume-title": 3223 ↛ 3226line 3223 didn't jump to line 3226 because the condition on line 3223 was always true

3224 self.parse_volume_title(child) 

3225 else: 

3226 self.warnings.append( 

3227 { 

3228 self.pid: self.__class__.__name__ 

3229 + "." 

3230 + inspect.currentframe().f_code.co_name 

3231 + " " 

3232 + tag 

3233 } 

3234 ) 

3235 

3236 def parse_volume_series(self, node, **kwargs): 

3237 self.vseries = node.text 

3238 

3239 def parse_volume_title(self, node, **kwargs): 

3240 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node) 

3241 self.title_xml = get_xml_from_node(node) 

3242 

3243 def set_seq(self, node): 

3244 try: 

3245 # First, use the seq attribute, if any 

3246 self.seq = int(node.get("seq") or "") 

3247 except ValueError: 

3248 # Second, use self.volume (which can be like "158-159") 

3249 if not self.volume: 3249 ↛ 3250line 3249 didn't jump to line 3250 because the condition on line 3249 was never true

3250 self.seq = 0 

3251 else: 

3252 text = self.volume.split("-")[0] 

3253 try: 

3254 self.seq = int(text) 

3255 except ValueError: 

3256 self.seq = 0 

3257 

3258 # Third, use self.vseries as an offset 

3259 try: 

3260 # pas plus de 10000 ouvrages dans une série (gasp) 

3261 self.seq = int(self.vseries) * 10000 + self.seq 

3262 except ValueError: 

3263 pass 

3264 

3265 

3266class BitsBook(BookData, JatsBase): 

3267 def __init__(self, *args, **kwargs): 

3268 super().__init__(*args, **kwargs) 

3269 self.no_bib = kwargs.get("no_bib", False) 

3270 

3271 self.parse_tree(kwargs["tree"]) 

3272 

3273 def parse_tree(self, tree): 

3274 super().parse_tree(tree) 

3275 

3276 book_type = get_normalized_attrib(tree, "book-type") or "Book" 

3277 self.ctype = "book-" + book_type 

3278 

3279 for node in tree: 

3280 if type(tree) == type(node): 3280 ↛ 3279line 3280 didn't jump to line 3279 because the condition on line 3280 was always true

3281 tag = normalize(node.tag) 

3282 

3283 if tag in ("collection-meta", "in-collection"): 

3284 col = BitsCollection(tree=node) 

3285 self.incollection.append(col) 

3286 elif tag == "book-meta": 

3287 self.parse_book_meta(node) 

3288 elif tag == "book-body": 

3289 self.parse_book_body(node) 

3290 elif tag == "front-matter": 

3291 self.parse_front_matter(node) 

3292 elif tag == "book-back": 3292 ↛ 3308line 3292 didn't jump to line 3308 because the condition on line 3292 was always true

3293 for child in node: 

3294 tag = normalize(child.tag) 

3295 if tag == "ref-list": 

3296 self.parse_ref_list(child) 

3297 else: 

3298 self.warnings.append( 

3299 { 

3300 self.pid: self.__class__.__name__ 

3301 + "." 

3302 + inspect.currentframe().f_code.co_name 

3303 + " " 

3304 + tag 

3305 } 

3306 ) 

3307 else: 

3308 self.warnings.append( 

3309 { 

3310 self.pid: self.__class__.__name__ 

3311 + "." 

3312 + inspect.currentframe().f_code.co_name 

3313 + " " 

3314 + tag 

3315 } 

3316 ) 

3317 

3318 self.set_contribs() 

3319 self.set_title() 

3320 self.post_parse_tree() 

3321 

3322 def parse_book_body(self, node, **kwargs): 

3323 for child in node: 

3324 if type(child) == type(node): 3324 ↛ 3323line 3324 didn't jump to line 3323 because the condition on line 3324 was always true

3325 tag = normalize(child.tag) 

3326 

3327 if tag == "book-part": 3327 ↛ 3332line 3327 didn't jump to line 3332 because the condition on line 3327 was always true

3328 book_part = BitsBookPart(tree=child, no_bib=self.no_bib) 

3329 self.warnings.extend(book_part.warnings) 

3330 self.parts.append(book_part) 

3331 else: 

3332 self.warnings.append( 

3333 { 

3334 self.pid: self.__class__.__name__ 

3335 + "." 

3336 + inspect.currentframe().f_code.co_name 

3337 + " " 

3338 + tag 

3339 } 

3340 ) 

3341 

3342 if not self.parts: 

3343 self.body = get_text_from_node(node) 

3344 

3345 def parse_book_meta(self, node, **kwargs): 

3346 for child in node: 

3347 tag = normalize(child.tag) 

3348 

3349 if tag == "book-id": 

3350 self.parse_id(child) 

3351 elif tag == "pub-date": 

3352 self.year = self.get_data_from_date(child) 

3353 elif tag == "book-volume-number": 3353 ↛ 3354line 3353 didn't jump to line 3354 because the condition on line 3353 was never true

3354 self.volume = child.text 

3355 self.volume_int = child.text 

3356 elif tag == "pub-history": 

3357 history_dates = self.get_data_from_history(child) 

3358 for date in history_dates: 

3359 if date["type"] == "last-modified": 

3360 self.last_modified_iso_8601_date_str = date["date"] 

3361 elif date["type"] == "prod-deployed-date": 3361 ↛ 3362line 3361 didn't jump to line 3362 because the condition on line 3361 was never true

3362 self.prod_deployed_date_iso_8601_date_str = date["date"] 

3363 elif tag == "book-title-group": 

3364 self.parse_title_group(child) 

3365 elif tag == "publisher": 

3366 self.publisher = JatsPublisher(tree=child) 

3367 else: 

3368 fct_name = "parse_" + tag.replace("-", "_") 

3369 ftor = getattr(self, fct_name, None) 

3370 if callable(ftor): 

3371 ftor(child, add_ext_link=True) 

3372 else: 

3373 self.warnings.append( 

3374 { 

3375 self.pid: self.__class__.__name__ 

3376 + "." 

3377 + inspect.currentframe().f_code.co_name 

3378 + " " 

3379 + tag 

3380 } 

3381 ) 

3382 

3383 if self.last_modified_iso_8601_date_str is None: 3383 ↛ 3384line 3383 didn't jump to line 3384 because the condition on line 3383 was never true

3384 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

3385 

3386 def parse_custom_meta_group(self, node, **kwargs): 

3387 for child in node: 

3388 tag = normalize(child.tag) 

3389 

3390 if tag == "custom-meta": 3390 ↛ 3387line 3390 didn't jump to line 3387 because the condition on line 3390 was always true

3391 name, value = self.get_data_from_custom_meta(child) 

3392 

3393 if name == "provider": 3393 ↛ 3387line 3393 didn't jump to line 3387 because the condition on line 3393 was always true

3394 self.provider = value 

3395 

3396 def set_contribs(self): 

3397 """ 

3398 Update the contrib_groups if the XML does not declare any 

3399 - with the authors of the first part 

3400 - if the book is a monograph 

3401 - if all parts are written by the same authors 

3402 

3403 :return: 

3404 """ 

3405 

3406 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"] 

3407 if not authors: 

3408 if self.ctype == "book-monograph" and self.parts: 

3409 first_part = self.parts[0] 

3410 self.contributors = first_part.contributors 

3411 elif ( 3411 ↛ exitline 3411 didn't return from function 'set_contribs' because the condition on line 3411 was always true

3412 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes" 

3413 ) and self.parts: 

3414 # check if authors of the book-parts are identical 

3415 equal = True 

3416 book_part_contributors = self.parts[0].contributors 

3417 i = 1 

3418 while equal and i < len(self.parts): 

3419 part = self.parts[i] 

3420 if part.contributors != book_part_contributors: 3420 ↛ 3422line 3420 didn't jump to line 3422 because the condition on line 3420 was always true

3421 equal = False 

3422 i += 1 

3423 if equal: 3423 ↛ 3424line 3423 didn't jump to line 3424 because the condition on line 3423 was never true

3424 if self.ctype == "book-edited-book": 

3425 self.ctype = "book-monograph" 

3426 self.contributors = book_part_contributors 

3427 else: 

3428 contrib = create_contributor() 

3429 contrib["string_name"] = "Collectif" 

3430 contrib["role"] = "author" 

3431 contrib["contrib_xml"] = get_contrib_xml(contrib) 

3432 self.contributors.append(contrib) 

3433 

3434 def set_title(self): 

3435 if self.title_xml == "" and len(self.incollection) > 0: 

3436 self.title_xml = self.incollection[0].title_xml 

3437 self.title_html = self.incollection[0].title_html 

3438 self.title_tex = self.incollection[0].title_tex 

3439 

3440 

3441class BitsBookPart(BookPartData, JatsArticleBase): 

3442 def __init__(self, *args, **kwargs): 

3443 super().__init__(*args, **kwargs) 

3444 self.no_bib = kwargs.get("no_bib", False) 

3445 

3446 self.parse_tree(kwargs["tree"]) 

3447 

3448 def parse_tree(self, tree): 

3449 super().parse_tree(tree) 

3450 

3451 self.atype = get_normalized_attrib(tree, "book-part-type") or "" 

3452 try: 

3453 self.seq = int(get_normalized_attrib(tree, "seq") or "") 

3454 except ValueError: 

3455 pass 

3456 

3457 for node in tree: 

3458 tag = normalize(node.tag) 

3459 

3460 if tag == "book-part-meta": 

3461 self.parse_book_part_meta(node) 

3462 elif tag == "body": 

3463 self.parse_body(node) 

3464 elif tag == "front-matter": 3464 ↛ 3465line 3464 didn't jump to line 3465 because the condition on line 3464 was never true

3465 self.parse_front_matter(node) 

3466 elif tag == "back": 3466 ↛ 3483line 3466 didn't jump to line 3483 because the condition on line 3466 was always true

3467 for child in node: 

3468 tag = normalize(child.tag) 

3469 

3470 if tag == "ref-list": 3470 ↛ 3473line 3470 didn't jump to line 3473 because the condition on line 3470 was always true

3471 self.parse_ref_list(child) 

3472 else: 

3473 self.warnings.append( 

3474 { 

3475 self.pid: self.__class__.__name__ 

3476 + "." 

3477 + inspect.currentframe().f_code.co_name 

3478 + " " 

3479 + tag 

3480 } 

3481 ) 

3482 else: 

3483 self.warnings.append( 

3484 { 

3485 self.pid: self.__class__.__name__ 

3486 + "." 

3487 + inspect.currentframe().f_code.co_name 

3488 + " " 

3489 + tag 

3490 } 

3491 ) 

3492 

3493 # Work around a numdam-plus bug where a book-part can have a trans-title without a title 

3494 # TODO: Fix numdam-plus, the books impacted and remove the hack 

3495 self.set_title() 

3496 

3497 self.post_parse_tree() 

3498 

3499 def parse_book_part_meta(self, node, **kwargs): 

3500 for child in node: 

3501 tag = normalize(child.tag) 

3502 

3503 if tag == "book-part-id": 

3504 self.parse_id(child) 

3505 elif tag == "fpage": 

3506 self.fpage = child.text 

3507 self.page_type = get_normalized_attrib(child, "content-type") or "" 

3508 elif tag == "lpage": 

3509 self.lpage = child.text 

3510 elif tag == "page-range": 3510 ↛ 3511line 3510 didn't jump to line 3511 because the condition on line 3510 was never true

3511 self.page_range = child.text 

3512 else: 

3513 fct_name = "parse_" + tag.replace("-", "_") 

3514 ftor = getattr(self, fct_name, None) 

3515 if callable(ftor): 3515 ↛ 3518line 3515 didn't jump to line 3518 because the condition on line 3515 was always true

3516 ftor(child) 

3517 else: 

3518 self.warnings.append( 

3519 { 

3520 self.pid: self.__class__.__name__ 

3521 + "." 

3522 + inspect.currentframe().f_code.co_name 

3523 + " " 

3524 + tag 

3525 } 

3526 ) 

3527 

3528 def parse_body(self, node, **kwargs): 

3529 for child in node: 

3530 tag = normalize(child.tag) 

3531 

3532 if tag == "book-part": 

3533 book_part = BitsBookPart(tree=child, no_bib=self.no_bib) 

3534 self.warnings.extend(book_part.warnings) 

3535 self.parts.append(book_part) 

3536 else: 

3537 self.warnings.append( 

3538 { 

3539 self.pid: self.__class__.__name__ 

3540 + "." 

3541 + inspect.currentframe().f_code.co_name 

3542 + " " 

3543 + tag 

3544 } 

3545 ) 

3546 

3547 self.body = get_text_from_node(node) 

3548 

3549 def set_title(self): 

3550 """ 

3551 Bug in some books: some chapters may have a trans-title, but no title ! 

3552 Hack and manually set the title* 

3553 :return: 

3554 """ 

3555 

3556 if self.trans_title_html and not self.title_html: 

3557 self.title_html = self.trans_title_html 

3558 self.title_tex = self.trans_title_tex 

3559 

3560 

3561###################################################################################### 

3562# 

3563# Functions used by ptf-tools 

3564# 

3565###################################################################################### 

3566 

3567 

3568def update_bibitem_xml(bibitem, new_ids): 

3569 xml = "<ref>" + bibitem.citation_xml + "</ref>" 

3570 the_parser = etree.XMLParser( 

3571 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3572 ) 

3573 tree = etree.fromstring(xml, parser=the_parser) 

3574 

3575 node = tree.find("element-citation") 

3576 if node is None: 

3577 node = tree.find("mixed-citation") 

3578 if node is not None: 3578 ↛ 3619line 3578 didn't jump to line 3619 because the condition on line 3578 was always true

3579 children_to_remove = [] 

3580 for child in node: 

3581 if child.tag == "ext-link": 

3582 child_type = child.get("ext-link-type") 

3583 if child_type and child_type in [ 

3584 "zbl-item-id", 

3585 "mr-item-id", 

3586 "doi", 

3587 "numdam-id", 

3588 "mathdoc-id", 

3589 "eid", 

3590 ]: 

3591 children_to_remove.append(child) 

3592 elif child.tag == "pub-id": 

3593 child_type = child.get("pub-id-type") 

3594 if child_type and child_type in [ 3594 ↛ 3580line 3594 didn't jump to line 3580 because the condition on line 3594 was always true

3595 "zbl-item-id", 

3596 "mr-item-id", 

3597 "doi", 

3598 "numdam-id", 

3599 "mathdoc-id", 

3600 ]: 

3601 children_to_remove.append(child) 

3602 

3603 for child in children_to_remove: 

3604 node.remove(child) 

3605 

3606 for id_type, value_dict in new_ids.items(): 

3607 if value_dict["checked"] and not value_dict["false_positive"]: 

3608 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]: 

3609 new_node = etree.Element("pub-id") 

3610 new_node.set("pub-id-type", id_type) 

3611 else: 

3612 new_node = etree.Element("ext-link") 

3613 new_node.set("ext-link-type", id_type) 

3614 

3615 new_node.text = value_dict["id_value"] 

3616 node.append(new_node) 

3617 

3618 # TODO Modify the call to update_bibitem_xml and pass the parent's lang 

3619 result = JatsRef(tree=tree, lang="und") 

3620 return result 

3621 

3622 

3623def check_bibitem_xml(bibitem: RefData): 

3624 xml = "<ref>" + bibitem.citation_xml + "</ref>" 

3625 the_parser = etree.XMLParser( 

3626 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3627 ) 

3628 tree = etree.fromstring(xml, parser=the_parser) 

3629 

3630 result = JatsRef(tree=tree, lang="und") 

3631 return result 

3632 

3633 

3634# Create XML strings based on internal data 

3635 

3636 

3637def get_tex_from_xml(xml, tag, **kwargs): 

3638 parser_ = etree.XMLParser( 

3639 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3640 ) 

3641 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

3642 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '') 

3643 text = xml 

3644 

3645 if tag in ["abstract", "title"]: 

3646 text = f"<article><front><article-meta>{text}</article-meta></front></article>" 

3647 

3648 tree = etree.fromstring(text.encode("utf-8"), parser=parser_) 

3649 xarticle = JatsArticle(tree=tree, **kwargs) 

3650 

3651 result = "" 

3652 if tag == "abstract": 

3653 result = xarticle.abstracts[0]["value_tex"] 

3654 elif tag == "title": 

3655 result = xarticle.title_tex, xarticle.trans_title_tex 

3656 

3657 return result