Coverage for src/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 45%

425 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# ckeditor_parser.py parses the HTML strings created by a CKEditor 

6# with tex formulas inside <span class="math-tex"> 

7# It returns the JATS equivalent. 

8# 

9# Ex: <p>Te&lt;st&nbsp;<span class="math-tex">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p> 

10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol> 

11# 

12################################################################################################## 

13 

14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15 because the condition on line 14 was never true

15 import os 

16 import sys 

17 

18 BASE_DIR = os.path.dirname( 

19 os.path.dirname( 

20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 

21 ) 

22 ) 

23 sys.path.append(BASE_DIR) 

24 

25import os 

26 

27from lxml import etree 

28 

29from django.conf import settings 

30 

31from ptf.cmds.xml.xml_utils import escape 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.cmds.xml.xml_utils import replace_html_entities 

34 

35# from ptf.utils import create_innerlink_for_citation 

36 

37 

38class CkeditorParser: 

39 def __init__(self, *args, **kwargs): 

40 self.warnings = [] 

41 self.value_xml = "" 

42 self.value_html = "" 

43 self.value_tex = "" 

44 

45 if "tree" not in kwargs and "html_value" in kwargs: 45 ↛ 57line 45 didn't jump to line 57 because the condition on line 45 was always true

46 parser = etree.XMLParser( 

47 huge_tree=True, 

48 recover=True, 

49 remove_blank_text=False, 

50 remove_comments=True, 

51 resolve_entities=True, 

52 ) 

53 html_value = kwargs["html_value"].replace("\n\n", "") 

54 body = f"<body>{replace_html_entities(html_value)}</body>" 

55 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

56 else: 

57 tree = kwargs["tree"] 

58 

59 self.mml_formulas = kwargs["mml_formulas"] 

60 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False 

61 self.pid = kwargs.get("pid", None) 

62 self.volume = kwargs.get("volume", None) 

63 self.issue_pid = kwargs.get("issue_pid", None) 

64 self.check_citation = kwargs.get("check_citation", False) 

65 self.biblio = kwargs.get("biblio", None) 

66 self.for_pcj_display = kwargs.get("for_pcj_display", False) 

67 

68 self.parse_tree(tree) 

69 

70 def parse_formula(self, node, **kwargs): 

71 formula = node.text or "" 

72 display = kwargs.get("display", None) 

73 if len(formula) > 0 and formula.find("\\(") == 0: 

74 formula = formula[2:-2] 

75 # elif len(formula) > 0 and formula.find("\[") == 0: 

76 # formula = formula[1:-1] 

77 mml = "" 

78 if len(self.mml_formulas) > 0: 

79 mml = self.mml_formulas.pop(0) 

80 

81 is_inline = True 

82 parent = node.getparent() 

83 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail: 

84 is_inline = False 

85 if self.for_pcj_display: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 formula = rf"\({formula}\)" 

87 else: 

88 formula = f"${formula}$" 

89 if mml: 

90 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>' 

91 elif display: 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true

92 html_text = f'<span class="mathjax-formula display" title="{formula}">{formula}</span>' 

93 else: 

94 html_text = f'<span class="mathjax-formula" title="{formula}">{formula}</span>' 

95 tex_text = formula 

96 

97 if is_inline: 

98 xml_text = "<inline-formula><alternatives>" 

99 if len(mml) > 0: 

100 xml_text += mml 

101 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

102 xml_text += "</alternatives></inline-formula>" 

103 else: 

104 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">' 

105 suffix = '</td><td class="formula-label"></td></tr></table>' 

106 html_text = prefix + html_text + suffix 

107 tex_text = prefix + tex_text + suffix 

108 

109 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>' 

110 if len(mml) > 0: 

111 xml_text += mml 

112 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

113 xml_text += "</alternatives></disp-formula>" 

114 

115 return html_text, tex_text, xml_text 

116 

117 def parse_list(self, node, **kwargs): 

118 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

119 node, **kwargs 

120 ) 

121 

122 list_type = "simple" if node.tag == "ul" else "number" 

123 

124 xml_text = f'<list list-type="{list_type}">' 

125 xml_text += inner_jats_xml_text 

126 xml_text += "</list>" 

127 

128 # # JATS requires <list> to be inside <p> 

129 # parent = node.getparent() 

130 # if parent is None or parent.tag != "p": 

131 # xml_text = f"<p>{xml_text}</p>" 

132 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>" 

133 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>" 

134 

135 return html_text, tex_text, xml_text 

136 

137 def parse_node_inner(self, node, **kwargs): 

138 """ 

139 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML 

140 :param node: 

141 :param kwargs: 

142 :return: 

143 """ 

144 

145 kwargs["is_top"] = False 

146 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

147 

148 if node.text: 

149 text = node.text 

150 

151 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 text = text[1:] 

153 

154 inner_jats_xml_text += escape(text) 

155 inner_html_text += escape(text) if kwargs["escape"] else text 

156 inner_tex_text += escape(text) if kwargs["escape"] else text 

157 

158 # if self.check_citation and node.tag != "a": 

159 # inner_html_text = create_innerlink_for_citation(inner_html_text, self.biblio) 

160 

161 for i in range(len(node)): 

162 child = node[i] 

163 

164 ( 

165 child_html_text, 

166 child_tex_text, 

167 child_jats_xml_text, 

168 ) = self.parse_node_with_mixed_content(child, **kwargs) 

169 inner_html_text += child_html_text 

170 inner_tex_text += child_tex_text 

171 inner_jats_xml_text += child_jats_xml_text 

172 

173 return inner_html_text, inner_tex_text, inner_jats_xml_text 

174 

175 def parse_node_with_a(self, node, **kwargs): 

176 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

177 node, **kwargs 

178 ) 

179 

180 href = "" 

181 for attrib in node.attrib: 

182 name = normalize(attrib) 

183 if name == "href": 

184 href = node.attrib[attrib] 

185 

186 if not href: 

187 href = inner_tex_text 

188 

189 html_text = f'<a href="{href}">{inner_html_text}</a>' 

190 tex_text = f'<a href="{href}">{inner_tex_text}</a>' 

191 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>' 

192 

193 return html_text, tex_text, xml_text 

194 

195 def parse_node_with_br(self, node, **kwargs): 

196 html_text = tex_text = "<br/>" 

197 xml_text = "<break/>" 

198 

199 return html_text, tex_text, xml_text 

200 

201 def parse_node_with_colgroup(self, node, **kwargs): 

202 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

203 node, **kwargs 

204 ) 

205 classe = "" 

206 for attrib in node.attrib: 

207 name = normalize(attrib) 

208 if name == "class": 

209 classe = node.attrib[name] 

210 html_text = f"<colgroup class={classe}>{inner_html_text}</colgroup>" 

211 tex_text = f"<colgroup class={classe}>{inner_tex_text}</colgroup>" 

212 

213 xml_text = '<colgroup xml:space="preserve">' + inner_jats_xml_text + "</colgroup>" 

214 return html_text, tex_text, xml_text 

215 

216 def parse_node_with_col(self, node, **kwargs): 

217 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

218 node, **kwargs 

219 ) 

220 classe = "" 

221 style = "" 

222 for attrib in node.attrib: 

223 name = normalize(attrib) 

224 if name == "class": 

225 classe = node.attrib[name] 

226 elif name == "style": 

227 style = node.attrib[name] 

228 if classe: 

229 html_text = f"<col class={classe} style='{style}'>{inner_html_text}</col>" 

230 tex_text = f"<col class={classe} style='{style}'>{inner_tex_text}</col>" 

231 else: 

232 html_text = f"<col style='{style}'>{inner_html_text}</col>" 

233 tex_text = f"<col style='{style}'>{inner_tex_text}</col>" 

234 

235 xml_text = '<col xml:space="preserve">' + inner_jats_xml_text + "</col>" 

236 return html_text, tex_text, xml_text 

237 

238 def parse_node_with_div(self, node, **kwargs): 

239 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

240 node, **kwargs 

241 ) 

242 classe = "" 

243 for attrib in node.attrib: 

244 name = normalize(attrib) 

245 if name == "class": 

246 classe = node.attrib[name] 

247 # Next condition checks style identification with pandoc library used 

248 # for docx --> html conversion 

249 elif name == "data-custom-style": 

250 if node.attrib[name] == "PCJ Equation": 

251 classe = "mathjax-formula PCJ-Equation" 

252 else: 

253 classe = node.attrib[name].replace(" ", "-") 

254 if classe == "PCJ-Section" and "References" in inner_html_text: 

255 html_text = tex_text = xml_text = "" 

256 return html_text, tex_text, xml_text 

257 elif classe == "PCJ-Reference": 

258 html_text = tex_text = xml_text = "" 

259 return html_text, tex_text, xml_text 

260 

261 html_text = f"<div class='{classe}'>{inner_html_text}</div>" 

262 tex_text = f"<div class='{classe}'>{inner_tex_text}</div>" 

263 

264 xml_text = '<div xml:space="preserve">' + inner_jats_xml_text + "</div>" 

265 return html_text, tex_text, xml_text 

266 

267 def parse_node_with_em(self, node, **kwargs): 

268 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

269 node, **kwargs 

270 ) 

271 

272 html_text = f'<span class="italique">{inner_html_text}</span>' 

273 tex_text = f"<i>{inner_tex_text}</i>" 

274 

275 if len(inner_jats_xml_text) > 0: 

276 xml_text = f"<italic>{inner_jats_xml_text}</italic>" 

277 else: 

278 xml_text = "<italic/>" 

279 

280 return html_text, tex_text, xml_text 

281 

282 def parse_node_with_h1(self, node, **kwargs): 

283 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

284 node, **kwargs 

285 ) 

286 classe = "" 

287 for attrib in node.attrib: 

288 name = normalize(attrib) 

289 if name == "class": 

290 classe = node.attrib[name] 

291 html_text = f"<h1 class={classe}>{inner_html_text}</h1>" 

292 tex_text = f"<h1 class={classe}>{inner_tex_text}</h1>" 

293 

294 xml_text = '<h1 xml:space="preserve">' + inner_jats_xml_text + "</h1>" 

295 

296 return html_text, tex_text, xml_text 

297 

298 def parse_node_with_h2(self, node, **kwargs): 

299 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

300 node, **kwargs 

301 ) 

302 classe = "" 

303 for attrib in node.attrib: 

304 name = normalize(attrib) 

305 if name == "class": 

306 classe = node.attrib[name] 

307 html_text = f"<h2 class={classe}>{inner_html_text}</h2>" 

308 tex_text = f"<h2 class={classe}>{inner_tex_text}</h2>" 

309 

310 xml_text = '<h2 xml:space="preserve">' + inner_jats_xml_text + "</h2>" 

311 

312 return html_text, tex_text, xml_text 

313 

314 def parse_node_with_h3(self, node, **kwargs): 

315 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

316 node, **kwargs 

317 ) 

318 classe = "" 

319 for attrib in node.attrib: 

320 name = normalize(attrib) 

321 if name == "class": 

322 classe = node.attrib[name] 

323 html_text = f"<h3 class={classe}>{inner_html_text}</h3>" 

324 tex_text = f"<h3 class={classe}>{inner_tex_text}</h3>" 

325 

326 xml_text = '<h3 xml:space="preserve">' + inner_jats_xml_text + "</h3>" 

327 

328 return html_text, tex_text, xml_text 

329 

330 def parse_node_with_h4(self, node, **kwargs): 

331 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

332 node, **kwargs 

333 ) 

334 classe = "" 

335 for attrib in node.attrib: 

336 name = normalize(attrib) 

337 if name == "class": 

338 classe = node.attrib[name] 

339 html_text = f"<h4 class={classe}>{inner_html_text}</h4>" 

340 tex_text = f"<h4 class={classe}>{inner_tex_text}</h4>" 

341 

342 xml_text = '<h4 xml:space="preserve">' + inner_jats_xml_text + "</h4>" 

343 return html_text, tex_text, xml_text 

344 

345 def parse_node_with_h5(self, node, **kwargs): 

346 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

347 node, **kwargs 

348 ) 

349 classe = "" 

350 for attrib in node.attrib: 

351 name = normalize(attrib) 

352 if name == "class": 

353 classe = node.attrib[name] 

354 html_text = f"<h5 class={classe}>{inner_html_text}</h5>" 

355 tex_text = f"<h5 class={classe}>{inner_tex_text}</h5>" 

356 

357 xml_text = '<h5 xml:space="preserve">' + inner_jats_xml_text + "</h5>" 

358 

359 return html_text, tex_text, xml_text 

360 

361 def parse_node_with_h6(self, node, **kwargs): 

362 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

363 node, **kwargs 

364 ) 

365 classe = "" 

366 for attrib in node.attrib: 

367 name = normalize(attrib) 

368 if name == "class": 

369 classe = node.attrib[name] 

370 html_text = f"<h6 class={classe}>{inner_html_text}</h6>" 

371 tex_text = f"<h6 class={classe}>{inner_tex_text}</h6>" 

372 

373 xml_text = '<h6 xml:space="preserve">' + inner_jats_xml_text + "</h6>" 

374 return html_text, tex_text, xml_text 

375 

376 def parse_node_with_img(self, node, **kwargs): 

377 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

378 node, **kwargs 

379 ) 

380 

381 # node.attribe["style"] = "" 

382 try: 

383 prefix = settings.SITE_URL_PREFIX 

384 except AttributeError: 

385 prefix = "" 

386 

387 # src = f"{prefix}/media/img/{self.volume}/{self.pid}/src/media" 

388 src = f"{prefix}/media/img/{self.issue_pid}/{self.pid}/src/media" 

389 href = "" 

390 classe = "" 

391 for attrib in node.attrib: 

392 name = normalize(attrib) 

393 if name == "src": 

394 img = os.path.basename(node.attrib[name]) 

395 name, ext = os.path.splitext(img) 

396 # If an image was convreted to jpg, pandoc still wrote the html with the previous extension, 

397 # '.tiff' for exemple 

398 if ext in [".tiff", ".tif"]: 

399 img = name + ".jpg" 

400 src = f"{src}/{img}" 

401 elif name == "style": 

402 classe = "article-body-img" 

403 elif name == "data-custom-style": 

404 classe = node.attrib[name].replace(" ", "-") 

405 

406 html_text = f"<img src={src} class={classe}>{inner_html_text}</img>" 

407 tex_text = f"<img src={src} class={classe}>{inner_html_text}</img>" 

408 xml_text = f'<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</graphic>' 

409 

410 return html_text, tex_text, xml_text 

411 

412 def parse_node_with_li(self, node, **kwargs): 

413 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

414 node, **kwargs 

415 ) 

416 parent_node = node.getparent() 

417 if parent_node.tag == "ul": 

418 html_text = f"<li >{inner_html_text}</li>" 

419 tex_text = f"<li >{inner_tex_text}</li>" 

420 else: 

421 html_text = f"<li class='article-list'>{inner_html_text}</li>" 

422 tex_text = f"<li class='article-list'>{inner_tex_text}</li>" 

423 

424 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>" 

425 

426 return html_text, tex_text, xml_text 

427 

428 def parse_node_with_mixed_content(self, node, **kwargs): 

429 """ 

430 Parse and return the text of an XML node which mixes text and XML sub-nodes. 

431 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

432 Some inner nodes are removed, others are kept or replaced. 

433 

434 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings. 

435 Parse the 2 nodes at the same time. 

436 

437 The JATS xml string is constructed at the same time because it is used during a PTF export 

438 

439 :param node: XML Node (with MathML), XML Node (with TexMath) 

440 :param kwargs: params of the function 

441 :return: HTML text, TeX test, XML text 

442 """ 

443 

444 html_text = tex_text = jats_xml_text = "" 

445 

446 if node is None: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true

447 return html_text, tex_text, jats_xml_text 

448 

449 # The tail is the text following the end of the node 

450 # Ex: <node>text1<a>text_a</a>a_tail</node> 

451 # The HTML text has to include the tail 

452 # only if html_from_mixed_content was called recursively 

453 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

454 

455 # lxml replace HTML entities in node.tex and node.tail (like &lt;) 

456 # kwargs['escape'] allows to escape back the values 

457 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True 

458 

459 tag = node.tag 

460 

461 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

462 

463 # I. Add the node's text. 

464 # Some tag have a corresponding html_from_@tag function to generate the HTML text. 

465 

466 fct_name = tag 

467 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

468 ftor = getattr(self, fct_name, None) 

469 if callable(ftor): 

470 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs) 

471 else: 

472 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

473 node, **kwargs 

474 ) 

475 

476 html_text += inner_html_text 

477 tex_text += inner_tex_text 

478 jats_xml_text += inner_jats_xml_text 

479 

480 # III. Add the node's tail for children 

481 if node.tail: 

482 # if self.check_citation and node.tag != "a": 

483 # html_text = create_innerlink_for_citation(html_text, self.biblio) 

484 # node.tail = create_innerlink_for_citation(node.tail, self.biblio) 

485 if self.check_citation: 485 ↛ 486line 485 didn't jump to line 486 because the condition on line 485 was never true

486 kwargs["escape"] = False 

487 html_text += escape(node.tail) if kwargs["escape"] else node.tail 

488 tex_text += escape(node.tail) if kwargs["escape"] else node.tail 

489 jats_xml_text += escape(node.tail) 

490 

491 # if self.check_citation and node.tag != "a": 

492 # html_text = create_innerlink_for_citation(html_text, self.biblio) 

493 

494 return html_text, tex_text, jats_xml_text 

495 

496 def parse_node_with_ol(self, node, **kwargs): 

497 # # JATS requires <list> to be inside <p> 

498 # parent = node.getparent() 

499 # if parent is None or parent.tag != "p": 

500 # xml_text = f"<p>{xml_text}</p>" 

501 

502 return self.parse_list(node, **kwargs) 

503 

504 def parse_node_with_p(self, node, **kwargs): 

505 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

506 node, **kwargs 

507 ) 

508 

509 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>" 

510 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>" 

511 if self.ignore_p: 

512 xml_text = inner_jats_xml_text 

513 elif len(inner_jats_xml_text) > 0: 513 ↛ 516line 513 didn't jump to line 516 because the condition on line 513 was always true

514 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>" 

515 else: 

516 xml_text = '<p xml:space="preserve"/>' 

517 

518 return html_text, tex_text, xml_text 

519 

520 def parse_node_with_span(self, node, **kwargs): 

521 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

522 node, **kwargs 

523 ) 

524 

525 the_class = node.get("class") 

526 display = the_class == "math display" 

527 if the_class in ["math inline", "math display"]: 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true

528 the_class = "mathjax-formula" 

529 

530 if the_class == "mathjax-formula": 

531 html_text, tex_text, xml_text = self.parse_formula(node, display=display) 

532 elif the_class is not None: 

533 html_text = f'<span class="{the_class}">{inner_html_text}</span>' 

534 tex_text = f'<span class="{the_class}">{inner_tex_text}</span>' 

535 xml_text = inner_jats_xml_text 

536 else: 

537 html_text = f"<span>{inner_html_text}</span>" 

538 tex_text = f"<span>{inner_tex_text}</span>" 

539 xml_text = inner_jats_xml_text 

540 

541 return html_text, tex_text, xml_text 

542 

543 def parse_node_with_strong(self, node, **kwargs): 

544 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

545 node, **kwargs 

546 ) 

547 

548 html_text = f"<strong>{inner_html_text}</strong>" 

549 tex_text = f"<strong>{inner_tex_text}</strong>" 

550 

551 if len(inner_jats_xml_text) > 0: 

552 xml_text = f"<bold>{inner_jats_xml_text}</bold>" 

553 else: 

554 xml_text = "<bold/>" 

555 

556 return html_text, tex_text, xml_text 

557 

558 def parse_node_with_sub(self, node, **kwargs): 

559 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

560 node, **kwargs 

561 ) 

562 

563 html_text = f"<sub>{inner_html_text}</sub>" 

564 tex_text = f"<sub>{inner_tex_text}</sub>" 

565 xml_text = f"<sub>{inner_jats_xml_text}</sub>" 

566 

567 return html_text, tex_text, xml_text 

568 

569 def parse_node_with_sup(self, node, **kwargs): 

570 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

571 node, **kwargs 

572 ) 

573 

574 html_text = f"<sup>{inner_html_text}</sup>" 

575 tex_text = f"<sup>{inner_tex_text}</sup>" 

576 xml_text = f"<sup>{inner_jats_xml_text}</sup>" 

577 

578 return html_text, tex_text, xml_text 

579 

580 def parse_node_with_table(self, node, **kwargs): 

581 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

582 node, **kwargs 

583 ) 

584 classe = "" 

585 for attrib in node.attrib: 

586 name = normalize(attrib) 

587 if name == "class": 

588 classe = node.attrib[name] 

589 # Next condition checks style identification with pandoc library used 

590 # for docx --> html conversion 

591 elif name == "data-custom-style": 

592 classe = node.attrib[name].replace(" ", "-") 

593 if "PCJ" in self.issue_pid: 

594 html_text = ( 

595 f"<div class='PCJ-table'><table class={classe}>{inner_html_text}</table></div>" 

596 ) 

597 tex_text = ( 

598 f"<div class='PCJ-table'><table class={classe}>{inner_tex_text}</table></div>" 

599 ) 

600 else: 

601 html_text = f"<table class={classe}>{inner_html_text}</table>" 

602 tex_text = f"<table class={classe}>{inner_tex_text}</table>" 

603 

604 xml_text = '<table xml:space="preserve">' + inner_jats_xml_text + "</table>" 

605 return html_text, tex_text, xml_text 

606 

607 def parse_node_with_tbody(self, node, **kwargs): 

608 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

609 node, **kwargs 

610 ) 

611 classe = "" 

612 for attrib in node.attrib: 

613 name = normalize(attrib) 

614 if name == "class": 

615 classe = node.attrib[name] 

616 html_text = f"<tbody class={classe}>{inner_html_text}</tbody>" 

617 tex_text = f"<tbody class={classe}>{inner_tex_text}</tbody>" 

618 

619 xml_text = '<tbody xml:space="preserve">' + inner_jats_xml_text + "</tbody>" 

620 return html_text, tex_text, xml_text 

621 

622 def parse_node_with_td(self, node, **kwargs): 

623 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

624 node, **kwargs 

625 ) 

626 classe = "" 

627 rowspan = "" 

628 colspan = "" 

629 for attrib in node.attrib: 

630 name = normalize(attrib) 

631 if name == "class": 

632 classe = node.attrib[name] 

633 elif name == "rowspan": 

634 rowspan = node.attrib[name] 

635 elif name == "colspan": 

636 colspan = node.attrib[name] 

637 if classe: 

638 html_text = f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>" 

639 tex_text = ( 

640 f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>" 

641 ) 

642 else: 

643 html_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>" 

644 tex_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>" 

645 

646 xml_text = '<td xml:space="preserve">' + inner_jats_xml_text + "</td>" 

647 return html_text, tex_text, xml_text 

648 

649 def parse_node_with_th(self, node, **kwargs): 

650 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

651 node, **kwargs 

652 ) 

653 classe = "" 

654 rowspan = "" 

655 colspan = "" 

656 for attrib in node.attrib: 

657 name = normalize(attrib) 

658 if name == "class": 

659 classe = node.attrib[name] 

660 elif name == "rowspan": 

661 rowspan = node.attrib[name] 

662 elif name == "colspan": 

663 colspan = node.attrib[name] 

664 if classe: 

665 html_text = f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>" 

666 tex_text = ( 

667 f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>" 

668 ) 

669 else: 

670 html_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>" 

671 tex_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>" 

672 

673 xml_text = '<th xml:space="preserve">' + inner_jats_xml_text + "</th>" 

674 return html_text, tex_text, xml_text 

675 

676 def parse_node_with_tr(self, node, **kwargs): 

677 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

678 node, **kwargs 

679 ) 

680 classe = "" 

681 

682 html_text = f"<tr class='{classe}'>{inner_html_text}</tr>" 

683 tex_text = f"<tr class='{classe}'>{inner_tex_text}</tr>" 

684 

685 xml_text = '<tr xml:space="preserve">' + inner_jats_xml_text + "</tr>" 

686 return html_text, tex_text, xml_text 

687 

688 def parse_node_with_ul(self, node, **kwargs): 

689 return self.parse_list(node, **kwargs) 

690 

691 def parse_tree(self, tree): 

692 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content( 

693 tree, is_top=True 

694 ) 

695 # if self.check_citation: 

696 # self.value_html = create_innerlink_for_citation(self.value_html, self.biblio) 

697 

698 

699if __name__ == "__main__": 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true

700 html_value = r'<p>Te&lt;st&nbsp;<span class="mathjax-formula">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol>' 

701 parser = CkeditorParser(html_value=html_value) 

702 result = parser.value_xml 

703 print(result)