Coverage for src/ptf/cmds/xml/ckeditor/ckeditor

1##################################################################################################

3# README

5# ckeditor_parser.py parses the HTML strings created by a CKEditor

6# with tex formulas inside <span class="math-tex">

7# It returns the JATS equivalent.

9# Ex: <p>Te<st <span class="math-tex">$x = {-b \pm \sqrt{b^2-4ac} \over 2a}$</span> done</p>

10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>

11#

12##################################################################################################

14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15 because the condition on line 14 was never true

15 import os

16 import sys

18 BASE_DIR = os.path.dirname(

19 os.path.dirname(

20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

21 )

22 )

23 sys.path.append(BASE_DIR)

25import os

27from lxml import etree

29from django.conf import settings

31from ptf.cmds.xml.xml_utils import escape

32from ptf.cmds.xml.xml_utils import normalize

33from ptf.cmds.xml.xml_utils import replace_html_entities

35# from ptf.utils import create_innerlink_for_citation

38class CkeditorParser:

39 def __init__(self, *args, **kwargs):

40 self.warnings = []

41 self.value_xml = ""

42 self.value_html = ""

43 self.value_tex = ""

45 if "tree" not in kwargs and "html_value" in kwargs: 45 ↛ 57line 45 didn't jump to line 57 because the condition on line 45 was always true

46 parser = etree.XMLParser(

47 huge_tree=True,

48 recover=True,

49 remove_blank_text=False,

50 remove_comments=True,

51 resolve_entities=True,

52 )

53 html_value = kwargs["html_value"].replace("\n\n", "")

54 body = f"<body>{replace_html_entities(html_value)}</body>"

55 tree = etree.fromstring(body.encode("utf-8"), parser=parser)

56 else:

57 tree = kwargs["tree"]

59 self.mml_formulas = kwargs["mml_formulas"]

60 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False

61 self.pid = kwargs.get("pid", None)

62 self.volume = kwargs.get("volume", None)

63 self.issue_pid = kwargs.get("issue_pid", None)

64 self.check_citation = kwargs.get("check_citation", False)

65 self.biblio = kwargs.get("biblio", None)

66 self.for_pcj_display = kwargs.get("for_pcj_display", False)

68 self.parse_tree(tree)

70 def parse_formula(self, node, **kwargs):

71 formula = node.text or ""

72 display = kwargs.get("display", None)

73 if len(formula) > 0 and formula.find("\\(") == 0:

74 formula = formula[2:-2]

75 # elif len(formula) > 0 and formula.find("\[") == 0:

76 # formula = formula[1:-1]

77 mml = ""

78 if len(self.mml_formulas) > 0:

79 mml = self.mml_formulas.pop(0)

81 is_inline = True

82 parent = node.getparent()

83 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail:

84 is_inline = False

85 if self.for_pcj_display: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 formula = rf"${formula}$"

87 else:

88 formula = f"${formula}$"

89 if mml:

90 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>'

91 elif display: 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true

92 html_text = f'<span class="mathjax-formula display" title="{formula}">{formula}</span>'

93 else:

94 html_text = f'<span class="mathjax-formula" title="{formula}">{formula}</span>'

95 tex_text = formula

97 if is_inline:

98 xml_text = "<inline-formula><alternatives>"

99 if len(mml) > 0:

100 xml_text += mml

101 xml_text += f"<tex-math>{escape(formula)}</tex-math>"

102 xml_text += "</alternatives></inline-formula>"

103 else:

104 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">'

105 suffix = '</td><td class="formula-label"></td></tr></table>'

106 html_text = prefix + html_text + suffix

107 tex_text = prefix + tex_text + suffix

108

109 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>'

110 if len(mml) > 0:

111 xml_text += mml

112 xml_text += f"<tex-math>{escape(formula)}</tex-math>"

113 xml_text += "</alternatives></disp-formula>"

114

115 return html_text, tex_text, xml_text

116

117 def parse_list(self, node, **kwargs):

118 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

119 node, **kwargs

120 )

121

122 list_type = "simple" if node.tag == "ul" else "number"

123

124 xml_text = f'<list list-type="{list_type}">'

125 xml_text += inner_jats_xml_text

126 xml_text += "</list>"

127

128 # # JATS requires <list> to be inside <p>

129 # parent = node.getparent()

130 # if parent is None or parent.tag != "p":

131 # xml_text = f"<p>{xml_text}</p>"

132 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>"

133 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>"

134

135 return html_text, tex_text, xml_text

136

137 def parse_node_inner(self, node, **kwargs):

138 """

139 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML

140 :param node:

141 :param kwargs:

142 :return:

143 """

144

145 kwargs["is_top"] = False

146 inner_html_text = inner_tex_text = inner_jats_xml_text = ""

147

148 if node.text:

149 text = node.text

150

151 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 text = text[1:]

153

154 inner_jats_xml_text += escape(text)

155 inner_html_text += escape(text) if kwargs["escape"] else text

156 inner_tex_text += escape(text) if kwargs["escape"] else text

157

158 # if self.check_citation and node.tag != "a":

159 # inner_html_text = create_innerlink_for_citation(inner_html_text, self.biblio)

160

161 for i in range(len(node)):

162 child = node[i]

163

164 (

165 child_html_text,

166 child_tex_text,

167 child_jats_xml_text,

168 ) = self.parse_node_with_mixed_content(child, **kwargs)

169 inner_html_text += child_html_text

170 inner_tex_text += child_tex_text

171 inner_jats_xml_text += child_jats_xml_text

172

173 return inner_html_text, inner_tex_text, inner_jats_xml_text

174

175 def parse_node_with_a(self, node, **kwargs):

176 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

177 node, **kwargs

178 )

179

180 href = ""

181 for attrib in node.attrib:

182 name = normalize(attrib)

183 if name == "href":

184 href = node.attrib[attrib]

185

186 if not href:

187 href = inner_tex_text

188

189 html_text = f'<a href="{href}">{inner_html_text}</a>'

190 tex_text = f'<a href="{href}">{inner_tex_text}</a>'

191 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>'

192

193 return html_text, tex_text, xml_text

194

195 def parse_node_with_br(self, node, **kwargs):

196 html_text = tex_text = "<br/>"

197 xml_text = "<break/>"

198

199 return html_text, tex_text, xml_text

200

201 def parse_node_with_colgroup(self, node, **kwargs):

202 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

203 node, **kwargs

204 )

205 classe = ""

206 for attrib in node.attrib:

207 name = normalize(attrib)

208 if name == "class":

209 classe = node.attrib[name]

210 html_text = f"<colgroup class={classe}>{inner_html_text}</colgroup>"

211 tex_text = f"<colgroup class={classe}>{inner_tex_text}</colgroup>"

212

213 xml_text = '<colgroup xml:space="preserve">' + inner_jats_xml_text + "</colgroup>"

214 return html_text, tex_text, xml_text

215

216 def parse_node_with_col(self, node, **kwargs):

217 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

218 node, **kwargs

219 )

220 classe = ""

221 style = ""

222 for attrib in node.attrib:

223 name = normalize(attrib)

224 if name == "class":

225 classe = node.attrib[name]

226 elif name == "style":

227 style = node.attrib[name]

228 if classe:

229 html_text = f"<col class={classe} style='{style}'>{inner_html_text}</col>"

230 tex_text = f"<col class={classe} style='{style}'>{inner_tex_text}</col>"

231 else:

232 html_text = f"<col style='{style}'>{inner_html_text}</col>"

233 tex_text = f"<col style='{style}'>{inner_tex_text}</col>"

234

235 xml_text = '<col xml:space="preserve">' + inner_jats_xml_text + "</col>"

236 return html_text, tex_text, xml_text

237

238 def parse_node_with_div(self, node, **kwargs):

239 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

240 node, **kwargs

241 )

242 classe = ""

243 for attrib in node.attrib:

244 name = normalize(attrib)

245 if name == "class":

246 classe = node.attrib[name]

247 # Next condition checks style identification with pandoc library used

248 # for docx --> html conversion

249 elif name == "data-custom-style":

250 if node.attrib[name] == "PCJ Equation":

251 classe = "mathjax-formula PCJ-Equation"

252 else:

253 classe = node.attrib[name].replace(" ", "-")

254 if classe == "PCJ-Section" and "References" in inner_html_text:

255 html_text = tex_text = xml_text = ""

256 return html_text, tex_text, xml_text

257 elif classe == "PCJ-Reference":

258 html_text = tex_text = xml_text = ""

259 return html_text, tex_text, xml_text

260

261 html_text = f"<div class='{classe}'>{inner_html_text}</div>"

262 tex_text = f"<div class='{classe}'>{inner_tex_text}</div>"

263

264 xml_text = '<div xml:space="preserve">' + inner_jats_xml_text + "</div>"

265 return html_text, tex_text, xml_text

266

267 def parse_node_with_em(self, node, **kwargs):

268 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

269 node, **kwargs

270 )

271

272 html_text = f'<span class="italique">{inner_html_text}</span>'

273 tex_text = f"<i>{inner_tex_text}</i>"

274

275 if len(inner_jats_xml_text) > 0:

276 xml_text = f"<italic>{inner_jats_xml_text}</italic>"

277 else:

278 xml_text = "<italic/>"

279

280 return html_text, tex_text, xml_text

281

282 def parse_node_with_h1(self, node, **kwargs):

283 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

284 node, **kwargs

285 )

286 classe = ""

287 for attrib in node.attrib:

288 name = normalize(attrib)

289 if name == "class":

290 classe = node.attrib[name]

291 html_text = f"<h1 class={classe}>{inner_html_text}</h1>"

292 tex_text = f"<h1 class={classe}>{inner_tex_text}</h1>"

293

294 xml_text = '<h1 xml:space="preserve">' + inner_jats_xml_text + "</h1>"

295

296 return html_text, tex_text, xml_text

297

298 def parse_node_with_h2(self, node, **kwargs):

299 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

300 node, **kwargs

301 )

302 classe = ""

303 for attrib in node.attrib:

304 name = normalize(attrib)

305 if name == "class":

306 classe = node.attrib[name]

307 html_text = f"<h2 class={classe}>{inner_html_text}</h2>"

308 tex_text = f"<h2 class={classe}>{inner_tex_text}</h2>"

309

310 xml_text = '<h2 xml:space="preserve">' + inner_jats_xml_text + "</h2>"

311

312 return html_text, tex_text, xml_text

313

314 def parse_node_with_h3(self, node, **kwargs):

315 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

316 node, **kwargs

317 )

318 classe = ""

319 for attrib in node.attrib:

320 name = normalize(attrib)

321 if name == "class":

322 classe = node.attrib[name]

323 html_text = f"<h3 class={classe}>{inner_html_text}</h3>"

324 tex_text = f"<h3 class={classe}>{inner_tex_text}</h3>"

325

326 xml_text = '<h3 xml:space="preserve">' + inner_jats_xml_text + "</h3>"

327

328 return html_text, tex_text, xml_text

329

330 def parse_node_with_h4(self, node, **kwargs):

331 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

332 node, **kwargs

333 )

334 classe = ""

335 for attrib in node.attrib:

336 name = normalize(attrib)

337 if name == "class":

338 classe = node.attrib[name]

339 html_text = f"<h4 class={classe}>{inner_html_text}</h4>"

340 tex_text = f"<h4 class={classe}>{inner_tex_text}</h4>"

341

342 xml_text = '<h4 xml:space="preserve">' + inner_jats_xml_text + "</h4>"

343 return html_text, tex_text, xml_text

344

345 def parse_node_with_h5(self, node, **kwargs):

346 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

347 node, **kwargs

348 )

349 classe = ""

350 for attrib in node.attrib:

351 name = normalize(attrib)

352 if name == "class":

353 classe = node.attrib[name]

354 html_text = f"<h5 class={classe}>{inner_html_text}</h5>"

355 tex_text = f"<h5 class={classe}>{inner_tex_text}</h5>"

356

357 xml_text = '<h5 xml:space="preserve">' + inner_jats_xml_text + "</h5>"

358

359 return html_text, tex_text, xml_text

360

361 def parse_node_with_h6(self, node, **kwargs):

362 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

363 node, **kwargs

364 )

365 classe = ""

366 for attrib in node.attrib:

367 name = normalize(attrib)

368 if name == "class":

369 classe = node.attrib[name]

370 html_text = f"<h6 class={classe}>{inner_html_text}</h6>"

371 tex_text = f"<h6 class={classe}>{inner_tex_text}</h6>"

372

373 xml_text = '<h6 xml:space="preserve">' + inner_jats_xml_text + "</h6>"

374 return html_text, tex_text, xml_text

375

376 def parse_node_with_img(self, node, **kwargs):

377 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

378 node, **kwargs

379 )

380

381 # node.attribe["style"] = ""

382 try:

383 prefix = settings.SITE_URL_PREFIX

384 except AttributeError:

385 prefix = ""

386

387 # src = f"{prefix}/media/img/{self.volume}/{self.pid}/src/media"

388 src = f"{prefix}/media/img/{self.issue_pid}/{self.pid}/src/media"

389 href = ""

390 classe = ""

391 for attrib in node.attrib:

392 name = normalize(attrib)

393 if name == "src":

394 img = os.path.basename(node.attrib[name])

395 name, ext = os.path.splitext(img)

396 # If an image was convreted to jpg, pandoc still wrote the html with the previous extension,

397 # '.tiff' for exemple

398 if ext in [".tiff", ".tif"]:

399 img = name + ".jpg"

400 src = f"{src}/{img}"

401 elif name == "style":

402 classe = "article-body-img"

403 elif name == "data-custom-style":

404 classe = node.attrib[name].replace(" ", "-")

405

406 html_text = f"<img src={src} class={classe}>{inner_html_text}</img>"

407 tex_text = f"<img src={src} class={classe}>{inner_html_text}</img>"

408 xml_text = f'<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</graphic>'

409

410 return html_text, tex_text, xml_text

411

412 def parse_node_with_li(self, node, **kwargs):

413 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

414 node, **kwargs

415 )

416 parent_node = node.getparent()

417 if parent_node.tag == "ul":

418 html_text = f"<li >{inner_html_text}</li>"

419 tex_text = f"<li >{inner_tex_text}</li>"

420 else:

421 html_text = f"<li class='article-list'>{inner_html_text}</li>"

422 tex_text = f"<li class='article-list'>{inner_tex_text}</li>"

423

424 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>"

425

426 return html_text, tex_text, xml_text

427

428 def parse_node_with_mixed_content(self, node, **kwargs):

429 """

430 Parse and return the text of an XML node which mixes text and XML sub-nodes.

431 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>

432 Some inner nodes are removed, others are kept or replaced.

433

434 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings.

435 Parse the 2 nodes at the same time.

436

437 The JATS xml string is constructed at the same time because it is used during a PTF export

438

439 :param node: XML Node (with MathML), XML Node (with TexMath)

440 :param kwargs: params of the function

441 :return: HTML text, TeX test, XML text

442 """

443

444 html_text = tex_text = jats_xml_text = ""

445

446 if node is None: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true

447 return html_text, tex_text, jats_xml_text

448

449 # The tail is the text following the end of the node

450 # Ex: <node>text1<a>text_a</a>a_tail</node>

451 # The HTML text has to include the tail

452 # only if html_from_mixed_content was called recursively

453 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

454

455 # lxml replace HTML entities in node.tex and node.tail (like <)

456 # kwargs['escape'] allows to escape back the values

457 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True

458

459 tag = node.tag

460

461 inner_html_text = inner_tex_text = inner_jats_xml_text = ""

462

463 # I. Add the node's text.

464 # Some tag have a corresponding html_from_@tag function to generate the HTML text.

465

466 fct_name = tag

467 fct_name = "parse_node_with_" + fct_name.replace("-", "_")

468 ftor = getattr(self, fct_name, None)

469 if callable(ftor):

470 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs)

471 else:

472 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

473 node, **kwargs

474 )

475

476 html_text += inner_html_text

477 tex_text += inner_tex_text

478 jats_xml_text += inner_jats_xml_text

479

480 # III. Add the node's tail for children

481 if node.tail:

482 # if self.check_citation and node.tag != "a":

483 # html_text = create_innerlink_for_citation(html_text, self.biblio)

484 # node.tail = create_innerlink_for_citation(node.tail, self.biblio)

485 if self.check_citation: 485 ↛ 486line 485 didn't jump to line 486 because the condition on line 485 was never true

486 kwargs["escape"] = False

487 html_text += escape(node.tail) if kwargs["escape"] else node.tail

488 tex_text += escape(node.tail) if kwargs["escape"] else node.tail

489 jats_xml_text += escape(node.tail)

490

491 # if self.check_citation and node.tag != "a":

492 # html_text = create_innerlink_for_citation(html_text, self.biblio)

493

494 return html_text, tex_text, jats_xml_text

495

496 def parse_node_with_ol(self, node, **kwargs):

497 # # JATS requires <list> to be inside <p>

498 # parent = node.getparent()

499 # if parent is None or parent.tag != "p":

500 # xml_text = f"<p>{xml_text}</p>"

501

502 return self.parse_list(node, **kwargs)

503

504 def parse_node_with_p(self, node, **kwargs):

505 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

506 node, **kwargs

507 )

508

509 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>"

510 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>"

511 if self.ignore_p:

512 xml_text = inner_jats_xml_text

513 elif len(inner_jats_xml_text) > 0: 513 ↛ 516line 513 didn't jump to line 516 because the condition on line 513 was always true

514 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>"

515 else:

516 xml_text = '<p xml:space="preserve"/>'

517

518 return html_text, tex_text, xml_text

519

520 def parse_node_with_span(self, node, **kwargs):

521 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

522 node, **kwargs

523 )

524

525 the_class = node.get("class")

526 display = the_class == "math display"

527 if the_class in ["math inline", "math display"]: 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true

528 the_class = "mathjax-formula"

529

530 if the_class == "mathjax-formula":

531 html_text, tex_text, xml_text = self.parse_formula(node, display=display)

532 elif the_class is not None:

533 html_text = f'<span class="{the_class}">{inner_html_text}</span>'

534 tex_text = f'<span class="{the_class}">{inner_tex_text}</span>'

535 xml_text = inner_jats_xml_text

536 else:

537 html_text = f"<span>{inner_html_text}</span>"

538 tex_text = f"<span>{inner_tex_text}</span>"

539 xml_text = inner_jats_xml_text

540

541 return html_text, tex_text, xml_text

542

543 def parse_node_with_strong(self, node, **kwargs):

544 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

545 node, **kwargs

546 )

547

548 html_text = f"<strong>{inner_html_text}</strong>"

549 tex_text = f"<strong>{inner_tex_text}</strong>"

550

551 if len(inner_jats_xml_text) > 0:

552 xml_text = f"<bold>{inner_jats_xml_text}</bold>"

553 else:

554 xml_text = "<bold/>"

555

556 return html_text, tex_text, xml_text

557

558 def parse_node_with_sub(self, node, **kwargs):

559 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

560 node, **kwargs

561 )

562

563 html_text = f"<sub>{inner_html_text}</sub>"

564 tex_text = f"<sub>{inner_tex_text}</sub>"

565 xml_text = f"<sub>{inner_jats_xml_text}</sub>"

566

567 return html_text, tex_text, xml_text

568

569 def parse_node_with_sup(self, node, **kwargs):

570 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

571 node, **kwargs

572 )

573

574 html_text = f"<sup>{inner_html_text}</sup>"

575 tex_text = f"<sup>{inner_tex_text}</sup>"

576 xml_text = f"<sup>{inner_jats_xml_text}</sup>"

577

578 return html_text, tex_text, xml_text

579

580 def parse_node_with_table(self, node, **kwargs):

581 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

582 node, **kwargs

583 )

584 classe = ""

585 for attrib in node.attrib:

586 name = normalize(attrib)

587 if name == "class":

588 classe = node.attrib[name]

589 # Next condition checks style identification with pandoc library used

590 # for docx --> html conversion

591 elif name == "data-custom-style":

592 classe = node.attrib[name].replace(" ", "-")

593 if "PCJ" in self.issue_pid:

594 html_text = (

595 f"<div class='PCJ-table'><table class={classe}>{inner_html_text}</table></div>"

596 )

597 tex_text = (

598 f"<div class='PCJ-table'><table class={classe}>{inner_tex_text}</table></div>"

599 )

600 else:

601 html_text = f"<table class={classe}>{inner_html_text}</table>"

602 tex_text = f"<table class={classe}>{inner_tex_text}</table>"

603

604 xml_text = '<table xml:space="preserve">' + inner_jats_xml_text + "</table>"

605 return html_text, tex_text, xml_text

606

607 def parse_node_with_tbody(self, node, **kwargs):

608 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

609 node, **kwargs

610 )

611 classe = ""

612 for attrib in node.attrib:

613 name = normalize(attrib)

614 if name == "class":

615 classe = node.attrib[name]

616 html_text = f"<tbody class={classe}>{inner_html_text}</tbody>"

617 tex_text = f"<tbody class={classe}>{inner_tex_text}</tbody>"

618

619 xml_text = '<tbody xml:space="preserve">' + inner_jats_xml_text + "</tbody>"

620 return html_text, tex_text, xml_text

621

622 def parse_node_with_td(self, node, **kwargs):

623 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

624 node, **kwargs

625 )

626 classe = ""

627 rowspan = ""

628 colspan = ""

629 for attrib in node.attrib:

630 name = normalize(attrib)

631 if name == "class":

632 classe = node.attrib[name]

633 elif name == "rowspan":

634 rowspan = node.attrib[name]

635 elif name == "colspan":

636 colspan = node.attrib[name]

637 if classe:

638 html_text = f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"

639 tex_text = (

640 f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"

641 )

642 else:

643 html_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"

644 tex_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"

645

646 xml_text = '<td xml:space="preserve">' + inner_jats_xml_text + "</td>"

647 return html_text, tex_text, xml_text

648

649 def parse_node_with_th(self, node, **kwargs):

650 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

651 node, **kwargs

652 )

653 classe = ""

654 rowspan = ""

655 colspan = ""

656 for attrib in node.attrib:

657 name = normalize(attrib)

658 if name == "class":

659 classe = node.attrib[name]

660 elif name == "rowspan":

661 rowspan = node.attrib[name]

662 elif name == "colspan":

663 colspan = node.attrib[name]

664 if classe:

665 html_text = f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"

666 tex_text = (

667 f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"

668 )

669 else:

670 html_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"

671 tex_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"

672

673 xml_text = '<th xml:space="preserve">' + inner_jats_xml_text + "</th>"

674 return html_text, tex_text, xml_text

675

676 def parse_node_with_tr(self, node, **kwargs):

677 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(

678 node, **kwargs

679 )

680 classe = ""

681

682 html_text = f"<tr class='{classe}'>{inner_html_text}</tr>"

683 tex_text = f"<tr class='{classe}'>{inner_tex_text}</tr>"

684

685 xml_text = '<tr xml:space="preserve">' + inner_jats_xml_text + "</tr>"

686 return html_text, tex_text, xml_text

687

688 def parse_node_with_ul(self, node, **kwargs):

689 return self.parse_list(node, **kwargs)

690

691 def parse_tree(self, tree):

692 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content(

693 tree, is_top=True

694 )

695 # if self.check_citation:

696 # self.value_html = create_innerlink_for_citation(self.value_html, self.biblio)

697

698

699if __name__ == "__main__": 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true

700 html_value = r'<p>Te<st <span class="mathjax-formula">$x = {-b \pm \sqrt{b^2-4ac} \over 2a}$</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>'

701 parser = CkeditorParser(html_value=html_value)

702 result = parser.value_xml

703 print(result)

Coverage for src/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 45%

425 statements