Coverage for src/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 45%
425 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1##################################################################################################
2#
3# README
4#
5# ckeditor_parser.py parses the HTML strings created by a CKEditor
6# with tex formulas inside <span class="math-tex">
7# It returns the JATS equivalent.
8#
9# Ex: <p>Te<st <span class="math-tex">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p>
10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>
11#
12##################################################################################################
14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15 because the condition on line 14 was never true
15 import os
16 import sys
18 BASE_DIR = os.path.dirname(
19 os.path.dirname(
20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21 )
22 )
23 sys.path.append(BASE_DIR)
25import os
27from lxml import etree
29from django.conf import settings
31from ptf.cmds.xml.xml_utils import escape
32from ptf.cmds.xml.xml_utils import normalize
33from ptf.cmds.xml.xml_utils import replace_html_entities
35# from ptf.utils import create_innerlink_for_citation
38class CkeditorParser:
39 def __init__(self, *args, **kwargs):
40 self.warnings = []
41 self.value_xml = ""
42 self.value_html = ""
43 self.value_tex = ""
45 if "tree" not in kwargs and "html_value" in kwargs: 45 ↛ 57line 45 didn't jump to line 57 because the condition on line 45 was always true
46 parser = etree.XMLParser(
47 huge_tree=True,
48 recover=True,
49 remove_blank_text=False,
50 remove_comments=True,
51 resolve_entities=True,
52 )
53 html_value = kwargs["html_value"].replace("\n\n", "")
54 body = f"<body>{replace_html_entities(html_value)}</body>"
55 tree = etree.fromstring(body.encode("utf-8"), parser=parser)
56 else:
57 tree = kwargs["tree"]
59 self.mml_formulas = kwargs["mml_formulas"]
60 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False
61 self.pid = kwargs.get("pid", None)
62 self.volume = kwargs.get("volume", None)
63 self.issue_pid = kwargs.get("issue_pid", None)
64 self.check_citation = kwargs.get("check_citation", False)
65 self.biblio = kwargs.get("biblio", None)
66 self.for_pcj_display = kwargs.get("for_pcj_display", False)
68 self.parse_tree(tree)
70 def parse_formula(self, node, **kwargs):
71 formula = node.text or ""
72 display = kwargs.get("display", None)
73 if len(formula) > 0 and formula.find("\\(") == 0:
74 formula = formula[2:-2]
75 # elif len(formula) > 0 and formula.find("\[") == 0:
76 # formula = formula[1:-1]
77 mml = ""
78 if len(self.mml_formulas) > 0:
79 mml = self.mml_formulas.pop(0)
81 is_inline = True
82 parent = node.getparent()
83 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail:
84 is_inline = False
85 if self.for_pcj_display: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 formula = rf"\({formula}\)"
87 else:
88 formula = f"${formula}$"
89 if mml:
90 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>'
91 elif display: 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true
92 html_text = f'<span class="mathjax-formula display" title="{formula}">{formula}</span>'
93 else:
94 html_text = f'<span class="mathjax-formula" title="{formula}">{formula}</span>'
95 tex_text = formula
97 if is_inline:
98 xml_text = "<inline-formula><alternatives>"
99 if len(mml) > 0:
100 xml_text += mml
101 xml_text += f"<tex-math>{escape(formula)}</tex-math>"
102 xml_text += "</alternatives></inline-formula>"
103 else:
104 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">'
105 suffix = '</td><td class="formula-label"></td></tr></table>'
106 html_text = prefix + html_text + suffix
107 tex_text = prefix + tex_text + suffix
109 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>'
110 if len(mml) > 0:
111 xml_text += mml
112 xml_text += f"<tex-math>{escape(formula)}</tex-math>"
113 xml_text += "</alternatives></disp-formula>"
115 return html_text, tex_text, xml_text
117 def parse_list(self, node, **kwargs):
118 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
119 node, **kwargs
120 )
122 list_type = "simple" if node.tag == "ul" else "number"
124 xml_text = f'<list list-type="{list_type}">'
125 xml_text += inner_jats_xml_text
126 xml_text += "</list>"
128 # # JATS requires <list> to be inside <p>
129 # parent = node.getparent()
130 # if parent is None or parent.tag != "p":
131 # xml_text = f"<p>{xml_text}</p>"
132 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>"
133 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>"
135 return html_text, tex_text, xml_text
137 def parse_node_inner(self, node, **kwargs):
138 """
139 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML
140 :param node:
141 :param kwargs:
142 :return:
143 """
145 kwargs["is_top"] = False
146 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
148 if node.text:
149 text = node.text
151 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 text = text[1:]
154 inner_jats_xml_text += escape(text)
155 inner_html_text += escape(text) if kwargs["escape"] else text
156 inner_tex_text += escape(text) if kwargs["escape"] else text
158 # if self.check_citation and node.tag != "a":
159 # inner_html_text = create_innerlink_for_citation(inner_html_text, self.biblio)
161 for i in range(len(node)):
162 child = node[i]
164 (
165 child_html_text,
166 child_tex_text,
167 child_jats_xml_text,
168 ) = self.parse_node_with_mixed_content(child, **kwargs)
169 inner_html_text += child_html_text
170 inner_tex_text += child_tex_text
171 inner_jats_xml_text += child_jats_xml_text
173 return inner_html_text, inner_tex_text, inner_jats_xml_text
175 def parse_node_with_a(self, node, **kwargs):
176 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
177 node, **kwargs
178 )
180 href = ""
181 for attrib in node.attrib:
182 name = normalize(attrib)
183 if name == "href":
184 href = node.attrib[attrib]
186 if not href:
187 href = inner_tex_text
189 html_text = f'<a href="{href}">{inner_html_text}</a>'
190 tex_text = f'<a href="{href}">{inner_tex_text}</a>'
191 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>'
193 return html_text, tex_text, xml_text
195 def parse_node_with_br(self, node, **kwargs):
196 html_text = tex_text = "<br/>"
197 xml_text = "<break/>"
199 return html_text, tex_text, xml_text
201 def parse_node_with_colgroup(self, node, **kwargs):
202 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
203 node, **kwargs
204 )
205 classe = ""
206 for attrib in node.attrib:
207 name = normalize(attrib)
208 if name == "class":
209 classe = node.attrib[name]
210 html_text = f"<colgroup class={classe}>{inner_html_text}</colgroup>"
211 tex_text = f"<colgroup class={classe}>{inner_tex_text}</colgroup>"
213 xml_text = '<colgroup xml:space="preserve">' + inner_jats_xml_text + "</colgroup>"
214 return html_text, tex_text, xml_text
216 def parse_node_with_col(self, node, **kwargs):
217 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
218 node, **kwargs
219 )
220 classe = ""
221 style = ""
222 for attrib in node.attrib:
223 name = normalize(attrib)
224 if name == "class":
225 classe = node.attrib[name]
226 elif name == "style":
227 style = node.attrib[name]
228 if classe:
229 html_text = f"<col class={classe} style='{style}'>{inner_html_text}</col>"
230 tex_text = f"<col class={classe} style='{style}'>{inner_tex_text}</col>"
231 else:
232 html_text = f"<col style='{style}'>{inner_html_text}</col>"
233 tex_text = f"<col style='{style}'>{inner_tex_text}</col>"
235 xml_text = '<col xml:space="preserve">' + inner_jats_xml_text + "</col>"
236 return html_text, tex_text, xml_text
238 def parse_node_with_div(self, node, **kwargs):
239 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
240 node, **kwargs
241 )
242 classe = ""
243 for attrib in node.attrib:
244 name = normalize(attrib)
245 if name == "class":
246 classe = node.attrib[name]
247 # Next condition checks style identification with pandoc library used
248 # for docx --> html conversion
249 elif name == "data-custom-style":
250 if node.attrib[name] == "PCJ Equation":
251 classe = "mathjax-formula PCJ-Equation"
252 else:
253 classe = node.attrib[name].replace(" ", "-")
254 if classe == "PCJ-Section" and "References" in inner_html_text:
255 html_text = tex_text = xml_text = ""
256 return html_text, tex_text, xml_text
257 elif classe == "PCJ-Reference":
258 html_text = tex_text = xml_text = ""
259 return html_text, tex_text, xml_text
261 html_text = f"<div class='{classe}'>{inner_html_text}</div>"
262 tex_text = f"<div class='{classe}'>{inner_tex_text}</div>"
264 xml_text = '<div xml:space="preserve">' + inner_jats_xml_text + "</div>"
265 return html_text, tex_text, xml_text
267 def parse_node_with_em(self, node, **kwargs):
268 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
269 node, **kwargs
270 )
272 html_text = f'<span class="italique">{inner_html_text}</span>'
273 tex_text = f"<i>{inner_tex_text}</i>"
275 if len(inner_jats_xml_text) > 0:
276 xml_text = f"<italic>{inner_jats_xml_text}</italic>"
277 else:
278 xml_text = "<italic/>"
280 return html_text, tex_text, xml_text
282 def parse_node_with_h1(self, node, **kwargs):
283 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
284 node, **kwargs
285 )
286 classe = ""
287 for attrib in node.attrib:
288 name = normalize(attrib)
289 if name == "class":
290 classe = node.attrib[name]
291 html_text = f"<h1 class={classe}>{inner_html_text}</h1>"
292 tex_text = f"<h1 class={classe}>{inner_tex_text}</h1>"
294 xml_text = '<h1 xml:space="preserve">' + inner_jats_xml_text + "</h1>"
296 return html_text, tex_text, xml_text
298 def parse_node_with_h2(self, node, **kwargs):
299 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
300 node, **kwargs
301 )
302 classe = ""
303 for attrib in node.attrib:
304 name = normalize(attrib)
305 if name == "class":
306 classe = node.attrib[name]
307 html_text = f"<h2 class={classe}>{inner_html_text}</h2>"
308 tex_text = f"<h2 class={classe}>{inner_tex_text}</h2>"
310 xml_text = '<h2 xml:space="preserve">' + inner_jats_xml_text + "</h2>"
312 return html_text, tex_text, xml_text
314 def parse_node_with_h3(self, node, **kwargs):
315 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
316 node, **kwargs
317 )
318 classe = ""
319 for attrib in node.attrib:
320 name = normalize(attrib)
321 if name == "class":
322 classe = node.attrib[name]
323 html_text = f"<h3 class={classe}>{inner_html_text}</h3>"
324 tex_text = f"<h3 class={classe}>{inner_tex_text}</h3>"
326 xml_text = '<h3 xml:space="preserve">' + inner_jats_xml_text + "</h3>"
328 return html_text, tex_text, xml_text
330 def parse_node_with_h4(self, node, **kwargs):
331 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
332 node, **kwargs
333 )
334 classe = ""
335 for attrib in node.attrib:
336 name = normalize(attrib)
337 if name == "class":
338 classe = node.attrib[name]
339 html_text = f"<h4 class={classe}>{inner_html_text}</h4>"
340 tex_text = f"<h4 class={classe}>{inner_tex_text}</h4>"
342 xml_text = '<h4 xml:space="preserve">' + inner_jats_xml_text + "</h4>"
343 return html_text, tex_text, xml_text
345 def parse_node_with_h5(self, node, **kwargs):
346 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
347 node, **kwargs
348 )
349 classe = ""
350 for attrib in node.attrib:
351 name = normalize(attrib)
352 if name == "class":
353 classe = node.attrib[name]
354 html_text = f"<h5 class={classe}>{inner_html_text}</h5>"
355 tex_text = f"<h5 class={classe}>{inner_tex_text}</h5>"
357 xml_text = '<h5 xml:space="preserve">' + inner_jats_xml_text + "</h5>"
359 return html_text, tex_text, xml_text
361 def parse_node_with_h6(self, node, **kwargs):
362 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
363 node, **kwargs
364 )
365 classe = ""
366 for attrib in node.attrib:
367 name = normalize(attrib)
368 if name == "class":
369 classe = node.attrib[name]
370 html_text = f"<h6 class={classe}>{inner_html_text}</h6>"
371 tex_text = f"<h6 class={classe}>{inner_tex_text}</h6>"
373 xml_text = '<h6 xml:space="preserve">' + inner_jats_xml_text + "</h6>"
374 return html_text, tex_text, xml_text
376 def parse_node_with_img(self, node, **kwargs):
377 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
378 node, **kwargs
379 )
381 # node.attribe["style"] = ""
382 try:
383 prefix = settings.SITE_URL_PREFIX
384 except AttributeError:
385 prefix = ""
387 # src = f"{prefix}/media/img/{self.volume}/{self.pid}/src/media"
388 src = f"{prefix}/media/img/{self.issue_pid}/{self.pid}/src/media"
389 href = ""
390 classe = ""
391 for attrib in node.attrib:
392 name = normalize(attrib)
393 if name == "src":
394 img = os.path.basename(node.attrib[name])
395 name, ext = os.path.splitext(img)
396 # If an image was convreted to jpg, pandoc still wrote the html with the previous extension,
397 # '.tiff' for exemple
398 if ext in [".tiff", ".tif"]:
399 img = name + ".jpg"
400 src = f"{src}/{img}"
401 elif name == "style":
402 classe = "article-body-img"
403 elif name == "data-custom-style":
404 classe = node.attrib[name].replace(" ", "-")
406 html_text = f"<img src={src} class={classe}>{inner_html_text}</img>"
407 tex_text = f"<img src={src} class={classe}>{inner_html_text}</img>"
408 xml_text = f'<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</graphic>'
410 return html_text, tex_text, xml_text
412 def parse_node_with_li(self, node, **kwargs):
413 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
414 node, **kwargs
415 )
416 parent_node = node.getparent()
417 if parent_node.tag == "ul":
418 html_text = f"<li >{inner_html_text}</li>"
419 tex_text = f"<li >{inner_tex_text}</li>"
420 else:
421 html_text = f"<li class='article-list'>{inner_html_text}</li>"
422 tex_text = f"<li class='article-list'>{inner_tex_text}</li>"
424 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>"
426 return html_text, tex_text, xml_text
428 def parse_node_with_mixed_content(self, node, **kwargs):
429 """
430 Parse and return the text of an XML node which mixes text and XML sub-nodes.
431 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>
432 Some inner nodes are removed, others are kept or replaced.
434 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings.
435 Parse the 2 nodes at the same time.
437 The JATS xml string is constructed at the same time because it is used during a PTF export
439 :param node: XML Node (with MathML), XML Node (with TexMath)
440 :param kwargs: params of the function
441 :return: HTML text, TeX test, XML text
442 """
444 html_text = tex_text = jats_xml_text = ""
446 if node is None: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true
447 return html_text, tex_text, jats_xml_text
449 # The tail is the text following the end of the node
450 # Ex: <node>text1<a>text_a</a>a_tail</node>
451 # The HTML text has to include the tail
452 # only if html_from_mixed_content was called recursively
453 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
455 # lxml replace HTML entities in node.tex and node.tail (like <)
456 # kwargs['escape'] allows to escape back the values
457 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True
459 tag = node.tag
461 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
463 # I. Add the node's text.
464 # Some tag have a corresponding html_from_@tag function to generate the HTML text.
466 fct_name = tag
467 fct_name = "parse_node_with_" + fct_name.replace("-", "_")
468 ftor = getattr(self, fct_name, None)
469 if callable(ftor):
470 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs)
471 else:
472 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
473 node, **kwargs
474 )
476 html_text += inner_html_text
477 tex_text += inner_tex_text
478 jats_xml_text += inner_jats_xml_text
480 # III. Add the node's tail for children
481 if node.tail:
482 # if self.check_citation and node.tag != "a":
483 # html_text = create_innerlink_for_citation(html_text, self.biblio)
484 # node.tail = create_innerlink_for_citation(node.tail, self.biblio)
485 if self.check_citation: 485 ↛ 486line 485 didn't jump to line 486 because the condition on line 485 was never true
486 kwargs["escape"] = False
487 html_text += escape(node.tail) if kwargs["escape"] else node.tail
488 tex_text += escape(node.tail) if kwargs["escape"] else node.tail
489 jats_xml_text += escape(node.tail)
491 # if self.check_citation and node.tag != "a":
492 # html_text = create_innerlink_for_citation(html_text, self.biblio)
494 return html_text, tex_text, jats_xml_text
496 def parse_node_with_ol(self, node, **kwargs):
497 # # JATS requires <list> to be inside <p>
498 # parent = node.getparent()
499 # if parent is None or parent.tag != "p":
500 # xml_text = f"<p>{xml_text}</p>"
502 return self.parse_list(node, **kwargs)
504 def parse_node_with_p(self, node, **kwargs):
505 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
506 node, **kwargs
507 )
509 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>"
510 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>"
511 if self.ignore_p:
512 xml_text = inner_jats_xml_text
513 elif len(inner_jats_xml_text) > 0: 513 ↛ 516line 513 didn't jump to line 516 because the condition on line 513 was always true
514 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>"
515 else:
516 xml_text = '<p xml:space="preserve"/>'
518 return html_text, tex_text, xml_text
520 def parse_node_with_span(self, node, **kwargs):
521 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
522 node, **kwargs
523 )
525 the_class = node.get("class")
526 display = the_class == "math display"
527 if the_class in ["math inline", "math display"]: 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true
528 the_class = "mathjax-formula"
530 if the_class == "mathjax-formula":
531 html_text, tex_text, xml_text = self.parse_formula(node, display=display)
532 elif the_class is not None:
533 html_text = f'<span class="{the_class}">{inner_html_text}</span>'
534 tex_text = f'<span class="{the_class}">{inner_tex_text}</span>'
535 xml_text = inner_jats_xml_text
536 else:
537 html_text = f"<span>{inner_html_text}</span>"
538 tex_text = f"<span>{inner_tex_text}</span>"
539 xml_text = inner_jats_xml_text
541 return html_text, tex_text, xml_text
543 def parse_node_with_strong(self, node, **kwargs):
544 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
545 node, **kwargs
546 )
548 html_text = f"<strong>{inner_html_text}</strong>"
549 tex_text = f"<strong>{inner_tex_text}</strong>"
551 if len(inner_jats_xml_text) > 0:
552 xml_text = f"<bold>{inner_jats_xml_text}</bold>"
553 else:
554 xml_text = "<bold/>"
556 return html_text, tex_text, xml_text
558 def parse_node_with_sub(self, node, **kwargs):
559 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
560 node, **kwargs
561 )
563 html_text = f"<sub>{inner_html_text}</sub>"
564 tex_text = f"<sub>{inner_tex_text}</sub>"
565 xml_text = f"<sub>{inner_jats_xml_text}</sub>"
567 return html_text, tex_text, xml_text
569 def parse_node_with_sup(self, node, **kwargs):
570 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
571 node, **kwargs
572 )
574 html_text = f"<sup>{inner_html_text}</sup>"
575 tex_text = f"<sup>{inner_tex_text}</sup>"
576 xml_text = f"<sup>{inner_jats_xml_text}</sup>"
578 return html_text, tex_text, xml_text
580 def parse_node_with_table(self, node, **kwargs):
581 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
582 node, **kwargs
583 )
584 classe = ""
585 for attrib in node.attrib:
586 name = normalize(attrib)
587 if name == "class":
588 classe = node.attrib[name]
589 # Next condition checks style identification with pandoc library used
590 # for docx --> html conversion
591 elif name == "data-custom-style":
592 classe = node.attrib[name].replace(" ", "-")
593 if "PCJ" in self.issue_pid:
594 html_text = (
595 f"<div class='PCJ-table'><table class={classe}>{inner_html_text}</table></div>"
596 )
597 tex_text = (
598 f"<div class='PCJ-table'><table class={classe}>{inner_tex_text}</table></div>"
599 )
600 else:
601 html_text = f"<table class={classe}>{inner_html_text}</table>"
602 tex_text = f"<table class={classe}>{inner_tex_text}</table>"
604 xml_text = '<table xml:space="preserve">' + inner_jats_xml_text + "</table>"
605 return html_text, tex_text, xml_text
607 def parse_node_with_tbody(self, node, **kwargs):
608 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
609 node, **kwargs
610 )
611 classe = ""
612 for attrib in node.attrib:
613 name = normalize(attrib)
614 if name == "class":
615 classe = node.attrib[name]
616 html_text = f"<tbody class={classe}>{inner_html_text}</tbody>"
617 tex_text = f"<tbody class={classe}>{inner_tex_text}</tbody>"
619 xml_text = '<tbody xml:space="preserve">' + inner_jats_xml_text + "</tbody>"
620 return html_text, tex_text, xml_text
622 def parse_node_with_td(self, node, **kwargs):
623 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
624 node, **kwargs
625 )
626 classe = ""
627 rowspan = ""
628 colspan = ""
629 for attrib in node.attrib:
630 name = normalize(attrib)
631 if name == "class":
632 classe = node.attrib[name]
633 elif name == "rowspan":
634 rowspan = node.attrib[name]
635 elif name == "colspan":
636 colspan = node.attrib[name]
637 if classe:
638 html_text = f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"
639 tex_text = (
640 f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"
641 )
642 else:
643 html_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"
644 tex_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"
646 xml_text = '<td xml:space="preserve">' + inner_jats_xml_text + "</td>"
647 return html_text, tex_text, xml_text
649 def parse_node_with_th(self, node, **kwargs):
650 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
651 node, **kwargs
652 )
653 classe = ""
654 rowspan = ""
655 colspan = ""
656 for attrib in node.attrib:
657 name = normalize(attrib)
658 if name == "class":
659 classe = node.attrib[name]
660 elif name == "rowspan":
661 rowspan = node.attrib[name]
662 elif name == "colspan":
663 colspan = node.attrib[name]
664 if classe:
665 html_text = f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"
666 tex_text = (
667 f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"
668 )
669 else:
670 html_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"
671 tex_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"
673 xml_text = '<th xml:space="preserve">' + inner_jats_xml_text + "</th>"
674 return html_text, tex_text, xml_text
676 def parse_node_with_tr(self, node, **kwargs):
677 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
678 node, **kwargs
679 )
680 classe = ""
682 html_text = f"<tr class='{classe}'>{inner_html_text}</tr>"
683 tex_text = f"<tr class='{classe}'>{inner_tex_text}</tr>"
685 xml_text = '<tr xml:space="preserve">' + inner_jats_xml_text + "</tr>"
686 return html_text, tex_text, xml_text
688 def parse_node_with_ul(self, node, **kwargs):
689 return self.parse_list(node, **kwargs)
691 def parse_tree(self, tree):
692 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content(
693 tree, is_top=True
694 )
695 # if self.check_citation:
696 # self.value_html = create_innerlink_for_citation(self.value_html, self.biblio)
699if __name__ == "__main__": 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true
700 html_value = r'<p>Te<st <span class="mathjax-formula">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>'
701 parser = CkeditorParser(html_value=html_value)
702 result = parser.value_xml
703 print(result)