Coverage for src/ptf/utils.py: 28%
205 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1import difflib
2import html
3import os
4import re
5import subprocess
6import unicodedata
8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES
9from bleach.css_sanitizer import CSSSanitizer
10from bleach.sanitizer import Cleaner
11from PIL import Image
12from PIL import ImageFile
14ImageFile.LOAD_TRUNCATED_IMAGES = True
16from django.conf import settings
17from django.core.exceptions import ImproperlyConfigured
18from django.core.mail import EmailMultiAlternatives
19from django.template import Template
20from django.template import TemplateSyntaxError
21from django.template import engines
22from django.template.loader import render_to_string
23from django.utils.html import strip_tags
24from django.utils.translation import gettext_lazy as _
26from ptf.site_register import SITE_REGISTER
29def strip_markup(string):
30 """
31 Strip string from :
32 - xml markkup (mathml, html, etc..)
33 - html entities ( , etc...)
34 """
35 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
36 return re.sub(cleanr, "", string)
39def highlight_diff(ours, theirs):
40 matcher = difflib.SequenceMatcher(
41 None, strip_markup(ours.lower()), strip_markup(theirs.lower())
42 )
44 def process_tag(tag, i1, i2, j1, j2):
45 if tag == "equal":
46 return f"<span class='bg-success'>{matcher.b[j1:j2]}</span>"
47 elif tag == "replace":
48 return f"<span class='bg-dark'>{matcher.b[j1:j2]}</span>"
49 else:
50 return matcher.b[j1:j2]
52 return "".join(process_tag(*t) for t in matcher.get_opcodes())
55def volume_display():
56 if settings.VOLUME_STRING: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 return "Volume"
58 else:
59 return _("Tome")
62def execute_cmd(cmd, force_execute=False):
63 """
64 excute shell command
65 @param cmd: str which represents shell command
66 @return: output of the command
67 """
68 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute:
69 result = subprocess.check_output(cmd, shell=True)
70 return result
72 else:
73 # pour debug
74 returnStatus = 0
75 output = cmd
76 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_:
77 file_.write(f"cmd : {cmd}\n")
78 return returnStatus, output
81def get_file_content_in_utf8(filename):
82 """
84 :param filename:
85 :return: the body of a utf-8 file
86 """
87 with open(filename, encoding="utf-8") as f:
88 body = f.read()
89 return body
92def pdf_to_text(pdf_filename):
93 # Extract full text from the PDF
94 if not settings.MERSENNE_CREATE_FRONTPAGE:
95 return ""
96 else:
97 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True)
99 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt")
100 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename
102 execute_cmd(cmd_str)
103 # Check if the output file has been created
104 if not os.path.isfile(txt_filename):
105 raise RuntimeError("The PDF file was not converted by pdftotext")
107 body = get_file_content_in_utf8(txt_filename)
108 # strip control characters
109 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C")
111 return body
114def linearize_pdf(from_path, to_path):
115 # Linearize the PDF
117 cmd_str = "qpdf --linearize " + from_path + " " + to_path
119 try:
120 subprocess.check_output(cmd_str, shell=True)
121 except Exception as e:
122 if not os.path.isfile(to_path):
123 raise e
125 do_copy = False
126 return do_copy
129def get_display_name(prefix, first_name, last_name, suffix, string_name):
130 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False)
132 list_name = [x for x in [last_name, first_name] if x.strip()]
133 if display_first_name_first and (last_name or first_name):
134 string_name = " ".join(list_name[::-1])
135 elif last_name or first_name:
136 string_name = f"{prefix} " if prefix else ""
137 string_name += ", ".join(list_name)
138 string_name += f" {suffix}" if suffix else ""
140 return string_name
143def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str:
144 """
145 Sanitizes HTML input from the CKEditor.
146 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer.
147 JavaScript is removed by allowing only a subset of HTML tags and attributes.
148 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure
149 approach.
151 html: str
152 The HTML string to sanitize.
153 allow_img : bool
154 Whether to preserve img related tags
155 """
156 allowed_tags = [
157 "a",
158 "abbr",
159 "acronym",
160 "address",
161 "aside",
162 "b",
163 "bdi",
164 "bdo",
165 "blockquote",
166 "br",
167 "caption",
168 "cite",
169 "code",
170 "dd",
171 "del",
172 "dfn",
173 "div",
174 "dl",
175 "dt",
176 "em",
177 "h1",
178 "h2",
179 "h3",
180 "h4",
181 "h5",
182 "h6",
183 "hgroup",
184 "hr",
185 "i",
186 "ins",
187 "kbd",
188 "li",
189 "mark",
190 "ol",
191 "p",
192 "pre",
193 "q",
194 "s",
195 "samp",
196 "small",
197 "span",
198 "strike",
199 "strong",
200 "sub",
201 "table",
202 "tbody",
203 "td",
204 "th",
205 "thead",
206 "tr",
207 "u",
208 "ul",
209 "var",
210 ]
212 allowed_attributes = {
213 "*": ["class", "dir", "style", "id", "name"],
214 "a": ["href", "target"],
215 "img": ["alt", "height", "src", "width"],
216 "source": ["type", "src"],
217 # Those table attributes are deprecated but they are still used by CKEditor 4
218 # We might consider upgrading/migrating to CKEditor 5 at some point
219 "table": ["align", "border", "align", "cellspacing", "cellpadding"],
220 "th": ["scope"],
221 }
223 additional_css_properties = [
224 "border",
225 "margin",
226 "margin-left",
227 "margin-right",
228 "margin-top",
229 "margin-bottom",
230 "padding",
231 "padding-left",
232 "padding-right",
233 "padding-top",
234 "padding-bottom",
235 ]
237 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties)
239 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"]
241 if allow_img:
242 allowed_tags += image_allowed_tags
244 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
245 cleaner = Cleaner(
246 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True
247 )
248 return cleaner.clean(html)
251def send_email(
252 html_content: str,
253 subject: str,
254 to: list[str] | tuple[str],
255 from_email: str | None = None,
256 cc: list[str] | tuple[str] = [],
257 from_collection: str = "",
258 reply_to: list[str] | tuple[str] = [],
259) -> None:
260 """
261 Sends an e-mail to the provided recipients and copy recipients with the provided html content.
262 It sends the e-mail with both a text and a HTML alternative.
263 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`
264 Params:
265 - html_content The HTML content of the e-mail
266 - subject The e-mail's subject
267 - from_email The sender's e-mail address
268 - to The list or tuple of the e-mail recipients
269 - cc The list or tuple of the e-mail CC
270 - from_collection: The collection to send the mail for. If from_email is None
271 it will get the email from site_register.py (`email_from`).
272 """
273 if from_email == "":
274 try:
275 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]
276 except (KeyError, ValueError):
277 if from_collection:
278 raise ImproperlyConfigured(
279 f"The collection {from_collection.lower()} is missing the "
280 "email_from property in site_register.py"
281 )
283 # We additionally unescape HTML characters here to avoid having stuff like
284 # > etc. in the output text.
285 text_content = html.unescape(strip_tags(html_content))
286 # Create the email, and attach the HTML version as well.
287 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr")
288 msg = EmailMultiAlternatives(
289 subject=subject,
290 body=text_content,
291 from_email=from_email,
292 to=to,
293 cc=cc,
294 headers={"Return-path": return_path},
295 reply_to=reply_to,
296 )
297 msg.attach_alternative(html_content, "text/html")
298 msg.send(fail_silently=False)
301def send_email_from_template(
302 template: str,
303 context_data: dict,
304 subject: str,
305 to: list[str] | tuple[str],
306 from_email: str = "",
307 cc: list[str] | tuple[str] = [],
308 from_collection: str = "",
309) -> None:
310 """
311 Renders the provided template and sends it as an e-mail to the
312 provided recipients and copy recipients.
313 It sends the e-mail with both a text and a HTML alternative.
314 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`
315 Params:
316 - template The HTML template of the e-mail
317 - context_data The context data used to render the template
318 - subject The e-mail's subject
319 - from_email The sender's e-mail address
320 - to The list or tuple of the e-mail recipients
321 - cc The list or tuple of the e-mail CC
322 - from_collection: The collection to send the mail for. If from_email is None
323 it will get the email from site_register.py (`email_from`).
324 """
325 if from_email == "":
326 try:
327 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]
328 except (KeyError, ValueError):
329 if from_collection:
330 raise ImproperlyConfigured(
331 f"The collection {from_collection.lower()} is missing the "
332 "email_from property in site_register.py"
333 )
335 html_content = render_to_string(template, context_data)
336 send_email(
337 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection
338 )
341def template_from_string(template_string, using=None) -> Template:
342 """
343 Convert a string into a template object using a given template engine
344 or using the default backends from `settings.TEMPLATES` if no engine was specified.
345 """
346 # This function is based on django.template.loader.get_template,
347 # but uses Engine.from_string instead of Engine.get_template.
348 engine_list = engines.all() if using is None else [engines[using]]
349 for engine in engine_list:
350 try:
351 return engine.from_string(template_string)
352 except TemplateSyntaxError:
353 continue
354 raise TemplateSyntaxError(template_string)
357def resize_image(img, max_size=1600):
358 """Take an image in argument and resize it to a {max_size} width with the same ratio"""
360 if img.width > max_size:
361 ratio = img.width / img.height
362 new_width = max_size
363 new__height = int(max_size / ratio)
364 img = img.resize((new_width, new__height))
366 return img
369def convert_tiff_to_jpg(img):
370 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""
371 image_file = os.path.basename(img.filename)
372 image_name = os.path.splitext(image_file)[0]
373 image_directory = os.path.dirname(img.filename)
374 if img.mode == "RGBA":
375 img = img.convert("RGB")
376 img.thumbnail(img.size)
377 img.save(os.path.join(image_directory, image_name + ".jpg"), "JPEG", quality=100)
380Image.Image.resize_image = resize_image
381Image.Image.convert_tiff_to_jpg = convert_tiff_to_jpg
384def convert_tiff_to_jpg_from_path(image_path):
385 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""
387 path = os.path.split(image_path)[0]
388 image_file = os.path.basename(image_path)
389 name = os.path.splitext(image_file)[0]
391 img = Image.open(image_path)
393 img.convert_tiff_to_jpg()
395 final_path = os.path.join(path, name + ".jpg")
396 img.save(final_path, "JPEG", quality=100)
397 img.close()
400def resize_image_from_path(image_path):
401 img = Image.open(image_path)
403 img = img.resize_image()
405 img.save(os.path.join(image_path), quality=100)
406 img.close()
409def convert_image_for_web(image_path):
410 image_file = os.path.basename(image_path)
411 extension = os.path.splitext(image_file)[1]
413 img = Image.open(image_path)
414 if extension in [".tiff", ".tif"]:
415 img.convert_tiff_to_jpg()
416 elif extension in [".wmf", ".emf"]:
417 return
419 img = img.resize_image()
420 img.close()
423def create_citation_link_and_new_html(key, label, tooltip_html=""):
424 highlight_id = f"'r{label}'"
425 citation_link = f'<a id="{label}" href="#r{label}" onclick="highlightReference({highlight_id}, 3000, 500)" >{key}</a>'
426 new_html = f'<span class="tooltipPCJ">{citation_link}{tooltip_html}</span>'
427 return new_html
430def create_innerlink_for_citation(html_text, biblio):
431 html_text = html_text.replace("\n", " ")
432 for key, value in biblio.items():
433 label = value["label"].replace("[", "").replace("]", "")
434 # highlight_id = f"'r{label}'"
435 reference = value["reference"]
436 tooltip_html = (
437 '<span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden">'
438 + f"{reference}"
439 + "</span>"
440 )
441 new_html = create_citation_link_and_new_html(key, label, tooltip_html=tooltip_html)
442 html_text = html_text.replace(f"{key}", new_html)
443 if key in html_text:
444 continue
446 key2 = key.replace("&", "&")
447 new_html = create_citation_link_and_new_html(key2, label, tooltip_html=tooltip_html)
448 html_text = html_text.replace(f"{key2}", new_html)
449 if key2 in html_text:
450 continue
452 key3 = key.replace("&", "and")
453 new_html = create_citation_link_and_new_html(key3, label, tooltip_html=tooltip_html)
454 html_text = html_text.replace(f"{key3}", new_html)
455 if key3 in html_text:
456 continue
458 key4 = key3.replace(",", "")
459 new_html = create_citation_link_and_new_html(key4, label, tooltip_html=tooltip_html)
460 html_text = html_text.replace(f"{key4}", new_html)
461 if key4 in html_text:
462 continue
464 key5 = key.replace(",", "")
465 new_html = create_citation_link_and_new_html(key5, label, tooltip_html=tooltip_html)
466 html_text = html_text.replace(f"{key5}", new_html)
467 if key5 in html_text:
468 continue
470 key6 = key2.replace(",", "")
471 new_html = create_citation_link_and_new_html(key6, label, tooltip_html=tooltip_html)
472 html_text = html_text.replace(f"{key6}", new_html)
473 if key6 in html_text:
474 continue
475 pass
477 return html_text