Coverage for src/ptf/utils.py: 28%

1import difflib

2import html

3import os

4import re

5import subprocess

6import unicodedata

8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES

9from bleach.css_sanitizer import CSSSanitizer

10from bleach.sanitizer import Cleaner

11from PIL import Image

12from PIL import ImageFile

14ImageFile.LOAD_TRUNCATED_IMAGES = True

16from django.conf import settings

17from django.core.exceptions import ImproperlyConfigured

18from django.core.mail import EmailMultiAlternatives

19from django.template import Template

20from django.template import TemplateSyntaxError

21from django.template import engines

22from django.template.loader import render_to_string

23from django.utils.html import strip_tags

24from django.utils.translation import gettext_lazy as _

26from ptf.site_register import SITE_REGISTER

29def strip_markup(string):

30 """

31 Strip string from :

32 - xml markkup (mathml, html, etc..)

33 - html entities (&nbsp, etc...)

34 """

35 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

36 return re.sub(cleanr, "", string)

39def highlight_diff(ours, theirs):

40 matcher = difflib.SequenceMatcher(

41 None, strip_markup(ours.lower()), strip_markup(theirs.lower())

42 )

44 def process_tag(tag, i1, i2, j1, j2):

45 if tag == "equal":

46 return f"{matcher.b[j1:j2]}"

47 elif tag == "replace":

48 return f"{matcher.b[j1:j2]}"

49 else:

50 return matcher.b[j1:j2]

52 return "".join(process_tag(*t) for t in matcher.get_opcodes())

55def volume_display():

56 if settings.VOLUME_STRING: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 return "Volume"

58 else:

59 return _("Tome")

62def execute_cmd(cmd, force_execute=False):

63 """

64 excute shell command

65 @param cmd: str which represents shell command

66 @return: output of the command

67 """

68 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute:

69 result = subprocess.check_output(cmd, shell=True)

70 return result

72 else:

73 # pour debug

74 returnStatus = 0

75 output = cmd

76 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_:

77 file_.write(f"cmd : {cmd}\n")

78 return returnStatus, output

81def get_file_content_in_utf8(filename):

82 """

84 :param filename:

85 :return: the body of a utf-8 file

86 """

87 with open(filename, encoding="utf-8") as f:

88 body = f.read()

89 return body

92def pdf_to_text(pdf_filename):

93 # Extract full text from the PDF

94 if not settings.MERSENNE_CREATE_FRONTPAGE:

95 return ""

96 else:

97 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True)

99 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt")

100 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename

101

102 execute_cmd(cmd_str)

103 # Check if the output file has been created

104 if not os.path.isfile(txt_filename):

105 raise RuntimeError("The PDF file was not converted by pdftotext")

106

107 body = get_file_content_in_utf8(txt_filename)

108 # strip control characters

109 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C")

110

111 return body

112

113

114def linearize_pdf(from_path, to_path):

115 # Linearize the PDF

116

117 cmd_str = "qpdf --linearize " + from_path + " " + to_path

118

119 try:

120 subprocess.check_output(cmd_str, shell=True)

121 except Exception as e:

122 if not os.path.isfile(to_path):

123 raise e

124

125 do_copy = False

126 return do_copy

127

128

129def get_display_name(prefix, first_name, last_name, suffix, string_name):

130 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False)

131

132 list_name = [x for x in [last_name, first_name] if x.strip()]

133 if display_first_name_first and (last_name or first_name):

134 string_name = " ".join(list_name[::-1])

135 elif last_name or first_name:

136 string_name = f"{prefix} " if prefix else ""

137 string_name += ", ".join(list_name)

138 string_name += f" {suffix}" if suffix else ""

139

140 return string_name

141

142

143def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str:

144 """

145 Sanitizes HTML input from the CKEditor.

146 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer.

147 JavaScript is removed by allowing only a subset of HTML tags and attributes.

148 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure

149 approach.

150

151 html: str

152 The HTML string to sanitize.

153 allow_img : bool

154 Whether to preserve img related tags

155 """

156 allowed_tags = [

157 "a",

158 "abbr",

159 "acronym",

160 "address",

161 "aside",

162 "b",

163 "bdi",

164 "bdo",

165 "blockquote",

166 "br",

167 "caption",

168 "cite",

169 "code",

170 "dd",

171 "del",

172 "dfn",

173 "div",

174 "dl",

175 "dt",

176 "em",

177 "h1",

178 "h2",

179 "h3",

180 "h4",

181 "h5",

182 "h6",

183 "hgroup",

184 "hr",

185 "i",

186 "ins",

187 "kbd",

188 "li",

189 "mark",

190 "ol",

191 "p",

192 "pre",

193 "q",

194 "s",

195 "samp",

196 "small",

197 "span",

198 "strike",

199 "strong",

200 "sub",

201 "table",

202 "tbody",

203 "td",

204 "th",

205 "thead",

206 "tr",

207 "u",

208 "ul",

209 "var",

210 ]

211

212 allowed_attributes = {

213 "*": ["class", "dir", "style", "id", "name"],

214 "a": ["href", "target"],

215 "img": ["alt", "height", "src", "width"],

216 "source": ["type", "src"],

217 # Those table attributes are deprecated but they are still used by CKEditor 4

218 # We might consider upgrading/migrating to CKEditor 5 at some point

219 "table": ["align", "border", "align", "cellspacing", "cellpadding"],

220 "th": ["scope"],

221 }

222

223 additional_css_properties = [

224 "border",

225 "margin",

226 "margin-left",

227 "margin-right",

228 "margin-top",

229 "margin-bottom",

230 "padding",

231 "padding-left",

232 "padding-right",

233 "padding-top",

234 "padding-bottom",

235 ]

236

237 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties)

238

239 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"]

240

241 if allow_img:

242 allowed_tags += image_allowed_tags

243

244 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)

245 cleaner = Cleaner(

246 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True

247 )

248 return cleaner.clean(html)

249

250

251def send_email(

252 html_content: str,

253 subject: str,

254 to: list[str] | tuple[str],

255 from_email: str | None = None,

256 cc: list[str] | tuple[str] = [],

257 from_collection: str = "",

258 reply_to: list[str] | tuple[str] = [],

259) -> None:

260 """

261 Sends an e-mail to the provided recipients and copy recipients with the provided html content.

262 It sends the e-mail with both a text and a HTML alternative.

263 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`

264 Params:

265 - html_content The HTML content of the e-mail

266 - subject The e-mail's subject

267 - from_email The sender's e-mail address

268 - to The list or tuple of the e-mail recipients

269 - cc The list or tuple of the e-mail CC

270 - from_collection: The collection to send the mail for. If from_email is None

271 it will get the email from site_register.py (`email_from`).

272 """

273 if from_email == "":

274 try:

275 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]

276 except (KeyError, ValueError):

277 if from_collection:

278 raise ImproperlyConfigured(

279 f"The collection {from_collection.lower()} is missing the "

280 "email_from property in site_register.py"

281 )

282

283 # We additionally unescape HTML characters here to avoid having stuff like

284 #   > etc. in the output text.

285 text_content = html.unescape(strip_tags(html_content))

286 # Create the email, and attach the HTML version as well.

287 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr")

288 msg = EmailMultiAlternatives(

289 subject=subject,

290 body=text_content,

291 from_email=from_email,

292 to=to,

293 cc=cc,

294 headers={"Return-path": return_path},

295 reply_to=reply_to,

296 )

297 msg.attach_alternative(html_content, "text/html")

298 msg.send(fail_silently=False)

299

300

301def send_email_from_template(

302 template: str,

303 context_data: dict,

304 subject: str,

305 to: list[str] | tuple[str],

306 from_email: str = "",

307 cc: list[str] | tuple[str] = [],

308 from_collection: str = "",

309) -> None:

310 """

311 Renders the provided template and sends it as an e-mail to the

312 provided recipients and copy recipients.

313 It sends the e-mail with both a text and a HTML alternative.

314 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`

315 Params:

316 - template The HTML template of the e-mail

317 - context_data The context data used to render the template

318 - subject The e-mail's subject

319 - from_email The sender's e-mail address

320 - to The list or tuple of the e-mail recipients

321 - cc The list or tuple of the e-mail CC

322 - from_collection: The collection to send the mail for. If from_email is None

323 it will get the email from site_register.py (`email_from`).

324 """

325 if from_email == "":

326 try:

327 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]

328 except (KeyError, ValueError):

329 if from_collection:

330 raise ImproperlyConfigured(

331 f"The collection {from_collection.lower()} is missing the "

332 "email_from property in site_register.py"

333 )

334

335 html_content = render_to_string(template, context_data)

336 send_email(

337 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection

338 )

339

340

341def template_from_string(template_string, using=None) -> Template:

342 """

343 Convert a string into a template object using a given template engine

344 or using the default backends from `settings.TEMPLATES` if no engine was specified.

345 """

346 # This function is based on django.template.loader.get_template,

347 # but uses Engine.from_string instead of Engine.get_template.

348 engine_list = engines.all() if using is None else [engines[using]]

349 for engine in engine_list:

350 try:

351 return engine.from_string(template_string)

352 except TemplateSyntaxError:

353 continue

354 raise TemplateSyntaxError(template_string)

355

356

357def resize_image(img, max_size=1600):

358 """Take an image in argument and resize it to a {max_size} width with the same ratio"""

359

360 if img.width > max_size:

361 ratio = img.width / img.height

362 new_width = max_size

363 new__height = int(max_size / ratio)

364 img = img.resize((new_width, new__height))

365

366 return img

367

368

369def convert_tiff_to_jpg(img):

370 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""

371 image_file = os.path.basename(img.filename)

372 image_name = os.path.splitext(image_file)[0]

373 image_directory = os.path.dirname(img.filename)

374 if img.mode == "RGBA":

375 img = img.convert("RGB")

376 img.thumbnail(img.size)

377 img.save(os.path.join(image_directory, image_name + ".jpg"), "JPEG", quality=100)

378

379

380Image.Image.resize_image = resize_image

381Image.Image.convert_tiff_to_jpg = convert_tiff_to_jpg

382

383

384def convert_tiff_to_jpg_from_path(image_path):

385 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""

386

387 path = os.path.split(image_path)[0]

388 image_file = os.path.basename(image_path)

389 name = os.path.splitext(image_file)[0]

390

391 img = Image.open(image_path)

392

393 img.convert_tiff_to_jpg()

394

395 final_path = os.path.join(path, name + ".jpg")

396 img.save(final_path, "JPEG", quality=100)

397 img.close()

398

399

400def resize_image_from_path(image_path):

401 img = Image.open(image_path)

402

403 img = img.resize_image()

404

405 img.save(os.path.join(image_path), quality=100)

406 img.close()

407

408

409def convert_image_for_web(image_path):

410 image_file = os.path.basename(image_path)

411 extension = os.path.splitext(image_file)[1]

412

413 img = Image.open(image_path)

414 if extension in [".tiff", ".tif"]:

415 img.convert_tiff_to_jpg()

416 elif extension in [".wmf", ".emf"]:

417 return

418

419 img = img.resize_image()

420 img.close()

421

422

423def create_citation_link_and_new_html(key, label, tooltip_html=""):

424 highlight_id = f"'r{label}'"

425 citation_link = f'<a id="{label}" href="#r{label}" onclick="highlightReference({highlight_id}, 3000, 500)" >{key}</a>'

426 new_html = f'{citation_link}{tooltip_html}'

427 return new_html

428

429

430def create_innerlink_for_citation(html_text, biblio):

431 html_text = html_text.replace("\n", " ")

432 for key, value in biblio.items():

433 label = value["label"].replace("[", "").replace("]", "")

434 # highlight_id = f"'r{label}'"

435 reference = value["reference"]

436 tooltip_html = (

437 ''

438 + f"{reference}"

439 + ""

440 )

441 new_html = create_citation_link_and_new_html(key, label, tooltip_html=tooltip_html)

442 html_text = html_text.replace(f"{key}", new_html)

443 if key in html_text:

444 continue

445

446 key2 = key.replace("&", "&")

447 new_html = create_citation_link_and_new_html(key2, label, tooltip_html=tooltip_html)

448 html_text = html_text.replace(f"{key2}", new_html)

449 if key2 in html_text:

450 continue

451

452 key3 = key.replace("&", "and")

453 new_html = create_citation_link_and_new_html(key3, label, tooltip_html=tooltip_html)

454 html_text = html_text.replace(f"{key3}", new_html)

455 if key3 in html_text:

456 continue

457

458 key4 = key3.replace(",", "")

459 new_html = create_citation_link_and_new_html(key4, label, tooltip_html=tooltip_html)

460 html_text = html_text.replace(f"{key4}", new_html)

461 if key4 in html_text:

462 continue

463

464 key5 = key.replace(",", "")

465 new_html = create_citation_link_and_new_html(key5, label, tooltip_html=tooltip_html)

466 html_text = html_text.replace(f"{key5}", new_html)

467 if key5 in html_text:

468 continue

469

470 key6 = key2.replace(",", "")

471 new_html = create_citation_link_and_new_html(key6, label, tooltip_html=tooltip_html)

472 html_text = html_text.replace(f"{key6}", new_html)

473 if key6 in html_text:

474 continue

475 pass

476

477 return html_text