Coverage for src/ptf/utils.py: 28%

205 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1import difflib 

2import html 

3import os 

4import re 

5import subprocess 

6import unicodedata 

7 

8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES 

9from bleach.css_sanitizer import CSSSanitizer 

10from bleach.sanitizer import Cleaner 

11from PIL import Image 

12from PIL import ImageFile 

13 

14ImageFile.LOAD_TRUNCATED_IMAGES = True 

15 

16from django.conf import settings 

17from django.core.exceptions import ImproperlyConfigured 

18from django.core.mail import EmailMultiAlternatives 

19from django.template import Template 

20from django.template import TemplateSyntaxError 

21from django.template import engines 

22from django.template.loader import render_to_string 

23from django.utils.html import strip_tags 

24from django.utils.translation import gettext_lazy as _ 

25 

26from ptf.site_register import SITE_REGISTER 

27 

28 

29def strip_markup(string): 

30 """ 

31 Strip string from : 

32 - xml markkup (mathml, html, etc..) 

33 - html entities (&nbsp, etc...) 

34 """ 

35 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") 

36 return re.sub(cleanr, "", string) 

37 

38 

39def highlight_diff(ours, theirs): 

40 matcher = difflib.SequenceMatcher( 

41 None, strip_markup(ours.lower()), strip_markup(theirs.lower()) 

42 ) 

43 

44 def process_tag(tag, i1, i2, j1, j2): 

45 if tag == "equal": 

46 return f"<span class='bg-success'>{matcher.b[j1:j2]}</span>" 

47 elif tag == "replace": 

48 return f"<span class='bg-dark'>{matcher.b[j1:j2]}</span>" 

49 else: 

50 return matcher.b[j1:j2] 

51 

52 return "".join(process_tag(*t) for t in matcher.get_opcodes()) 

53 

54 

55def volume_display(): 

56 if settings.VOLUME_STRING: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 return "Volume" 

58 else: 

59 return _("Tome") 

60 

61 

62def execute_cmd(cmd, force_execute=False): 

63 """ 

64 excute shell command 

65 @param cmd: str which represents shell command 

66 @return: output of the command 

67 """ 

68 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute: 

69 result = subprocess.check_output(cmd, shell=True) 

70 return result 

71 

72 else: 

73 # pour debug 

74 returnStatus = 0 

75 output = cmd 

76 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_: 

77 file_.write(f"cmd : {cmd}\n") 

78 return returnStatus, output 

79 

80 

81def get_file_content_in_utf8(filename): 

82 """ 

83 

84 :param filename: 

85 :return: the body of a utf-8 file 

86 """ 

87 with open(filename, encoding="utf-8") as f: 

88 body = f.read() 

89 return body 

90 

91 

92def pdf_to_text(pdf_filename): 

93 # Extract full text from the PDF 

94 if not settings.MERSENNE_CREATE_FRONTPAGE: 

95 return "" 

96 else: 

97 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True) 

98 

99 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt") 

100 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename 

101 

102 execute_cmd(cmd_str) 

103 # Check if the output file has been created 

104 if not os.path.isfile(txt_filename): 

105 raise RuntimeError("The PDF file was not converted by pdftotext") 

106 

107 body = get_file_content_in_utf8(txt_filename) 

108 # strip control characters 

109 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C") 

110 

111 return body 

112 

113 

114def linearize_pdf(from_path, to_path): 

115 # Linearize the PDF 

116 

117 cmd_str = "qpdf --linearize " + from_path + " " + to_path 

118 

119 try: 

120 subprocess.check_output(cmd_str, shell=True) 

121 except Exception as e: 

122 if not os.path.isfile(to_path): 

123 raise e 

124 

125 do_copy = False 

126 return do_copy 

127 

128 

129def get_display_name(prefix, first_name, last_name, suffix, string_name): 

130 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False) 

131 

132 list_name = [x for x in [last_name, first_name] if x.strip()] 

133 if display_first_name_first and (last_name or first_name): 

134 string_name = " ".join(list_name[::-1]) 

135 elif last_name or first_name: 

136 string_name = f"{prefix} " if prefix else "" 

137 string_name += ", ".join(list_name) 

138 string_name += f" {suffix}" if suffix else "" 

139 

140 return string_name 

141 

142 

143def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str: 

144 """ 

145 Sanitizes HTML input from the CKEditor. 

146 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer. 

147 JavaScript is removed by allowing only a subset of HTML tags and attributes. 

148 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure 

149 approach. 

150 

151 html: str 

152 The HTML string to sanitize. 

153 allow_img : bool 

154 Whether to preserve img related tags 

155 """ 

156 allowed_tags = [ 

157 "a", 

158 "abbr", 

159 "acronym", 

160 "address", 

161 "aside", 

162 "b", 

163 "bdi", 

164 "bdo", 

165 "blockquote", 

166 "br", 

167 "caption", 

168 "cite", 

169 "code", 

170 "dd", 

171 "del", 

172 "dfn", 

173 "div", 

174 "dl", 

175 "dt", 

176 "em", 

177 "h1", 

178 "h2", 

179 "h3", 

180 "h4", 

181 "h5", 

182 "h6", 

183 "hgroup", 

184 "hr", 

185 "i", 

186 "ins", 

187 "kbd", 

188 "li", 

189 "mark", 

190 "ol", 

191 "p", 

192 "pre", 

193 "q", 

194 "s", 

195 "samp", 

196 "small", 

197 "span", 

198 "strike", 

199 "strong", 

200 "sub", 

201 "table", 

202 "tbody", 

203 "td", 

204 "th", 

205 "thead", 

206 "tr", 

207 "u", 

208 "ul", 

209 "var", 

210 ] 

211 

212 allowed_attributes = { 

213 "*": ["class", "dir", "style", "id", "name"], 

214 "a": ["href", "target"], 

215 "img": ["alt", "height", "src", "width"], 

216 "source": ["type", "src"], 

217 # Those table attributes are deprecated but they are still used by CKEditor 4 

218 # We might consider upgrading/migrating to CKEditor 5 at some point 

219 "table": ["align", "border", "align", "cellspacing", "cellpadding"], 

220 "th": ["scope"], 

221 } 

222 

223 additional_css_properties = [ 

224 "border", 

225 "margin", 

226 "margin-left", 

227 "margin-right", 

228 "margin-top", 

229 "margin-bottom", 

230 "padding", 

231 "padding-left", 

232 "padding-right", 

233 "padding-top", 

234 "padding-bottom", 

235 ] 

236 

237 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties) 

238 

239 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"] 

240 

241 if allow_img: 

242 allowed_tags += image_allowed_tags 

243 

244 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties) 

245 cleaner = Cleaner( 

246 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True 

247 ) 

248 return cleaner.clean(html) 

249 

250 

251def send_email( 

252 html_content: str, 

253 subject: str, 

254 to: list[str] | tuple[str], 

255 from_email: str | None = None, 

256 cc: list[str] | tuple[str] = [], 

257 from_collection: str = "", 

258 reply_to: list[str] | tuple[str] = [], 

259) -> None: 

260 """ 

261 Sends an e-mail to the provided recipients and copy recipients with the provided html content. 

262 It sends the e-mail with both a text and a HTML alternative. 

263 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

264 Params: 

265 - html_content The HTML content of the e-mail 

266 - subject The e-mail's subject 

267 - from_email The sender's e-mail address 

268 - to The list or tuple of the e-mail recipients 

269 - cc The list or tuple of the e-mail CC 

270 - from_collection: The collection to send the mail for. If from_email is None 

271 it will get the email from site_register.py (`email_from`). 

272 """ 

273 if from_email == "": 

274 try: 

275 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

276 except (KeyError, ValueError): 

277 if from_collection: 

278 raise ImproperlyConfigured( 

279 f"The collection {from_collection.lower()} is missing the " 

280 "email_from property in site_register.py" 

281 ) 

282 

283 # We additionally unescape HTML characters here to avoid having stuff like 

284 # &nbsp; &gt; etc. in the output text. 

285 text_content = html.unescape(strip_tags(html_content)) 

286 # Create the email, and attach the HTML version as well. 

287 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr") 

288 msg = EmailMultiAlternatives( 

289 subject=subject, 

290 body=text_content, 

291 from_email=from_email, 

292 to=to, 

293 cc=cc, 

294 headers={"Return-path": return_path}, 

295 reply_to=reply_to, 

296 ) 

297 msg.attach_alternative(html_content, "text/html") 

298 msg.send(fail_silently=False) 

299 

300 

301def send_email_from_template( 

302 template: str, 

303 context_data: dict, 

304 subject: str, 

305 to: list[str] | tuple[str], 

306 from_email: str = "", 

307 cc: list[str] | tuple[str] = [], 

308 from_collection: str = "", 

309) -> None: 

310 """ 

311 Renders the provided template and sends it as an e-mail to the 

312 provided recipients and copy recipients. 

313 It sends the e-mail with both a text and a HTML alternative. 

314 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

315 Params: 

316 - template The HTML template of the e-mail 

317 - context_data The context data used to render the template 

318 - subject The e-mail's subject 

319 - from_email The sender's e-mail address 

320 - to The list or tuple of the e-mail recipients 

321 - cc The list or tuple of the e-mail CC 

322 - from_collection: The collection to send the mail for. If from_email is None 

323 it will get the email from site_register.py (`email_from`). 

324 """ 

325 if from_email == "": 

326 try: 

327 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

328 except (KeyError, ValueError): 

329 if from_collection: 

330 raise ImproperlyConfigured( 

331 f"The collection {from_collection.lower()} is missing the " 

332 "email_from property in site_register.py" 

333 ) 

334 

335 html_content = render_to_string(template, context_data) 

336 send_email( 

337 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection 

338 ) 

339 

340 

341def template_from_string(template_string, using=None) -> Template: 

342 """ 

343 Convert a string into a template object using a given template engine 

344 or using the default backends from `settings.TEMPLATES` if no engine was specified. 

345 """ 

346 # This function is based on django.template.loader.get_template, 

347 # but uses Engine.from_string instead of Engine.get_template. 

348 engine_list = engines.all() if using is None else [engines[using]] 

349 for engine in engine_list: 

350 try: 

351 return engine.from_string(template_string) 

352 except TemplateSyntaxError: 

353 continue 

354 raise TemplateSyntaxError(template_string) 

355 

356 

357def resize_image(img, max_size=1600): 

358 """Take an image in argument and resize it to a {max_size} width with the same ratio""" 

359 

360 if img.width > max_size: 

361 ratio = img.width / img.height 

362 new_width = max_size 

363 new__height = int(max_size / ratio) 

364 img = img.resize((new_width, new__height)) 

365 

366 return img 

367 

368 

369def convert_tiff_to_jpg(img): 

370 """Take the path of a '.tiff' image and convert the image to a '.jpg' one""" 

371 image_file = os.path.basename(img.filename) 

372 image_name = os.path.splitext(image_file)[0] 

373 image_directory = os.path.dirname(img.filename) 

374 if img.mode == "RGBA": 

375 img = img.convert("RGB") 

376 img.thumbnail(img.size) 

377 img.save(os.path.join(image_directory, image_name + ".jpg"), "JPEG", quality=100) 

378 

379 

380Image.Image.resize_image = resize_image 

381Image.Image.convert_tiff_to_jpg = convert_tiff_to_jpg 

382 

383 

384def convert_tiff_to_jpg_from_path(image_path): 

385 """Take the path of a '.tiff' image and convert the image to a '.jpg' one""" 

386 

387 path = os.path.split(image_path)[0] 

388 image_file = os.path.basename(image_path) 

389 name = os.path.splitext(image_file)[0] 

390 

391 img = Image.open(image_path) 

392 

393 img.convert_tiff_to_jpg() 

394 

395 final_path = os.path.join(path, name + ".jpg") 

396 img.save(final_path, "JPEG", quality=100) 

397 img.close() 

398 

399 

400def resize_image_from_path(image_path): 

401 img = Image.open(image_path) 

402 

403 img = img.resize_image() 

404 

405 img.save(os.path.join(image_path), quality=100) 

406 img.close() 

407 

408 

409def convert_image_for_web(image_path): 

410 image_file = os.path.basename(image_path) 

411 extension = os.path.splitext(image_file)[1] 

412 

413 img = Image.open(image_path) 

414 if extension in [".tiff", ".tif"]: 

415 img.convert_tiff_to_jpg() 

416 elif extension in [".wmf", ".emf"]: 

417 return 

418 

419 img = img.resize_image() 

420 img.close() 

421 

422 

423def create_citation_link_and_new_html(key, label, tooltip_html=""): 

424 highlight_id = f"'r{label}'" 

425 citation_link = f'<a id="{label}" href="#r{label}" onclick="highlightReference({highlight_id}, 3000, 500)" >{key}</a>' 

426 new_html = f'<span class="tooltipPCJ">{citation_link}{tooltip_html}</span>' 

427 return new_html 

428 

429 

430def create_innerlink_for_citation(html_text, biblio): 

431 html_text = html_text.replace("\n", " ") 

432 for key, value in biblio.items(): 

433 label = value["label"].replace("[", "").replace("]", "") 

434 # highlight_id = f"'r{label}'" 

435 reference = value["reference"] 

436 tooltip_html = ( 

437 '<span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden">' 

438 + f"{reference}" 

439 + "</span>" 

440 ) 

441 new_html = create_citation_link_and_new_html(key, label, tooltip_html=tooltip_html) 

442 html_text = html_text.replace(f"{key}", new_html) 

443 if key in html_text: 

444 continue 

445 

446 key2 = key.replace("&", "&amp;") 

447 new_html = create_citation_link_and_new_html(key2, label, tooltip_html=tooltip_html) 

448 html_text = html_text.replace(f"{key2}", new_html) 

449 if key2 in html_text: 

450 continue 

451 

452 key3 = key.replace("&", "and") 

453 new_html = create_citation_link_and_new_html(key3, label, tooltip_html=tooltip_html) 

454 html_text = html_text.replace(f"{key3}", new_html) 

455 if key3 in html_text: 

456 continue 

457 

458 key4 = key3.replace(",", "") 

459 new_html = create_citation_link_and_new_html(key4, label, tooltip_html=tooltip_html) 

460 html_text = html_text.replace(f"{key4}", new_html) 

461 if key4 in html_text: 

462 continue 

463 

464 key5 = key.replace(",", "") 

465 new_html = create_citation_link_and_new_html(key5, label, tooltip_html=tooltip_html) 

466 html_text = html_text.replace(f"{key5}", new_html) 

467 if key5 in html_text: 

468 continue 

469 

470 key6 = key2.replace(",", "") 

471 new_html = create_citation_link_and_new_html(key6, label, tooltip_html=tooltip_html) 

472 html_text = html_text.replace(f"{key6}", new_html) 

473 if key6 in html_text: 

474 continue 

475 pass 

476 

477 return html_text