Coverage for src/ptf/display/resolver.py: 36%

1import os

2import shutil

3import time

5from django.conf import settings

7from ptf.cmds.xml import xml_utils

9NOW = time.gmtime()[0]

11extids_hrefs = {

12 "doi": "https://doi.org/{0}",

13 "zbl-item-id": "https://zbmath.org/?q=an:{0}",

14 "jfm-item-id": "https://zbmath.org/?q=an:{0}",

15 "mr-item-id": "https://mathscinet.ams.org/mathscinet-getitem?mr={0}",

16 "nmid": "/item/{0}",

17 "numdam-id": "/item/{0}",

18 "mathdoc-id": "/item/{0}",

19 "eudml-item-id": "https://eudml.org/doc/{0}",

20 "sps-id": "http://sites.mathdoc.fr/cgi-bin/spitem?id={0}",

21 "arxiv": "https://arxiv.org/abs/{0}",

22 "hal": "https://hal.archives-ouvertes.fr/{0}",

23 "tel": "https://tel.archives-ouvertes.fr/{0}",

24 "theses.fr": "https://theses.fr/{0}",

25 "orcid": "https://orcid.org/{0}",

26 "idref": "https://www.idref.fr/{0}",

27 "semantic-scholar": "https://www.semanticscholar.org/paper/{0}",

28 "pmid": "https://pubmed.ncbi.nlm.nih.gov/{0}",

29 "ark": "http://ark.bnf.fr/{0}",

30}

33def resolve_id(id_type: str, id_value: str, force_numdam=False):

34 if id_type == "mr-item-id" and "#" in id_value: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 id_value = id_value.replace(" #", ":")

36 elif id_type == "nmid" and force_numdam: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 return f"http://www.numdam.org/item/{id_value}"

38 elif id_type == "eudml-item-id":

39 values = id_value.split(":")

40 if len(values) > 0: 40 ↛ 43line 40 didn't jump to line 43 because the condition on line 40 was always true

41 id_value = values[-1]

43 if id_type in extids_hrefs: 43 ↛ 45line 43 didn't jump to line 45 because the condition on line 43 was always true

44 return extids_hrefs[id_type].format(id_value)

45 return ""

48def find_id_type(id):

49 id_type = None

50 if id.find("10.") == 0:

51 id_type = "doi"

52 elif id.find("hal-") == 0:

53 id_type = "hal"

54 elif id.lower().find("arxiv:") == 0: 54 ↛ 64line 54 didn't jump to line 64 because the condition on line 54 was always true

55 id_type = "arxiv"

57 # if (len(id) == 9 or len(id) == 10) and id.find(".") == 5:

58 # year = id[0:1]

59 # month = id[2:3]

60 # sequence = id[5:]

61 # if year.is_numeric() and month.is_numeric() and 1 < int(month) < 13 and sequence.is_numeric():

62 # id_type = "arxiv"

64 return id_type

67def get_mimetype(filename):

68 type_extension = {

69 "pdf": "application/pdf",

70 "djvu": "image/x.djvu",

71 "tex": "application/x-tex",

72 "png": "image/png",

73 "jpg": "image/jpeg",

74 }

76 basename = os.path.basename(filename)

77 lower_basename = basename.lower()

78 extension = os.path.splitext(lower_basename)[1][1:]

79 mimetype = type_extension.get(extension, "")

80 return mimetype

83def get_article_base_url():

84 return settings.ARTICLE_BASE_URL

87def get_issue_base_url():

88 return settings.ISSUE_BASE_URL

91def get_icon_base_url():

92 return settings.ICON_BASE_URL

95def get_icon_url(id_, filename):

96 href = get_icon_base_url() + filename

97 # path = get_relative_file_path(id, filename)

98 # if os.path.isabs(path):

99 # path = path[1:]

100 # href = os.path.join(get_icon_base_url(), path)

101 return href

102

103

104def get_doi_url(doi):

105 href = settings.DOI_BASE_URL + doi

106 return href

107

108

109def get_relative_folder(collection_id, container_id=None, article_id=None):

110 folder = collection_id

111 if container_id: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 folder += "/" + container_id

113 if article_id: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 folder += "/" + article_id

115 return folder

116

117

118def embargo(wall, year):

119 result = False

120 y = NOW

121

122 if wall:

123 try:

124 y = int(year.split("-")[0])

125 except BaseException:

126 pass

127

128 result = NOW - y <= wall

129

130 return result

131

132

133# Iterate a folder with a collection

134# The folder must look like @COL/@ISSUE/@ISSUE.XML

135

136

137def iterate_collection_folder(folder, pid, first_issue=""):

138 root_folder = os.path.join(folder, pid)

139

140 start = len(first_issue) == 0

141 # first_issue = 'CRMATH_2008__346_1-2'

142 for item in sorted(os.listdir(root_folder)):

143 if not start and item == first_issue:

144 start = True

145 if start: # and item != 'CRMATH_2015__353_2':

146 dir = os.path.join(root_folder, item)

147 if os.path.isdir(dir):

148 file = os.path.join(root_folder, item, item + ".xml")

149 if os.path.isfile(file):

150 yield item, file

151 file = os.path.join(root_folder, item, item + "-cdrxml.xml")

152 if os.path.isfile(file):

153 yield item, file

154

155

156def create_folder(folder):

157 try:

158 os.makedirs(folder)

159 except BaseException:

160 pass

161

162 if not os.path.isdir(folder): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise RuntimeError("Unable to create " + folder)

164

165

166def copy_folder(from_dir, to_dir):

167 if os.path.isdir(from_dir):

168 create_folder(to_dir)

169

170 for f in os.listdir(from_dir):

171 from_path = os.path.join(from_dir, f)

172 if os.path.isfile(from_path):

173 copy_file(from_path, to_dir)

174 if os.path.isdir(from_path):

175 copy_folder(from_path, os.path.join(to_dir, f))

176

177

178def copy_file(from_path, to_path):

179 if os.path.isfile(from_path): 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 if os.path.isdir(to_path):

181 to_path = os.path.join(to_path, os.path.basename(from_path))

182 if to_path.startswith(settings.MATHDOC_ARCHIVE_FOLDER):

183 # copy2 attempts to preserve all file metadata

184 # on /mathdoc_archive, we don't want to preserve the mode, just the dates

185 shutil.copyfile(from_path, to_path)

186 shutil.copystat(from_path, to_path)

187 else:

188 shutil.copy2(from_path, to_path)

189

190

191def copy_html_images_in_folder(colid, issue, article_to_copy, from_folder, to_folder, folder_name):

192 # copy depuis archive, directement tout le répertoire contenant les images

193 dest_folder = os.path.join(

194 to_folder,

195 get_relative_folder(colid, issue.pid, article_to_copy.pid),

196 folder_name,

197 )

198 if os.path.isdir(dest_folder):

199 try:

200 shutil.rmtree(dest_folder)

201 except OSError:

202 message = "Unable to remove " + dest_folder

203 raise RuntimeError(message)

204

205 src_folder = os.path.join(

206 from_folder,

207 get_relative_folder(colid, issue.pid, article_to_copy.pid),

208 folder_name,

209 )

210 if os.path.isdir(src_folder):

211 copy_folder(src_folder, dest_folder)

212

213

214def copy_html_images(resource, to_folder, from_folder):

215 """

216 Copy the figures associated with the HTML body of an article

217 if from_archive:

218 Images are in settings.MATHDOC/@colid/@issue_id/@a_id/src/tex/figures/

219 if from_cedram:

220 Images are in settings.CEDRAM_TEX_FOLDER/@colid/@issue_id/@tex_aid/Fulltext/figures/

221

222 @param resource:

223 @param to_folder:

224 @param from_folder:

225 @return: nothing

226 """

227

228 if resource.classname != "Article":

229 return

230

231 article_to_copy = resource

232 issue = article_to_copy.my_container

233 colid = article_to_copy.get_collection().pid

234

235 if from_folder == settings.CEDRAM_XML_FOLDER:

236 tex_src_folder = get_cedram_issue_tex_folder(colid, issue.pid)

237 tex_folders, _ = get_cedram_tex_folders(colid, issue.pid)

238

239 if len(tex_folders) > 0:

240 i = 0

241 for article in issue.article_set.all():

242 if article_to_copy.pid == article.pid:

243 # l'ordre d'enregistrement des articles dans la bdd est important : l'ordre du tex est SENSE correspondre au xml de l'issue

244

245 dest_folder = os.path.join(

246 to_folder,

247 get_relative_folder(colid, issue.pid, article.pid),

248 "src/tex/figures",

249 )

250

251 if os.path.isdir(dest_folder):

252 try:

253 shutil.rmtree(dest_folder)

254 except OSError:

255 message = "Unable to remove " + dest_folder

256 raise RuntimeError(message)

257

258 src_folder = os.path.join(

259 tex_src_folder, tex_folders[i], "FullText", "figures"

260 )

261 qs = article.relatedobject_set.filter(rel="html-image")

262 if qs.count() > 0:

263 create_folder(dest_folder)

264

265 for related_obj in qs:

266 img_file = os.path.join(src_folder, os.path.basename(related_obj.location))

267 copy_file(img_file, dest_folder)

268

269 i += 1

270

271 else:

272 # copy depuis archive, directement tout le répertoire contenant les images

273 copy_html_images_in_folder(

274 colid, issue, article_to_copy, from_folder, to_folder, "src/tex/figures"

275 )

276 copy_html_images_in_folder(

277 colid, issue, article_to_copy, from_folder, to_folder, "src/media"

278 )

279

280

281def copy_file_obj_to_article_folder(

282 file_obj, colid, issue_pid, article_pid, is_image=False, article_container_pid=None, path=""

283):

284 if not is_image:

285 name, extension = os.path.splitext(file_obj.name)

286 relative_folder = get_relative_folder(colid, issue_pid, article_pid)

287 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder)

288 create_folder(folder)

289 full_file_name = os.path.join(folder, article_pid + extension)

290 relative_file_name = os.path.join(relative_folder, article_pid + extension)

291

292 with open(full_file_name, "wb+") as destination:

293 for chunk in file_obj.chunks():

294 destination.write(chunk)

295

296 else:

297 name, extension = os.path.splitext(file_obj.name)

298 relative_folder = get_relative_folder(colid, issue_pid, article_pid)

299 if path:

300 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder, path)

301 else:

302 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder + "/src/media")

303 create_folder(folder)

304 full_file_name = os.path.join(folder, name + extension)

305 with open(full_file_name, "wb+") as destination:

306 for chunk in file_obj.chunks():

307 destination.write(chunk)

308

309 relative_file_name = os.path.join(relative_folder, article_pid + extension)

310

311 return relative_file_name

312

313

314def copy_binary_files(resource, from_folder, to_folder, binary_files=None):

315 if to_folder is not None and not from_folder == to_folder: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true

316 if binary_files is None:

317 copy_html_images(resource, to_folder, from_folder)

318 binary_files = resource.get_binary_files_location()

319

320 for file in binary_files:

321 to_path = os.path.join(to_folder, file)

322 dest_folder = os.path.dirname(to_path)

323

324 os.makedirs(dest_folder, exist_ok=True)

325 skip_copy = False

326

327 if "http" in file:

328 skip_copy = True

329 from_path = os.path.join(from_folder, file)

330

331 if not skip_copy and os.path.isfile(from_path):

332 copy_file(from_path, to_path)

333

334

335def delete_object_folder(object_folder, to_folder):

336 folder = os.path.join(to_folder, object_folder)

337

338 # pas de sécurité car pour garder le mode CASCADE de la db, on supprime le rép sans s'occuper de ce qu'il y a dedans

339 # si on veut vérifier, décommenter :

340 # for entry in os.listdir(folder):

341 # if entry.startswith(colid) and os.path.isdir(os.path.join(folder, entry)):

342 # print(entry)

343 # os.path.join(folder, entry)

344 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des articles/containers')

345 #

346 # if verify == True:

347 # for root, dirs, files in os.walk(folder):

348 # if len(files) > 0:

349 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des objects')

350

351 folder = os.path.normpath(folder)

352 # garde fous :)

353 if folder in [

354 "/mersenne_prod_data",

355 "/mersenne_test_data",

356 "/mathdoc_archive",

357 ] or folder.startswith("/cedram_dev"):

358 raise Exception("Attention, pb avec la suppression de " + folder)

359

360 if os.path.isdir(folder):

361 shutil.rmtree(folder)

362

363

364def delete_file(path):

365 if os.path.isfile(path):

366 os.remove(path)

367

368

369def get_disk_location(

370 root_folder, collection_id, ext, container_id=None, article_id=None, do_create_folder=False

371):

372 if do_create_folder: 372 ↛ 373line 372 didn't jump to line 373 because the condition on line 372 was never true

373 folder = os.path.join(root_folder, collection_id)

374 create_folder(folder)

375

376 if container_id:

377 folder = os.path.join(root_folder, collection_id, container_id)

378 create_folder(folder)

379

380 if article_id:

381 folder = os.path.join(root_folder, collection_id, container_id, article_id)

382 create_folder(folder)

383

384 last_id = collection_id

385 filename = os.path.join(root_folder, collection_id)

386 if container_id: 386 ↛ 389line 386 didn't jump to line 389 because the condition on line 386 was always true

387 filename = os.path.join(filename, container_id)

388 last_id = container_id

389 if article_id: 389 ↛ 393line 389 didn't jump to line 393 because the condition on line 389 was always true

390 filename = os.path.join(filename, article_id)

391 last_id = article_id

392

393 filename = os.path.join(filename, last_id + "." + ext)

394

395 return filename

396

397

398def get_body(filename):

399 with open(filename, encoding="utf-8") as file_:

400 body = file_.read()

401 return body

402

403

404def get_archive_filename(root_folder, colid, pid, ext, do_create_folder=False, article_pid=None):

405 """

406

407 :param root_folder: root_folder of the archive. Ex: /mathdoc_archive

408 :param colid: collection id

409 :param pid: issue id

410 :param ext: filename extension ("xml" or "json")

411 :param create_folder: option to recursively create sub folders

412 :return:

413 """

414

415 # TODO: call get_disk_location(root_folder, colid, ext, pid, None, do_create_folder)

416

417 if do_create_folder:

418 folder = os.path.join(root_folder, colid)

419 create_folder(folder)

420

421 if pid: 421 ↛ 425line 421 didn't jump to line 425 because the condition on line 421 was always true

422 folder = os.path.join(root_folder, colid, pid)

423 create_folder(folder)

424

425 if article_pid: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true

426 folder = os.path.join(folder, article_pid)

427 create_folder(folder)

428

429 if pid and article_pid: 429 ↛ 430line 429 didn't jump to line 430 because the condition on line 429 was never true

430 filename = os.path.join(root_folder, colid, pid, article_pid, article_pid + "." + ext)

431 elif pid: 431 ↛ 434line 431 didn't jump to line 434 because the condition on line 431 was always true

432 filename = os.path.join(root_folder, colid, pid, pid + "." + ext)

433 else:

434 filename = os.path.join(root_folder, colid, colid + "." + ext)

435

436 return filename

437

438

439# Read the XML of an issue/collection within an archive folder

440# The folder must look like @COL/@ISSUE/@ISSUE.XML

441# @COL/@COL.XML

442

443

444def get_archive_body(root_folder, colid, pid):

445 filename = get_archive_filename(root_folder, colid, pid, "xml")

446 return get_body(filename)

447

448

449def is_tex_comment(text, i):

450 is_comment = False

451 while i > 0 and text[i] == " ": 451 ↛ 452line 451 didn't jump to line 452 because the condition on line 451 was never true

452 i -= 1

453

454 if i >= 0 and text[i] == "%": 454 ↛ 455line 454 didn't jump to line 455 because the condition on line 454 was never true

455 is_comment = True

456 elif i > 0 and text[i] == "~" and text[i - 1] == "%": 456 ↛ 457line 456 didn't jump to line 457 because the condition on line 456 was never true

457 is_comment = True

458

459 return is_comment

460

461

462def is_tex_def(text, i):

463 is_def = False

464

465 if text[i - 5 : i - 1] == "\\def":

466 is_def = True

467

468 return is_def

469

470

471def is_tex_newcommand(text, i):

472 is_newcommand = False

473

474 if text[i - 12 : i - 1] == "\\newcommand":

475 is_newcommand = True

476

477 return is_newcommand

478

479

480def get_cedram_issue_tex_folder(colid, issue_id):

481 return os.path.join(settings.CEDRAM_TEX_FOLDER, colid, issue_id)

482

483

484def get_cedram_tex_folders(colid, issue_id):

485 """

486 return article filenames in cedram tex issue folder and corresponding doi if present, extracted from issue tex file

487 @param colid:

488 @param issue_id:

489 @return: list of filename, list of doi

490 """

491 filenames = []

492 dois = []

493

494 body = ""

495 issue_filename = os.path.join(get_cedram_issue_tex_folder(colid, issue_id), issue_id + ".tex")

496 if os.path.isfile(issue_filename):

497 try:

498 with open(issue_filename, encoding="utf-8") as f:

499 body = f.read()

500 except UnicodeDecodeError:

501 with open(issue_filename, encoding="iso-8859-1") as f:

502 body = f.read()

503

504 lower_body = body.lower()

505

506 li = []

507 j = body.find("includearticle")

508 if j >= 0:

509 li.append(j)

510 j = body.find("includeprearticle")

511 if j >= 0:

512 li.append(j)

513 j = lower_body.find("includepreface")

514 if j >= 0:

515 li.append(j)

516 i = min(li) if len(li) > 0 else -1

517

518 while i >= 0:

519 if (

520 i > 1

521 and not is_tex_comment(body, i - 2)

522 and not is_tex_def(body, i)

523 and not is_tex_newcommand(body, i)

524 ):

525 doi = None

526 while body[i] != "{":

527 if len(body) > i + 4 and body[i : i + 4] == "doi=":

528 j = i + 4

529 while body[i] != "," and body[i] != "]":

530 i += 1

531 doi = xml_utils.normalize_space(body[j:i])

532 i += 1

533 i += 1

534 filename = ""

535 while body[i] != "}":

536 filename += body[i]

537 i += 1

538 if len(filename) > 0:

539 filenames.append(filename)

540 dois.append(doi)

541 else:

542 i += 1

543

544 li = []

545 j = body.find("includearticle", i)

546 if j >= 0:

547 li.append(j)

548 j = body.find("includeprearticle", i)

549 if j >= 0:

550 li.append(j)

551 j = lower_body.find("includepreface", i)

552 if j >= 0:

553 li.append(j)

554 i = min(li) if len(li) > 0 else -1

555

556 return filenames, dois

557

558

559def get_bibtex_from_tex(tex_filename):

560 bibtex_filename = ""

561

562 body = ""

563 if os.path.isfile(tex_filename): 563 ↛ 585line 563 didn't jump to line 585 because the condition on line 563 was always true

564 try:

565 with open(tex_filename, encoding="utf-8") as f:

566 body = f.read()

567 except UnicodeDecodeError:

568 with open(tex_filename, encoding="iso-8859-1") as f:

569 body = f.read()

570

571 i = body.find("\\bibliography")

572 while i >= 0:

573 if i > 1 and not is_tex_comment(body, i - 2): 573 ↛ 581line 573 didn't jump to line 581 because the condition on line 573 was always true

574 while body[i] != "{":

575 i += 1

576 i += 1

577 while body[i] != "}":

578 bibtex_filename += body[i]

579 i += 1

580 else:

581 i += 1

582

583 i = body.find("\\bibliography", i)

584

585 return bibtex_filename

586

587

588PCJ_SECTIONS = {

589 "animsci": "Animal Science",

590 "archaeo": "Archaeology",

591 "ecology": "Ecology",

592 "ecotoxenvchem": "Ecotoxicology & Environmental Chemistry",

593 "evolbiol": "Evolutionary Biology",

594 "forestwoodsci": "Forest & Wood Sciences",

595 "genomics": "Genomics",

596 "healthmovsci": "Health & Movement Sciences",

597 "infections": "Infections",

598 "mcb": "Mathematical & Computational Biology",

599 "microbiol": "Microbiology",

600 "networksci": "Network Science",

601 "neuro": "Neuroscience",

602 "orgstudies": "Organization Studies",

603 "paleo": "Paleontology",

604 "rr": "Registered Reports",

605 "zool": "Zoology",

606}

607

608PCJ_UGA_SECTION = ["healthmovsci", "rr", "orgstudies"]

609PCJ_CONFERENCES = ["Euring 2023"]

610PCJ_MANDATORY_TOPICS = {

611 "ecology": "Ecology",

612 "evolbiol": "Evolution",

613 "genomics": "Genetics/genomics",

614 "paleo": "Paleontology",

615 "archaeo": "Archaeology",

616 "microbiol": "Microbiology",

617 "neuro": "Neuroscience",

618}

619

620

621def get_pci(value):

622 if value in PCJ_SECTIONS: 622 ↛ 624line 622 didn't jump to line 624 because the condition on line 622 was always true

623 return PCJ_SECTIONS[value]

624 return ""

625

626

627ARTICLE_TYPES = {

628 "biographical-note": "Notice biographique",

629 "book-review": "Recension d’ouvrage",

630 "clarification": "Mise au point",

631 "congress": "Intervention en colloque",

632 "corrigendum": "Corrigendum",

633 "editorial": "Éditorial",

634 "erratum": "Erratum",

635 "expression-of-concern": "Avertissement des éditeurs",

636 "foreword": "Avant-propos",

637 "guest-editors": "Rédacteurs invités",

638 "historical-commentary": "Commentaire historique",

639 "history-of-sciences": "Histoire des sciences et des idées",

640 "letter": "Commentaire et réponse",

641 "news": "C'est apparu dans la presse",

642 "opinion": "Opinion / Perspective",

643 "preliminary-communication": "Communication préliminaire",

644 "research-article": "Article de recherche",

645 "retraction": "Rétractation",

646 "review": "Article de synthèse",

647 "software-tool": "Outil logiciel",

648}

649

650# From : apps/ptf/templates/common/externalid_detail.html

651extids_formats: dict[str, str] = {

652 "arxiv": "arXiv",

653 "tel": "TEL",

654 "hal": "HAL",

655 "theses.fr": "theses.fr",

656 "doi": "DOI",

657 "numdam-id": "Numdam",

658 "sps-id": "SPS",

659 "mr-item-id": "MR",

660 "zbl-item-id": "Zbl",

661 "jfm-item-id": "JFM",

662 "eudml-item-id": "EuDML",

663 "pmid": "PMID",

664 "ark": "Gallica",

665}