Coverage for src/ptf/citedby.py: 68%

484 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1import concurrent.futures 

2import html 

3import re 

4from collections import defaultdict 

5from datetime import timedelta 

6from difflib import SequenceMatcher 

7 

8import xmltodict 

9from bs4 import BeautifulSoup 

10from pylatexenc.latex2text import LatexNodes2Text 

11from requests.exceptions import ConnectionError 

12from requests.exceptions import Timeout 

13from requests_cache import CachedSession 

14from requests_cache import FileCache 

15 

16from django.conf import settings 

17 

18from ptf.bibtex import parse_bibtex 

19from ptf.cmds.xml.xml_utils import normalise_span 

20from ptf.model_data import RefData 

21from ptf.model_data import create_contributor 

22from ptf.model_data_converter import update_ref_data_for_jats 

23from ptf.model_helpers import get_extid 

24from ptf.models import BibItemId 

25from ptf.models import get_names 

26from ptf.utils import get_display_name 

27 

28ADS_URL = "https://api.adsabs.harvard.edu/v1/search" 

29ARXIV_URL = "https://export.arxiv.org/api/query" 

30CROSSREF_URL = "https://doi.crossref.org/servlet/getForwardLinks" 

31SEMANTIC_URL = "https://api.semanticscholar.org/v1/paper/" 

32ZBMATH_URL = "https://zbmath.org" 

33 

34ADS = "NASA ADS" 

35CROSSREF = "Crossref" 

36SEMANTIC = "Semantic Scholar" 

37ZBMATH = "zbMATH" 

38 

39 

40TIMEOUT = 4.0 

41 

42PRIORITY = defaultdict(int, {ZBMATH: 10, ADS: 9, CROSSREF: 8, SEMANTIC: 7}) 

43 

44LATEX_PARSER = LatexNodes2Text(math_mode="verbatim") 

45 

46session = CachedSession( 

47 backend=FileCache( 

48 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache", 

49 decode_content=False, 

50 ), 

51 headers={ 

52 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0", 

53 "From": getattr(settings, "REQUESTS_EMAIL", None) or "accueil@listes.mathdoc.fr", 

54 }, 

55 expire_after=timedelta(days=30), 

56) 

57 

58 

59def create_refdata(lang="und"): 

60 data = RefData(lang=lang) 

61 data.type = "misc" 

62 data.doi = None 

63 data.arxiv = None 

64 data.zbl = None 

65 data.semantic = None 

66 return data 

67 

68 

69def is_same_title(compare, titles, tol=0.90): 

70 compare = re.sub(r"\W", "", compare).lower() 

71 for title in titles: 

72 title = re.sub(r"\W", "", title).lower() 

73 if SequenceMatcher(None, compare, title).ratio() > tol: 

74 return True 

75 return False 

76 

77 

78def get_zbmath_bibtex(params): 

79 text = "" 

80 headers = {"Content-Type": "text/html"} 

81 response = session.get(ZBMATH_URL, params=params, headers=headers, timeout=0.5 * TIMEOUT) 

82 soup = BeautifulSoup(response.text, "html.parser") 

83 results = soup.find("div", {"class": "citations"}) 

84 if results: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 for ref in results.find_all("a", href=True): 

86 headers = {"Content-Type": "text/x-bibtex"} 

87 url = ZBMATH_URL + "/bibtexoutput" + ref.get("href", "") 

88 response = session.get(url, headers=headers, timeout=0.5 * TIMEOUT) 

89 response.encoding = "utf-8" 

90 text += response.text 

91 return text 

92 

93 

94def citedby_zbmath(metadata): 

95 if "zbl_id" in metadata: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 params = {"q": "an:" + metadata["zbl_id"]} 

97 else: 

98 params = {"q": "en:" + metadata["doi"]} 

99 title_tex = normalise_span(metadata["title"]).replace("\xa0", " ") 

100 authors = "&au:".join(metadata["authors"]) 

101 params = {"q": params["q"] + "|(ti:" + f'"{title_tex}"' + "&au:" + authors + ")"} 

102 text = get_zbmath_bibtex(params) 

103 citations = parse_bibtex(text) 

104 return citations 

105 

106 

107def citedby_crossref(metadata): 

108 citations = [] 

109 user = settings.CROSSREF_USER 

110 password = settings.CROSSREF_PWD 

111 url = f"{CROSSREF_URL}?usr={user}&pwd={password}&doi={metadata['doi']}" 

112 response = session.get(url, timeout=TIMEOUT) 

113 response.encoding = "utf-8" 

114 if response.status_code == 200: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 data = xmltodict.parse(response.text) 

116 body = data["crossref_result"]["query_result"]["body"] 

117 if body: 

118 citations = body["forward_link"] 

119 

120 if not isinstance(citations, list): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 citations = [citations] 

122 return citations 

123 

124 

125def get_arxiv_id(metadata): 

126 arxiv_id = None 

127 title_tex = normalise_span(metadata["title"]).replace("\xa0", " ") 

128 headers = {"Content-Type": "application/atom+xml"} 

129 query = "doi:" + metadata["doi"] + " OR (ti:" + f'"{title_tex}"' + ")" 

130 params = {"search_query": query, "max_results": 1} 

131 response = session.get(ARXIV_URL, params=params, headers=headers, timeout=0.5 * TIMEOUT) 

132 if response.status_code == 200: 132 ↛ 139line 132 didn't jump to line 139 because the condition on line 132 was always true

133 data = xmltodict.parse(response.text) 

134 if "entry" in data["feed"]: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 entry = data["feed"]["entry"] 

136 if is_same_title(title_tex, [entry["title"]]): 

137 arxiv_id = entry["id"].split("arxiv.org/abs/") 

138 arxiv_id = arxiv_id[-1].split("v")[0] 

139 return arxiv_id 

140 

141 

142def citedby_ads(metadata, by_doi=True, citedby=True): 

143 if by_doi: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 arxiv_id = get_arxiv_id(metadata) 

145 else: 

146 arxiv_id = metadata["arxiv_id"] 

147 if not arxiv_id: 147 ↛ 150line 147 didn't jump to line 150 because the condition on line 147 was always true

148 return [] 

149 

150 citations = [] 

151 url = ADS_URL + "/query" 

152 headers = {"Authorization": f"Bearer:{settings.ADS_TOKEN}"} 

153 reference = "citation" if citedby else "reference" 

154 params = {"q": "identifier:" + arxiv_id, "fl": reference} 

155 response = session.get(url, headers=headers, params=params, timeout=0.5 * TIMEOUT) 

156 if response.status_code == 200: 

157 results = response.json().get("response", {}).get("docs") 

158 if results and isinstance(results, list) and reference in results[0]: 

159 url = ADS_URL + "/bigquery" 

160 bibcodes = "bibcode\n" + "\n".join(results[0][reference]) 

161 filters = "abstract,author,bibcode,comment,doi,doctype," 

162 filters += "eid,identifier,issue,keyword,orcid_pub," 

163 filters += "page,page_count,page_range,pub,pub_raw,title,volume,year" 

164 params = {"q": "*:*", "fl": filters, "rows": 200} 

165 response = session.post( 

166 url, params=params, headers=headers, data=bibcodes, timeout=0.5 * TIMEOUT 

167 ) 

168 response.encoding = "utf-8" 

169 if response.status_code == 200: 

170 citations = response.json().get("response", {}).get("docs") 

171 return citations 

172 

173 

174def citedby_semantic(metadata, citedby=True): 

175 citations = [] 

176 reference = "citations" if citedby else "references" 

177 if settings.SITE_ID != 36: # all but PCJ 

178 response = session.get(SEMANTIC_URL + metadata["doi"], timeout=TIMEOUT) 

179 response.encoding = "utf-8" 

180 if response.status_code == 200: 

181 citations.extend(response.json()[reference]) 

182 return citations 

183 

184 

185def set_contributors(ref, api_contributors, orcids=None): 

186 if not isinstance(api_contributors, list): 

187 api_contributors = [api_contributors] 

188 

189 contributors = [] 

190 for contributor in api_contributors: 

191 first_name = last_name = "" 

192 if ref.provider == CROSSREF: 

193 first_name = contributor.get("given_name") 

194 last_name = contributor.get("surname") 

195 elif ref.provider in [ADS, ZBMATH]: 

196 result = contributor.split(", ") 

197 if result: 197 ↛ 205line 197 didn't jump to line 205 because the condition on line 197 was always true

198 first_name = result[1] if len(result) > 1 else "" 

199 last_name = result[0] 

200 elif ref.provider == SEMANTIC: 200 ↛ 205line 200 didn't jump to line 205 because the condition on line 200 was always true

201 result = contributor["name"].split(" ") 

202 if result: 202 ↛ 205line 202 didn't jump to line 205 because the condition on line 202 was always true

203 first_name = " ".join(result[0:-1]) 

204 last_name = result[-1] 

205 contributor = create_contributor() 

206 contributor["first_name"] = first_name.strip() if first_name else "" 

207 contributor["last_name"] = last_name.strip() if last_name else "" 

208 contributor["role"] = "author" 

209 contributors.append(contributor) 

210 

211 if orcids and len(contributors) == len(orcids): 

212 for contrib, orcid in zip(contributors, orcids): 

213 contrib["orcid"] = orcid if orcid != "-" else "" 

214 setattr(ref, "contributors", contributors) 

215 

216 

217def ads_to_bibtex_type(doc_type): 

218 if doc_type in ["article", "eprint"]: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true

219 bibtex_type = "article" 

220 elif doc_type in [ 

221 "book", 

222 "inbook", 

223 "inproceedings", 

224 "mastersthesis", 

225 "phdthesis", 

226 "proceedings", 

227 "techreport", 

228 ]: 

229 bibtex_type = doc_type 

230 else: 

231 bibtex_type = "misc" 

232 return bibtex_type 

233 

234 

235def crossref_to_bibtex_type(doc_type, item): 

236 if doc_type == "journal_cite": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 bibtex_type = "article" 

238 elif doc_type == "conf_cite": 

239 if "paper_title" in item: 

240 bibtex_type = "inproceedings" 

241 else: 

242 bibtex_type = "proceedings" 

243 elif doc_type == "book_cite": 

244 if "chapter_title" in item: 

245 bibtex_type = "inbook" 

246 else: 

247 bibtex_type = "book" 

248 else: 

249 bibtex_type = "misc" 

250 return bibtex_type 

251 

252 

253def citedby_crossref_refs(citations): 

254 refdata = [] 

255 for item in citations: 

256 item.pop("@doi") # the interior orderdict remains 

257 if not item: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 continue 

259 doc_type, item = item.popitem() 

260 ref = create_refdata() 

261 setattr(ref, "provider", CROSSREF) 

262 setattr(ref, "type", crossref_to_bibtex_type(doc_type, item)) 

263 if "journal_title" in item and item["journal_title"]: 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true

264 setattr(ref, "source_tex", item["journal_title"]) 

265 if "article_title" in item and item["article_title"]: 265 ↛ 267line 265 didn't jump to line 267 because the condition on line 265 was always true

266 setattr(ref, "article_title_tex", item["article_title"]) 

267 if "volume_title" in item: # book or proceedings title 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 setattr(ref, "source_tex", item["volume_title"]) 

269 if "paper_title" in item and item["paper_title"]: # inproceedings title 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 setattr(ref, "article_title_tex", item["paper_title"]) 

271 if "chapter_title" in item and item["chapter_title"]: # incollection or inbook 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 setattr(ref, "chapter_title_tex", item["chapter_title"]) 

273 if "first_page" in item: 273 ↛ 275line 273 didn't jump to line 275 because the condition on line 273 was always true

274 setattr(ref, "fpage", item["first_page"]) 

275 if "last_page" in item: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 setattr(ref, "lpage", item["last_page"]) 

277 if "volume" in item: 277 ↛ 279line 277 didn't jump to line 279 because the condition on line 277 was always true

278 setattr(ref, "volume", item["volume"]) 

279 if "issue" in item: 

280 setattr(ref, "issue", item["issue"]) 

281 if "year" in item and item["year"]: 281 ↛ 283line 281 didn't jump to line 283 because the condition on line 281 was always true

282 setattr(ref, "year", item["year"]) 

283 if "contributors" in item and "contributor" in item["contributors"]: 283 ↛ 285line 283 didn't jump to line 285 because the condition on line 283 was always true

284 set_contributors(ref, item["contributors"]["contributor"]) 

285 if "doi" in item and item["doi"]: 285 ↛ 287line 285 didn't jump to line 287 because the condition on line 285 was always true

286 setattr(ref, "doi", item["doi"]["#text"].lower()) 

287 refdata.append(ref) 

288 return refdata 

289 

290 

291def citedby_zbmath_refs(citations): 

292 return bibtex_to_refs(citations) 

293 

294 

295def is_misc(doctype): 

296 if doctype not in [ 296 ↛ 310line 296 didn't jump to line 310 because the condition on line 296 was never true

297 "article", 

298 "book", 

299 # "booklet", 

300 "conference", 

301 "inbook", 

302 "incollection", 

303 "inproceedings", 

304 # "manual", 

305 # "mastersthesis", 

306 "phdthesis", 

307 "proceedings", 

308 "techreport", 

309 ]: 

310 return True 

311 return False 

312 

313 

314def bibtex_to_refs(bibitems): 

315 refdata = [] 

316 for item in bibitems: 

317 ref = create_refdata() 

318 setattr(ref, "provider", ZBMATH) 

319 item["doctype"] = "misc" if is_misc(item["doctype"]) else item["doctype"] 

320 setattr(ref, "type", item["doctype"]) 

321 if "fjournal" in item: 321 ↛ 323line 321 didn't jump to line 323 because the condition on line 321 was always true

322 setattr(ref, "source_tex", item["fjournal"]) 

323 elif "journal" in item: 

324 setattr(ref, "source_tex", item["journal"]) 

325 elif "booktitle" in item: 

326 setattr(ref, "source_tex", item["booktitle"]) 

327 elif "howpublished" in item: 

328 howpublished = re.sub(r" \([0-9]{4}\)\.?", "", item["howpublished"]) 

329 setattr(ref, "source_tex", howpublished) 

330 if "fseries" in item: 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 setattr(ref, "series", item["fseries"]) 

332 elif "series" in item: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 setattr(ref, "series", item["series"]) 

334 if "title" in item: 334 ↛ 341line 334 didn't jump to line 341 because the condition on line 334 was always true

335 if item["doctype"] in ["article", "misc"]: 335 ↛ 337line 335 didn't jump to line 337 because the condition on line 335 was always true

336 setattr(ref, "article_title_tex", item["title"]) 

337 elif item["doctype"] in ["incollection", "inproceedings", "inbook"]: 

338 setattr(ref, "chapter_title_tex", item["title"]) 

339 else: 

340 setattr(ref, "source_tex", item["title"]) 

341 if "url" in item and not ref.source_tex: 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true

342 setattr(ref, "source_tex", item["url"]) 

343 if "pages" in item and item["pages"]: 343 ↛ 348line 343 didn't jump to line 348 because the condition on line 343 was always true

344 result = [x for x in re.split(r"\W", item["pages"])] 

345 setattr(ref, "fpage", result[0]) 

346 if len(result) == 2: 

347 setattr(ref, "lpage", result[1]) 

348 if "volume" in item: 348 ↛ 350line 348 didn't jump to line 350 because the condition on line 348 was always true

349 setattr(ref, "volume", item["volume"]) 

350 if "number" in item: 

351 setattr(ref, "issue", item["number"]) 

352 if "issue" in item: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 setattr(ref, "issue", item["issue"]) 

354 if "note" in item: 

355 setattr(ref, "comment", item["note"]) 

356 if "year" in item: 356 ↛ 358line 356 didn't jump to line 358 because the condition on line 356 was always true

357 setattr(ref, "year", item["year"]) 

358 if "author" in item: 358 ↛ 360line 358 didn't jump to line 360 because the condition on line 358 was always true

359 set_contributors(ref, item["author"].split(" and ")) 

360 if "publisher" in item: 360 ↛ 361line 360 didn't jump to line 361 because the condition on line 360 was never true

361 setattr(ref, "publisher_name", item["publisher"]) 

362 elif "school" in item: 362 ↛ 363line 362 didn't jump to line 363 because the condition on line 362 was never true

363 setattr(ref, "publisher_name", item["school"]) 

364 elif "institution" in item: 364 ↛ 365line 364 didn't jump to line 365 because the condition on line 364 was never true

365 setattr(ref, "publisher_name", item["institution"]) 

366 if "address" in item: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 setattr(ref, "publisher_loc", item["address"]) 

368 if "doi" in item and item["doi"]: 

369 setattr(ref, "doi", item["doi"].lower()) 

370 if "zbmath" in item: 370 ↛ 372line 370 didn't jump to line 372 because the condition on line 370 was always true

371 setattr(ref, "zbl", item["zbmath"]) 

372 if "zbl" in item: 

373 setattr(ref, "zbl", item["zbl"]) 

374 refdata.append(ref) 

375 return refdata 

376 

377 

378def citedby_ads_refs(citations): 

379 refdata: list[RefData] = [] 

380 for item in citations: 

381 ref = create_refdata() 

382 setattr(ref, "provider", ADS) 

383 setattr(ref, "bibcode", item["bibcode"]) 

384 setattr(ref, "type", ads_to_bibtex_type(item["doctype"])) 

385 if "title" in item and item["title"]: 385 ↛ 387line 385 didn't jump to line 387 because the condition on line 385 was always true

386 setattr(ref, "article_title_tex", item["title"][0]) 

387 if "page_range" in item: 387 ↛ 388line 387 didn't jump to line 388 because the condition on line 387 was never true

388 result = item["page_range"].split("-") 

389 if len(result) == 2: 

390 setattr(ref, "fpage", result[0]) 

391 setattr(ref, "lpage", result[1]) 

392 elif "page" in item and item["page"] and item["page"][0].isdigit(): 

393 setattr(ref, "fpage", item["page"][0]) 

394 if "page_count" in item and item["page_count"]: 

395 setattr(ref, "lpage", str(item["page_count"] - 1)) 

396 if "year" in item and item["year"]: 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true

397 setattr(ref, "year", item["year"]) 

398 if "author" in item and item["author"]: 398 ↛ 400line 398 didn't jump to line 400 because the condition on line 398 was always true

399 set_contributors(ref, item["author"], item.get("orcid_pub", [])) 

400 if "issue" in item: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 setattr(ref, "issue", item["issue"]) 

402 if "volume" in item: 402 ↛ 403line 402 didn't jump to line 403 because the condition on line 402 was never true

403 setattr(ref, "volume", item["volume"]) 

404 if "doi" in item and item["doi"]: 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true

405 setattr(ref, "doi", item["doi"][0].lower()) 

406 if "eid" in item and item["eid"]: 406 ↛ 411line 406 didn't jump to line 411 because the condition on line 406 was always true

407 arxiv = item["eid"].split("arXiv:") 

408 if "pub" in item and "arXiv" in item["pub"]: 408 ↛ 411line 408 didn't jump to line 411 because the condition on line 408 was always true

409 setattr(ref, "arxiv", arxiv[-1]) 

410 setattr(ref, "source_tex", "arXiv") 

411 if "pub_raw" in item and item["pub_raw"] and ref.doi and not ref.arxiv: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 result = re.match(r"(^.+)?[,.]( vol. | Volume )", item["pub_raw"]) 

413 if result: 

414 setattr(ref, "source_tex", result.group(1)) 

415 elif "pub" in item and not ref.arxiv: 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true

416 setattr(ref, "source_tex", item["pub"]) 

417 if "abstract" in item and item["abstract"]: 417 ↛ 419line 417 didn't jump to line 419 because the condition on line 417 was always true

418 setattr(ref, "abstract", [item["abstract"]]) 

419 refdata.append(ref) 

420 return refdata 

421 

422 

423def citedby_semantic_refs(citations): 

424 refdata = [] 

425 for item in citations: 

426 ref = create_refdata() 

427 setattr(ref, "provider", SEMANTIC) 

428 if "title" in item: 428 ↛ 432line 428 didn't jump to line 432 because the condition on line 428 was always true

429 title = item["title"] 

430 title = title.capitalize() if title.isupper() else item["title"] 

431 setattr(ref, "article_title_tex", title) 

432 if "year" in item and item["year"]: 432 ↛ 434line 432 didn't jump to line 434 because the condition on line 432 was always true

433 setattr(ref, "year", str(item["year"])) 

434 if "authors" in item and item["authors"]: 434 ↛ 436line 434 didn't jump to line 436 because the condition on line 434 was always true

435 set_contributors(ref, item["authors"]) 

436 if "doi" in item and item["doi"]: 

437 setattr(ref, "doi", item["doi"].lower()) 

438 if "arxivId" in item and item["arxivId"]: 

439 setattr(ref, "arxiv", item["arxivId"]) 

440 setattr(ref, "source_tex", "arXiv") 

441 if "venue" in item and item["venue"]: 

442 setattr(ref, "source_tex", item["venue"]) 

443 if "paperId" in item: 443 ↛ 445line 443 didn't jump to line 445 because the condition on line 443 was always true

444 setattr(ref, "semantic", item["paperId"]) 

445 refdata.append(ref) 

446 return refdata 

447 

448 

449def get_extlinks(extids): 

450 extlinks = [] 

451 for extid in extids: 

452 eid = BibItemId() 

453 eid.id_type, eid.id_value = extid 

454 extlink = "" 

455 if eid.id_type == "doi": 

456 extlink = "DOI:" + eid.id_value 

457 elif eid.id_type == "arxiv": 

458 extlink = "arXiv:" + eid.id_value 

459 elif eid.id_type == "zbl-item-id": 

460 extlink = "Zbl:" + eid.id_value 

461 elif eid.id_type == "semantic-scholar": 461 ↛ 463line 461 didn't jump to line 463 because the condition on line 461 was always true

462 extlink = "Semantic-scholar:" + eid.id_value 

463 if extlink: 463 ↛ 451line 463 didn't jump to line 451 because the condition on line 463 was always true

464 extlink = f' | <a href="{eid.get_href()}">{extlink}</a>' 

465 extlinks.append(extlink) 

466 return extlinks 

467 

468 

469def built_extlinks(ref): 

470 extids = [] 

471 if ref.doi: 

472 extids.append(("doi", ref.doi)) 

473 if ref.arxiv: 

474 extids.append(("arxiv", ref.arxiv)) 

475 if ref.zbl: 

476 extids.append(("zbl-item-id", ref.zbl)) 

477 if not any((ref.doi, ref.zbl, ref.arxiv)) and getattr(ref, "semantic", False): 

478 extids.append(("semantic-scholar", ref.semantic)) 

479 setattr(ref, "extids", extids) 

480 

481 

482def get_values_for_stats(refs): 

483 """ 

484 extract data of a ref and return as a dict 

485 @param refs: dict of RefData.__dict__ 

486 @return: dict 

487 """ 

488 

489 citedby_for_stats = [] 

490 for ref_item in refs.values(): 

491 authors = [] 

492 for author in ref_item.get("contributors"): 

493 if author["role"] == "author": 493 ↛ 492line 493 didn't jump to line 492 because the condition on line 493 was always true

494 display_name = get_display_name( 

495 author["prefix"], 

496 author["first_name"], 

497 author["last_name"], 

498 author["suffix"], 

499 author["string_name"], 

500 ) 

501 authors.append({"author": display_name}) 

502 

503 title_key = get_publication_title(ref_item, "title") 

504 title = ref_item[title_key] 

505 publication_title_key = get_publication_title(ref_item, "publication_title") 

506 publication_title = ref_item[publication_title_key] 

507 

508 url = "" 

509 if ref_item["extlinks"]: 509 ↛ 513line 509 didn't jump to line 513

510 result = re.search(r'href="(.+)">', ref_item["extlinks"][0]) 

511 url = result.group(1) if result else "" 

512 

513 result = { 

514 "authors": authors, 

515 "title": title, 

516 "publication_title": publication_title, 

517 "year": ref_item["year"], 

518 "url": url, 

519 "source": ref_item["provider"], 

520 } 

521 citedby_for_stats.append(result) 

522 return citedby_for_stats 

523 

524 

525def get_publication_title(ref_item, category="title"): 

526 type_ = ref_item.get("type") 

527 

528 if "thesis" in type_: 528 ↛ 529line 528 didn't jump to line 529 because the condition on line 528 was never true

529 type_ = "thesis" 

530 else: 

531 type_ = "misc" 

532 

533 dic = { 

534 "incollection": {"title": "source_tex", "publication_title": "series"}, 

535 "thesis": {"title": "source_tex", "publication_title": "series"}, 

536 "article": {"title": "article_title_tex", "publication_title": "source_tex"}, 

537 "book": {"title": "source_tex", "publication_title": "series"}, 

538 "inbook": {"title": "chapter_title_tex", "publication_title": "series"}, 

539 "misc": {"title": "article_title_tex", "publication_title": "source_tex"}, 

540 } 

541 return dic.get(type_).get(category) 

542 

543 

544def built_citations(data): 

545 # to match citations and add these ids when missing 

546 doi_arxiv = {ref.doi: ref.arxiv for ref in data if ref.doi and ref.arxiv} 

547 arxiv_doi = {v: k for k, v in doi_arxiv.items()} 

548 

549 results = [] 

550 for n, ref in enumerate(data): 

551 if ref.arxiv and not ref.doi: 

552 setattr(ref, "doi", arxiv_doi.get(ref.arxiv)) 

553 elif not ref.arxiv and ref.doi: 

554 setattr(ref, "arxiv", doi_arxiv.get(ref.doi)) 

555 built_extlinks(ref) 

556 update_ref_data_for_jats(ref, n, with_label=False) 

557 ref.citation_html = html.unescape(ref.citation_html) 

558 results.append(vars(ref)) 

559 

560 results.sort( 

561 key=lambda k: ( 

562 -int(k["year"]) if k["year"] else 0, 

563 k["source_tex"], 

564 k["volume"], 

565 k["issue"], 

566 k["fpage"], 

567 ), 

568 ) 

569 

570 refs = {} 

571 titles = { 

572 item[get_publication_title(item)] 

573 for item in results 

574 if any((item["arxiv"], item["doi"], item["zbl"])) 

575 } 

576 

577 for item in results: 

578 links = get_extlinks(item["extids"]) 

579 level = PRIORITY[item["provider"]] 

580 citation = LATEX_PARSER.latex_to_text(item["citation_html"].replace("$$", "$")) 

581 ref = {"html": citation + "".join(links)} 

582 ref.update({"priority": level, "extlinks": links}) 

583 ref.update(item) 

584 

585 if item["doi"]: 

586 if item["doi"] not in refs or refs[item["doi"]]["priority"] < level: 

587 refs[item["doi"]] = ref 

588 elif item["zbl"]: 

589 refs[item["zbl"]] = ref 

590 elif item["arxiv"]: 

591 if item["arxiv"] not in refs or refs[item["arxiv"]]["priority"] < level: 591 ↛ 577line 591 didn't jump to line 577 because the condition on line 591 was always true

592 refs[item["arxiv"]] = ref 

593 elif item["semantic"] and (item["doi"] or item["arxiv"]): 593 ↛ 594line 593 didn't jump to line 594 because the condition on line 593 was never true

594 if not is_same_title(item[get_publication_title(item)], titles): 

595 refs[item["semantic"]] = ref 

596 

597 sources = list({ref["provider"] for ref in refs.values()}) 

598 sources = ", ".join(sorted(sources)) 

599 citations_html = [citation["html"] for citation in refs.values()] 

600 citedby_for_stats = get_values_for_stats(refs) 

601 return citations_html, sources, citedby_for_stats 

602 

603 

604def citations_to_refs(provider, citations): 

605 if provider == CROSSREF: 

606 return citedby_crossref_refs(citations) 

607 elif provider == ZBMATH: 

608 return citedby_zbmath_refs(citations) 

609 elif provider == ADS: 

610 return citedby_ads_refs(citations) 

611 elif provider == SEMANTIC: 

612 return citedby_semantic_refs(citations) 

613 

614 

615def get_citations(resource): 

616 """Returns documents that cite this doi and sources used for the research.""" 

617 data = {} 

618 authors = get_names(resource, "author") 

619 zbl_id = get_extid(resource, "zbl-item-id") 

620 preprint_id = get_extid(resource, "preprint") 

621 

622 metadata = { 

623 "authors": authors, 

624 "doi": resource.doi, 

625 "preprint_id": preprint_id.id_value if preprint_id else "", 

626 "title": resource.title_tex, 

627 } 

628 

629 if zbl_id and zbl_id.id_value: 629 ↛ 630line 629 didn't jump to line 630 because the condition on line 629 was never true

630 metadata.update({"zbl_id": zbl_id.id_value}) 

631 

632 with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: 

633 future_to_provider = { 

634 executor.submit(citedby_crossref, metadata): CROSSREF, 

635 executor.submit(citedby_zbmath, metadata): ZBMATH, 

636 executor.submit(citedby_ads, metadata): ADS, 

637 } 

638 for future in concurrent.futures.as_completed(future_to_provider): 

639 provider = future_to_provider[future] 

640 try: 

641 if future.result(): 641 ↛ 642line 641 didn't jump to line 642 because the condition on line 641 was never true

642 data.update({provider: future.result()}) 

643 except Timeout: 

644 continue 

645 except ConnectionError: 

646 continue 

647 

648 citations = [] 

649 for provider, cites in data.items(): 649 ↛ 650line 649 didn't jump to line 650 because the loop on line 649 never started

650 citations.extend(citations_to_refs(provider, cites)) 

651 

652 return built_citations(citations)