Coverage for src/ptf/external/arxiv.py: 0%
32 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1from datetime import timedelta
3import feedparser
4from requests_cache import CachedSession
5from requests_cache import FileCache
7from django.conf import settings
9from ptf import model_data
10from ptf.model_data import AbstractDict
11from ptf.model_data import create_contributor
13session = CachedSession(
14 backend=FileCache(
15 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache",
16 decode_content=False,
17 ),
18 headers={
19 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0",
20 "From": getattr(settings, "REQUESTS_EMAIL", None) or "accueil@listes.mathdoc.fr",
21 },
22 expire_after=timedelta(days=2),
23)
26def get_arxiv_url(id):
27 return f"http://export.arxiv.org/api/query?id_list={id}"
30def get_arxiv_article(id):
31 url = get_arxiv_url(id)
33 # http = urllib3.PoolManager(cert_reqs="CERT_NONE")
34 # urllib3.util.make_headers(keep_alive=None, accept_encoding="utf-8")
35 headers = {"accept_encoding": "utf-8"}
37 # For SSL Errors, use verify=False kwarg
38 response = session.get(url=url, headers=headers)
40 # parse the response using feedparser
41 feed = feedparser.parse(response.text)
42 if len(feed.entries) == 0:
43 return None
45 entry = feed.entries[0]
47 article_data = model_data.create_articledata()
49 # TITLE
50 article_data.title_tex = entry.title
52 # AUTHORS
53 for author_entry in entry.authors:
54 author = create_contributor()
55 author["role"] = "author"
56 author["string_name"] = author_entry.name
58 article_data.contributors.append(author)
60 # ABSTRACT
61 xabstract: AbstractDict = {
62 "tag": "abstract",
63 "value_html": "",
64 "value_tex": entry.summary,
65 "value_xml": "",
66 "lang": "en",
67 }
68 article_data.abstracts.append(xabstract)
70 # PDF
71 for link in entry.links:
72 if link["type"] == "application/pdf":
73 article_data.pdf_url = link["href"]
75 return article_data