Coverage for src/ptf/external/arxiv.py: 0%

32 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1from datetime import timedelta 

2 

3import feedparser 

4from requests_cache import CachedSession 

5from requests_cache import FileCache 

6 

7from django.conf import settings 

8 

9from ptf import model_data 

10from ptf.model_data import AbstractDict 

11from ptf.model_data import create_contributor 

12 

13session = CachedSession( 

14 backend=FileCache( 

15 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache", 

16 decode_content=False, 

17 ), 

18 headers={ 

19 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0", 

20 "From": getattr(settings, "REQUESTS_EMAIL", None) or "accueil@listes.mathdoc.fr", 

21 }, 

22 expire_after=timedelta(days=2), 

23) 

24 

25 

26def get_arxiv_url(id): 

27 return f"http://export.arxiv.org/api/query?id_list={id}" 

28 

29 

30def get_arxiv_article(id): 

31 url = get_arxiv_url(id) 

32 

33 # http = urllib3.PoolManager(cert_reqs="CERT_NONE") 

34 # urllib3.util.make_headers(keep_alive=None, accept_encoding="utf-8") 

35 headers = {"accept_encoding": "utf-8"} 

36 

37 # For SSL Errors, use verify=False kwarg 

38 response = session.get(url=url, headers=headers) 

39 

40 # parse the response using feedparser 

41 feed = feedparser.parse(response.text) 

42 if len(feed.entries) == 0: 

43 return None 

44 

45 entry = feed.entries[0] 

46 

47 article_data = model_data.create_articledata() 

48 

49 # TITLE 

50 article_data.title_tex = entry.title 

51 

52 # AUTHORS 

53 for author_entry in entry.authors: 

54 author = create_contributor() 

55 author["role"] = "author" 

56 author["string_name"] = author_entry.name 

57 

58 article_data.contributors.append(author) 

59 

60 # ABSTRACT 

61 xabstract: AbstractDict = { 

62 "tag": "abstract", 

63 "value_html": "", 

64 "value_tex": entry.summary, 

65 "value_xml": "", 

66 "lang": "en", 

67 } 

68 article_data.abstracts.append(xabstract) 

69 

70 # PDF 

71 for link in entry.links: 

72 if link["type"] == "application/pdf": 

73 article_data.pdf_url = link["href"] 

74 

75 return article_data