Coverage for src/ptf/model_data_converter.py: 58%
367 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1##################################################################################################
2#
3# README
4#
5# Operations on the xml data objects
6# Django DB -> Data objects
7#
8##################################################################################################
10import types
12from django.db.models import Q
14from ptf.cmds.xml.citation_html import get_citation_html
15from ptf.cmds.xml.jats.builder.issue import get_single_title_xml
16from ptf.cmds.xml.jats.builder.issue import get_title_xml
17from ptf.cmds.xml.xml_base import RefBase
18from ptf.cmds.xml.xml_utils import escape
19from ptf.cmds.xml.xml_utils import get_contrib_xml
20from ptf.model_data import ArticleData
21from ptf.model_data import BookData
22from ptf.model_data import BookPartData
23from ptf.model_data import Foo
24from ptf.model_data import IssueData
25from ptf.model_data import JournalData
26from ptf.model_data import MathdocPublicationData
27from ptf.model_data import PublisherData
28from ptf.model_data import RefData
29from ptf.model_data import create_contributor
32def db_append_obj_with_location_to_list(resource_qs, data_list):
33 for obj_with_location in resource_qs:
34 data = {
35 "rel": obj_with_location.rel,
36 "mimetype": obj_with_location.mimetype,
37 "location": obj_with_location.location,
38 "base": obj_with_location.base.base if obj_with_location.base else "",
39 }
40 # 'seq': obj_with_location.seq}
42 for attr in ["metadata", "text", "caption"]:
43 if hasattr(obj_with_location, attr):
44 data[attr] = getattr(obj_with_location, attr)
46 data_list.append(data)
49def db_to_contributors(qs):
50 contributors = []
51 for contribution in qs.all():
52 contributor = create_contributor()
54 contributor["first_name"] = contribution.first_name
55 contributor["last_name"] = contribution.last_name
56 contributor["prefix"] = contribution.prefix
57 contributor["suffix"] = contribution.suffix
58 contributor["orcid"] = contribution.orcid if contribution.orcid else ""
59 contributor["email"] = contribution.email
60 contributor["string_name"] = contribution.string_name
61 contributor["mid"] = contribution.mid if contribution.mid else ""
62 contributor["addresses"] = [
63 contrib_address.address for contrib_address in contribution.contribaddress_set.all()
64 ]
65 contributor["role"] = contribution.role
66 contributor["deceased_before_publication"] = contribution.deceased_before_publication
67 contributor["equal_contrib"] = contribution.equal_contrib
68 contributor["corresponding"] = contribution.corresponding
69 contributor["contrib_xml"] = contribution.contrib_xml
70 # contributor["author_id"] = contribution.id
72 contributors.append(contributor)
74 return contributors
77def db_to_resource_data_common(resource, data_resource):
78 data_resource.pid = resource.pid
79 data_resource.doi = resource.doi
81 data_resource.lang = resource.lang
82 data_resource.title_xml = resource.title_xml
83 data_resource.title_tex = resource.title_tex
84 data_resource.title_html = resource.title_html
85 data_resource.abbrev = resource.abbrev
87 data_resource.trans_lang = resource.trans_lang
88 data_resource.trans_title_tex = resource.trans_title_tex
89 data_resource.trans_title_html = resource.trans_title_html
91 data_resource.funding_statement_xml = resource.funding_statement_xml
92 data_resource.funding_statement_html = resource.funding_statement_html
93 data_resource.footnotes_xml = resource.footnotes_xml
94 data_resource.footnotes_html = resource.footnotes_html
96 data_resource.ids = [(id.id_type, id.id_value) for id in resource.resourceid_set.all()]
97 data_resource.extids = [(extid.id_type, extid.id_value) for extid in resource.extid_set.all()]
99 db_append_obj_with_location_to_list(resource.extlink_set.all(), data_resource.ext_links)
100 db_append_obj_with_location_to_list(resource.datastream_set.all(), data_resource.streams)
101 db_append_obj_with_location_to_list(
102 resource.relatedobject_set.all(), data_resource.related_objects
103 )
105 # Ignore related_objects and figures: they are updated by the FullText import after the Cedrics import
106 # db_append_obj_with_location_to_list(resource.relatedobject_set.all(),
107 # data_resource.related_objects)
108 # db_append_obj_with_location_to_list(resource.relatedobject_set.filter(rel='html-image'),
109 # data_resource.figures)
110 db_append_obj_with_location_to_list(
111 resource.relatedobject_set.filter(Q(rel="supplementary-material") | Q(rel="review")),
112 data_resource.supplementary_materials,
113 )
115 data_resource.counts = [
116 (count.name, count.value) for count in resource.resourcecount_set.all()
117 ]
119 data_resource.contributors = db_to_contributors(resource.contributions)
121 data_resource.kwds = [
122 {"type": kwd.type, "lang": kwd.lang, "value": kwd.value} for kwd in resource.kwd_set.all()
123 ]
124 data_resource.subjs = [
125 {"type": subj.type, "lang": subj.lang, "value": subj.value}
126 for subj in resource.subj_set.all()
127 ]
129 data_resource.abstracts = [
130 {
131 "tag": abstract.tag,
132 "lang": abstract.lang,
133 "value_xml": abstract.value_xml,
134 "value_tex": abstract.value_tex,
135 "value_html": abstract.value_html,
136 }
137 for abstract in resource.abstract_set.all()
138 ]
140 data_resource.awards = [
141 {"abbrev": award.abbrev, "award_id": award.award_id} for award in resource.award_set.all()
142 ]
144 for relation in resource.subject_of.all(): 144 ↛ 145line 144 didn't jump to line 145 because the loop on line 144 never started
145 obj = Foo()
146 obj.rel_type = relation.rel_info.left
147 obj.id_value = relation.object_pid
148 data_resource.relations.append(obj)
150 for relation in resource.object_of.all():
151 obj = Foo()
152 obj.rel_type = relation.rel_info.right
153 obj.id_value = relation.subject_pid
154 data_resource.relations.append(obj)
155 if hasattr(resource, "issn"):
156 data_resource.issn = resource.issn
157 if hasattr(resource, "e_issn"):
158 data_resource.e_issn = resource.e_issn
161def db_to_publisher_data(publisher):
162 data_publisher = PublisherData()
164 data_publisher.name = publisher.pub_name
165 data_publisher.loc = publisher.pub_loc
167 # TODO: ext_links ?
168 data_publisher.ext_links = []
170 return data_publisher
173def db_to_publication_data(collection):
174 data_col = MathdocPublicationData()
176 db_to_resource_data_common(collection, data_col)
178 data_col.coltype = collection.coltype
179 data_col.wall = collection.wall
180 data_col.issn = collection.issn
181 data_col.e_issn = collection.e_issn
183 return data_col
186def db_to_journal_data(collection):
187 data_journal = JournalData()
189 # A JournalData has no coltype ?
191 # A JournalData has a publisher but it does not seem to be used anywhere ?
192 # The publisher seems to belong to the issue/article and not to the Journal.
194 db_to_resource_data_common(collection, data_journal)
195 return data_journal
198def db_to_collection_data(collection):
199 data_col = MathdocPublicationData()
201 db_to_resource_data_common(collection, data_col)
203 data_col.coltype = collection.coltype
204 data_col.issn = collection.issn
205 data_col.e_issn = collection.e_issn
207 # attributes used for CollectionMembership
208 if hasattr(collection, "vseries"):
209 data_col.vseries = collection.vseries
210 if hasattr(collection, "volume"):
211 data_col.volume = collection.volume
212 if hasattr(collection, "seq"):
213 data_col.seq = collection.seq
215 return data_col
218def db_to_issue_data(container, articles=None):
219 data_issue = IssueData()
221 db_to_resource_data_common(container, data_issue)
223 data_issue.ctype = container.ctype
225 data_issue.year = container.year
226 data_issue.vseries = container.vseries
227 data_issue.volume = container.volume
228 data_issue.number = container.number
230 data_issue.last_modified_iso_8601_date_str = (
231 container.last_modified.isoformat() if container.last_modified else ""
232 )
233 data_issue.prod_deployed_date_iso_8601_date_str = (
234 container.deployed_date().isoformat() if container.deployed_date() else ""
235 )
237 data_issue.journal = db_to_journal_data(container.my_collection)
238 data_issue.publisher = db_to_publisher_data(container.my_publisher)
239 data_issue.provider = container.provider.name
241 # a Container has a seq, but it is used only for the books collections
243 # articles may have been prefetched / filtered before
244 if not articles: 244 ↛ 247line 244 didn't jump to line 247 because the condition on line 244 was always true
245 articles = container.article_set.all()
247 for article in articles:
248 data_article = db_to_article_data(article)
249 data_issue.articles.append(data_article)
251 return data_issue
254def db_to_book_data(container):
255 data_book = BookData()
257 db_to_resource_data_common(container, data_book)
259 data_book.ctype = container.ctype
260 setattr(data_book, "year", container.year)
262 data_book.publisher = db_to_publisher_data(container.my_publisher)
263 data_book.provider = container.provider
265 data_col = db_to_collection_data(container.my_collection)
266 # These attributes are required when adding a container to solr
267 if not hasattr(data_col, "vseries"):
268 setattr(data_col, "vseries", 0)
269 if not hasattr(data_col, "volume"):
270 setattr(data_col, "volume", 0)
271 data_book.incollection.append(data_col)
272 for collection in container.my_other_collections.all():
273 data_col = db_to_collection_data(container.my_collection)
274 data_book.incollection.append(data_col)
276 if hasattr(container, "frontmatter") and container.frontmatter is not None:
277 data_book.frontmatter_xml = container.frontmatter.value_xml
278 data_book.frontmatter_toc_html = container.frontmatter.value_html
279 data_book.frontmatter_foreword_html = container.frontmatter.foreword_html
280 data_book.body = container.get_body()
282 data_book.last_modified_iso_8601_date_str = (
283 container.last_modified.isoformat() if container.last_modified else ""
284 )
285 data_book.prod_deployed_date_iso_8601_date_str = (
286 container.deployed_date().isoformat() if container.deployed_date() else ""
287 )
289 for bookpart in container.article_set.all():
290 data_bookpart = db_to_bookpart_data(bookpart)
291 data_book.parts.append(data_bookpart)
293 for bibitem in container.bibitem_set.all():
294 data_ref = db_to_ref_data(bibitem, data_book.lang)
295 data_book.bibitems.append(data_ref)
296 data_book.bibitem.append(data_ref.citation_html)
298 return data_book
301def db_to_article_data(article):
302 data_article = ArticleData()
304 db_to_resource_data_common(article, data_article)
306 data_article.atype = article.atype
307 data_article.seq = str(article.seq)
309 data_article.fpage = article.fpage
310 data_article.lpage = article.lpage
311 data_article.page_range = article.page_range
312 data_article.page_type = article.page_type
314 data_article.article_number = article.article_number
315 data_article.talk_number = article.talk_number
316 data_article.elocation = article.elocation
317 data_article.coi_statement = article.coi_statement if article.coi_statement else ""
319 data_article.date_published_iso_8601_date_str = (
320 article.date_published.isoformat() if article.date_published else ""
321 )
322 data_article.prod_deployed_date_iso_8601_date_str = (
323 article.deployed_date().isoformat()
324 if article.my_container and article.deployed_date()
325 else ""
326 )
328 data_article.history_dates = [
329 {"type": type, "date": date.isoformat()}
330 for type, date in [
331 ("received", article.date_received),
332 ("revised", article.date_revised),
333 ("accepted", article.date_accepted),
334 ("online", article.date_online_first),
335 ]
336 if date
337 ]
339 data_article.body = article.get_body()
340 data_article.body_html = article.body_html
341 data_article.body_tex = article.body_tex
342 data_article.body_xml = article.body_xml
344 for bibitem in article.bibitem_set.all():
345 data_ref = db_to_ref_data(bibitem, "und")
346 data_article.bibitems.append(data_ref)
347 data_article.bibitem.append(data_ref.citation_html)
349 for trans_article in article.translations.all(): 349 ↛ 350line 349 didn't jump to line 350 because the loop on line 349 never started
350 trans_data_article = db_to_article_data(trans_article)
351 data_article.translations.append(trans_data_article)
353 return data_article
356def db_to_bookpart_data(article):
357 data_bookpart = BookPartData()
359 db_to_resource_data_common(article, data_bookpart)
361 data_bookpart.atype = article.atype
363 data_bookpart.fpage = article.fpage
364 data_bookpart.lpage = article.lpage
365 data_bookpart.page_range = article.page_range
366 data_bookpart.page_type = article.page_type
368 if hasattr(article, "frontmatter") and article.frontmatter is not None:
369 data_bookpart.frontmatter_xml = article.frontmatter.value_xml
370 data_bookpart.frontmatter_toc_html = article.frontmatter.value_html
371 data_bookpart.frontmatter_foreword_html = article.frontmatter.foreword_html
372 data_bookpart.body = article.get_body()
374 for bibitem in article.bibitem_set.all():
375 data_ref = db_to_ref_data(bibitem, data_bookpart.lang)
376 data_bookpart.bibitems.append(data_ref)
377 data_bookpart.bibitem.append(data_ref.citation_html)
379 return data_bookpart
382def db_to_ref_data(bibitem, lang):
383 data_ref = RefData(lang=lang)
385 data_ref.type = bibitem.type
386 data_ref.user_id = bibitem.user_id
387 data_ref.label = bibitem.label
389 data_ref.citation_xml = bibitem.citation_xml
390 data_ref.citation_tex = bibitem.citation_tex
391 data_ref.citation_html = bibitem.citation_html
393 data_ref.publisher_name = bibitem.publisher_name
394 data_ref.publisher_loc = bibitem.publisher_loc
396 data_ref.article_title_tex = bibitem.article_title_tex
397 data_ref.chapter_title_tex = bibitem.chapter_title_tex
398 data_ref.institution = bibitem.institution
399 data_ref.series = bibitem.series
400 data_ref.volume = bibitem.volume
401 data_ref.issue = bibitem.issue
402 data_ref.month = bibitem.month
403 data_ref.year = bibitem.year
404 data_ref.comment = bibitem.comment
405 data_ref.annotation = bibitem.annotation
406 data_ref.fpage = bibitem.fpage
407 data_ref.lpage = bibitem.lpage
408 data_ref.page_range = bibitem.page_range
409 data_ref.size = bibitem.size
410 data_ref.source_tex = bibitem.source_tex
412 data_ref.extids = [
413 (bibitemid.id_type, bibitemid.id_value) for bibitemid in bibitem.bibitemid_set.all()
414 ]
416 data_ref.contributors = db_to_contributors(bibitem.contributions)
418 return data_ref
421def jats_from_ref_comment(ref):
422 attr = getattr(ref, "comment")
423 if attr is None: 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true
424 return ""
426 text = ""
427 start = attr.find("http://")
428 if start == -1: 428 ↛ 431line 428 didn't jump to line 431 because the condition on line 428 was always true
429 start = attr.find("https://")
431 if start != -1: 431 ↛ 432line 431 didn't jump to line 432 because the condition on line 431 was never true
432 end = attr.find(" ", start)
433 if end == -1:
434 url = escape(attr[start:])
435 else:
436 url = escape(attr[start:end])
438 text = escape(attr[0:start])
439 text += f'<ext-link xlink:href="{url}">{url}</ext-link>'
441 if end != -1:
442 text += escape(attr[end + 1 :])
443 else:
444 text = escape(attr)
446 text = f'<comment xml:space="preserve">{text}</comment>'
448 return text
451def jats_from_ref_attr(
452 ref,
453 attr_name,
454 jats_tag="",
455 preserve=False,
456 attr_type=None,
457 attr_type_value="",
458 convert_html_tag=False,
459):
460 if not hasattr(ref, attr_name): 460 ↛ 461line 460 didn't jump to line 461 because the condition on line 460 was never true
461 return ""
463 text = ""
464 attr = getattr(ref, attr_name)
465 if len(jats_tag) == 0:
466 jats_tag = attr_name
467 if attr and preserve:
468 value = get_single_title_xml(attr) if convert_html_tag else escape(attr)
469 if attr_type is not None: 469 ↛ 470line 469 didn't jump to line 470 because the condition on line 469 was never true
470 text = f'<{jats_tag} {attr_type}="{attr_type_value}" xml:space="preserve">{escape(attr)}</{jats_tag}>'
471 else:
472 text = f'<{jats_tag} xml:space="preserve">{value}</{jats_tag}>'
473 elif attr:
474 value = get_single_title_xml(attr) if convert_html_tag else escape(attr)
475 if attr_type is not None:
476 text = f'<{jats_tag} {attr_type}="{attr_type_value}">{value}</{jats_tag}>'
477 else:
478 text = f"<{jats_tag}>{escape(attr)}</{jats_tag}>"
480 return text
483def jats_from_abstract(abstract_lang, article_lang, abstract):
484 if abstract_lang == article_lang:
485 return f'<abstract xml:lang="{article_lang}">{abstract.value_xml}</abstract>'
486 else:
487 return f'<trans-abstract xml:lang="{abstract_lang}">{abstract.value_xml}</trans-abstract>'
490def jats_from_ref(ref):
491 text = ""
492 authors = ref.get_authors()
493 if authors is not None: 493 ↛ 496line 493 didn't jump to line 496 because the condition on line 493 was always true
494 text += "".join([author["contrib_xml"] for author in authors])
496 text += jats_from_ref_attr(
497 ref, "article_title_tex", "article-title", preserve=True, convert_html_tag=True
498 )
499 text += jats_from_ref_attr(ref, "chapter_title_tex", "chapter-title", convert_html_tag=True)
500 text += jats_from_ref_attr(ref, "source_tex", "source", preserve=True, convert_html_tag=True)
502 editors = ref.get_editors()
503 if editors is not None: 503 ↛ 506line 503 didn't jump to line 506 because the condition on line 503 was always true
504 text += "".join([editor["contrib_xml"] for editor in editors])
506 text += jats_from_ref_attr(ref, "series", preserve=True)
507 text += jats_from_ref_attr(ref, "volume")
508 text += jats_from_ref_attr(ref, "publisher_name", "publisher-name")
509 text += jats_from_ref_attr(ref, "publisher_loc", "publisher-loc")
510 text += jats_from_ref_attr(ref, "institution")
511 text += jats_from_ref_attr(ref, "year")
512 text += jats_from_ref_attr(ref, "issue")
513 text += jats_from_ref_attr(
514 ref, "doi", "pub-id", attr_type="pub-id-type", attr_type_value="doi"
515 )
516 text += jats_from_ref_attr(ref, "fpage")
517 text += jats_from_ref_attr(ref, "lpage")
518 text += jats_from_ref_attr(ref, "size", "size")
519 text += jats_from_ref_comment(ref)
521 return text
524def update_ref_data_for_jats(ref, i, with_label=True):
525 """
526 Set with_label=False if you do not want a label in the citation_html (for example in the citedby)
527 """
529 if hasattr(ref, "eid") and ref.eid is not None and ref.eid != "": 529 ↛ 530line 529 didn't jump to line 530 because the condition on line 529 was never true
530 eids = [item for item in ref.extids if item[0] == "eid"]
531 if len(eids) > 0:
532 ref.extids.remove(eids[0])
533 ref.extids.append(("eid", ref.eid))
535 label = ref.label
536 if not label and with_label: 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true
537 label = f"[{i}]"
538 ref.label = label
540 if ref.type == "unknown": 540 ↛ 541line 540 didn't jump to line 541 because the condition on line 540 was never true
541 if not ref.citation_html:
542 if with_label and ref.citation_tex.find(label) != 0:
543 ref.citation_html = f"{label} {ref.citation_tex}"
544 else:
545 ref.citation_html = ref.citation_tex
547 if not ref.citation_xml:
548 ref.citation_xml = f'<label>{escape(ref.label)}</label><mixed-citation xml:space="preserve">{ref.citation_tex}</mixed_ciation>'
549 else:
550 ref.label = f"{label}" if with_label else ""
551 # ref can be a Munch dictionary, or a RefData object.
552 # Add RefBase member functions, like get_authors
553 # ref_base = RefBase(lang='und')
554 # ref_base.from_dict(ref)
555 ref.get_authors = types.MethodType(RefBase.get_authors, ref)
556 ref.get_editors = types.MethodType(RefBase.get_editors, ref)
557 text = get_citation_html(ref)
558 ref.citation_html = ref.citation_tex = text
560 for contrib in ref.contributors:
561 contrib["contrib_xml"] = get_contrib_xml(contrib, is_ref=True)
563 if ref.type != "unknown": 563 ↛ exitline 563 didn't return from function 'update_ref_data_for_jats' because the condition on line 563 was always true
564 element_citation = jats_from_ref(ref)
565 ref.citation_xml = f'<label>{escape(ref.label)}</label><element-citation publication-type="{ref.type}">{element_citation}</element-citation>'
568def update_data_for_jats(data_article, create_author_if_empty=False, with_label=True):
569 if not data_article.title_html:
570 data_article.title_html = data_article.title_tex
571 if not data_article.trans_title_html:
572 data_article.trans_title_html = data_article.trans_title_tex
573 if not data_article.title_xml:
574 data_article.title_xml = get_title_xml(
575 data_article.title_tex, data_article.trans_title_tex, data_article.trans_lang
576 )
578 for contrib in data_article.contributors:
579 contrib["contrib_xml"] = get_contrib_xml(contrib)
581 if data_article.doi is not None:
582 value = ("doi", data_article.doi)
583 if value not in data_article.ids:
584 data_article.ids.append(value)
586 if create_author_if_empty and len(data_article.contributors) == 0:
587 contrib = create_contributor()
588 contrib["role"] = "author"
589 contrib["contrib_xml"] = get_contrib_xml(contrib)
590 data_article.contributors = [contrib]
592 for i, ref in enumerate(data_article.bibitems, start=1):
593 update_ref_data_for_jats(ref, i, with_label=with_label)
595 for trans_data_article in data_article.translations:
596 update_data_for_jats(trans_data_article, create_author_if_empty, with_label)
599def convert_refdata_for_editor(ref):
600 contribs_text = "\n".join(
601 [f"{contrib['last_name']}, {contrib['first_name']}" for contrib in ref.contributors]
602 )
603 ref.contribs_text = contribs_text
605 if not ref.article_title_tex and not ref.chapter_title_tex and not ref.source_tex:
606 ref.type = "unknown"
608 ref.doi = ""
609 for extid in ref.extids:
610 if extid[0] == "doi":
611 ref.doi = extid[1]
612 elif extid[0] == "eid":
613 ref.eid = extid[1]
614 # URLs are in <comment>
615 # ref.url = ''
616 # for ext_link in ref.ext_links:
617 # if ext_link['link_type'] == '':
618 # ref.url = ext_link['location']