Coverage for src/ptf/model_data_comparator.py: 74%
231 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-05 09:56 +0000
1##################################################################################################
2#
3# README
4#
5#
6##################################################################################################
8from collections import OrderedDict
10from ptf import model_helpers
11from ptf.model_data_converter import db_to_journal_data
13TEST_STRICT = False
16def create_diff_obj(a, b, label):
17 diff_obj = []
18 result = True
20 # contrib_xml is updated if you edit an article: other fields will be different
21 # first_initials is not stored in the database
22 if ( 22 ↛ 29line 22 didn't jump to line 29 because the condition on line 22 was never true
23 label == "contrib_xml"
24 or label == "first_initials"
25 or label == "label_prefix"
26 or label == "value_html"
27 or label == "value_tex"
28 ) and not TEST_STRICT:
29 return result, diff_obj
31 if hasattr(a, "__dict__"):
32 a = a.__dict__
33 if hasattr(b, "__dict__"):
34 b = b.__dict__
36 if type(a) == list and type(b) == list:
37 if len(a) != len(b) and label != "streams": # TeX can be added in streams
38 result = False
39 diff_obj.append(["len", len(a), len(b)])
40 elif label not in [
41 "streams",
42 "supplementary_materials",
43 ]: # Cedrics import change the location: ignore the values
44 for obj1, obj2 in zip(a, b):
45 result_obj, list_diff_obj = create_diff_obj(obj1, obj2, label)
46 if not result_obj: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 result = False
49 if label == "contrib_groups":
50 child_label = obj1["content_type"]
51 elif label == "contribs":
52 child_label = obj1["string_name"]
53 else:
54 child_label = label
56 diff_obj.append([child_label, list_diff_obj])
57 elif type(a) == dict and type(b) == dict:
58 a = OrderedDict(sorted(a.items()))
59 b = OrderedDict(sorted(b.items()))
60 for key, value in a.items():
61 if key in b: 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true
62 result_obj, values_diff_obj = create_diff_obj(value, b[key], key)
63 if not result_obj: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 result = False
65 diff_obj.append(values_diff_obj)
66 else:
67 result = False
68 diff_obj.append([key, value, ""])
69 for key, value in b.items():
70 if key not in a: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 result = False
72 diff_obj.append([key, "", value])
73 else:
74 if type(a) != type(b): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 result = False
76 elif type(a) == str and type(b) == str:
77 a = a.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "")
78 b = b.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "")
80 # The Cedrics -> JATS changes the order
81 a = a.replace('close="" open="{" separators="', "")
82 b = b.replace('separators="" open="{" close="', "")
83 a = a.replace('separators="" open="|" close="', "")
84 b = b.replace('close="" open="|" separators="', "")
86 try:
87 a_date = model_helpers.parse_date_str(a)
88 b_date = model_helpers.parse_date_str(b)
89 a = a_date
90 b = b_date
91 except:
92 pass
93 result = a == b
94 else:
95 result = a == b
97 if not result: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 diff_obj = [label, a, b]
100 if type(a) == str:
101 i = 0
102 while i < len(a) and i < len(b):
103 if a[i] != b[i]:
104 print(label, i, a[i : min(len(a), i + 30)])
105 print(label, i, b[i : min(len(b), i + 30)])
106 break
107 i += 1
109 return result, diff_obj
112class BaseComparator:
113 def compare(self, obj1, obj2, diff_dict):
114 return True
116 def compare_list_of_simple_types(self, obj1, obj2, diff_dict, list_of_simple_types):
117 result = True
119 for attr in list_of_simple_types:
120 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex
121 # We have to ignore the DOIs
122 if not (attr in ["doi", "citation_xml"] and hasattr(obj1, "citation_xml")) and not (
123 attr == "doi"
124 and hasattr(obj2, "pid")
125 and obj2.pid is not None
126 and obj2.pid.startswith("AIF_")
127 ):
128 obj1_attr = getattr(obj1, attr)
129 obj2_attr = getattr(obj2, attr)
131 if attr == "provider" and obj2_attr == "numdam":
132 obj2_attr = "mathdoc"
134 if (
135 type(obj1_attr) == str
136 and obj1_attr is not None
137 and type(obj2_attr) == str
138 and obj2_attr is not None
139 ):
140 obj1_attr = obj1_attr.replace(
141 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""
142 )
143 obj2_attr = obj2_attr.replace(
144 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""
145 )
147 if obj1_attr != obj2_attr:
148 result = False
149 diff_dict[attr] = [obj1_attr, obj2_attr]
151 return result
153 def compare_list_of_list(self, obj1, obj2, diff_dict, list_of_list):
154 result = True
156 for attr in list_of_list:
157 obj1_attr = getattr(obj1, attr)
158 obj2_attr = getattr(obj2, attr)
160 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex
161 # We have to ignore the DOIs
162 if TEST_STRICT or ( 162 ↛ 156line 162 didn't jump to line 156 because the condition on line 162 was always true
163 not (attr == "extids" and hasattr(obj1, "citation_xml"))
164 and not (
165 attr == "ids"
166 and hasattr(obj2, "pid")
167 and obj2.pid is not None
168 and obj2.pid.startswith("AIF_")
169 )
170 ):
171 result_obj, diff_obj = create_diff_obj(obj1_attr, obj2_attr, attr)
172 if not result_obj:
173 result = False
174 diff_dict[attr] = diff_obj
176 return result
178 def compare_list_of_objs(self, obj1, obj2, diff_dict, attr, attr_id_list, comparator):
179 result = True
181 list1 = getattr(obj1, attr)
182 list2 = getattr(obj2, attr)
183 if len(list1) != len(list2): 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true
184 if attr != "articles":
185 diff_dict[attr] = [{"len": [len(list1), len(list2)]}]
186 else:
187 attr_diffs = []
189 for list1_obj, list2_obj in zip(list1, list2):
190 attr_diff_dict = {}
191 if not comparator.compare(list1_obj, list2_obj, attr_diff_dict):
192 result = False
193 params = {"diff": attr_diff_dict}
194 for attr_id in attr_id_list:
195 params[attr_id] = getattr(list1_obj, attr_id)
196 attr_diffs.append(params)
198 if not result:
199 diff_dict[attr] = attr_diffs
201 return result
204class ResourceDataComparator(BaseComparator):
205 def compare(self, obj1, obj2, diff_dict):
206 result = super().compare(obj1, obj2, diff_dict)
208 # Ignore trans_title_xml: it is not used in the Django DB
209 result = (
210 self.compare_list_of_simple_types(
211 obj1,
212 obj2,
213 diff_dict,
214 [
215 "lang",
216 "pid",
217 "doi",
218 "title_xml",
219 "title_tex",
220 "title_html",
221 "trans_lang",
222 "trans_title_html",
223 "trans_title_tex",
224 "funding_statement_html",
225 "footnotes_html",
226 ],
227 )
228 and result
229 )
231 if False and len(obj1.abstracts) > 0 and len(obj2.abstracts) > 0:
232 j = 0
233 while j < len(obj1.abstracts):
234 a1 = obj1.abstracts[j]["value_xml"]
235 a2 = obj2.abstracts[j]["value_xml"]
237 if a1 != a2:
238 j = len(obj1.abstracts)
239 i = 0
240 while i < len(a1) and i < len(a2):
241 if a1[i] != a2[i]:
242 print(i, a1[i], a2[i], ord(a1[i]), ord(a2[i]))
243 i += 1
244 j += 1
246 result = (
247 self.compare_list_of_list(
248 obj1,
249 obj2,
250 diff_dict,
251 [
252 "abstracts",
253 "awards",
254 "relations",
255 # 'ids', 'extids',
256 # 'ext_links',
257 "streams",
258 "related_objects",
259 "counts",
260 "contributors",
261 "kwds",
262 "kwd_groups",
263 "figures",
264 "supplementary_materials",
265 ],
266 )
267 and result
268 )
270 if TEST_STRICT: 270 ↛ 276line 270 didn't jump to line 276
271 result = (
272 self.compare_list_of_list(obj1, obj2, diff_dict, ["ids", "extids", "ext_links"])
273 and result
274 )
276 result = (
277 self.compare_list_of_objs(
278 obj1, obj2, diff_dict, "bibitems", ["label"], RefDataComparator()
279 )
280 and result
281 )
283 return result
286class MathdocPublicationDataComparator(ResourceDataComparator):
287 def compare(self, obj1, obj2, diff_dict):
288 result = super().compare(obj1, obj2, diff_dict)
290 result = (
291 self.compare_list_of_simple_types(
292 obj1, obj2, diff_dict, ["coltype", "e_issn", "wall", "provider"]
293 )
294 and result
295 )
297 return result
300class PublisherDataComparator(BaseComparator):
301 def compare(self, obj1, obj2, diff_dict):
302 if obj1 is None and obj2 is None:
303 return True
305 result = super().compare(obj1, obj2, diff_dict)
307 result = (
308 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["name", "loc"]) and result
309 )
311 return result
314class JournalDataComparator(ResourceDataComparator):
315 def compare(self, obj1, obj2, diff_dict):
316 result = super().compare(obj1, obj2, diff_dict)
318 publisher_comparator = PublisherDataComparator()
319 pub_diff = {}
320 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 result = False
322 diff_dict["publisher"] = pub_diff
324 return result
327class IssueDataComparator(ResourceDataComparator):
328 def compare(self, obj1, obj2, diff_dict):
329 result = super().compare(obj1, obj2, diff_dict)
331 journal_comparator = JournalDataComparator()
332 journal_diff = {}
333 if not journal_comparator.compare(obj1.journal, obj2.journal, journal_diff): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 result = False
335 diff_dict["journal"] = journal_diff
337 publisher_comparator = PublisherDataComparator()
338 pub_diff = {}
339 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 result = False
341 diff_dict["publisher"] = pub_diff
343 result = (
344 self.compare_list_of_simple_types(
345 obj1, obj2, diff_dict, ["provider", "ctype", "year", "vseries", "volume", "number"]
346 )
347 and result
348 )
350 # Ignore last_modified (it's OK if it has been edited in ptf-tools)
351 # Ignore prod_deployed_date (set during import by database cmds: Django DB is different from the XML)
353 result = (
354 self.compare_list_of_objs(
355 obj1, obj2, diff_dict, "articles", ["pid", "doi"], ArticleDataComparator()
356 )
357 and result
358 )
360 return result
363class ArticleDataComparator(ResourceDataComparator):
364 def compare(self, obj1, obj2, diff_dict):
365 result = super().compare(obj1, obj2, diff_dict)
367 result = (
368 self.compare_list_of_simple_types(
369 obj1,
370 obj2,
371 diff_dict,
372 [
373 "pid",
374 "atype",
375 "seq",
376 "article_number",
377 "talk_number",
378 "fpage",
379 "lpage",
380 "page_range",
381 "size",
382 "page_type",
383 "elocation",
384 "coi_statement",
385 ],
386 )
387 and result
388 )
390 dates1 = list(obj1.history_dates)
391 dates2 = list(obj2.history_dates)
392 # Ignore date_published as it is OK if they are different between DjangoDB and the XML
394 for date1 in dates1:
395 type1 = date1["type"]
396 if type1 != "online": 396 ↛ 394line 396 didn't jump to line 394 because the condition on line 396 was always true
397 dates = [date for date in dates2 if date["type"] == type1]
398 if len(dates) == 0: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 result = False
400 diff_dict[type1] = [date1["date"], ""]
401 else:
402 date2 = dates[0]
403 d1 = model_helpers.parse_date_str(date1["date"])
404 d2 = model_helpers.parse_date_str(date2["date"])
405 if d1 != d2:
406 result = False
407 diff_dict[type1] = [date1["date"], date2["date"]]
408 for date2 in dates2:
409 type2 = date2["type"]
410 if type2 != "online": 410 ↛ 408line 410 didn't jump to line 408 because the condition on line 410 was always true
411 dates = [date for date in dates1 if date["type"] == type2]
412 if len(dates) == 0: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true
413 result = False
414 diff_dict[type2] = ["", date2["date"]]
416 return result
419class RefDataComparator(ResourceDataComparator):
420 def compare(self, obj1, obj2, diff_dict):
421 result = super().compare(obj1, obj2, diff_dict)
423 # Ignore citation_*. The difference will be flagged with the ext_ids
424 result = (
425 self.compare_list_of_simple_types(
426 obj1,
427 obj2,
428 diff_dict,
429 [
430 "lang",
431 "user_id",
432 "label",
433 # 'label_prefix', 'label_suffix',
434 "type",
435 "publisher_name",
436 "publisher_loc",
437 "institution",
438 "series",
439 "volume",
440 "issue",
441 "month",
442 "year",
443 "comment",
444 "annotation",
445 "fpage",
446 "lpage",
447 "page_range",
448 "size",
449 "source_tex",
450 "article_title_tex",
451 "chapter_title_tex",
452 ],
453 )
454 and result
455 )
457 if TEST_STRICT: 457 ↛ 465line 457 didn't jump to line 465 because the condition on line 457 was always true
458 result = (
459 self.compare_list_of_simple_types(
460 obj1, obj2, diff_dict, ["citation_xml", "citation_html", "citation_tex"]
461 )
462 and result
463 )
465 result = self.compare_list_of_list(obj1, obj2, diff_dict, ["contributors"]) and result
467 return result
470class CollectionDataComparator(ResourceDataComparator):
471 def compare(self, obj1, obj2, diff_dict):
472 result = super().compare(obj1, obj2, diff_dict)
474 result = (
475 self.compare_list_of_simple_types(
476 obj1, obj2, diff_dict, ["coltype", "issn", "e_issn", "volume", "vseries", "seq"]
477 )
478 and result
479 )
481 return result
484class BookDataComparator(ResourceDataComparator):
485 def compare(self, obj1, obj2, diff_dict):
486 result = super().compare(obj1, obj2, diff_dict)
488 result = (
489 self.compare_list_of_simple_types(
490 obj1, obj2, diff_dict, ["ctype", "provider", "frontmatter", "body"]
491 )
492 and result
493 )
495 publisher_comparator = PublisherDataComparator()
496 pub_diff = {}
497 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff):
498 result = False
499 diff_dict["publisher"] = pub_diff
501 result = (
502 self.compare_list_of_objs(
503 obj1, obj2, diff_dict, "incollection", ["pid"], CollectionDataComparator()
504 )
505 and result
506 )
508 result = (
509 self.compare_list_of_objs(
510 obj1, obj2, diff_dict, "parts", ["pid"], BookPartDataComparator()
511 )
512 and result
513 )
515 return result
518class BookPartDataComparator(ArticleDataComparator):
519 def compare(self, obj1, obj2, diff_dict):
520 result = super().compare(obj1, obj2, diff_dict)
522 result = (
523 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["frontmatter"]) and result
524 )
526 return result
529def prepare_issue_for_comparison(xml_issue):
530 # xml_cmds does not use the jats_parser collection but retrieve it from the database
531 journal_pid = xml_issue.journal.pid
532 collection = model_helpers.get_collection(journal_pid)
534 xml_issue.journal = db_to_journal_data(collection)
536 for article in xml_issue.articles:
537 for ref in article.bibitems:
538 # the Django DB ignores the ref doi although it is set by the XML parser.
539 ref.doi = None