Coverage for src/ptf/model_data_comparator.py: 74%

231 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-05 09:56 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# 

6################################################################################################## 

7 

8from collections import OrderedDict 

9 

10from ptf import model_helpers 

11from ptf.model_data_converter import db_to_journal_data 

12 

13TEST_STRICT = False 

14 

15 

16def create_diff_obj(a, b, label): 

17 diff_obj = [] 

18 result = True 

19 

20 # contrib_xml is updated if you edit an article: other fields will be different 

21 # first_initials is not stored in the database 

22 if ( 22 ↛ 29line 22 didn't jump to line 29 because the condition on line 22 was never true

23 label == "contrib_xml" 

24 or label == "first_initials" 

25 or label == "label_prefix" 

26 or label == "value_html" 

27 or label == "value_tex" 

28 ) and not TEST_STRICT: 

29 return result, diff_obj 

30 

31 if hasattr(a, "__dict__"): 

32 a = a.__dict__ 

33 if hasattr(b, "__dict__"): 

34 b = b.__dict__ 

35 

36 if type(a) == list and type(b) == list: 

37 if len(a) != len(b) and label != "streams": # TeX can be added in streams 

38 result = False 

39 diff_obj.append(["len", len(a), len(b)]) 

40 elif label not in [ 

41 "streams", 

42 "supplementary_materials", 

43 ]: # Cedrics import change the location: ignore the values 

44 for obj1, obj2 in zip(a, b): 

45 result_obj, list_diff_obj = create_diff_obj(obj1, obj2, label) 

46 if not result_obj: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 result = False 

48 

49 if label == "contrib_groups": 

50 child_label = obj1["content_type"] 

51 elif label == "contribs": 

52 child_label = obj1["string_name"] 

53 else: 

54 child_label = label 

55 

56 diff_obj.append([child_label, list_diff_obj]) 

57 elif type(a) == dict and type(b) == dict: 

58 a = OrderedDict(sorted(a.items())) 

59 b = OrderedDict(sorted(b.items())) 

60 for key, value in a.items(): 

61 if key in b: 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true

62 result_obj, values_diff_obj = create_diff_obj(value, b[key], key) 

63 if not result_obj: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 result = False 

65 diff_obj.append(values_diff_obj) 

66 else: 

67 result = False 

68 diff_obj.append([key, value, ""]) 

69 for key, value in b.items(): 

70 if key not in a: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 result = False 

72 diff_obj.append([key, "", value]) 

73 else: 

74 if type(a) != type(b): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 result = False 

76 elif type(a) == str and type(b) == str: 

77 a = a.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

78 b = b.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

79 

80 # The Cedrics -> JATS changes the order 

81 a = a.replace('close="" open="{" separators="', "") 

82 b = b.replace('separators="" open="{" close="', "") 

83 a = a.replace('separators="" open="|" close="', "") 

84 b = b.replace('close="" open="|" separators="', "") 

85 

86 try: 

87 a_date = model_helpers.parse_date_str(a) 

88 b_date = model_helpers.parse_date_str(b) 

89 a = a_date 

90 b = b_date 

91 except: 

92 pass 

93 result = a == b 

94 else: 

95 result = a == b 

96 

97 if not result: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 diff_obj = [label, a, b] 

99 

100 if type(a) == str: 

101 i = 0 

102 while i < len(a) and i < len(b): 

103 if a[i] != b[i]: 

104 print(label, i, a[i : min(len(a), i + 30)]) 

105 print(label, i, b[i : min(len(b), i + 30)]) 

106 break 

107 i += 1 

108 

109 return result, diff_obj 

110 

111 

112class BaseComparator: 

113 def compare(self, obj1, obj2, diff_dict): 

114 return True 

115 

116 def compare_list_of_simple_types(self, obj1, obj2, diff_dict, list_of_simple_types): 

117 result = True 

118 

119 for attr in list_of_simple_types: 

120 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex 

121 # We have to ignore the DOIs 

122 if not (attr in ["doi", "citation_xml"] and hasattr(obj1, "citation_xml")) and not ( 

123 attr == "doi" 

124 and hasattr(obj2, "pid") 

125 and obj2.pid is not None 

126 and obj2.pid.startswith("AIF_") 

127 ): 

128 obj1_attr = getattr(obj1, attr) 

129 obj2_attr = getattr(obj2, attr) 

130 

131 if attr == "provider" and obj2_attr == "numdam": 

132 obj2_attr = "mathdoc" 

133 

134 if ( 

135 type(obj1_attr) == str 

136 and obj1_attr is not None 

137 and type(obj2_attr) == str 

138 and obj2_attr is not None 

139 ): 

140 obj1_attr = obj1_attr.replace( 

141 ' xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

142 ) 

143 obj2_attr = obj2_attr.replace( 

144 ' xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

145 ) 

146 

147 if obj1_attr != obj2_attr: 

148 result = False 

149 diff_dict[attr] = [obj1_attr, obj2_attr] 

150 

151 return result 

152 

153 def compare_list_of_list(self, obj1, obj2, diff_dict, list_of_list): 

154 result = True 

155 

156 for attr in list_of_list: 

157 obj1_attr = getattr(obj1, attr) 

158 obj2_attr = getattr(obj2, attr) 

159 

160 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex 

161 # We have to ignore the DOIs 

162 if TEST_STRICT or ( 162 ↛ 156line 162 didn't jump to line 156 because the condition on line 162 was always true

163 not (attr == "extids" and hasattr(obj1, "citation_xml")) 

164 and not ( 

165 attr == "ids" 

166 and hasattr(obj2, "pid") 

167 and obj2.pid is not None 

168 and obj2.pid.startswith("AIF_") 

169 ) 

170 ): 

171 result_obj, diff_obj = create_diff_obj(obj1_attr, obj2_attr, attr) 

172 if not result_obj: 

173 result = False 

174 diff_dict[attr] = diff_obj 

175 

176 return result 

177 

178 def compare_list_of_objs(self, obj1, obj2, diff_dict, attr, attr_id_list, comparator): 

179 result = True 

180 

181 list1 = getattr(obj1, attr) 

182 list2 = getattr(obj2, attr) 

183 if len(list1) != len(list2): 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 if attr != "articles": 

185 diff_dict[attr] = [{"len": [len(list1), len(list2)]}] 

186 else: 

187 attr_diffs = [] 

188 

189 for list1_obj, list2_obj in zip(list1, list2): 

190 attr_diff_dict = {} 

191 if not comparator.compare(list1_obj, list2_obj, attr_diff_dict): 

192 result = False 

193 params = {"diff": attr_diff_dict} 

194 for attr_id in attr_id_list: 

195 params[attr_id] = getattr(list1_obj, attr_id) 

196 attr_diffs.append(params) 

197 

198 if not result: 

199 diff_dict[attr] = attr_diffs 

200 

201 return result 

202 

203 

204class ResourceDataComparator(BaseComparator): 

205 def compare(self, obj1, obj2, diff_dict): 

206 result = super().compare(obj1, obj2, diff_dict) 

207 

208 # Ignore trans_title_xml: it is not used in the Django DB 

209 result = ( 

210 self.compare_list_of_simple_types( 

211 obj1, 

212 obj2, 

213 diff_dict, 

214 [ 

215 "lang", 

216 "pid", 

217 "doi", 

218 "title_xml", 

219 "title_tex", 

220 "title_html", 

221 "trans_lang", 

222 "trans_title_html", 

223 "trans_title_tex", 

224 "funding_statement_html", 

225 "footnotes_html", 

226 ], 

227 ) 

228 and result 

229 ) 

230 

231 if False and len(obj1.abstracts) > 0 and len(obj2.abstracts) > 0: 

232 j = 0 

233 while j < len(obj1.abstracts): 

234 a1 = obj1.abstracts[j]["value_xml"] 

235 a2 = obj2.abstracts[j]["value_xml"] 

236 

237 if a1 != a2: 

238 j = len(obj1.abstracts) 

239 i = 0 

240 while i < len(a1) and i < len(a2): 

241 if a1[i] != a2[i]: 

242 print(i, a1[i], a2[i], ord(a1[i]), ord(a2[i])) 

243 i += 1 

244 j += 1 

245 

246 result = ( 

247 self.compare_list_of_list( 

248 obj1, 

249 obj2, 

250 diff_dict, 

251 [ 

252 "abstracts", 

253 "awards", 

254 "relations", 

255 # 'ids', 'extids', 

256 # 'ext_links', 

257 "streams", 

258 "related_objects", 

259 "counts", 

260 "contributors", 

261 "kwds", 

262 "kwd_groups", 

263 "figures", 

264 "supplementary_materials", 

265 ], 

266 ) 

267 and result 

268 ) 

269 

270 if TEST_STRICT: 270 ↛ 276line 270 didn't jump to line 276

271 result = ( 

272 self.compare_list_of_list(obj1, obj2, diff_dict, ["ids", "extids", "ext_links"]) 

273 and result 

274 ) 

275 

276 result = ( 

277 self.compare_list_of_objs( 

278 obj1, obj2, diff_dict, "bibitems", ["label"], RefDataComparator() 

279 ) 

280 and result 

281 ) 

282 

283 return result 

284 

285 

286class MathdocPublicationDataComparator(ResourceDataComparator): 

287 def compare(self, obj1, obj2, diff_dict): 

288 result = super().compare(obj1, obj2, diff_dict) 

289 

290 result = ( 

291 self.compare_list_of_simple_types( 

292 obj1, obj2, diff_dict, ["coltype", "e_issn", "wall", "provider"] 

293 ) 

294 and result 

295 ) 

296 

297 return result 

298 

299 

300class PublisherDataComparator(BaseComparator): 

301 def compare(self, obj1, obj2, diff_dict): 

302 if obj1 is None and obj2 is None: 

303 return True 

304 

305 result = super().compare(obj1, obj2, diff_dict) 

306 

307 result = ( 

308 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["name", "loc"]) and result 

309 ) 

310 

311 return result 

312 

313 

314class JournalDataComparator(ResourceDataComparator): 

315 def compare(self, obj1, obj2, diff_dict): 

316 result = super().compare(obj1, obj2, diff_dict) 

317 

318 publisher_comparator = PublisherDataComparator() 

319 pub_diff = {} 

320 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 result = False 

322 diff_dict["publisher"] = pub_diff 

323 

324 return result 

325 

326 

327class IssueDataComparator(ResourceDataComparator): 

328 def compare(self, obj1, obj2, diff_dict): 

329 result = super().compare(obj1, obj2, diff_dict) 

330 

331 journal_comparator = JournalDataComparator() 

332 journal_diff = {} 

333 if not journal_comparator.compare(obj1.journal, obj2.journal, journal_diff): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 result = False 

335 diff_dict["journal"] = journal_diff 

336 

337 publisher_comparator = PublisherDataComparator() 

338 pub_diff = {} 

339 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 result = False 

341 diff_dict["publisher"] = pub_diff 

342 

343 result = ( 

344 self.compare_list_of_simple_types( 

345 obj1, obj2, diff_dict, ["provider", "ctype", "year", "vseries", "volume", "number"] 

346 ) 

347 and result 

348 ) 

349 

350 # Ignore last_modified (it's OK if it has been edited in ptf-tools) 

351 # Ignore prod_deployed_date (set during import by database cmds: Django DB is different from the XML) 

352 

353 result = ( 

354 self.compare_list_of_objs( 

355 obj1, obj2, diff_dict, "articles", ["pid", "doi"], ArticleDataComparator() 

356 ) 

357 and result 

358 ) 

359 

360 return result 

361 

362 

363class ArticleDataComparator(ResourceDataComparator): 

364 def compare(self, obj1, obj2, diff_dict): 

365 result = super().compare(obj1, obj2, diff_dict) 

366 

367 result = ( 

368 self.compare_list_of_simple_types( 

369 obj1, 

370 obj2, 

371 diff_dict, 

372 [ 

373 "pid", 

374 "atype", 

375 "seq", 

376 "article_number", 

377 "talk_number", 

378 "fpage", 

379 "lpage", 

380 "page_range", 

381 "size", 

382 "page_type", 

383 "elocation", 

384 "coi_statement", 

385 ], 

386 ) 

387 and result 

388 ) 

389 

390 dates1 = list(obj1.history_dates) 

391 dates2 = list(obj2.history_dates) 

392 # Ignore date_published as it is OK if they are different between DjangoDB and the XML 

393 

394 for date1 in dates1: 

395 type1 = date1["type"] 

396 if type1 != "online": 396 ↛ 394line 396 didn't jump to line 394 because the condition on line 396 was always true

397 dates = [date for date in dates2 if date["type"] == type1] 

398 if len(dates) == 0: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 result = False 

400 diff_dict[type1] = [date1["date"], ""] 

401 else: 

402 date2 = dates[0] 

403 d1 = model_helpers.parse_date_str(date1["date"]) 

404 d2 = model_helpers.parse_date_str(date2["date"]) 

405 if d1 != d2: 

406 result = False 

407 diff_dict[type1] = [date1["date"], date2["date"]] 

408 for date2 in dates2: 

409 type2 = date2["type"] 

410 if type2 != "online": 410 ↛ 408line 410 didn't jump to line 408 because the condition on line 410 was always true

411 dates = [date for date in dates1 if date["type"] == type2] 

412 if len(dates) == 0: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true

413 result = False 

414 diff_dict[type2] = ["", date2["date"]] 

415 

416 return result 

417 

418 

419class RefDataComparator(ResourceDataComparator): 

420 def compare(self, obj1, obj2, diff_dict): 

421 result = super().compare(obj1, obj2, diff_dict) 

422 

423 # Ignore citation_*. The difference will be flagged with the ext_ids 

424 result = ( 

425 self.compare_list_of_simple_types( 

426 obj1, 

427 obj2, 

428 diff_dict, 

429 [ 

430 "lang", 

431 "user_id", 

432 "label", 

433 # 'label_prefix', 'label_suffix', 

434 "type", 

435 "publisher_name", 

436 "publisher_loc", 

437 "institution", 

438 "series", 

439 "volume", 

440 "issue", 

441 "month", 

442 "year", 

443 "comment", 

444 "annotation", 

445 "fpage", 

446 "lpage", 

447 "page_range", 

448 "size", 

449 "source_tex", 

450 "article_title_tex", 

451 "chapter_title_tex", 

452 ], 

453 ) 

454 and result 

455 ) 

456 

457 if TEST_STRICT: 457 ↛ 465line 457 didn't jump to line 465 because the condition on line 457 was always true

458 result = ( 

459 self.compare_list_of_simple_types( 

460 obj1, obj2, diff_dict, ["citation_xml", "citation_html", "citation_tex"] 

461 ) 

462 and result 

463 ) 

464 

465 result = self.compare_list_of_list(obj1, obj2, diff_dict, ["contributors"]) and result 

466 

467 return result 

468 

469 

470class CollectionDataComparator(ResourceDataComparator): 

471 def compare(self, obj1, obj2, diff_dict): 

472 result = super().compare(obj1, obj2, diff_dict) 

473 

474 result = ( 

475 self.compare_list_of_simple_types( 

476 obj1, obj2, diff_dict, ["coltype", "issn", "e_issn", "volume", "vseries", "seq"] 

477 ) 

478 and result 

479 ) 

480 

481 return result 

482 

483 

484class BookDataComparator(ResourceDataComparator): 

485 def compare(self, obj1, obj2, diff_dict): 

486 result = super().compare(obj1, obj2, diff_dict) 

487 

488 result = ( 

489 self.compare_list_of_simple_types( 

490 obj1, obj2, diff_dict, ["ctype", "provider", "frontmatter", "body"] 

491 ) 

492 and result 

493 ) 

494 

495 publisher_comparator = PublisherDataComparator() 

496 pub_diff = {} 

497 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 

498 result = False 

499 diff_dict["publisher"] = pub_diff 

500 

501 result = ( 

502 self.compare_list_of_objs( 

503 obj1, obj2, diff_dict, "incollection", ["pid"], CollectionDataComparator() 

504 ) 

505 and result 

506 ) 

507 

508 result = ( 

509 self.compare_list_of_objs( 

510 obj1, obj2, diff_dict, "parts", ["pid"], BookPartDataComparator() 

511 ) 

512 and result 

513 ) 

514 

515 return result 

516 

517 

518class BookPartDataComparator(ArticleDataComparator): 

519 def compare(self, obj1, obj2, diff_dict): 

520 result = super().compare(obj1, obj2, diff_dict) 

521 

522 result = ( 

523 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["frontmatter"]) and result 

524 ) 

525 

526 return result 

527 

528 

529def prepare_issue_for_comparison(xml_issue): 

530 # xml_cmds does not use the jats_parser collection but retrieve it from the database 

531 journal_pid = xml_issue.journal.pid 

532 collection = model_helpers.get_collection(journal_pid) 

533 

534 xml_issue.journal = db_to_journal_data(collection) 

535 

536 for article in xml_issue.articles: 

537 for ref in article.bibitems: 

538 # the Django DB ignores the ref doi although it is set by the XML parser. 

539 ref.doi = None