Coverage for src/ptf/model_data

1##################################################################################################

3# README

6##################################################################################################

8from collections import OrderedDict

10from ptf import model_helpers

11from ptf.model_data_converter import db_to_journal_data

13TEST_STRICT = False

16def create_diff_obj(a, b, label):

17 diff_obj = []

18 result = True

20 # contrib_xml is updated if you edit an article: other fields will be different

21 # first_initials is not stored in the database

22 if ( 22 ↛ 29line 22 didn't jump to line 29 because the condition on line 22 was never true

23 label == "contrib_xml"

24 or label == "first_initials"

25 or label == "label_prefix"

26 or label == "value_html"

27 or label == "value_tex"

28 ) and not TEST_STRICT:

29 return result, diff_obj

31 if hasattr(a, "__dict__"):

32 a = a.__dict__

33 if hasattr(b, "__dict__"):

34 b = b.__dict__

36 if type(a) == list and type(b) == list:

37 if len(a) != len(b) and label != "streams": # TeX can be added in streams

38 result = False

39 diff_obj.append(["len", len(a), len(b)])

40 elif label not in [

41 "streams",

42 "supplementary_materials",

43 ]: # Cedrics import change the location: ignore the values

44 for obj1, obj2 in zip(a, b):

45 result_obj, list_diff_obj = create_diff_obj(obj1, obj2, label)

46 if not result_obj: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 result = False

49 if label == "contrib_groups":

50 child_label = obj1["content_type"]

51 elif label == "contribs":

52 child_label = obj1["string_name"]

53 else:

54 child_label = label

56 diff_obj.append([child_label, list_diff_obj])

57 elif type(a) == dict and type(b) == dict:

58 a = OrderedDict(sorted(a.items()))

59 b = OrderedDict(sorted(b.items()))

60 for key, value in a.items():

61 if key in b: 61 ↛ 67line 61 didn't jump to line 67 because the condition on line 61 was always true

62 result_obj, values_diff_obj = create_diff_obj(value, b[key], key)

63 if not result_obj: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 result = False

65 diff_obj.append(values_diff_obj)

66 else:

67 result = False

68 diff_obj.append([key, value, ""])

69 for key, value in b.items():

70 if key not in a: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 result = False

72 diff_obj.append([key, "", value])

73 else:

74 if type(a) != type(b): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 result = False

76 elif type(a) == str and type(b) == str:

77 a = a.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "")

78 b = b.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', "")

80 # The Cedrics -> JATS changes the order

81 a = a.replace('close="" open="{" separators="', "")

82 b = b.replace('separators="" open="{" close="', "")

83 a = a.replace('separators="" open="|" close="', "")

84 b = b.replace('close="" open="|" separators="', "")

86 try:

87 a_date = model_helpers.parse_date_str(a)

88 b_date = model_helpers.parse_date_str(b)

89 a = a_date

90 b = b_date

91 except:

92 pass

93 result = a == b

94 else:

95 result = a == b

97 if not result: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 diff_obj = [label, a, b]

100 if type(a) == str:

101 i = 0

102 while i < len(a) and i < len(b):

103 if a[i] != b[i]:

104 print(label, i, a[i : min(len(a), i + 30)])

105 print(label, i, b[i : min(len(b), i + 30)])

106 break

107 i += 1

108

109 return result, diff_obj

110

111

112class BaseComparator:

113 def compare(self, obj1, obj2, diff_dict):

114 return True

115

116 def compare_list_of_simple_types(self, obj1, obj2, diff_dict, list_of_simple_types):

117 result = True

118

119 for attr in list_of_simple_types:

120 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex

121 # We have to ignore the DOIs

122 if not (attr in ["doi", "citation_xml"] and hasattr(obj1, "citation_xml")) and not (

123 attr == "doi"

124 and hasattr(obj2, "pid")

125 and obj2.pid is not None

126 and obj2.pid.startswith("AIF_")

127 ):

128 obj1_attr = getattr(obj1, attr)

129 obj2_attr = getattr(obj2, attr)

130

131 if attr == "provider" and obj2_attr == "numdam":

132 obj2_attr = "mathdoc"

133

134 if (

135 type(obj1_attr) == str

136 and obj1_attr is not None

137 and type(obj2_attr) == str

138 and obj2_attr is not None

139 ):

140 obj1_attr = obj1_attr.replace(

141 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""

142 )

143 obj2_attr = obj2_attr.replace(

144 ' xmlns:xlink="http://www.w3.org/1999/xlink"', ""

145 )

146

147 if obj1_attr != obj2_attr:

148 result = False

149 diff_dict[attr] = [obj1_attr, obj2_attr]

150

151 return result

152

153 def compare_list_of_list(self, obj1, obj2, diff_dict, list_of_list):

154 result = True

155

156 for attr in list_of_list:

157 obj1_attr = getattr(obj1, attr)

158 obj2_attr = getattr(obj2, attr)

159

160 # Matching info were reported in /cedram_dev/exploitation, not in /cedram_dev/production_tex

161 # We have to ignore the DOIs

162 if TEST_STRICT or ( 162 ↛ 156line 162 didn't jump to line 156 because the condition on line 162 was always true

163 not (attr == "extids" and hasattr(obj1, "citation_xml"))

164 and not (

165 attr == "ids"

166 and hasattr(obj2, "pid")

167 and obj2.pid is not None

168 and obj2.pid.startswith("AIF_")

169 )

170 ):

171 result_obj, diff_obj = create_diff_obj(obj1_attr, obj2_attr, attr)

172 if not result_obj:

173 result = False

174 diff_dict[attr] = diff_obj

175

176 return result

177

178 def compare_list_of_objs(self, obj1, obj2, diff_dict, attr, attr_id_list, comparator):

179 result = True

180

181 list1 = getattr(obj1, attr)

182 list2 = getattr(obj2, attr)

183 if len(list1) != len(list2): 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 if attr != "articles":

185 diff_dict[attr] = [{"len": [len(list1), len(list2)]}]

186 else:

187 attr_diffs = []

188

189 for list1_obj, list2_obj in zip(list1, list2):

190 attr_diff_dict = {}

191 if not comparator.compare(list1_obj, list2_obj, attr_diff_dict):

192 result = False

193 params = {"diff": attr_diff_dict}

194 for attr_id in attr_id_list:

195 params[attr_id] = getattr(list1_obj, attr_id)

196 attr_diffs.append(params)

197

198 if not result:

199 diff_dict[attr] = attr_diffs

200

201 return result

202

203

204class ResourceDataComparator(BaseComparator):

205 def compare(self, obj1, obj2, diff_dict):

206 result = super().compare(obj1, obj2, diff_dict)

207

208 # Ignore trans_title_xml: it is not used in the Django DB

209 result = (

210 self.compare_list_of_simple_types(

211 obj1,

212 obj2,

213 diff_dict,

214 [

215 "lang",

216 "pid",

217 "doi",

218 "title_xml",

219 "title_tex",

220 "title_html",

221 "trans_lang",

222 "trans_title_html",

223 "trans_title_tex",

224 "funding_statement_html",

225 "footnotes_html",

226 ],

227 )

228 and result

229 )

230

231 if False and len(obj1.abstracts) > 0 and len(obj2.abstracts) > 0:

232 j = 0

233 while j < len(obj1.abstracts):

234 a1 = obj1.abstracts[j]["value_xml"]

235 a2 = obj2.abstracts[j]["value_xml"]

236

237 if a1 != a2:

238 j = len(obj1.abstracts)

239 i = 0

240 while i < len(a1) and i < len(a2):

241 if a1[i] != a2[i]:

242 print(i, a1[i], a2[i], ord(a1[i]), ord(a2[i]))

243 i += 1

244 j += 1

245

246 result = (

247 self.compare_list_of_list(

248 obj1,

249 obj2,

250 diff_dict,

251 [

252 "abstracts",

253 "awards",

254 "relations",

255 # 'ids', 'extids',

256 # 'ext_links',

257 "streams",

258 "related_objects",

259 "counts",

260 "contributors",

261 "kwds",

262 "kwd_groups",

263 "figures",

264 "supplementary_materials",

265 ],

266 )

267 and result

268 )

269

270 if TEST_STRICT: 270 ↛ 276line 270 didn't jump to line 276

271 result = (

272 self.compare_list_of_list(obj1, obj2, diff_dict, ["ids", "extids", "ext_links"])

273 and result

274 )

275

276 result = (

277 self.compare_list_of_objs(

278 obj1, obj2, diff_dict, "bibitems", ["label"], RefDataComparator()

279 )

280 and result

281 )

282

283 return result

284

285

286class MathdocPublicationDataComparator(ResourceDataComparator):

287 def compare(self, obj1, obj2, diff_dict):

288 result = super().compare(obj1, obj2, diff_dict)

289

290 result = (

291 self.compare_list_of_simple_types(

292 obj1, obj2, diff_dict, ["coltype", "e_issn", "wall", "provider"]

293 )

294 and result

295 )

296

297 return result

298

299

300class PublisherDataComparator(BaseComparator):

301 def compare(self, obj1, obj2, diff_dict):

302 if obj1 is None and obj2 is None:

303 return True

304

305 result = super().compare(obj1, obj2, diff_dict)

306

307 result = (

308 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["name", "loc"]) and result

309 )

310

311 return result

312

313

314class JournalDataComparator(ResourceDataComparator):

315 def compare(self, obj1, obj2, diff_dict):

316 result = super().compare(obj1, obj2, diff_dict)

317

318 publisher_comparator = PublisherDataComparator()

319 pub_diff = {}

320 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 result = False

322 diff_dict["publisher"] = pub_diff

323

324 return result

325

326

327class IssueDataComparator(ResourceDataComparator):

328 def compare(self, obj1, obj2, diff_dict):

329 result = super().compare(obj1, obj2, diff_dict)

330

331 journal_comparator = JournalDataComparator()

332 journal_diff = {}

333 if not journal_comparator.compare(obj1.journal, obj2.journal, journal_diff): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 result = False

335 diff_dict["journal"] = journal_diff

336

337 publisher_comparator = PublisherDataComparator()

338 pub_diff = {}

339 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 result = False

341 diff_dict["publisher"] = pub_diff

342

343 result = (

344 self.compare_list_of_simple_types(

345 obj1, obj2, diff_dict, ["provider", "ctype", "year", "vseries", "volume", "number"]

346 )

347 and result

348 )

349

350 # Ignore last_modified (it's OK if it has been edited in ptf-tools)

351 # Ignore prod_deployed_date (set during import by database cmds: Django DB is different from the XML)

352

353 result = (

354 self.compare_list_of_objs(

355 obj1, obj2, diff_dict, "articles", ["pid", "doi"], ArticleDataComparator()

356 )

357 and result

358 )

359

360 return result

361

362

363class ArticleDataComparator(ResourceDataComparator):

364 def compare(self, obj1, obj2, diff_dict):

365 result = super().compare(obj1, obj2, diff_dict)

366

367 result = (

368 self.compare_list_of_simple_types(

369 obj1,

370 obj2,

371 diff_dict,

372 [

373 "pid",

374 "atype",

375 "seq",

376 "article_number",

377 "talk_number",

378 "fpage",

379 "lpage",

380 "page_range",

381 "size",

382 "page_type",

383 "elocation",

384 "coi_statement",

385 ],

386 )

387 and result

388 )

389

390 dates1 = list(obj1.history_dates)

391 dates2 = list(obj2.history_dates)

392 # Ignore date_published as it is OK if they are different between DjangoDB and the XML

393

394 for date1 in dates1:

395 type1 = date1["type"]

396 if type1 != "online": 396 ↛ 394line 396 didn't jump to line 394 because the condition on line 396 was always true

397 dates = [date for date in dates2 if date["type"] == type1]

398 if len(dates) == 0: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 result = False

400 diff_dict[type1] = [date1["date"], ""]

401 else:

402 date2 = dates[0]

403 d1 = model_helpers.parse_date_str(date1["date"])

404 d2 = model_helpers.parse_date_str(date2["date"])

405 if d1 != d2:

406 result = False

407 diff_dict[type1] = [date1["date"], date2["date"]]

408 for date2 in dates2:

409 type2 = date2["type"]

410 if type2 != "online": 410 ↛ 408line 410 didn't jump to line 408 because the condition on line 410 was always true

411 dates = [date for date in dates1 if date["type"] == type2]

412 if len(dates) == 0: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true

413 result = False

414 diff_dict[type2] = ["", date2["date"]]

415

416 return result

417

418

419class RefDataComparator(ResourceDataComparator):

420 def compare(self, obj1, obj2, diff_dict):

421 result = super().compare(obj1, obj2, diff_dict)

422

423 # Ignore citation_*. The difference will be flagged with the ext_ids

424 result = (

425 self.compare_list_of_simple_types(

426 obj1,

427 obj2,

428 diff_dict,

429 [

430 "lang",

431 "user_id",

432 "label",

433 # 'label_prefix', 'label_suffix',

434 "type",

435 "publisher_name",

436 "publisher_loc",

437 "institution",

438 "series",

439 "volume",

440 "issue",

441 "month",

442 "year",

443 "comment",

444 "annotation",

445 "fpage",

446 "lpage",

447 "page_range",

448 "size",

449 "source_tex",

450 "article_title_tex",

451 "chapter_title_tex",

452 ],

453 )

454 and result

455 )

456

457 if TEST_STRICT: 457 ↛ 465line 457 didn't jump to line 465 because the condition on line 457 was always true

458 result = (

459 self.compare_list_of_simple_types(

460 obj1, obj2, diff_dict, ["citation_xml", "citation_html", "citation_tex"]

461 )

462 and result

463 )

464

465 result = self.compare_list_of_list(obj1, obj2, diff_dict, ["contributors"]) and result

466

467 return result

468

469

470class CollectionDataComparator(ResourceDataComparator):

471 def compare(self, obj1, obj2, diff_dict):

472 result = super().compare(obj1, obj2, diff_dict)

473

474 result = (

475 self.compare_list_of_simple_types(

476 obj1, obj2, diff_dict, ["coltype", "issn", "e_issn", "volume", "vseries", "seq"]

477 )

478 and result

479 )

480

481 return result

482

483

484class BookDataComparator(ResourceDataComparator):

485 def compare(self, obj1, obj2, diff_dict):

486 result = super().compare(obj1, obj2, diff_dict)

487

488 result = (

489 self.compare_list_of_simple_types(

490 obj1, obj2, diff_dict, ["ctype", "provider", "frontmatter", "body"]

491 )

492 and result

493 )

494

495 publisher_comparator = PublisherDataComparator()

496 pub_diff = {}

497 if not publisher_comparator.compare(obj1.publisher, obj2.publisher, pub_diff):

498 result = False

499 diff_dict["publisher"] = pub_diff

500

501 result = (

502 self.compare_list_of_objs(

503 obj1, obj2, diff_dict, "incollection", ["pid"], CollectionDataComparator()

504 )

505 and result

506 )

507

508 result = (

509 self.compare_list_of_objs(

510 obj1, obj2, diff_dict, "parts", ["pid"], BookPartDataComparator()

511 )

512 and result

513 )

514

515 return result

516

517

518class BookPartDataComparator(ArticleDataComparator):

519 def compare(self, obj1, obj2, diff_dict):

520 result = super().compare(obj1, obj2, diff_dict)

521

522 result = (

523 self.compare_list_of_simple_types(obj1, obj2, diff_dict, ["frontmatter"]) and result

524 )

525

526 return result

527

528

529def prepare_issue_for_comparison(xml_issue):

530 # xml_cmds does not use the jats_parser collection but retrieve it from the database

531 journal_pid = xml_issue.journal.pid

532 collection = model_helpers.get_collection(journal_pid)

533

534 xml_issue.journal = db_to_journal_data(collection)

535

536 for article in xml_issue.articles:

537 for ref in article.bibitems:

538 # the Django DB ignores the ref doi although it is set by the XML parser.

539 ref.doi = None

Coverage for src/ptf/model_data_comparator.py: 74%

231 statements