diff --git a/.gitattributes b/.gitattributes index fdcf6b7f..4e19dc6e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ virtuoso-opensource/ filter=lfs diff=lfs merge=lfs -text +duplicated_br_from_files.csv filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 36c3da52..d50d76de 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,5 @@ zenodo.yaml internet_archive.yaml storage virtuoso-opensource -ts_upload_cache.json \ No newline at end of file +ts_upload_cache.json +duplicated_brs_from_files/ \ No newline at end of file diff --git a/duplicated_br_from_files.csv b/duplicated_br_from_files.csv new file mode 100644 index 00000000..62c90a82 --- /dev/null +++ b/duplicated_br_from_files.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c35ec18eac67c03bc39d1f36843e852ffff67178f68af6df0d374ac5897602 +size 186474735 diff --git a/oc_meta/run/merge/check_results.py b/oc_meta/run/merge/check_results.py index 069bff73..78a1077d 100644 --- a/oc_meta/run/merge/check_results.py +++ b/oc_meta/run/merge/check_results.py @@ -88,6 +88,19 @@ def check_entity_sparql(sparql_endpoint, entity_uri, is_surviving): tqdm.write(f"Error in SPARQL: Surviving entity {entity_uri} does not exist") return + if not is_surviving: + referenced_query = f""" + ASK {{ + ?s ?p <{entity_uri}> . + }} + """ + sparql.setQuery(referenced_query) + sparql.setReturnFormat(JSON) + referenced_results = sparql_query_with_retry(sparql) + + if referenced_results['boolean']: + tqdm.write(f"Error in SPARQL: Merged entity {entity_uri} is still referenced by other entities") + # Query to get entity types types_query = f""" SELECT ?type WHERE {{ diff --git a/oc_meta/run/merge/find_diff_between_sparql_and_files_merge_lists.py b/oc_meta/run/merge/find_diff_between_sparql_and_files_merge_lists.py new file mode 100644 index 00000000..770c7a68 --- /dev/null +++ b/oc_meta/run/merge/find_diff_between_sparql_and_files_merge_lists.py @@ -0,0 +1,43 @@ +import argparse +import csv +from typing import Set, List + +def read_csv(file_path: str) -> Set[str]: + entities = set() + with open(file_path, 'r') as csvfile: + reader = csv.reader(csvfile) + next(reader) # Skip header + for row in reader: + surviving_entity = row[0] + merged_entities = row[1].split('; ') + entities.add(surviving_entity) + entities.update(merged_entities) + return entities + +def find_differences(file_a: str, file_b: str) -> tuple[List[str], List[str]]: + entities_a = read_csv(file_a) + entities_b = read_csv(file_b) + + in_a_not_b = list(entities_a - entities_b) + in_b_not_a = list(entities_b - entities_a) + + return in_a_not_b, in_b_not_a + +def main(): + parser = argparse.ArgumentParser(description="Find differences between two CSV files.") + parser.add_argument("file_a", help="Path to the first CSV file") + parser.add_argument("file_b", help="Path to the second CSV file") + args = parser.parse_args() + + in_a_not_b, in_b_not_a = find_differences(args.file_a, args.file_b) + + print("Entities in A but not in B:") + for entity in in_a_not_b: + print(entity) + + print("\nEntities in B but not in A:") + for entity in in_b_not_a: + print(entity) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/oc_meta/run/merge/group_entities_to_be_merged.py b/oc_meta/run/merge/group_entities_to_be_merged.py index cefe1293..0874a69a 100644 --- a/oc_meta/run/merge/group_entities_to_be_merged.py +++ b/oc_meta/run/merge/group_entities_to_be_merged.py @@ -1,9 +1,10 @@ -import pandas as pd -import os import argparse -from SPARQLWrapper import SPARQLWrapper, JSON -from tqdm import tqdm +import os + +import pandas as pd from retrying import retry +from SPARQLWrapper import JSON, SPARQLWrapper +from tqdm import tqdm class UnionFind: @@ -11,11 +12,16 @@ def __init__(self): self.parent = {} def find(self, item): - if item not in self.parent: - self.parent[item] = item - if self.parent[item] != item: - self.parent[item] = self.find(self.parent[item]) - return self.parent[item] + path = [item] + while item in self.parent and self.parent[item] != item: + item = self.parent[item] + path.append(item) + if len(path) > 1000: # Limite arbitrario per evitare loop infiniti + print(f"Warning: Long path detected: {' -> '.join(path)}") + break + for p in path: + self.parent[p] = item + return item def union(self, item1, item2): root1 = self.find(item1) @@ -23,30 +29,43 @@ def union(self, item1, item2): if root1 != root2: self.parent[root2] = root1 - def load_csv(file_path): return pd.read_csv(file_path) @retry(stop_max_attempt_number=3, wait_exponential_multiplier=1000, wait_exponential_max=10000) -def query_sparql(endpoint, uri): +def query_sparql(endpoint, uri, query_type): sparql = SPARQLWrapper(endpoint) - query = f""" - SELECT ?subject WHERE {{ - ?subject ?predicate <{uri}> . - }} - """ + + if query_type == 'subjects': + query = f""" + SELECT ?subject WHERE {{ + ?subject ?predicate <{uri}> . + }} + """ + elif query_type == 'objects': + query = f""" + SELECT ?object WHERE {{ + <{uri}> ?predicate ?object . + ?object ?p ?o . + }} + """ + sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() - subjects = [result['subject']['value'] for result in results['results']['bindings']] - return subjects + if query_type == 'subjects': + return [result['subject']['value'] for result in results['results']['bindings']] + elif query_type == 'objects': + return [result['object']['value'] for result in results['results']['bindings']] def get_all_related_entities(endpoint, uris): related_entities = set(uris) for uri in uris: - subjects = query_sparql(endpoint, uri) + subjects = query_sparql(endpoint, uri, 'subjects') + objects = query_sparql(endpoint, uri, 'objects') related_entities.update(subjects) + related_entities.update(objects) return related_entities def group_entities(df, endpoint): @@ -101,4 +120,4 @@ def main(): save_grouped_entities(grouped_entities, args.output_dir) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/poetry.lock b/poetry.lock index e178b365..c8928496 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1385,13 +1385,13 @@ zstandard = ">=0.21.0,<0.22.0" [[package]] name = "oc-ocdm" -version = "8.2.2" +version = "8.3.0" description = "Object mapping library for manipulating RDF graphs that are compliant with the OpenCitations datamodel." optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "oc_ocdm-8.2.2-py3-none-any.whl", hash = "sha256:cac4b06a5397a241119462d3241b715f1e0814fcc794c05944d782005babcf01"}, - {file = "oc_ocdm-8.2.2.tar.gz", hash = "sha256:813203dfef79f33fcd1dd4e593f56c3c6f6337c224082ba00601d3a59ed45c5c"}, + {file = "oc_ocdm-8.3.0-py3-none-any.whl", hash = "sha256:a512cf8afd3de3c270099887a19b73c750217354b5572e0a5e7be1aa00ebb473"}, + {file = "oc_ocdm-8.3.0.tar.gz", hash = "sha256:23911150fb44b8d1b011b149977518cb64fdaf04a9a316880dee7fe4b487b922"}, ] [package.dependencies] @@ -2709,4 +2709,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "8d3266e0383e0fed6dd09cf79f720501c1dfd6b6e66b37b770c45faeb4ee1f61" +content-hash = "69ad480dc06d04051b5c1f57eabf6144df83f6b79172b9ac367f8d176cd696db" diff --git a/pyproject.toml b/pyproject.toml index 5a52260b..3ffd519a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ oc-ds-converter = "1.0.3" ijson = "^3.2.3" internetarchive = "^3.7.0" zenodopy = "^0.3.0" -oc-ocdm = "8.2.2" +oc-ocdm = "8.3.0" retrying = "^1.3.4" orjson = "^3.10.7" diff --git a/virtuoso-opensource/database/virtuoso-temp.db b/virtuoso-opensource/database/virtuoso-temp.db index e63ce37a..7d8d8f98 100644 Binary files a/virtuoso-opensource/database/virtuoso-temp.db and b/virtuoso-opensource/database/virtuoso-temp.db differ