Skip to content

Commit

Permalink
group entities by referenced entities
Browse files Browse the repository at this point in the history
  • Loading branch information
arcangelo7 committed Sep 25, 2024
1 parent e9844b9 commit 020509f
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
virtuoso-opensource/ filter=lfs diff=lfs merge=lfs -text
duplicated_br_from_files.csv filter=lfs diff=lfs merge=lfs -text
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ zenodo.yaml
internet_archive.yaml
storage
virtuoso-opensource
ts_upload_cache.json
ts_upload_cache.json
duplicated_brs_from_files/
3 changes: 3 additions & 0 deletions duplicated_br_from_files.csv
Git LFS file not shown
13 changes: 13 additions & 0 deletions oc_meta/run/merge/check_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ def check_entity_sparql(sparql_endpoint, entity_uri, is_surviving):
tqdm.write(f"Error in SPARQL: Surviving entity {entity_uri} does not exist")
return

if not is_surviving:
referenced_query = f"""
ASK {{
?s ?p <{entity_uri}> .
}}
"""
sparql.setQuery(referenced_query)
sparql.setReturnFormat(JSON)
referenced_results = sparql_query_with_retry(sparql)

if referenced_results['boolean']:
tqdm.write(f"Error in SPARQL: Merged entity {entity_uri} is still referenced by other entities")

# Query to get entity types
types_query = f"""
SELECT ?type WHERE {{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse
import csv
from typing import Set, List

def read_csv(file_path: str) -> Set[str]:
entities = set()
with open(file_path, 'r') as csvfile:
reader = csv.reader(csvfile)
next(reader) # Skip header
for row in reader:
surviving_entity = row[0]
merged_entities = row[1].split('; ')
entities.add(surviving_entity)
entities.update(merged_entities)
return entities

def find_differences(file_a: str, file_b: str) -> tuple[List[str], List[str]]:
entities_a = read_csv(file_a)
entities_b = read_csv(file_b)

in_a_not_b = list(entities_a - entities_b)
in_b_not_a = list(entities_b - entities_a)

return in_a_not_b, in_b_not_a

def main():
parser = argparse.ArgumentParser(description="Find differences between two CSV files.")
parser.add_argument("file_a", help="Path to the first CSV file")
parser.add_argument("file_b", help="Path to the second CSV file")
args = parser.parse_args()

in_a_not_b, in_b_not_a = find_differences(args.file_a, args.file_b)

print("Entities in A but not in B:")
for entity in in_a_not_b:
print(entity)

print("\nEntities in B but not in A:")
for entity in in_b_not_a:
print(entity)

if __name__ == "__main__":
main()
59 changes: 39 additions & 20 deletions oc_meta/run/merge/group_entities_to_be_merged.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,71 @@
import pandas as pd
import os
import argparse
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
import os

import pandas as pd
from retrying import retry
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm import tqdm


class UnionFind:
def __init__(self):
self.parent = {}

def find(self, item):
if item not in self.parent:
self.parent[item] = item
if self.parent[item] != item:
self.parent[item] = self.find(self.parent[item])
return self.parent[item]
path = [item]
while item in self.parent and self.parent[item] != item:
item = self.parent[item]
path.append(item)
if len(path) > 1000: # Limite arbitrario per evitare loop infiniti
print(f"Warning: Long path detected: {' -> '.join(path)}")
break
for p in path:
self.parent[p] = item
return item

def union(self, item1, item2):
root1 = self.find(item1)
root2 = self.find(item2)
if root1 != root2:
self.parent[root2] = root1


def load_csv(file_path):
return pd.read_csv(file_path)

@retry(stop_max_attempt_number=3, wait_exponential_multiplier=1000, wait_exponential_max=10000)
def query_sparql(endpoint, uri):
def query_sparql(endpoint, uri, query_type):
sparql = SPARQLWrapper(endpoint)
query = f"""
SELECT ?subject WHERE {{
?subject ?predicate <{uri}> .
}}
"""

if query_type == 'subjects':
query = f"""
SELECT ?subject WHERE {{
?subject ?predicate <{uri}> .
}}
"""
elif query_type == 'objects':
query = f"""
SELECT ?object WHERE {{
<{uri}> ?predicate ?object .
?object ?p ?o .
}}
"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

subjects = [result['subject']['value'] for result in results['results']['bindings']]
return subjects
if query_type == 'subjects':
return [result['subject']['value'] for result in results['results']['bindings']]
elif query_type == 'objects':
return [result['object']['value'] for result in results['results']['bindings']]

def get_all_related_entities(endpoint, uris):
related_entities = set(uris)
for uri in uris:
subjects = query_sparql(endpoint, uri)
subjects = query_sparql(endpoint, uri, 'subjects')
objects = query_sparql(endpoint, uri, 'objects')
related_entities.update(subjects)
related_entities.update(objects)
return related_entities

def group_entities(df, endpoint):
Expand Down Expand Up @@ -101,4 +120,4 @@ def main():
save_grouped_entities(grouped_entities, args.output_dir)

if __name__ == "__main__":
main()
main()
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ oc-ds-converter = "1.0.3"
ijson = "^3.2.3"
internetarchive = "^3.7.0"
zenodopy = "^0.3.0"
oc-ocdm = "8.2.2"
oc-ocdm = "8.3.0"
retrying = "^1.3.4"
orjson = "^3.10.7"

Expand Down
Binary file modified virtuoso-opensource/database/virtuoso-temp.db
Binary file not shown.

0 comments on commit 020509f

Please sign in to comment.