Skip to content

Commit

Permalink
check_br_constraints
Browse files Browse the repository at this point in the history
  • Loading branch information
arcangelo7 committed Nov 29, 2024
1 parent 8d568c9 commit 58b53e3
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 36 deletions.
103 changes: 75 additions & 28 deletions oc_meta/run/merge/check_merged_brs_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,20 @@
from multiprocessing import Pool, cpu_count

import yaml
from oc_meta.plugins.editor import MetaEditor
from filelock import FileLock
from rdflib import RDF, ConjunctiveGraph, Literal, Namespace, URIRef
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm import tqdm

from oc_meta.plugins.editor import MetaEditor

DATACITE = "http://purl.org/spar/datacite/"
FABIO = "http://purl.org/spar/fabio/"
PROV = Namespace("http://www.w3.org/ns/prov#")
PRO = Namespace("http://purl.org/spar/pro/")
DCTERMS = Namespace("http://purl.org/dc/terms/")
FRBR = Namespace("http://purl.org/vocab/frbr/core#")
PRISM = Namespace("http://prismstandard.org/namespaces/basic/2.1/")

def read_csv(csv_file):
with open(csv_file, 'r') as f:
Expand All @@ -33,34 +39,75 @@ def sparql_query_with_retry(sparql, max_retries=3, initial_delay=1, backoff_fact
delay = initial_delay * (backoff_factor ** attempt)
time.sleep(delay + random.uniform(0, 1))

def check_br_constraints(g: ConjunctiveGraph, entity):
issues = []

# Check types
types = list(g.objects(entity, RDF.type, unique=True))
if not types:
issues.append(f"Entity {entity} has no type")
elif len(types) > 2:
issues.append(f"Entity {entity} has more than two types")
elif URIRef(FABIO + "Expression") not in types:
issues.append(f"Entity {entity} is not a fabio:Expression")

# Check if entity is a journal issue or volume
is_journal_issue = URIRef(FABIO + "JournalIssue") in types
is_journal_volume = URIRef(FABIO + "JournalVolume") in types

# Check identifiers
identifiers = list(g.objects(entity, URIRef(DATACITE + "hasIdentifier"), unique=True))
if not identifiers:
issues.append(f"Entity {entity} has no datacite:hasIdentifier")

# Check title (zero or one)
titles = list(g.objects(entity, DCTERMS.title, unique=True))
if len(titles) > 1:
issues.append(f"Entity {entity} has multiple titles")

# Check part of (zero or one)
part_of = list(g.objects(entity, FRBR.partOf, unique=True))
if len(part_of) > 1:
issues.append(f"Entity {entity} has multiple partOf relations")

# Check publication date (zero or one)
pub_dates = list(g.objects(entity, PRISM.hasPublicationDate, unique=True))
if len(pub_dates) > 1:
issues.append(f"Entity {entity} has multiple publication dates")

# Check sequence identifier (zero or one)
seq_ids = list(g.objects(entity, URIRef(FABIO + "hasSequenceIdentifier"), unique=True))
if len(seq_ids) > 1:
issues.append(f"Entity {entity} has multiple sequence identifiers")
elif seq_ids and not (is_journal_issue or is_journal_volume):
issues.append(f"Entity {entity} has sequence identifier but is not a journal issue or volume")

return issues

def check_entity_file(file_path: str, entity_uri, is_surviving):
with zipfile.ZipFile(file_path, 'r') as zip_ref:
for filename in zip_ref.namelist():
with zip_ref.open(filename) as file:
g = ConjunctiveGraph()
g.parse(file, format='json-ld')
entity = URIRef(entity_uri)

if (entity, None, None) not in g:
if is_surviving:
tqdm.write(f"Error in file {file_path}: Surviving entity {entity_uri} does not exist")
return

if not is_surviving:
tqdm.write(f"Error in file {file_path}: Merged entity {entity_uri} still exists")
return

types = list(g.objects(entity, RDF.type))
if not types:
tqdm.write(f"Error in file {file_path}: Entity {entity_uri} has no type")
elif len(types) > 2:
tqdm.write(f"Error in file {file_path}: Entity {entity_uri} has more than two types")
elif URIRef(FABIO + "Expression") not in types:
tqdm.write(f"Error in file {file_path}: Entity {entity_uri} is not a fabio:Expression")

identifiers = list(g.objects(entity, URIRef(DATACITE + "hasIdentifier")))
if not identifiers:
tqdm.write(f"Error in file {file_path}: Entity {entity_uri} has no datacite:hasIdentifier")
lock_path = f"{file_path}.lock"
lock = FileLock(lock_path)

with lock:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
for filename in zip_ref.namelist():
with zip_ref.open(filename) as file:
g = ConjunctiveGraph()
g.parse(file, format='json-ld')
entity = URIRef(entity_uri)

if (entity, None, None) not in g:
if is_surviving:
tqdm.write(f"Error in file {file_path}: Surviving entity {entity_uri} does not exist")
return

if not is_surviving:
tqdm.write(f"Error in file {file_path}: Merged entity {entity_uri} still exists")
return

br_issues = check_br_constraints(g, entity)
for issue in br_issues:
tqdm.write(f"Error in file {file_path}: {issue}")

# Check provenance
prov_file_path = file_path.replace('.zip', '') + '/prov/se.zip'
Expand Down
6 changes: 3 additions & 3 deletions oc_meta/run/merge/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,9 @@ def process_folder(self, csv_folder: str):
if file.endswith('.csv')]

# Filter CSV files based on number of rows and workers
# if self.workers > 4:
# csv_files = [file for file in csv_files
# if self.count_csv_rows(file) <= 10000]
if self.workers > 4:
csv_files = [file for file in csv_files
if self.count_csv_rows(file) <= 10000]

with concurrent.futures.ProcessPoolExecutor(max_workers=self.workers) as executor:
futures = {
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ oc-ds-converter = "^1.0.4"
ijson = "^3.2.3"
internetarchive = "^3.7.0"
zenodopy = "^0.3.0"
oc-ocdm = "9.2.0"
oc-ocdm = "9.2.1"
retrying = "^1.3.4"
orjson = "^3.10.7"
rdflib-ocdm = "0.3.11"
Expand Down

0 comments on commit 58b53e3

Please sign in to comment.