Skip to content

Commit

Permalink
VIAF: add sources
Browse files Browse the repository at this point in the history
* Adds WorldCat, Wikipedia and sourc URL's

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep committed Feb 25, 2023
1 parent aa0dbdf commit 79711d2
Show file tree
Hide file tree
Showing 23 changed files with 38,401 additions and 3,535 deletions.
7,305 changes: 6,260 additions & 1,045 deletions data/aggnd.json

Large diffs are not rendered by default.

459 changes: 420 additions & 39 deletions data/aggnd_metadata.csv

Large diffs are not rendered by default.

459 changes: 420 additions & 39 deletions data/aggnd_pidstore.csv

Large diffs are not rendered by default.

3,964 changes: 3,710 additions & 254 deletions data/agrero.json

Large diffs are not rendered by default.

8,383 changes: 7,339 additions & 1,044 deletions data/aidref.json

Large diffs are not rendered by default.

566 changes: 493 additions & 73 deletions data/aidref_metadata.csv

Large diffs are not rendered by default.

566 changes: 493 additions & 73 deletions data/aidref_pidstore.csv

Large diffs are not rendered by default.

7,821 changes: 7,469 additions & 352 deletions data/mef.json

Large diffs are not rendered by default.

909 changes: 909 additions & 0 deletions data/mef_id.csv

Large diffs are not rendered by default.

1,071 changes: 990 additions & 81 deletions data/mef_metadata.csv

Large diffs are not rendered by default.

1,071 changes: 990 additions & 81 deletions data/mef_pidstore.csv

Large diffs are not rendered by default.

6,818 changes: 6,597 additions & 221 deletions data/viaf.json

Large diffs are not rendered by default.

1,082 changes: 1,000 additions & 82 deletions data/viaf_metadata.csv

Large diffs are not rendered by default.

1,082 changes: 1,000 additions & 82 deletions data/viaf_pidstore.csv

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions rero_mef/agents/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ def agents():
@agents.command()
@click.option('-k', '--enqueue', 'enqueue', is_flag=True, default=False,
help="Enqueue record creation.")
@click.option('-o', '--online', 'online', multiple=True,
default=['aidref', 'aggnd', 'agrero'])
@click.option('-o', '--online', 'online', multiple=True, default=[])
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-V', '--online_verbose', 'online_verbose', is_flag=True,
default=False)
Expand Down Expand Up @@ -155,7 +154,7 @@ def create_csv_viaf(viaf_file, output_directory, verbose):
err=True)

count = create_viaf_files(
viaf_input_file=viaf_file,
viaf_input_file_name=viaf_file,
viaf_pidstore_file_name=pidstore,
viaf_metadata_file_name=metadata,
verbose=verbose
Expand Down
16 changes: 14 additions & 2 deletions rero_mef/agents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def create_viaf_files(
) as viaf_pidstore,
open(
viaf_metadata_file_name, 'w', encoding='utf-8'
) as viaf_metadata,\
) as viaf_metadata,
open(
viaf_input_file_name, 'r', encoding='utf-8'
) as viaf_in_file):
Expand Down Expand Up @@ -233,7 +233,19 @@ def create_viaf_files(
previous_viaf_pid = viaf_pid
corresponding = fields[1].split('|')
if len(corresponding) == 2:
corresponding_data[corresponding[0]] = corresponding[1]
corresponding_data.setdefault(
corresponding[0], {'pid': corresponding[1]})
corresponding = fields[1].split('@')
if len(corresponding) == 2:
if corresponding[0] == 'Wikipedia':
# multiple wikipedia
corresponding_data.setdefault(corresponding[0], {})
corresponding_data[corresponding[0]].setdefault('url', [])
corresponding_data[corresponding[0]]['url'].append(
corresponding[1])
else:
corresponding_data.setdefault(
corresponding[0], {'url': corresponding[1]})
# save the last record
agent_pid += 1
if write_link_json(
Expand Down
117 changes: 102 additions & 15 deletions rero_mef/agents/viaf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from ..api import Action, ReroIndexer, ReroMefRecord
from ..mef.api import AgentMefRecord
from ..utils import get_entity_class
from ...filter import exists_filter
from ...utils import get_entity_class, progressbar


Expand All @@ -56,6 +57,83 @@ class AgentViafRecord(ReroMefRecord):
name = 'viaf'
model_cls = ViafMetadata
search = AgentViafSearch
# https://viaf.org/
sources = {
'DNB': 'gnd', # German National Library
'SUDOC': 'idref', # Sudoc [ABES], France
'RERO': 'rero', # RERO - Library Network of Western Switzerland
'SZ': 'sz', # Swiss National Library
'BNE': 'bne', # National Library of Spain
'BNF': 'bnf', # National Library of France
'ICCU': 'iccu', # Central Institute for the Union Catalogue of the Italian libraries # noqa
'ISNI': 'isni', # ISNI
'WKP': 'wiki' # Wikidata
# 'LC': 'loc', # Library of Congress
# 'SELIBR': 'selibr', # National Library of Sweden
# 'NLA': 'nla', # National Library of Australia
# 'PTBNP': 'ptbnp', # National Library of Portugal
# 'BLBNB': 'BLBNB', # National Library of Brazil
# 'NKC': 'nkc', # National Library of the Czech Republic
# 'J9U': 'j9u', # National Library of Israel
# 'EGAXA': 'egaxa', # Library of Alexandria, Egypt
# 'BAV': 'bav', # Vatican Library
# 'CAOONL': 'caoonl', # Library and Archives Canada/PFAN
# 'JPG': 'jpg', # Union List of Artist Names [Getty Research Institute] # noqa
# 'NUKAT': 'nukat', # NUKAT Center of Warsaw University Library
# 'NSZL': 'NSZL', # National Széchényi Library, Hungary
# 'VLACC': 'vlacc', # Flemish Public Libraries National Library of Russia # noqa
# 'NTA': 'nta', # National Library of Netherlands
# 'BIBSYS': 'bibsys', # BIBSYS
# 'GRATEVE': 'grateve', # National Library of Greece
# 'ARBABN': 'arbabn', # National Library of Argentina
# 'W2Z': 'w2z', # National Library of Norway
# 'DBC': 'dbc', # DBC (Danish Bibliographic Center)
# 'NDL': 'ndl', # National Diet Library, Japan
# 'NII': 'nii', # NII (Japan)
# 'NLB': 'nlb', # National Library Board, Singapore
# 'LNB': 'lnb', # National Library of Latvia
# 'PLWABN': 'plwabn', # National Library of Poland
# 'BNC': 'BNC', # National Library of Catalonia
# 'LNL': 'lnl', # Lebanese National Library
# 'PERSEUS': 'perseus', # Perseus Digital Library
# 'SRP': 'srp', # Syriac Reference Portal
# 'N6I': 'n6i', # National Library of Ireland
# 'NSK': 'nsk', # National and University Library in Zagreb
# 'CYT': 'cyt', # National Central Library, Taiwan
# 'B2Q': 'b2q', # National Library and Archives of Québec
# 'KRLNK': 'krlnk', # National Library of Korea
# 'BNL': 'BNL', # National Library of Luxembourg
# 'BNCHL': 'bnchl', # National Library of Chile
# 'MRBNR': 'mrbnr', # National Library of Morocco
# 'XA': 'xa', # xA Extended Authorities
# 'XR': 'xr', # xR Extended Relationships
# 'FAST': 'fast', # FAST Subjects
# 'ERRR': 'errr', # National Library of Estonia
# 'UIY': 'uiy', # National and University Library of Iceland (NULI)
# 'NYNYRILM': 'nynyrilm', # Repertoire International de Litterature Musicale, Inc. (RILM) # noqa
# 'DE663': 'de663', # International Inventory of Musical Sources (RISM) # noqa
# 'SIMACOB': 'simacob', # NUK/COBISS.SI, Slovenia
# 'LIH': 'lih', # National Library of Lithuania
# 'SKMASNL': 'skmasnl', # Slovak National Library
# 'UAE': 'uae', # United Arab Emirates University
}
sources_used = ('DNB', 'SUDOC', 'RERO')

@classmethod
def filters(cls):
"""Filters for sources."""
return {
source: exists_filter(f'{source}_pid')
for source in cls.sources.values()
}

@classmethod
def aggregations(cls):
"""Aggregations for sources."""
return {
source: dict(filter=dict(exists=dict(field=f'{source}_pid')))
for source in cls.sources.values()
}

def create_mef_and_agents(self, dbcommit=False, reindex=False,
online=None, verbose=False,
Expand Down Expand Up @@ -195,11 +273,6 @@ def get_online_record(cls, viaf_source_code, pid, format=None):
link = get the VIAF link record
:returns: VIAF record as json
"""
source_code = {
'DNB': 'gnd_pid',
'SUDOC': 'idref_pid',
'RERO': 'rero_pid'
}
viaf_format = '/viaf.json'
if format == 'link':
viaf_format = '/justlinks.json'
Expand All @@ -217,19 +290,34 @@ def get_online_record(cls, viaf_source_code, pid, format=None):
if isinstance(sources, dict):
sources = [sources]
for source in sources:
text = source.get('#text', '|').split('|')
if text[0] in source_code:
result[source_code[text[0]]] = text[1]
# get pid
text = source.get('#text', '|')
text = text.split('|')
if bib_source := cls.sources.get(text[0]):
result[f'{bib_source}_pid'] = text[1]
# get URL
if nsid := source.get('@nsid'):
if nsid.startswith('http'):
result[bib_source] = nsid
# get Wikipedia and WorldCat URLs
if result.get('wiki_pid'):
x_links = data_json.get('xLinks', {}).get('xLink', [])
for x_link in x_links:
if 'worldcat' in x_link:
result['worldcat'] = x_link
elif isinstance(x_link, dict):
text = x_link.get('#text')
if text and 'wikipedia' in text:
result.setdefault('wiki', []).append(text)
# make sure we got a VIAF with the same pid for source
if result.get(source_code.get(viaf_source_code)) == pid:
if result.get(f'{cls.sources.get(viaf_source_code)}_pid') == pid:
return result

@classmethod
def get_viaf(cls, agent):
"""Get VIAF record by agent.
:param agent: Agency do get corresponding VIAF record.
:param online: Try to get VIAF record online if not exist.
"""
if isinstance(agent, AgentMefRecord):
return [cls.get_record_by_pid(agent.get('viaf_pid'))]
Expand Down Expand Up @@ -349,14 +437,13 @@ def get_missing_agent_pids(cls, agent, verbose=False):
def get_pids_with_multiple_viaf(cls, verbose=False):
"""Get agent pids with multiple MEF records.
:params record_types: Record types (pid_types).
:param verbose: Verbose.
:returns: pids, multiple pids, missing pids.
:returns: pids.
"""
multiple_pids = {
'gnd_pid': {},
'idref_pid': {},
'rero_pid': {}
'DNB': 'gnd_pid',
'SUDOC': 'idref_pid',
'RERO': 'rero_pid'
}
progress = progressbar(
items=AgentViafSearch()
Expand Down
117 changes: 111 additions & 6 deletions rero_mef/agents/viaf/jsonschemas/viaf/viaf-v0.0.1.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for VIAF authority",
"title": "Schema for VIAF contributor",
"type": "object",
"required": [
"$schema",
Expand All @@ -10,7 +10,7 @@
"properties": {
"$schema": {
"title": "Schema",
"description": "Schema to VIAF authority record.",
"description": "Schema to VIAF contributor record.",
"type": "string",
"minLength": 7
},
Expand All @@ -20,20 +20,125 @@
"minLength": 1
},
"gnd_pid": {
"title": "GND authority ID",
"title": "GND contributor ID",
"type": "string",
"minLength": 1
},
"gnd": {
"title": "GND contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"idref_pid": {
"title": "IDREF contributor ID",
"type": "string",
"minLength": 1
},
"IDREF": {
"title": "IDREF contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"rero_pid": {
"title": "RERO authority ID",
"title": "RERO contributor ID",
"type": "string",
"minLength": 1
},
"idref_pid": {
"title": "IDREF authority ID",
"rero": {
"title": "RERO contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"sz_pid": {
"title": "SZ contributor ID",
"type": "string",
"minLength": 1
},
"sz": {
"title": "SZ contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"bne_pid": {
"title": "BNE contributor ID",
"type": "string",
"minLength": 1
},
"bne": {
"title": "BNE contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"bnf_pid": {
"title": "BNF contributor ID",
"type": "string",
"minLength": 1
},
"bnf": {
"title": "BNF contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"iccu_pid": {
"title": "ICCU contributor ID",
"type": "string",
"minLength": 1
},
"iccu": {
"title": "ICCU contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"isni_pid": {
"title": "ISNI contributor ID",
"type": "string",
"minLength": 1
},
"isni": {
"title": "ISNI contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"wiki_pid": {
"title": "Wikipedia contributor ID",
"type": "string",
"minLength": 1
},
"wiki": {
"title": "Wikipedia links",
"type": "array",
"minItems": 1,
"items": {
"title": "Wikipedia link",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
}
},
"worldcat": {
"title": "WorldCat contributor URL",
"type": "string",
"format": "uri",
"pattern": "^(https?)://.*$",
"minLength": 7
},
"md5": {
"title": "MD5",
"type": "string",
Expand Down
Loading

0 comments on commit 79711d2

Please sign in to comment.