Skip to content

Commit

Permalink
agents VIAF: add sources
Browse files Browse the repository at this point in the history
* Adds WorldCat, Wikipedia and sourc URL's

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep committed Apr 27, 2023
1 parent 3bd26d5 commit 6d12cfc
Show file tree
Hide file tree
Showing 25 changed files with 38,451 additions and 3,584 deletions.
7,305 changes: 6,260 additions & 1,045 deletions data/aggnd.json

Large diffs are not rendered by default.

459 changes: 420 additions & 39 deletions data/aggnd_metadata.csv

Large diffs are not rendered by default.

459 changes: 420 additions & 39 deletions data/aggnd_pidstore.csv

Large diffs are not rendered by default.

3,964 changes: 3,710 additions & 254 deletions data/agrero.json

Large diffs are not rendered by default.

8,383 changes: 7,339 additions & 1,044 deletions data/aidref.json

Large diffs are not rendered by default.

566 changes: 493 additions & 73 deletions data/aidref_metadata.csv

Large diffs are not rendered by default.

566 changes: 493 additions & 73 deletions data/aidref_pidstore.csv

Large diffs are not rendered by default.

7,821 changes: 7,469 additions & 352 deletions data/mef.json

Large diffs are not rendered by default.

909 changes: 909 additions & 0 deletions data/mef_id.csv

Large diffs are not rendered by default.

1,071 changes: 990 additions & 81 deletions data/mef_metadata.csv

Large diffs are not rendered by default.

1,071 changes: 990 additions & 81 deletions data/mef_pidstore.csv

Large diffs are not rendered by default.

6,818 changes: 6,597 additions & 221 deletions data/viaf.json

Large diffs are not rendered by default.

1,082 changes: 1,000 additions & 82 deletions data/viaf_metadata.csv

Large diffs are not rendered by default.

1,082 changes: 1,000 additions & 82 deletions data/viaf_pidstore.csv

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions rero_mef/agents/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ def agents():
@agents.command()
@click.option('-k', '--enqueue', 'enqueue', is_flag=True, default=False,
help="Enqueue record creation.")
@click.option('-o', '--online', 'online', multiple=True,
default=['aidref', 'aggnd', 'agrero'])
@click.option('-o', '--online', 'online', multiple=True, default=[])
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-V', '--online_verbose', 'online_verbose', is_flag=True,
default=False)
Expand Down
62 changes: 37 additions & 25 deletions rero_mef/agents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from flask import current_app

from ..utils import get_entity_class, metadata_csv_line, \
number_records_in_file, pidstore_csv_line, progressbar, write_link_json
number_records_in_file, pidstore_csv_line, progressbar, write_viaf_json


def write_mef_files(pid, data, pidstore, metadata, ids):
Expand Down Expand Up @@ -191,12 +191,12 @@ def create_viaf_files(
:param verbose: Verbose.
:returns: count of processed VIAF records.
"""
from rero_mef.agents import AgentViafRecord
if verbose:
click.echo(' Start ...')

agent_pid = 0
corresponding_data = {}
count = 0
corresponding_data = {}
use = False
with (
open(
viaf_pidstore_file_name, 'w', encoding='utf-8'
Expand All @@ -212,39 +212,51 @@ def create_viaf_files(
fields = row.rstrip().split('\t')
assert len(fields) == 2
previous_viaf_pid = fields[0].split('/')[-1]
# go back to first line in file
viaf_in_file.seek(0)
for row in viaf_in_file:
fields = row.rstrip().split('\t')
assert len(fields) == 2
viaf_pid = fields[0].split('/')[-1]
if viaf_pid != previous_viaf_pid:
agent_pid += 1
if write_link_json(
agent='viaf',
pidstore_file=viaf_pidstore,
metadata_file=viaf_metadata,
viaf_pid=previous_viaf_pid,
corresponding_data=corresponding_data,
agent_pid=str(agent_pid),
verbose=verbose
):
if use:
write_viaf_json(
pidstore_file=viaf_pidstore,
metadata_file=viaf_metadata,
viaf_pid=previous_viaf_pid,
corresponding_data=corresponding_data,
verbose=verbose
)
count += 1
use = False
corresponding_data = {}
previous_viaf_pid = viaf_pid
corresponding = fields[1].split('|')
if len(corresponding) == 2:
corresponding_data[corresponding[0]] = corresponding[1]
corresponding_data.setdefault(
corresponding[0], {'pid': corresponding[1]})
if corresponding[0] in AgentViafRecord.sources_used:
use = True
corresponding = fields[1].split('@')
if len(corresponding) == 2:
if corresponding[0] == 'Wikipedia':
# multiple wikipedia
corresponding_data.setdefault(corresponding[0], {})
corresponding_data[corresponding[0]].setdefault('url', [])
corresponding_data[corresponding[0]]['url'].append(
corresponding[1])
else:
corresponding_data.setdefault(
corresponding[0], {'url': corresponding[1]})
# save the last record
agent_pid += 1
if write_link_json(
agent='viaf',
pidstore_file=viaf_pidstore,
metadata_file=viaf_metadata,
viaf_pid=previous_viaf_pid,
corresponding_data=corresponding_data,
agent_pid=str(agent_pid),
verbose=verbose
):
if use:
write_viaf_json(
pidstore_file=viaf_pidstore,
metadata_file=viaf_metadata,
viaf_pid=previous_viaf_pid,
corresponding_data=corresponding_data,
verbose=verbose
)
count += 1
if verbose:
click.echo(f' VIAF records created: {count}')
Expand Down
136 changes: 110 additions & 26 deletions rero_mef/agents/viaf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

"""API for manipulating VIAF record."""

from copy import deepcopy

import click
import requests
from elasticsearch_dsl.query import Q
Expand All @@ -30,6 +32,7 @@
from ..api import Action, ReroIndexer, ReroMefRecord
from ..mef.api import AgentMefRecord
from ..utils import get_entity_class
from ...filter import exists_filter
from ...utils import get_entity_class, progressbar, requests_retry_session


Expand All @@ -56,6 +59,83 @@ class AgentViafRecord(ReroMefRecord):
name = 'viaf'
model_cls = ViafMetadata
search = AgentViafSearch
# https://viaf.org/
sources = {
'DNB': 'gnd', # German National Library
'SUDOC': 'idref', # Sudoc [ABES], France
'RERO': 'rero', # RERO - Library Network of Western Switzerland
'SZ': 'sz', # Swiss National Library
'BNE': 'bne', # National Library of Spain
'BNF': 'bnf', # National Library of France
'ICCU': 'iccu', # Central Institute for the Union Catalogue of the Italian libraries # noqa
'ISNI': 'isni', # ISNI
'WKP': 'wiki' # Wikidata
# 'LC': 'loc', # Library of Congress
# 'SELIBR': 'selibr', # National Library of Sweden
# 'NLA': 'nla', # National Library of Australia
# 'PTBNP': 'ptbnp', # National Library of Portugal
# 'BLBNB': 'BLBNB', # National Library of Brazil
# 'NKC': 'nkc', # National Library of the Czech Republic
# 'J9U': 'j9u', # National Library of Israel
# 'EGAXA': 'egaxa', # Library of Alexandria, Egypt
# 'BAV': 'bav', # Vatican Library
# 'CAOONL': 'caoonl', # Library and Archives Canada/PFAN
# 'JPG': 'jpg', # Union List of Artist Names [Getty Research Institute] # noqa
# 'NUKAT': 'nukat', # NUKAT Center of Warsaw University Library
# 'NSZL': 'NSZL', # National Széchényi Library, Hungary
# 'VLACC': 'vlacc', # Flemish Public Libraries National Library of Russia # noqa
# 'NTA': 'nta', # National Library of Netherlands
# 'BIBSYS': 'bibsys', # BIBSYS
# 'GRATEVE': 'grateve', # National Library of Greece
# 'ARBABN': 'arbabn', # National Library of Argentina
# 'W2Z': 'w2z', # National Library of Norway
# 'DBC': 'dbc', # DBC (Danish Bibliographic Center)
# 'NDL': 'ndl', # National Diet Library, Japan
# 'NII': 'nii', # NII (Japan)
# 'NLB': 'nlb', # National Library Board, Singapore
# 'LNB': 'lnb', # National Library of Latvia
# 'PLWABN': 'plwabn', # National Library of Poland
# 'BNC': 'BNC', # National Library of Catalonia
# 'LNL': 'lnl', # Lebanese National Library
# 'PERSEUS': 'perseus', # Perseus Digital Library
# 'SRP': 'srp', # Syriac Reference Portal
# 'N6I': 'n6i', # National Library of Ireland
# 'NSK': 'nsk', # National and University Library in Zagreb
# 'CYT': 'cyt', # National Central Library, Taiwan
# 'B2Q': 'b2q', # National Library and Archives of Québec
# 'KRLNK': 'krlnk', # National Library of Korea
# 'BNL': 'BNL', # National Library of Luxembourg
# 'BNCHL': 'bnchl', # National Library of Chile
# 'MRBNR': 'mrbnr', # National Library of Morocco
# 'XA': 'xa', # xA Extended Authorities
# 'XR': 'xr', # xR Extended Relationships
# 'FAST': 'fast', # FAST Subjects
# 'ERRR': 'errr', # National Library of Estonia
# 'UIY': 'uiy', # National and University Library of Iceland (NULI)
# 'NYNYRILM': 'nynyrilm', # Repertoire International de Litterature Musicale, Inc. (RILM) # noqa
# 'DE663': 'de663', # International Inventory of Musical Sources (RISM) # noqa
# 'SIMACOB': 'simacob', # NUK/COBISS.SI, Slovenia
# 'LIH': 'lih', # National Library of Lithuania
# 'SKMASNL': 'skmasnl', # Slovak National Library
# 'UAE': 'uae', # United Arab Emirates University
}
sources_used = ('DNB', 'SUDOC', 'RERO')

@classmethod
def filters(cls):
"""Filters for sources."""
return {
source: exists_filter(f'{source}_pid')
for source in cls.sources.values()
}

@classmethod
def aggregations(cls):
"""Aggregations for sources."""
return {
source: dict(filter=dict(exists=dict(field=f'{source}_pid')))
for source in cls.sources.values()
}

def create_mef_and_agents(self, dbcommit=False, reindex=False,
online=None, verbose=False,
Expand Down Expand Up @@ -205,11 +285,6 @@ def get_online_record(cls, viaf_source_code, pid, format=None):
link = get the VIAF link record
:returns: VIAF record as json
"""
source_code = {
'DNB': 'gnd_pid',
'SUDOC': 'idref_pid',
'RERO': 'rero_pid'
}
viaf_format = '/viaf.json'
if format == 'link':
viaf_format = '/justlinks.json'
Expand All @@ -229,11 +304,27 @@ def get_online_record(cls, viaf_source_code, pid, format=None):
if isinstance(sources, dict):
sources = [sources]
for source in sources:
text = source.get('#text', '|').split('|')
if text[0] in source_code:
result[source_code[text[0]]] = text[1]
# get pid
text = source.get('#text', '|')
text = text.split('|')
if bib_source := cls.sources.get(text[0]):
result[f'{bib_source}_pid'] = text[1]
# get URL
if nsid := source.get('@nsid'):
if nsid.startswith('http'):
result[bib_source] = nsid
# get Wikipedia and WorldCat URLs
if result.get('wiki_pid'):
x_links = data_json.get('xLinks', {}).get('xLink', [])
for x_link in x_links:
if 'worldcat' in x_link:
result['worldcat'] = x_link
elif isinstance(x_link, dict):
text = x_link.get('#text')
if text and 'wikipedia' in text:
result.setdefault('wiki', []).append(text)
# make sure we got a VIAF with the same pid for source
if result.get(source_code.get(viaf_source_code)) == pid:
if result.get(f'{cls.sources.get(viaf_source_code)}_pid') == pid:
return result, msg
return {}, f'VIAF get: {pid:<15} {url} | NO RECORD'

Expand All @@ -242,7 +333,6 @@ def get_viaf(cls, agent):
"""Get VIAF record by agent.
:param agent: Agency do get corresponding VIAF record.
:param online: Try to get VIAF record online if not exist.
"""
if isinstance(agent, AgentMefRecord):
return [cls.get_record_by_pid(agent.get('viaf_pid'))]
Expand Down Expand Up @@ -362,15 +452,14 @@ def get_missing_agent_pids(cls, agent, verbose=False):
def get_pids_with_multiple_viaf(cls, verbose=False):
"""Get agent pids with multiple MEF records.
:params record_types: Record types (pid_types).
:param verbose: Verbose.
:returns: pids, multiple pids, missing pids.
:returns: pids.
"""
multiple_pids = {
'gnd_pid': {},
'idref_pid': {},
'rero_pid': {}
f'{cls.sources.get(source)}_pid': {}
for source in cls.sources_used
}
cleaned_pids = deepcopy(multiple_pids)
progress = progressbar(
items=AgentViafSearch()
.params(preserve_order=True)
Expand All @@ -382,19 +471,14 @@ def get_pids_with_multiple_viaf(cls, verbose=False):
for hit in progress:
viaf_pid = hit.pid
data = hit.to_dict()
for agent in multiple_pids:
if pid := data.get(agent):
multiple_pids[agent].setdefault(pid, [])
multiple_pids[agent][pid].append(viaf_pid)
cleaned_pids = {
'gnd_pid': {},
'idref_pid': {},
'rero_pid': {}
}
for agent, pids in multiple_pids.items():
for source in multiple_pids:
if pid := data.get(source):
multiple_pids[source].setdefault(pid, [])
multiple_pids[source][pid].append(viaf_pid)
for source, pids in multiple_pids.items():
for pid, viaf_pids in pids.items():
if len(viaf_pids) > 1:
cleaned_pids[agent][pid] = viaf_pids
cleaned_pids[source][pid] = viaf_pids
return cleaned_pids


Expand Down
Loading

0 comments on commit 6d12cfc

Please sign in to comment.