Skip to content

Commit

Permalink
Merge pull request #220 from ncbo/multilingual_preflabels
Browse files Browse the repository at this point in the history
Multilingual prefLabels
  • Loading branch information
alexskr authored Oct 15, 2024
2 parents 812dd78 + fbd7548 commit 93e1bd3
Show file tree
Hide file tree
Showing 7 changed files with 26,183 additions and 256 deletions.
14 changes: 8 additions & 6 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
GIT
remote: https://github.com/ncbo/goo.git
revision: 74a012eebb9433d031eb00df5abbe488cb8b4512
revision: d4da86d07a449e91dbbd6b72763f42d3ba3f20f3
branch: develop
specs:
goo (0.0.2)
Expand Down Expand Up @@ -87,9 +87,10 @@ GEM
net-pop
net-smtp
method_source (1.1.0)
mime-types (3.5.2)
mime-types (3.6.0)
logger
mime-types-data (~> 3.2015)
mime-types-data (3.2024.0903)
mime-types-data (3.2024.1001)
mini_mime (1.1.5)
minitest (4.7.5)
minitest-reporters (0.14.24)
Expand All @@ -101,7 +102,7 @@ GEM
net-http (0.4.1)
uri
net-http-persistent (2.9.4)
net-imap (0.4.16)
net-imap (0.4.17)
date
net-protocol
net-pop (0.1.2)
Expand Down Expand Up @@ -130,7 +131,7 @@ GEM
method_source (~> 1.0)
public_suffix (6.0.1)
racc (1.8.1)
rack (2.2.9)
rack (2.2.10)
rack-test (0.8.3)
rack (>= 1.0, < 3)
rainbow (3.1.1)
Expand All @@ -153,7 +154,7 @@ GEM
rsolr (2.6.0)
builder (>= 2.1.2)
faraday (>= 0.9, < 3, != 2.0.0)
rubocop (1.66.1)
rubocop (1.67.0)
json (~> 2.3)
language_server-protocol (>= 3.17.0)
parallel (~> 1.10)
Expand Down Expand Up @@ -192,6 +193,7 @@ GEM
PLATFORMS
aarch64-linux
arm64-darwin-22
arm64-darwin-23
x86_64-linux

DEPENDENCIES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def process(logger, options = {})

def handle_missing_labels(file_path, logger)
callbacks = {
include_languages: true,
missing_labels: {
op_name: 'Missing Labels Generation',
required: true,
Expand Down Expand Up @@ -60,6 +61,10 @@ def loop_classes(logger, raw_paging, submission, callbacks)
size = 2500
count_classes = 0
acr = submission.id.to_s.split("/")[-1]

# include all languages in attributes of classes if asked for
incl_lang = callbacks.delete(:include_languages)
RequestStore.store[:requested_lang] = :ALL if incl_lang
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
Expand Down Expand Up @@ -161,6 +166,7 @@ def loop_classes(logger, raw_paging, submission, callbacks)
@submission.save
end
end
RequestStore.store[:requested_lang] = nil if incl_lang
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
Expand All @@ -185,26 +191,35 @@ def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, p
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label
lang_rdfs_labels = c.label(include_languages: true)
lang_rdfs_labels = {none: []} if lang_rdfs_labels.empty?

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first
lang_rdfs_labels&.each do |lang, rdfs_labels|
if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end
rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end

rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil
rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil

if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end

if lang === :none
lang = nil
prefLabel = label
end
prefLabel = label if !prefLabel && lang === Goo.portal_language
prefLabel = label unless prefLabel
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label, lang)
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end
Expand Down Expand Up @@ -381,238 +396,6 @@ def delete_and_append(triples_file_path, logger, mime_type = nil)
logger.flush
end

def process_callbacks(logger, callbacks, action_name, &block)
callbacks.delete_if do |_, callback|
begin
if callback[action_name]
callable = @submission.method(callback[action_name])
yield(callable, callback)
end
false
rescue Exception => e
logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}")
logger.flush

if callback[:status]
add_submission_status(callback[:status].get_error_status)
@submission.save
end

# halt the entire processing if :required is set to true
raise e if callback[:required]
# continue processing of other callbacks, but not this one
true
end
end
end

def loop_classes(logger, raw_paging, callbacks)
page = 1
size = 2500
count_classes = 0
acr = @submission.id.to_s.split("/")[-1]
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
paging = raw_paging.page(page, size)
cls_count_set = false
cls_count = class_count(logger)

if cls_count > -1
# prevent a COUNT SPARQL query if possible
paging.page_count_set(cls_count)
cls_count_set = true
else
cls_count = 0
end

iterate_classes = false
# 1. init artifacts hash if not explicitly passed in the callback
# 2. determine if class-level iteration is required
callbacks.each { |_, callback| callback[:artifacts] ||= {}; iterate_classes = true if callback[:caller_on_each] }

process_callbacks(logger, callbacks, :caller_on_pre) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }

page_len = -1
prev_page_len = -1

begin
t0 = Time.now
page_classes = paging.page(page, size).all
total_pages = page_classes.total_pages
page_len = page_classes.length

# nothing retrieved even though we're expecting more records
if total_pages > 0 && page_classes.empty? && (prev_page_len == -1 || prev_page_len == size)
j = 0
num_calls = LinkedData.settings.num_retries_4store

while page_classes.empty? && j < num_calls do
j += 1
logger.error("Empty page encountered. Retrying #{j} times...")
sleep(2)
page_classes = paging.page(page, size).all
logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty?
end

if page_classes.empty?
msg = "Empty page #{page} of #{total_pages} persisted after retrying #{j} times. #{operations} of #{acr} aborted..."
logger.error(msg)
raise msg
end
end

if page_classes.empty?
if total_pages > 0
logger.info("The number of pages reported for #{acr} - #{total_pages} is higher than expected #{page - 1}. Completing #{operations}...")
else
logger.info("Ontology #{acr} contains #{total_pages} pages...")
end
break
end

prev_page_len = page_len
logger.info("#{acr}: page #{page} of #{total_pages} - #{page_len} ontology terms retrieved in #{Time.now - t0} sec.")
logger.flush
count_classes += page_classes.length

process_callbacks(logger, callbacks, :caller_on_pre_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }

page_classes.each { |c|
process_callbacks(logger, callbacks, :caller_on_each) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page, c) }
} if iterate_classes

process_callbacks(logger, callbacks, :caller_on_post_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }
cls_count += page_classes.length unless cls_count_set

page = page_classes.next? ? page + 1 : nil
end while !page.nil?

callbacks.each { |_, callback| callback[:artifacts][:count_classes] = cls_count }
process_callbacks(logger, callbacks, :caller_on_post) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }
end

logger.info("Completed #{operations}: #{acr} in #{time} sec. #{count_classes} classes.")
logger.flush

# set the status on actions that have completed successfully
callbacks.each do |_, callback|
if callback[:status]
add_submission_status(callback[:status])
@submission.save
end
end
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
file_path = artifacts[:file_path]
artifacts[:save_in_file] = File.join(File.dirname(file_path), "labels.ttl")
artifacts[:save_in_file_mappings] = File.join(File.dirname(file_path), "mappings.ttl")
property_triples = LinkedData::Utils::Triples.rdf_for_custom_properties(@submission)
Goo.sparql_data_client.append_triples(@submission.id, property_triples, mime_type = "application/x-turtle")
fsave = File.open(artifacts[:save_in_file], "w")
fsave.write(property_triples)
fsave_mappings = File.open(artifacts[:save_in_file_mappings], "w")
artifacts[:fsave] = fsave
artifacts[:fsave_mappings] = fsave_mappings
end

def generate_missing_labels_pre_page(artifacts = {}, logger, paging, page_classes, page)
artifacts[:label_triples] = []
artifacts[:mapping_triples] = []
end

def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, page, c)
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

if rdfs_labels.nil? || rdfs_labels.length == 0
rdfs_labels = c.label
end
end

if rdfs_labels and not (rdfs_labels.instance_of? Array)
rdfs_labels = [rdfs_labels]
end
label = nil

if rdfs_labels && rdfs_labels.length > 0
label = rdfs_labels[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end

if @submission.ontology.viewOf.nil?
loomLabel = OntologySubmission.loom_transform_literal(prefLabel.to_s)

if loomLabel.length > 2
artifacts[:mapping_triples] << LinkedData::Utils::Triples.loom_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingLoom], loomLabel)
end
artifacts[:mapping_triples] << LinkedData::Utils::Triples.uri_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingSameURI], c.id)
end
end

def generate_missing_labels_post_page(artifacts = {}, logger, paging, page_classes, page)
rest_mappings = LinkedData::Mappings.migrate_rest_mappings(@submission.ontology.acronym)
artifacts[:mapping_triples].concat(rest_mappings)

if artifacts[:label_triples].length > 0
logger.info("Asserting #{artifacts[:label_triples].length} labels in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:label_triples] = artifacts[:label_triples].join("\n")
artifacts[:fsave].write(artifacts[:label_triples])
t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:label_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Labels asserted in #{t1 - t0} sec.")
logger.flush
else
logger.info("No labels generated in page #{page}.")
logger.flush
end

if artifacts[:mapping_triples].length > 0
logger.info("Asserting #{artifacts[:mapping_triples].length} mappings in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:mapping_triples] = artifacts[:mapping_triples].join("\n")
artifacts[:fsave_mappings].write(artifacts[:mapping_triples])

t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:mapping_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Mapping labels asserted in #{t1 - t0} sec.")
logger.flush
end
end

def generate_missing_labels_post(artifacts = {}, logger, paging)
logger.info("end generate_missing_labels traversed #{artifacts[:count_classes]} classes")
logger.info("Saved generated labels in #{artifacts[:save_in_file]}")
artifacts[:fsave].close()
artifacts[:fsave_mappings].close()
logger.flush
end

def generate_obsolete_classes(logger, file_path)
@submission.bring(:obsoleteProperty) if @submission.bring?(:obsoleteProperty)
@submission.bring(:obsoleteParent) if @submission.bring?(:obsoleteParent)
Expand Down
11 changes: 9 additions & 2 deletions lib/ontologies_linked_data/utils/triples.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,17 @@ def self.rdf_for_custom_properties(ont_sub)
return (triples.join "\n")
end

def self.label_for_class_triple(class_id,property,label)
def self.label_for_class_triple(class_id, property, label, language=nil)
label = label.to_s.gsub('\\','\\\\\\\\')
label = label.gsub('"','\"')
return triple(class_id,property,RDF::Literal.new(label, :datatype => RDF::XSD.string))
params = { datatype: RDF::XSD.string }
lang = language.to_s.downcase

if !lang.empty? && lang.to_sym != :none
params[:datatype] = RDF.langString
params[:language] = lang.to_sym
end
return triple(class_id, property, RDF::Literal.new(label, params))
end

def self.generated_label(class_id, existing_label)
Expand Down
Loading

0 comments on commit 93e1bd3

Please sign in to comment.