Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multilingual prefLabels #220

Merged
merged 3 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,10 @@ GEM
launchy (>= 2.1, < 4.0)
mail (~> 2.7)
eventmachine (1.2.7)
faraday (2.12.0)
faraday-net_http (>= 2.0, < 3.4)
json
logger
faraday-net_http (3.3.0)
net-http
ffi (1.17.0-aarch64-linux-gnu)
ffi (1.17.0-arm64-darwin)
ffi (1.17.0-x86_64-linux-gnu)
faraday (1.2.0)
multipart-post (>= 1.2, < 3)
ruby2_keywords
ffi (1.17.0)
hashie (5.0.0)
htmlentities (4.3.4)
http-accept (1.7.0)
Expand All @@ -87,9 +82,10 @@ GEM
net-pop
net-smtp
method_source (1.1.0)
mime-types (3.5.2)
mime-types (3.6.0)
logger
mime-types-data (~> 3.2015)
mime-types-data (3.2024.0903)
mime-types-data (3.2024.1001)
mini_mime (1.1.5)
minitest (4.7.5)
minitest-reporters (0.14.24)
Expand All @@ -98,8 +94,7 @@ GEM
minitest (>= 2.12, < 5.0)
powerbar
multi_json (1.15.0)
net-http (0.4.1)
uri
multipart-post (2.4.1)
net-http-persistent (2.9.4)
net-imap (0.4.16)
date
Expand Down Expand Up @@ -128,7 +123,7 @@ GEM
pry (0.14.2)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (6.0.1)
public_suffix (5.1.1)
racc (1.8.1)
rack (2.2.9)
rack-test (0.8.3)
Expand Down Expand Up @@ -166,6 +161,7 @@ GEM
rubocop-ast (1.32.3)
parser (>= 3.3.1.0)
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
rubyzip (1.3.0)
simplecov (0.22.0)
docile (~> 1.1)
Expand All @@ -185,13 +181,13 @@ GEM
timeout (0.4.1)
tzinfo (0.3.62)
unicode-display_width (2.6.0)
uri (0.13.1)
uuid (2.3.9)
macaddr (~> 1.0)

PLATFORMS
aarch64-linux
arm64-darwin-22
arm64-darwin-23
x86_64-linux

DEPENDENCIES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def process(logger, options = {})

def handle_missing_labels(file_path, logger)
callbacks = {
include_languages: true,
missing_labels: {
op_name: 'Missing Labels Generation',
required: true,
Expand Down Expand Up @@ -60,6 +61,10 @@ def loop_classes(logger, raw_paging, submission, callbacks)
size = 2500
count_classes = 0
acr = submission.id.to_s.split("/")[-1]

# include all languages in attributes of classes if asked for
incl_lang = callbacks.delete(:include_languages)
RequestStore.store[:requested_lang] = :ALL if incl_lang
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
Expand Down Expand Up @@ -161,6 +166,7 @@ def loop_classes(logger, raw_paging, submission, callbacks)
@submission.save
end
end
RequestStore.store[:requested_lang] = nil if incl_lang
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
Expand All @@ -185,26 +191,35 @@ def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, p
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label
lang_rdfs_labels = c.label(include_languages: true)
lang_rdfs_labels = {none: []} if lang_rdfs_labels.empty?

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first
lang_rdfs_labels&.each do |lang, rdfs_labels|
if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end
rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end

rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil
rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil

if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end

if lang === :none
lang = nil
prefLabel = label
end
prefLabel = label if !prefLabel && lang === Goo.portal_language

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be simplified too

prefLabel ||= label if lang == Goo.portal_language
prefLabel ||= label

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and Goo.portal_language does not exist, it is Goo.main_languages.first.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Goo.portal_language is a change that I implemented in GOO. I plan to create a PR for that shortly.

prefLabel = label unless prefLabel
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label, lang)
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end
Expand Down Expand Up @@ -381,238 +396,6 @@ def delete_and_append(triples_file_path, logger, mime_type = nil)
logger.flush
end

def process_callbacks(logger, callbacks, action_name, &block)
callbacks.delete_if do |_, callback|
begin
if callback[action_name]
callable = @submission.method(callback[action_name])
yield(callable, callback)
end
false
rescue Exception => e
logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}")
logger.flush

if callback[:status]
add_submission_status(callback[:status].get_error_status)
@submission.save
end

# halt the entire processing if :required is set to true
raise e if callback[:required]
# continue processing of other callbacks, but not this one
true
end
end
end

def loop_classes(logger, raw_paging, callbacks)
page = 1
size = 2500
count_classes = 0
acr = @submission.id.to_s.split("/")[-1]
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
paging = raw_paging.page(page, size)
cls_count_set = false
cls_count = class_count(logger)

if cls_count > -1
# prevent a COUNT SPARQL query if possible
paging.page_count_set(cls_count)
cls_count_set = true
else
cls_count = 0
end

iterate_classes = false
# 1. init artifacts hash if not explicitly passed in the callback
# 2. determine if class-level iteration is required
callbacks.each { |_, callback| callback[:artifacts] ||= {}; iterate_classes = true if callback[:caller_on_each] }

process_callbacks(logger, callbacks, :caller_on_pre) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }

page_len = -1
prev_page_len = -1

begin
t0 = Time.now
page_classes = paging.page(page, size).all
total_pages = page_classes.total_pages
page_len = page_classes.length

# nothing retrieved even though we're expecting more records
if total_pages > 0 && page_classes.empty? && (prev_page_len == -1 || prev_page_len == size)
j = 0
num_calls = LinkedData.settings.num_retries_4store

while page_classes.empty? && j < num_calls do
j += 1
logger.error("Empty page encountered. Retrying #{j} times...")
sleep(2)
page_classes = paging.page(page, size).all
logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty?
end

if page_classes.empty?
msg = "Empty page #{page} of #{total_pages} persisted after retrying #{j} times. #{operations} of #{acr} aborted..."
logger.error(msg)
raise msg
end
end

if page_classes.empty?
if total_pages > 0
logger.info("The number of pages reported for #{acr} - #{total_pages} is higher than expected #{page - 1}. Completing #{operations}...")
else
logger.info("Ontology #{acr} contains #{total_pages} pages...")
end
break
end

prev_page_len = page_len
logger.info("#{acr}: page #{page} of #{total_pages} - #{page_len} ontology terms retrieved in #{Time.now - t0} sec.")
logger.flush
count_classes += page_classes.length

process_callbacks(logger, callbacks, :caller_on_pre_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }

page_classes.each { |c|
process_callbacks(logger, callbacks, :caller_on_each) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page, c) }
} if iterate_classes

process_callbacks(logger, callbacks, :caller_on_post_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }
cls_count += page_classes.length unless cls_count_set

page = page_classes.next? ? page + 1 : nil
end while !page.nil?

callbacks.each { |_, callback| callback[:artifacts][:count_classes] = cls_count }
process_callbacks(logger, callbacks, :caller_on_post) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }
end

logger.info("Completed #{operations}: #{acr} in #{time} sec. #{count_classes} classes.")
logger.flush

# set the status on actions that have completed successfully
callbacks.each do |_, callback|
if callback[:status]
add_submission_status(callback[:status])
@submission.save
end
end
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
file_path = artifacts[:file_path]
artifacts[:save_in_file] = File.join(File.dirname(file_path), "labels.ttl")
artifacts[:save_in_file_mappings] = File.join(File.dirname(file_path), "mappings.ttl")
property_triples = LinkedData::Utils::Triples.rdf_for_custom_properties(@submission)
Goo.sparql_data_client.append_triples(@submission.id, property_triples, mime_type = "application/x-turtle")
fsave = File.open(artifacts[:save_in_file], "w")
fsave.write(property_triples)
fsave_mappings = File.open(artifacts[:save_in_file_mappings], "w")
artifacts[:fsave] = fsave
artifacts[:fsave_mappings] = fsave_mappings
end

def generate_missing_labels_pre_page(artifacts = {}, logger, paging, page_classes, page)
artifacts[:label_triples] = []
artifacts[:mapping_triples] = []
end

def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, page, c)
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

if rdfs_labels.nil? || rdfs_labels.length == 0
rdfs_labels = c.label
end
end

if rdfs_labels and not (rdfs_labels.instance_of? Array)
rdfs_labels = [rdfs_labels]
end
label = nil

if rdfs_labels && rdfs_labels.length > 0
label = rdfs_labels[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end

if @submission.ontology.viewOf.nil?
loomLabel = OntologySubmission.loom_transform_literal(prefLabel.to_s)

if loomLabel.length > 2
artifacts[:mapping_triples] << LinkedData::Utils::Triples.loom_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingLoom], loomLabel)
end
artifacts[:mapping_triples] << LinkedData::Utils::Triples.uri_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingSameURI], c.id)
end
end

def generate_missing_labels_post_page(artifacts = {}, logger, paging, page_classes, page)
rest_mappings = LinkedData::Mappings.migrate_rest_mappings(@submission.ontology.acronym)
artifacts[:mapping_triples].concat(rest_mappings)

if artifacts[:label_triples].length > 0
logger.info("Asserting #{artifacts[:label_triples].length} labels in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:label_triples] = artifacts[:label_triples].join("\n")
artifacts[:fsave].write(artifacts[:label_triples])
t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:label_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Labels asserted in #{t1 - t0} sec.")
logger.flush
else
logger.info("No labels generated in page #{page}.")
logger.flush
end

if artifacts[:mapping_triples].length > 0
logger.info("Asserting #{artifacts[:mapping_triples].length} mappings in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:mapping_triples] = artifacts[:mapping_triples].join("\n")
artifacts[:fsave_mappings].write(artifacts[:mapping_triples])

t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:mapping_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Mapping labels asserted in #{t1 - t0} sec.")
logger.flush
end
end

def generate_missing_labels_post(artifacts = {}, logger, paging)
logger.info("end generate_missing_labels traversed #{artifacts[:count_classes]} classes")
logger.info("Saved generated labels in #{artifacts[:save_in_file]}")
artifacts[:fsave].close()
artifacts[:fsave_mappings].close()
logger.flush
end

def generate_obsolete_classes(logger, file_path)
@submission.bring(:obsoleteProperty) if @submission.bring?(:obsoleteProperty)
@submission.bring(:obsoleteParent) if @submission.bring?(:obsoleteParent)
Expand Down
Loading
Loading