From 17ab65e83806959e94f64b7e1097270598b40170 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Tue, 19 Mar 2024 16:27:45 -0700 Subject: [PATCH 1/8] add service to record instantiation identifiers that could apply to multiple assets --- .../ams/missing_instantiations_locator.rb | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 app/services/ams/missing_instantiations_locator.rb diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb new file mode 100644 index 00000000..e80bf3d9 --- /dev/null +++ b/app/services/ams/missing_instantiations_locator.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true +require 'ruby-progressbar' + +module AMS + class MissingInstantiationsLocator + WORKING_DIR = Rails.root.join('tmp', 'imports') + + attr_reader :search_dir, :results_path, :results + + # TODO: take array of directory paths? + def initialize(search_dir:) + @search_dir = WORKING_DIR.join(search_dir) + @results_path = WORKING_DIR.join("i16-#{search_dir}.json") + @results = initialize_results + end + + # TODO: better method name + def locate_within_dir + xml_files = Dir.glob(search_dir.join('*.xml')) + + xml_files.each do |f| + locate(f) + end + + write_results + end + + # TODO: better method name + def locate(xml_file) + xml = File.read(xml_file) + + pbcore_id = xml.scan(/(cpb-aacip\/.+?)<\//).flatten.first + asset_id = pbcore_id.tr('/', '-') + instantiation_identifiers = xml.scan(/(.+?)<\/instantiationIdentifier>/mi).flatten + + instantiation_identifiers.each do |inst_id| + instantiation_class = xml.match?('instantiationPhysical') ? PhysicalInstantiation : DigitalInstantiation + af_instantiations = instantiation_class.where(local_instantiation_identifier: inst_id) + + broken = af_instantiations.any? do |af_inst| + normalize_date(af_inst.date_uploaded) != normalize_date(af_inst.date_modified) + end + next unless broken + + results[inst_id] ||= [] + results[inst_id] |= Array.wrap(asset_id) + end + end + + def normalize_date(date) + date.to_datetime.strftime('%Y-%m-%d %H:%M') + end + + def initialize_results + if File.exist?(results_path) + JSON.parse(File.read(results_path)) + else + {} + end + end + + def write_results + File.open(results_path, 'w') do |f| + f.puts results.to_json + end + end + end +end From a5148bc6812d2317beffee18fbef11257725454a Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 08:47:29 -0700 Subject: [PATCH 2/8] include origin dir of asset xml This will make it easier to find the file later instead of having to search through every directory --- app/services/ams/missing_instantiations_locator.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index e80bf3d9..4dcc3756 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -42,8 +42,9 @@ def locate(xml_file) end next unless broken + truncated_search_dir = search_dir.to_s.split('/').last results[inst_id] ||= [] - results[inst_id] |= Array.wrap(asset_id) + results[inst_id] |= Array.wrap("#{truncated_search_dir}/#{asset_id}") end end From 4918b93d683aae613d8758f8fa8df4eb3910fa82 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 09:31:25 -0700 Subject: [PATCH 3/8] search through an array of dirs instead of one at a time --- .../ams/missing_instantiations_locator.rb | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 4dcc3756..5ca6e05c 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -5,24 +5,27 @@ module AMS class MissingInstantiationsLocator WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dir, :results_path, :results + attr_reader :search_dirs, :current_dir, :results_path, :results - # TODO: take array of directory paths? - def initialize(search_dir:) - @search_dir = WORKING_DIR.join(search_dir) - @results_path = WORKING_DIR.join("i16-#{search_dir}.json") - @results = initialize_results + # @param [Array] search_dirs + def initialize(search_dirs) + @search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) } end # TODO: better method name - def locate_within_dir - xml_files = Dir.glob(search_dir.join('*.xml')) + def locate_within_dirs + search_dirs.each do |current_dir| + @current_dir = current_dir + @results_path = WORKING_DIR.join("i16-#{truncated_dir_name(current_dir)}.json") + @results = initialize_results + xml_files = Dir.glob(current_dir.join('*.xml')) - xml_files.each do |f| - locate(f) - end + xml_files.each do |f| + locate(f) + end - write_results + write_results + end end # TODO: better method name @@ -42,9 +45,8 @@ def locate(xml_file) end next unless broken - truncated_search_dir = search_dir.to_s.split('/').last results[inst_id] ||= [] - results[inst_id] |= Array.wrap("#{truncated_search_dir}/#{asset_id}") + results[inst_id] |= Array.wrap("#{truncated_dir_name(current_dir)}/#{asset_id}") end end @@ -52,6 +54,10 @@ def normalize_date(date) date.to_datetime.strftime('%Y-%m-%d %H:%M') end + def truncated_dir_name(dir) + dir.to_s.split('/').last + end + def initialize_results if File.exist?(results_path) JSON.parse(File.read(results_path)) From 4095c862235931834524803722c67bd6fb4ced81 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 09:48:21 -0700 Subject: [PATCH 4/8] add progressbar --- app/services/ams/missing_instantiations_locator.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 5ca6e05c..0c0dfca6 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -5,7 +5,7 @@ module AMS class MissingInstantiationsLocator WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dirs, :current_dir, :results_path, :results + attr_reader :search_dirs, :current_dir, :results_path, :results, :progressbar # @param [Array] search_dirs def initialize(search_dirs) @@ -19,9 +19,12 @@ def locate_within_dirs @results_path = WORKING_DIR.join("i16-#{truncated_dir_name(current_dir)}.json") @results = initialize_results xml_files = Dir.glob(current_dir.join('*.xml')) + progressbar_format = "#{truncated_dir_name(current_dir)} -- %a %e %P% Processed: %c from %C" + @progressbar = ProgressBar.create(total: xml_files.size, format: progressbar_format) xml_files.each do |f| locate(f) + progressbar.increment end write_results From f6741ce1d65b519e86b0860a0b209d495a25c991 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:02:12 -0700 Subject: [PATCH 5/8] refactor truncated_dir_name logic --- app/services/ams/missing_instantiations_locator.rb | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 0c0dfca6..22df5f51 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -5,7 +5,7 @@ module AMS class MissingInstantiationsLocator WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dirs, :current_dir, :results_path, :results, :progressbar + attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar # @param [Array] search_dirs def initialize(search_dirs) @@ -16,10 +16,11 @@ def initialize(search_dirs) def locate_within_dirs search_dirs.each do |current_dir| @current_dir = current_dir - @results_path = WORKING_DIR.join("i16-#{truncated_dir_name(current_dir)}.json") + @truncated_dir_name = current_dir.to_s.split('/').last + @results_path = WORKING_DIR.join("i16-#{truncated_dir_name}.json") @results = initialize_results xml_files = Dir.glob(current_dir.join('*.xml')) - progressbar_format = "#{truncated_dir_name(current_dir)} -- %a %e %P% Processed: %c from %C" + progressbar_format = "#{truncated_dir_name} -- %a %e %P% Processed: %c from %C" @progressbar = ProgressBar.create(total: xml_files.size, format: progressbar_format) xml_files.each do |f| @@ -49,7 +50,7 @@ def locate(xml_file) next unless broken results[inst_id] ||= [] - results[inst_id] |= Array.wrap("#{truncated_dir_name(current_dir)}/#{asset_id}") + results[inst_id] |= Array.wrap("#{truncated_dir_name}/#{asset_id}") end end @@ -57,10 +58,6 @@ def normalize_date(date) date.to_datetime.strftime('%Y-%m-%d %H:%M') end - def truncated_dir_name(dir) - dir.to_s.split('/').last - end - def initialize_results if File.exist?(results_path) JSON.parse(File.read(results_path)) From 8fb255f2c590c938f67e9eb59fcd1fb509a934c6 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:08:35 -0700 Subject: [PATCH 6/8] add logging and exception handling --- .../ams/missing_instantiations_locator.rb | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 22df5f51..5d35c1e4 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -5,40 +5,56 @@ module AMS class MissingInstantiationsLocator WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar + attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger # @param [Array] search_dirs def initialize(search_dirs) @search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) } + @logger = ActiveSupport::Logger.new( + WORKING_DIR.join('i16-missing-instantiations-locator.log') + ) end # TODO: better method name def locate_within_dirs search_dirs.each do |current_dir| @current_dir = current_dir - @truncated_dir_name = current_dir.to_s.split('/').last + @truncated_dir_name = truncate_path(current_dir) @results_path = WORKING_DIR.join("i16-#{truncated_dir_name}.json") @results = initialize_results xml_files = Dir.glob(current_dir.join('*.xml')) progressbar_format = "#{truncated_dir_name} -- %a %e %P% Processed: %c from %C" @progressbar = ProgressBar.create(total: xml_files.size, format: progressbar_format) + logger.info("Starting #{truncated_dir_name}") + xml_files.each do |f| locate(f) progressbar.increment end write_results + rescue => e + logger.error("#{e.class} (#{truncated_dir_name}) - #{e.message}") end end # TODO: better method name def locate(xml_file) xml = File.read(xml_file) + current_file_path = "#{truncated_dir_name}/#{truncate_path(xml_file)}" pbcore_id = xml.scan(/(cpb-aacip\/.+?)<\//).flatten.first + if pbcore_id.blank? + logger.debug("No pbcore_id found within #{current_file_path}") + return + end asset_id = pbcore_id.tr('/', '-') instantiation_identifiers = xml.scan(/(.+?)<\/instantiationIdentifier>/mi).flatten + if instantiation_identifiers.blank? + logger.debug("No instantiation identifier(s) found within #{current_file_path}") + return + end instantiation_identifiers.each do |inst_id| instantiation_class = xml.match?('instantiationPhysical') ? PhysicalInstantiation : DigitalInstantiation @@ -51,13 +67,21 @@ def locate(xml_file) results[inst_id] ||= [] results[inst_id] |= Array.wrap("#{truncated_dir_name}/#{asset_id}") + rescue => e + logger.error("#{e.class} (#{current_file_path}) (Inst: #{inst_id}) - #{e.message}") end + rescue => e + logger.error("#{e.class} (#{current_file_path}) - #{e.message}") end def normalize_date(date) date.to_datetime.strftime('%Y-%m-%d %H:%M') end + def truncate_path(path) + path.to_s.split('/').last + end + def initialize_results if File.exist?(results_path) JSON.parse(File.read(results_path)) From dfc2facf2761734bfcbea58df4663168360d3e4b Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:16:04 -0700 Subject: [PATCH 7/8] more accurate method names --- app/services/ams/missing_instantiations_locator.rb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 5d35c1e4..dfc5388f 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -2,6 +2,7 @@ require 'ruby-progressbar' module AMS + # @see https://github.com/scientist-softserv/ams/issues/16 class MissingInstantiationsLocator WORKING_DIR = Rails.root.join('tmp', 'imports') @@ -15,8 +16,7 @@ def initialize(search_dirs) ) end - # TODO: better method name - def locate_within_dirs + def map_all_instantiation_identifiers search_dirs.each do |current_dir| @current_dir = current_dir @truncated_dir_name = truncate_path(current_dir) @@ -29,7 +29,7 @@ def locate_within_dirs logger.info("Starting #{truncated_dir_name}") xml_files.each do |f| - locate(f) + map_asset_id_to_inst_ids(f) progressbar.increment end @@ -39,8 +39,9 @@ def locate_within_dirs end end - # TODO: better method name - def locate(xml_file) + private + + def map_asset_id_to_inst_ids(xml_file) xml = File.read(xml_file) current_file_path = "#{truncated_dir_name}/#{truncate_path(xml_file)}" From e3be535638a1d8e82db2adecce97a9eb9cae660e Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 20 Mar 2024 21:41:50 -0700 Subject: [PATCH 8/8] don't reinvent the wheel --- app/services/ams/missing_instantiations_locator.rb | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index dfc5388f..1479030d 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -19,7 +19,7 @@ def initialize(search_dirs) def map_all_instantiation_identifiers search_dirs.each do |current_dir| @current_dir = current_dir - @truncated_dir_name = truncate_path(current_dir) + @truncated_dir_name = File.basename(current_dir) @results_path = WORKING_DIR.join("i16-#{truncated_dir_name}.json") @results = initialize_results xml_files = Dir.glob(current_dir.join('*.xml')) @@ -43,7 +43,7 @@ def map_all_instantiation_identifiers def map_asset_id_to_inst_ids(xml_file) xml = File.read(xml_file) - current_file_path = "#{truncated_dir_name}/#{truncate_path(xml_file)}" + current_file_path = "#{truncated_dir_name}/#{File.basename(xml_file)}" pbcore_id = xml.scan(/(cpb-aacip\/.+?)<\//).flatten.first if pbcore_id.blank? @@ -79,10 +79,6 @@ def normalize_date(date) date.to_datetime.strftime('%Y-%m-%d %H:%M') end - def truncate_path(path) - path.to_s.split('/').last - end - def initialize_results if File.exist?(results_path) JSON.parse(File.read(results_path))