From ac5c66b2cde4ba09f2168624a661de903ef737a4 Mon Sep 17 00:00:00 2001 From: "Luis M. Rodriguez-R" Date: Tue, 6 Feb 2024 13:39:35 +0100 Subject: [PATCH] Efficiency improvement for large projects Projects with a large number of datasets should now work much faster with two improvements: - Using name Set instead if Hash - Adding names to Set instead of re-creating it when a new dataset is added --- lib/miga/cli/action/download/ncbi.rb | 43 ++++++++++++++++++++++------ lib/miga/dataset.rb | 3 +- lib/miga/json.rb | 19 ++++++++---- lib/miga/project.rb | 1 + lib/miga/project/dataset.rb | 19 ++++++++---- lib/miga/remote_dataset.rb | 19 ++++++++++-- lib/miga/version.rb | 4 +-- 7 files changed, 84 insertions(+), 24 deletions(-) diff --git a/lib/miga/cli/action/download/ncbi.rb b/lib/miga/cli/action/download/ncbi.rb index 36f9e2d..587a5b3 100644 --- a/lib/miga/cli/action/download/ncbi.rb +++ b/lib/miga/cli/action/download/ncbi.rb @@ -57,14 +57,14 @@ def sanitize_cli def remote_list if cli[:ncbi_taxonomy_dump] cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}" - MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump]) + MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli) end if cli[:ncbi_list_json] && File.size?(cli[:ncbi_list_json]) - cli.say "Reusing remote list: #{cli[:ncbi_list_json]}" - return MiGA::Json.parse(cli[:ncbi_list_json]) + return read_ncbi_list_json(cli[:ncbi_list_json]) end + cli.say "Obtaining remote list of datasets" list = {} query = remote_list_query loop do @@ -79,18 +79,45 @@ def remote_list list.merge!(parse_reports_as_datasets(page[:reports])) # Next page + cli.advance('Datasets:', list.size, page[:total_count]) break unless page[:next_page_token] query[:page_token] = page[:next_page_token] end + cli.say - if cli[:ncbi_list_json] - cli.say "Saving remote list: #{cli[:ncbi_list_json]}" - MiGA::Json.generate_fast(list, cli[:ncbi_list_json]) + write_ncbi_list_json(cli[:ncbi_list_json], list) if cli[:ncbi_list_json] + list + end + + def read_ncbi_list_json(file) + cli.say "Reusing remote list: #{file}" + list = {} + n_tot = nil + File.open(file, 'r') do |fh| + n_tot = fh.gets.chomp.sub(/^# /, '').to_i + fh.each_with_index do |ln, k| + row = ln.chomp.split("\t", 2) + list[row[0]] = MiGA::Json.parse(row[1], contents: true) + cli.advance('Lines:', k, n_tot) + end + cli.say end + return list + end - list + def write_ncbi_list_json(file, list) + cli.say "Saving remote list: #{file}" + File.open(file, 'w') do |fh| + fh.puts('# %i' % list.size) + kk = 0 + list.each do |k, v| + fh.puts([k, MiGA::Json.generate_fast(v)].join("\t")) + cli.advance('Datasets:', kk += 1, list.size) + end + cli.say + end end - + def parse_reports_as_datasets(reports) ds = {} reports.each do |r| diff --git a/lib/miga/dataset.rb b/lib/miga/dataset.rb index 8402ccd..abf6628 100644 --- a/lib/miga/dataset.rb +++ b/lib/miga/dataset.rb @@ -3,6 +3,7 @@ # @package MiGA # @license Artistic-2.0 +require'set' require 'miga/metadata' require 'miga/dataset/result' require 'miga/dataset/status' @@ -27,7 +28,7 @@ class << self ## # Does the +project+ already have a dataset with that +name+? def exist?(project, name) - !project.dataset_names_hash[name].nil? + project.dataset_names_set.include? name end ## diff --git a/lib/miga/json.rb b/lib/miga/json.rb index ee3f0e9..74648d4 100644 --- a/lib/miga/json.rb +++ b/lib/miga/json.rb @@ -15,6 +15,8 @@ class << self # - +:symbolize+: If names should be symbolized. By default it's true if # additions is false, or false otherwise. They can both be false, but an # exception will be raised if both are true + # - +:large_file+: If passed, the file is treated as a file with very long + # lines (possibly a single long line) def default_opts(opts = {}) opts[:contents] ||= false opts[:additions] ||= false @@ -36,11 +38,18 @@ def parse(path, opts = {}) # Read JSON cont = path - 12.times do - cont = File.read(path) - break unless cont.empty? - sleep 1 # Wait up to 12 seconds for racing processes (iff empty file) - end unless opts[:contents] + if opts[:large_file] + cont = '' + File.open(path, 'r') do |fh| + cont += fh.read(2 ** 16) until fh.eof? + end + elsif !opts[:contents] + 12.times do + cont = File.read(path) + break unless cont.empty? + sleep 1 # Wait up to 12 seconds for racing processes (iff empty file) + end + end raise "Empty descriptor: #{opts[:contents] ? "''" : path}" if cont.empty? # Parse JSON diff --git a/lib/miga/project.rb b/lib/miga/project.rb index 1a8d929..7a7deb0 100644 --- a/lib/miga/project.rb +++ b/lib/miga/project.rb @@ -77,6 +77,7 @@ def save! def load @datasets = {} @dataset_names_hash = nil + @dataset_names_set = nil @metadata = MiGA::Metadata.load "#{path}/miga.project.json" raise "Couldn't find project metadata at #{path}" if metadata.nil? diff --git a/lib/miga/project/dataset.rb b/lib/miga/project/dataset.rb index eb42e7f..bb10e2d 100644 --- a/lib/miga/project/dataset.rb +++ b/lib/miga/project/dataset.rb @@ -5,24 +5,32 @@ # Helper module including specific functions handle datasets. module MiGA::Project::Dataset ## - # Returns Array of MiGA::Dataset. + # Returns Array of MiGA::Dataset def datasets metadata[:datasets].map { |name| dataset(name) } end ## - # Returns Array of String (without evaluating dataset objects). + # Returns Array of String (without evaluating dataset objects) def dataset_names metadata[:datasets] end ## - # Returns Hash of Strings => true. Similar to +dataset_names+ but as - # Hash for efficiency. + # Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as + # Hash for efficiency def dataset_names_hash + warn 'The Project#dataset_names_hash method will be deprecated soon' @dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }] end + ## + # Returns Set of Strings. Similar to +dataset_names+ but as Set for + # efficiency + def dataset_names_set + @dataset_names_set ||= Set.new(dataset_names) + end + ## # Returns MiGA::Dataset def dataset(name) @@ -50,7 +58,8 @@ def add_dataset(name) unless metadata[:datasets].include? name d = MiGA::Dataset.new(self, name) @metadata[:datasets] << name - @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true + @dataset_names_hash[name] = true if @dataset_names_hash + @dataset_names_set << name if @dataset_names_set save if d.ref? && d.active? recalculate_tasks("Reference dataset added: #{d.name}") diff --git a/lib/miga/remote_dataset.rb b/lib/miga/remote_dataset.rb index dc352f6..f7f757e 100644 --- a/lib/miga/remote_dataset.rb +++ b/lib/miga/remote_dataset.rb @@ -16,7 +16,12 @@ class << self # Path to a directory with a recent NCBI Taxonomy dump to use instead of # making API calls to NCBI servers, which can be obtained at: # https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz - def use_ncbi_taxonomy_dump(path) + # + # The +cli+ parameter, if passed, should be a MiGA::Cli object that will + # be used to report advance in the reading. Other objects can be passed, + # minimally supporting the MiGA::Cli#say and MiGA::Cli#advance method + # interfaces + def use_ncbi_taxonomy_dump(path, cli = nil) raise "Directory doesn't exist: #{path}" unless File.directory?(path) # Structure: { TaxID => ["name", "rank", parent TaxID] } @@ -24,23 +29,31 @@ def use_ncbi_taxonomy_dump(path) @ncbi_taxonomy_names = {} # Read names.dmp - File.open(File.join(path, 'names.dmp')) do |fh| + File.open(file = File.join(path, 'names.dmp')) do |fh| + read = 0 + size = File.size(file) fh.each do |ln| + cli&.advance('- names.dmp:', read += ln.size, size) row = ln.split(/\t\|\t?/) next unless row[3] == 'scientific name' @ncbi_taxonomy_names[row[0].to_i] = [row[1].strip] end + cli&.say end # Read nodes.dmp - File.open(File.join(path, 'nodes.dmp')) do |fh| + File.open(file = File.join(path, 'nodes.dmp')) do |fh| + read = 0 + size = File.size(file) fh.each do |ln| + cli&.advance('- nodes.dmp:', read += ln.size, size) row = ln.split(/\t\|\t?/) child = row[0].to_i parent = row[1].to_i @ncbi_taxonomy_names[child][1] = row[2] @ncbi_taxonomy_names[child][2] = parent unless parent == child end + cli&.say end end diff --git a/lib/miga/version.rb b/lib/miga/version.rb index 3ecfae1..411bebc 100644 --- a/lib/miga/version.rb +++ b/lib/miga/version.rb @@ -12,7 +12,7 @@ module MiGA # - String indicating release status: # - rc* release candidate, not released as gem # - [0-9]+ stable release, released as gem - VERSION = [1.3, 10, 1].freeze + VERSION = [1.3, 10, 2].freeze ## # Nickname for the current major.minor version. @@ -20,7 +20,7 @@ module MiGA ## # Date of the current gem relese. - VERSION_DATE = Date.new(2024, 1, 31) + VERSION_DATE = Date.new(2024, 2, 6) ## # References of MiGA