From 0ec7adcad9bdb3a5f06cbc4db2e8cea7c14b880d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 13 May 2024 12:50:42 +0200 Subject: [PATCH 01/18] feat: add ref and annotation data to Auspice tree types --- packages/nextclade/src/graph/graph.rs | 2 + packages/nextclade/src/tree/tree.rs | 65 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/packages/nextclade/src/graph/graph.rs b/packages/nextclade/src/graph/graph.rs index 2c4aa3743..4078ea5f1 100644 --- a/packages/nextclade/src/graph/graph.rs +++ b/packages/nextclade/src/graph/graph.rs @@ -13,6 +13,7 @@ use num_traits::Float; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use maplit::btreemap; #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] #[allow(clippy::partial_pub_fields)] @@ -556,6 +557,7 @@ pub fn convert_graph_to_auspice_tree(graph: &AuspiceGraph) -> Result }, +} + +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct AuspiceGenomeAnnotationCds { + #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")] + pub r#type: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub gene: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub color: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub display_name: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, + + #[serde(default)] + pub strand: GeneStrand, + + #[serde(flatten)] + pub segments: Segments, + + #[serde(flatten)] + pub other: serde_json::Value, +} + +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct AuspiceGenomeAnnotations { + pub nuc: AuspiceGenomeAnnotationNuc, + + #[serde(flatten)] + pub cdses: BTreeMap, + + #[serde(flatten)] + pub other: serde_json::Value, +} + #[derive(Clone, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)] pub struct AuspiceTreeMeta { + #[serde(skip_serializing_if = "Option::is_none")] + pub genome_annotations: Option, + #[serde(default, skip_serializing_if = "AuspiceMetaExtensions::is_empty")] pub extensions: AuspiceMetaExtensions, @@ -472,6 +534,9 @@ pub struct AuspiceTree { pub tree: AuspiceTreeNode, + #[serde(skip_serializing_if = "BTreeMap::is_empty")] + pub root_sequence: BTreeMap, + #[serde(flatten)] pub other: serde_json::Value, } From 1043b98ab97bac898be0d91cce678468d046e3a0 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 16 May 2024 14:35:08 +0200 Subject: [PATCH 02/18] refactor: add pathogen nextclade extension to auspice tree type I had to derive a bunch of Eq and PartialEq traits to satisfy parent type requirements --- .../src/dataset/dataset_download.rs | 36 +------------ packages/nextclade/src/align/params.rs | 22 ++++---- packages/nextclade/src/align/seed_match.rs | 6 +-- packages/nextclade/src/analyze/phenotype.rs | 2 +- .../nextclade/src/analyze/virus_properties.rs | 30 ++++++----- packages/nextclade/src/io/dataset.rs | 2 +- packages/nextclade/src/qc/qc_config.rs | 51 ++++++++++--------- .../nextclade/src/qc/qc_rule_frame_shifts.rs | 2 +- .../nextclade/src/qc/qc_rule_missing_data.rs | 4 +- .../src/qc/qc_rule_private_mutations.rs | 12 ++--- .../nextclade/src/qc/qc_rule_snp_clusters.rs | 2 +- .../nextclade/src/qc/qc_rule_stop_codons.rs | 2 +- packages/nextclade/src/run/params_general.rs | 2 +- packages/nextclade/src/tree/params.rs | 7 +-- packages/nextclade/src/tree/tree.rs | 6 ++- packages/nextclade/src/utils/any.rs | 48 +++++++++++++++-- 16 files changed, 126 insertions(+), 108 deletions(-) diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs index 380afc318..8736e0c02 100644 --- a/packages/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages/nextclade-cli/src/dataset/dataset_download.rs @@ -297,41 +297,7 @@ pub fn dataset_individual_files_load( .and_then(|input_pathogen_json| read_file_to_string(input_pathogen_json).ok()) .map_ref_fallible(VirusProperties::from_str) .wrap_err("When reading pathogen JSON")? - .unwrap_or_else(|| { - // The only case where we allow pathogen.json to be missing is when there's no dataset and files are provided - // explicitly through args. Let's create a dummy value to avoid making the field optional, - // and avoid adding `Default` trait. - VirusProperties { - schema_version: "".to_owned(), - attributes: BTreeMap::default(), - shortcuts: vec![], - meta: DatasetMeta::default(), - files: DatasetFiles { - reference: "".to_owned(), - pathogen_json: "".to_owned(), - genome_annotation: None, - tree_json: None, - examples: None, - readme: None, - changelog: None, - rest_files: BTreeMap::default(), - other: serde_json::Value::default(), - }, - default_cds: None, - cds_order_preference: vec![], - mut_labels: LabelledMutationsConfig::default(), - qc: None, - general_params: None, - alignment_params: None, - tree_builder_params: None, - phenotype_data: None, - aa_motifs: vec![], - versions: vec![], - version: None, - compatibility: None, - other: serde_json::Value::default(), - } - }); + .unwrap_or_default(); let ref_record = read_one_fasta(input_ref).wrap_err("When reading reference sequence")?; diff --git a/packages/nextclade/src/align/params.rs b/packages/nextclade/src/align/params.rs index eea5ed85f..1a0d3b6ba 100644 --- a/packages/nextclade/src/align/params.rs +++ b/packages/nextclade/src/align/params.rs @@ -1,12 +1,14 @@ +use crate::utils::any::AnyType; use crate::{make_error, o}; use clap::{Parser, ValueEnum}; use eyre::Report; use itertools::Itertools; use optfield::optfield; +use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; -#[derive(ValueEnum, Copy, Clone, Debug, Deserialize, Serialize, schemars::JsonSchema)] +#[derive(ValueEnum, Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, schemars::JsonSchema)] #[serde(rename_all = "kebab-case")] pub enum GapAlignmentSide { Left, @@ -25,7 +27,7 @@ impl Default for GapAlignmentSide { #[allow(clippy::struct_excessive_bools)] #[optfield(pub AlignPairwiseParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)] -#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] +#[derive(Parser, Debug, Clone, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct AlignPairwiseParams { /// Minimum length of nucleotide sequence to consider for alignment. @@ -116,7 +118,7 @@ pub struct AlignPairwiseParams { /// Fraction of the query sequence that has to be covered by extended seeds /// to proceed with the banded alignment. #[clap(long)] - pub min_seed_cover: f64, + pub min_seed_cover: OrderedFloat, /// Number of times Nextclade will retry alignment with more relaxed results if alignment band boundaries are hit #[clap(long)] @@ -125,27 +127,27 @@ pub struct AlignPairwiseParams { // The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set) /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub max_indel: Option, + pub max_indel: Option, /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub seed_length: Option, + pub seed_length: Option, /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub mismatches_allowed: Option, + pub mismatches_allowed: Option, /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub min_seeds: Option, + pub min_seeds: Option, /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub min_match_rate: Option, + pub min_match_rate: Option, /// REMOVED #[clap(long, hide_long_help = true, hide_short_help = true)] - pub seed_spacing: Option, + pub seed_spacing: Option, } impl Default for AlignPairwiseParams { @@ -166,7 +168,7 @@ impl Default for AlignPairwiseParams { gap_alignment_side: GapAlignmentSide::default(), excess_bandwidth: 9, terminal_bandwidth: 50, - min_seed_cover: 0.33, + min_seed_cover: OrderedFloat(0.33), kmer_length: 10, // Should not be much larger than 1/divergence of amino acids kmer_distance: 50, // Distance between successive k-mers min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low diff --git a/packages/nextclade/src/align/seed_match.rs b/packages/nextclade/src/align/seed_match.rs index 1516f7795..c5c7abd2a 100644 --- a/packages/nextclade/src/align/seed_match.rs +++ b/packages/nextclade/src/align/seed_match.rs @@ -481,15 +481,15 @@ pub fn get_seed_matches2( // write_matches_to_file(&seed_matches, "chained_matches.csv"); let sum_of_seed_length: usize = seed_matches.iter().map(|sm| sm.length).sum(); - if (sum_of_seed_length as f64 / qry_seq.len() as f64) < params.min_seed_cover { + if (sum_of_seed_length as f64 / qry_seq.len() as f64) < *params.min_seed_cover { let query_knowns = qry_seq.iter().filter(|n| n.is_acgt()).count(); - if (sum_of_seed_length as f64 / query_knowns as f64) < params.min_seed_cover { + if (sum_of_seed_length as f64 / query_knowns as f64) < *params.min_seed_cover { return make_error!( "Unable to align: seed alignment covers {:.2}% of the query sequence, which is less than expected {:.2}% \ (configurable using 'min seed cover' CLI flag or dataset property). This is likely due to low quality of the \ provided sequence, or due to using incorrect reference sequence.", 100.0 * (sum_of_seed_length as f64) / (query_knowns as f64), - 100.0 * params.min_seed_cover + 100.0 * *params.min_seed_cover ); } } diff --git a/packages/nextclade/src/analyze/phenotype.rs b/packages/nextclade/src/analyze/phenotype.rs index aafe8f7c0..fc32da44e 100644 --- a/packages/nextclade/src/analyze/phenotype.rs +++ b/packages/nextclade/src/analyze/phenotype.rs @@ -16,7 +16,7 @@ pub fn calculate_phenotype(phenotype_data: &PhenotypeData, aa_substitutions: &[A .iter() .map(|AaSub { pos, qry_aa: qry, .. }| phenotype_data.get_coeff(*pos, *qry)) .sum(); - phenotype_data.weight * (-phenotype_for_antibody).exp() + *phenotype_data.weight * (-phenotype_for_antibody).exp() }) .sum(); diff --git a/packages/nextclade/src/analyze/virus_properties.rs b/packages/nextclade/src/analyze/virus_properties.rs index 845c3f1f2..896705865 100644 --- a/packages/nextclade/src/analyze/virus_properties.rs +++ b/packages/nextclade/src/analyze/virus_properties.rs @@ -14,6 +14,7 @@ use crate::run::params_general::NextcladeGeneralParamsOptional; use crate::tree::params::TreeBuilderParamsOptional; use crate::utils::any::AnyType; use eyre::{Report, WrapErr}; +use ordered_float::OrderedFloat; use semver::Version; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -24,7 +25,7 @@ const PATHOGEN_JSON_SCHEMA_VERSION_FROM: &str = "3.0.0"; const PATHOGEN_JSON_SCHEMA_VERSION_TO: &str = "3.0.0"; /// Contains external configuration and data specific for a particular pathogen -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct VirusProperties { pub schema_version: String, @@ -78,7 +79,7 @@ pub struct VirusProperties { pub type LabelMap = BTreeMap, Vec>; pub type NucLabelMap = LabelMap; -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] pub struct LabelledMutationsConfig { #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] @@ -87,41 +88,42 @@ pub struct LabelledMutationsConfig { pub other: serde_json::Value, } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct PhenotypeDataIgnore { #[serde(default)] pub clades: Vec, } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] #[serde(untagged)] pub enum PhenotypeCoeff { - ByPosition(f64), - ByPositionAndAa(BTreeMap), + ByPosition(OrderedFloat), + ByPositionAndAa(BTreeMap>), Other(serde_json::Value), } impl PhenotypeCoeff { pub fn get_coeff(&self, aa: Aa) -> f64 { match self { - PhenotypeCoeff::ByPosition(coeff) => Some(coeff), + PhenotypeCoeff::ByPosition(coeff) => Some(coeff.0), PhenotypeCoeff::ByPositionAndAa(aa_coeff_map) => aa_coeff_map .get(&aa.to_string()) - .or_else(|| aa_coeff_map.get("default")), + .or_else(|| aa_coeff_map.get("default")) + .map(|c| c.0), PhenotypeCoeff::Other(_) => None, } - .unwrap_or(&0.0) + .unwrap_or(0.0) .to_owned() } } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct PhenotypeDataEntry { pub name: String, - pub weight: f64, + pub weight: OrderedFloat, pub locations: BTreeMap, } @@ -131,7 +133,7 @@ impl PhenotypeDataEntry { } } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct PhenotypeData { pub name: String, @@ -152,7 +154,7 @@ pub struct PhenotypeAttrDesc { pub description: String, } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct AaMotifsDesc { pub name: String, @@ -165,7 +167,7 @@ pub struct AaMotifsDesc { pub include_cdses: Vec, } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct CountAaMotifsCdsDesc { pub cds: String, diff --git a/packages/nextclade/src/io/dataset.rs b/packages/nextclade/src/io/dataset.rs index 96f03a1af..1838a53f9 100644 --- a/packages/nextclade/src/io/dataset.rs +++ b/packages/nextclade/src/io/dataset.rs @@ -328,7 +328,7 @@ impl DatasetMeta { } } -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct DatasetFiles { pub reference: String, diff --git a/packages/nextclade/src/qc/qc_config.rs b/packages/nextclade/src/qc/qc_config.rs index b30b7727a..f043c6aeb 100644 --- a/packages/nextclade/src/qc/qc_config.rs +++ b/packages/nextclade/src/qc/qc_config.rs @@ -2,21 +2,22 @@ use crate::coord::range::AaRefRange; use crate::io::fs::read_file_to_string; use crate::io::json::json_parse; use eyre::{Report, WrapErr}; +use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; use std::path::Path; use std::str::FromStr; use validator::Validate; -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigMissingData { pub enabled: bool, - pub missing_data_threshold: f64, - pub score_bias: f64, + pub missing_data_threshold: OrderedFloat, + pub score_bias: OrderedFloat, } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigMixedSites { @@ -24,49 +25,49 @@ pub struct QcRulesConfigMixedSites { pub mixed_sites_threshold: usize, } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigPrivateMutations { pub enabled: bool, #[serde(default = "one")] - pub weight_reversion_substitutions: f64, + pub weight_reversion_substitutions: OrderedFloat, #[serde(default = "one")] - pub weight_reversion_deletions: f64, + pub weight_reversion_deletions: OrderedFloat, #[serde(default = "one")] - pub weight_labeled_substitutions: f64, + pub weight_labeled_substitutions: OrderedFloat, #[serde(default = "one")] - pub weight_labeled_deletions: f64, + pub weight_labeled_deletions: OrderedFloat, #[serde(default = "one")] - pub weight_unlabeled_substitutions: f64, + pub weight_unlabeled_substitutions: OrderedFloat, #[serde(default = "one")] - pub weight_unlabeled_deletions: f64, + pub weight_unlabeled_deletions: OrderedFloat, - pub typical: f64, - pub cutoff: f64, + pub typical: OrderedFloat, + pub cutoff: OrderedFloat, } -const fn one() -> f64 { - 1.0 +const fn one() -> OrderedFloat { + OrderedFloat(1.0) } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigSnpClusters { pub enabled: bool, pub window_size: usize, pub cluster_cut_off: usize, - pub score_weight: f64, + pub score_weight: OrderedFloat, } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct FrameShiftLocation { @@ -74,14 +75,14 @@ pub struct FrameShiftLocation { pub codon_range: AaRefRange, } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigFrameShifts { pub enabled: bool, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub ignored_frame_shifts: Vec, - pub score_weight: f64, + pub score_weight: OrderedFloat, } impl Default for QcRulesConfigFrameShifts { @@ -89,7 +90,7 @@ impl Default for QcRulesConfigFrameShifts { Self { enabled: false, ignored_frame_shifts: vec![], - score_weight: 75.0, + score_weight: OrderedFloat(75.0), } } } @@ -101,14 +102,14 @@ pub struct StopCodonLocation { pub codon: usize, } -#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcRulesConfigStopCodons { pub enabled: bool, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub ignored_stop_codons: Vec, - pub score_weight: f64, + pub score_weight: OrderedFloat, } impl Default for QcRulesConfigStopCodons { @@ -116,12 +117,12 @@ impl Default for QcRulesConfigStopCodons { Self { enabled: false, ignored_stop_codons: vec![], - score_weight: 75.0, + score_weight: OrderedFloat(75.0), } } } -#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)] #[serde(rename_all = "camelCase")] #[serde(default)] pub struct QcConfig { diff --git a/packages/nextclade/src/qc/qc_rule_frame_shifts.rs b/packages/nextclade/src/qc/qc_rule_frame_shifts.rs index b89c4ac5b..262b8b653 100644 --- a/packages/nextclade/src/qc/qc_rule_frame_shifts.rs +++ b/packages/nextclade/src/qc/qc_rule_frame_shifts.rs @@ -45,7 +45,7 @@ pub fn rule_frame_shifts( let total_frame_shifts = frame_shifts.len(); let total_frame_shifts_ignored = frame_shifts_ignored.len(); - let score = total_frame_shifts as f64 * config.score_weight; + let score = total_frame_shifts as f64 * *config.score_weight; let status = QcStatus::from_score(score); Some(QcResultFrameShifts { diff --git a/packages/nextclade/src/qc/qc_rule_missing_data.rs b/packages/nextclade/src/qc/qc_rule_missing_data.rs index 0b4e0af5b..3759dc4c0 100644 --- a/packages/nextclade/src/qc/qc_rule_missing_data.rs +++ b/packages/nextclade/src/qc/qc_rule_missing_data.rs @@ -24,7 +24,7 @@ pub fn rule_missing_data(total_missing: usize, config: &QcRulesConfigMissingData } let score = clamp_min( - ((total_missing as f64 - config.score_bias) * 100.0) / config.missing_data_threshold, + ((total_missing as f64 - *config.score_bias) * 100.0) / *config.missing_data_threshold, 0.0, ); let status = QcStatus::from_score(score); @@ -33,6 +33,6 @@ pub fn rule_missing_data(total_missing: usize, config: &QcRulesConfigMissingData score, status, total_missing, - missing_data_threshold: config.missing_data_threshold + config.score_bias, + missing_data_threshold: *config.missing_data_threshold + *config.score_bias, }) } diff --git a/packages/nextclade/src/qc/qc_rule_private_mutations.rs b/packages/nextclade/src/qc/qc_rule_private_mutations.rs index b8f31c30c..22ac5aee6 100644 --- a/packages/nextclade/src/qc/qc_rule_private_mutations.rs +++ b/packages/nextclade/src/qc/qc_rule_private_mutations.rs @@ -46,13 +46,13 @@ pub fn rule_private_mutations( let total_deletion_ranges = deletion_ranges.len(); let weighted_total = 0.0 - + config.weight_reversion_substitutions * num_reversion_substitutions as f64 - + config.weight_labeled_substitutions * num_labeled_substitutions as f64 - + config.weight_unlabeled_substitutions * num_unlabeled_substitutions as f64 + + *config.weight_reversion_substitutions * num_reversion_substitutions as f64 + + *config.weight_labeled_substitutions * num_labeled_substitutions as f64 + + *config.weight_unlabeled_substitutions * num_unlabeled_substitutions as f64 + total_deletion_ranges as f64; // the score hits 100 if the excess mutations equals the cutoff value - let score = (clamp_min(weighted_total - config.typical, 0.0) * 100.0) / config.cutoff; + let score = (clamp_min(weighted_total - *config.typical, 0.0) * 100.0) / *config.cutoff; let status = QcStatus::from_score(score); Some(QcResultPrivateMutations { @@ -63,8 +63,8 @@ pub fn rule_private_mutations( num_unlabeled_substitutions, total_deletion_ranges, weighted_total, - excess: weighted_total - config.typical, - cutoff: config.cutoff, + excess: weighted_total - *config.typical, + cutoff: *config.cutoff, }) } diff --git a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs index ca5adb6e7..f174c0131 100644 --- a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs +++ b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs @@ -52,7 +52,7 @@ pub fn rule_snp_clusters( let clustered_snps = process_snp_clusters(snp_clusters); let total_snps = clustered_snps.iter().map(|cluster| cluster.number_of_snps).sum(); - let score = clamp_min(total_clusters as f64 * config.score_weight, 0.0); + let score = clamp_min(total_clusters as f64 * *config.score_weight, 0.0); let status = QcStatus::from_score(score); Some(QcResultSnpClusters { diff --git a/packages/nextclade/src/qc/qc_rule_stop_codons.rs b/packages/nextclade/src/qc/qc_rule_stop_codons.rs index 0299596c4..32e14d0e6 100644 --- a/packages/nextclade/src/qc/qc_rule_stop_codons.rs +++ b/packages/nextclade/src/qc/qc_rule_stop_codons.rs @@ -50,7 +50,7 @@ pub fn rule_stop_codons(translation: &Translation, config: &QcRulesConfigStopCod let total_stop_codons = stop_codons.len(); let total_stop_codons_ignored = stop_codons_ignored.len(); - let score = total_stop_codons as f64 * config.score_weight; + let score = total_stop_codons as f64 * *config.score_weight; let status = QcStatus::from_score(score); Some(QcResultStopCodons { diff --git a/packages/nextclade/src/run/params_general.rs b/packages/nextclade/src/run/params_general.rs index fb3d19805..65685d9fa 100644 --- a/packages/nextclade/src/run/params_general.rs +++ b/packages/nextclade/src/run/params_general.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; #[allow(clippy::struct_excessive_bools)] #[optfield(pub NextcladeGeneralParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)] -#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] +#[derive(Parser, Debug, Clone, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct NextcladeGeneralParams { /// Whether to include aligned reference nucleotide sequence into output nucleotide sequence FASTA file and reference peptides into output peptide FASTA files. diff --git a/packages/nextclade/src/tree/params.rs b/packages/nextclade/src/tree/params.rs index d751d2324..e797aac7b 100644 --- a/packages/nextclade/src/tree/params.rs +++ b/packages/nextclade/src/tree/params.rs @@ -1,5 +1,6 @@ use clap::Parser; use optfield::optfield; +use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; // NOTE: The `optfield` attribute creates a struct that have the same fields, but which are wrapped into `Option`, @@ -7,7 +8,7 @@ use serde::{Deserialize, Serialize}; // into self (mutably). #[allow(clippy::struct_excessive_bools)] #[optfield(pub TreeBuilderParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)] -#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] +#[derive(Parser, Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct TreeBuilderParams { /// Disable greedy tree builder algorithm @@ -16,7 +17,7 @@ pub struct TreeBuilderParams { pub without_greedy_tree_builder: bool, #[clap(long)] - pub masked_muts_weight: f64, + pub masked_muts_weight: OrderedFloat, } #[allow(clippy::derivable_impls)] @@ -24,7 +25,7 @@ impl Default for TreeBuilderParams { fn default() -> Self { Self { without_greedy_tree_builder: false, - masked_muts_weight: 0.05, + masked_muts_weight: OrderedFloat(0.05), } } } diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs index 5a68e6003..bdbd824ea 100644 --- a/packages/nextclade/src/tree/tree.rs +++ b/packages/nextclade/src/tree/tree.rs @@ -1,6 +1,7 @@ use crate::alphabet::aa::Aa; use crate::alphabet::nuc::Nuc; use crate::analyze::find_private_nuc_mutations::BranchMutations; +use crate::analyze::virus_properties::VirusProperties; use crate::coord::position::{AaRefPosition, NucRefGlobalPosition}; use crate::coord::range::NucRefGlobalRange; use crate::gene::gene::GeneStrand; @@ -321,7 +322,7 @@ pub struct CladeNodeAttrKeyDesc { pub other: serde_json::Value, } -#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq, schemars::JsonSchema, Validate, Debug)] +#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)] pub struct AuspiceMetaExtensionsNextclade { #[serde(default, skip_serializing_if = "Vec::is_empty")] pub clade_node_attrs: Vec, @@ -329,6 +330,9 @@ pub struct AuspiceMetaExtensionsNextclade { #[serde(default, skip_serializing_if = "Vec::is_empty")] pub placement_mask_ranges: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub pathogen: Option, + #[serde(flatten)] pub other: serde_json::Value, } diff --git a/packages/nextclade/src/utils/any.rs b/packages/nextclade/src/utils/any.rs index 8ed141b27..b96ec432c 100644 --- a/packages/nextclade/src/utils/any.rs +++ b/packages/nextclade/src/utils/any.rs @@ -1,17 +1,21 @@ use crate::io::json::{json_stringify, JsonPretty}; +use crate::make_error; use eyre::{eyre, Report}; +use ordered_float::OrderedFloat; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use serde_json::Value; use std::collections::BTreeMap; use std::fmt::{Display, Formatter}; +use std::str::FromStr; -#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] #[serde(untagged)] pub enum AnyType { String(String), Int(isize), - Float(f64), + Float(OrderedFloat), Bool(bool), Array(Vec), Object(BTreeMap), @@ -51,7 +55,7 @@ impl AnyType { pub const fn as_float_maybe(&self) -> Option { match &self { - AnyType::Float(x) => Some(*x), + AnyType::Float(x) => Some(x.0), _ => None, } } @@ -79,3 +83,41 @@ impl AnyType { self.as_bool_maybe().ok_or(eyre!("Cannot parse value as bool")) } } + +impl FromStr for AnyType { + type Err = Report; + + fn from_str(s: &str) -> Result { + let value: Value = match serde_json::from_str(s) { + Ok(v) => v, + Err(err) => return make_error!("Failed to parse JSON: {err}"), + }; + + match value { + Value::String(s) => Ok(AnyType::String(s)), + Value::Number(n) => { + if let Some(int_val) = n.as_i64() { + Ok(AnyType::Int(int_val as isize)) + } else { + Ok(AnyType::Float(OrderedFloat(n.as_f64().unwrap()))) + } + } + Value::Bool(b) => Ok(AnyType::Bool(b)), + Value::Array(arr) => { + let mut parsed_array = Vec::new(); + for val in arr { + parsed_array.push(AnyType::from_str(&val.to_string())?); + } + Ok(AnyType::Array(parsed_array)) + } + Value::Object(obj) => { + let mut parsed_object = BTreeMap::new(); + for (key, val) in obj { + parsed_object.insert(key, AnyType::from_str(&val.to_string())?); + } + Ok(AnyType::Object(parsed_object)) + } + Value::Null => Ok(AnyType::Null), + } + } +} From 4334f32ba4f8bdb675addf0a27b1dc93a1f44fae Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 16 May 2024 14:38:20 +0200 Subject: [PATCH 03/18] feat: use Auspice JSON as dataset This allows to pass a path to Auspice JSON v2 to `--input-dataset` CLI argument. In this case we attempt to read not only tree, but also ref sequence, genome annotation and pathogen properties from that file, rather than from a conventional dataset. --- .../src/dataset/dataset_download.rs | 74 ++++++++++++++-- .../nextclade/src/gene/auspice_annotations.rs | 84 +++++++++++++++++++ packages/nextclade/src/gene/gene_map.rs | 8 +- packages/nextclade/src/gene/mod.rs | 1 + packages/nextclade/src/tree/tree.rs | 4 +- 5 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 packages/nextclade/src/gene/auspice_annotations.rs diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs index 8736e0c02..e5a099a99 100644 --- a/packages/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages/nextclade-cli/src/dataset/dataset_download.rs @@ -5,19 +5,20 @@ use color_eyre::{Section, SectionExt}; use eyre::{eyre, ContextCompat, Report, WrapErr}; use itertools::Itertools; use log::{warn, LevelFilter}; -use nextclade::analyze::virus_properties::{LabelledMutationsConfig, VirusProperties}; +use nextclade::analyze::virus_properties::VirusProperties; use nextclade::gene::gene_map::{filter_gene_map, GeneMap}; -use nextclade::io::dataset::{Dataset, DatasetFiles, DatasetMeta, DatasetsIndexJson}; -use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str}; +use nextclade::io::dataset::{Dataset, DatasetsIndexJson}; +use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str, FastaRecord}; use nextclade::io::file::create_file_or_stdout; use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string}; use nextclade::run::nextclade_wasm::NextcladeParams; use nextclade::tree::tree::AuspiceTree; +use nextclade::utils::any::AnyType; use nextclade::utils::fs::list_files_recursive; use nextclade::utils::option::OptionMapRefFallible; use nextclade::utils::string::{format_list, surround_with_quotes, Indent}; use nextclade::{make_error, make_internal_error, o}; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeSet; use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek, Write}; use std::ops::Deref; @@ -35,13 +36,16 @@ pub fn nextclade_get_inputs( if input_dataset.is_file() && has_extension(input_dataset, "zip") { dataset_zip_load(run_args, input_dataset, cdses) .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}")) + } else if input_dataset.is_file() && has_extension(input_dataset, "json") { + dataset_json_load(run_args, input_dataset, cdses) + .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}")) } else if input_dataset.is_dir() { dataset_dir_load(run_args, input_dataset, cdses) .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}")) } else { make_error!( "--input-dataset: path is invalid. \ - Expected a directory path or a zip archive file path, but got: '{input_dataset:#?}'" + Expected a directory path, a zip file path or json file path, but got: '{input_dataset:#?}'" ) } } else { @@ -283,6 +287,66 @@ pub fn dataset_dir_load( }) } +pub fn dataset_json_load( + run_args: &NextcladeRunArgs, + dataset_json: impl AsRef, + cdses: &Option>, +) -> Result { + let dataset_json = dataset_json.as_ref(); + + // let NextcladeRunInputArgs { + // input_ref, + // input_tree, + // input_pathogen_json, + // input_annotation, + // .. + // } = &run_args.inputs; + + let auspice_json = AuspiceTree::from_path(dataset_json).wrap_err("When reading Auspice JSON v2")?; + + let virus_properties = auspice_json + .meta + .extensions + .nextclade + .pathogen + .as_ref() + .cloned() + .unwrap_or_default(); + + let ref_record = { + let ref_name = virus_properties + .attributes + .get("reference name") + .cloned() + .unwrap_or_else(|| AnyType::String("reference".to_owned())) + .as_str() + .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")? + .to_owned(); + + let ref_seq = auspice_json.root_sequence.get("nuc") + .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned(); + + FastaRecord { + index: 0, + seq_name: ref_name, + seq: ref_seq, + } + }; + + let gene_map = auspice_json + .meta + .genome_annotations + .map_ref_fallible(GeneMap::from_auspice_annotations)? + .unwrap_or_default(); + + Ok(NextcladeParams { + ref_record, + gene_map, + tree: Some(auspice_json), + virus_properties, + }) +} + pub fn dataset_individual_files_load( run_args: &NextcladeRunArgs, cdses: &Option>, diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs new file mode 100644 index 000000000..9c45e830c --- /dev/null +++ b/packages/nextclade/src/gene/auspice_annotations.rs @@ -0,0 +1,84 @@ +use crate::coord::range::{NucRefGlobalRange, NucRefLocalRange}; +use crate::gene::cds::Cds; +use crate::gene::cds_segment::{CdsSegment, WrappingPart}; +use crate::gene::frame::Frame; +use crate::gene::gene::Gene; +use crate::gene::phase::Phase; +use crate::tree::tree::{AuspiceGenomeAnnotations, Segments, StartEnd}; +use eyre::Report; +use std::collections::HashMap; + +pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> Result, Report> { + anns + .cdses + .iter() + .enumerate() + .map(|(index, (cds_name, ann))| { + let gene_name = ann.gene.as_ref().cloned().unwrap_or_else(|| format!("gene_{index}")); + + let segments = match &ann.segments { + Segments::OneSegment(StartEnd { start, end }) => vec![CdsSegment { + index, + id: cds_name.to_owned(), + name: cds_name.to_owned(), + range: NucRefGlobalRange::from_isize(*start, *end), + range_local: NucRefLocalRange::from_isize(0, *end - *start), + landmark: None, + wrapping_part: WrappingPart::NonWrapping, + strand: ann.strand, + frame: Frame::_0, + phase: Phase::_0, + exceptions: vec![], + attributes: HashMap::default(), + source_record: None, + compat_is_gene: false, + color: None, + }], + Segments::MultipleSegments { segments } => segments + .iter() + .map(|StartEnd { start, end }| CdsSegment { + index, + id: cds_name.to_owned(), + name: cds_name.to_owned(), + range: NucRefGlobalRange::from_isize(*start, *end), + range_local: NucRefLocalRange::from_isize(0, *end - *start), + landmark: None, + wrapping_part: WrappingPart::NonWrapping, + strand: ann.strand, + frame: Frame::_0, + phase: Phase::_0, + exceptions: vec![], + attributes: HashMap::default(), + source_record: None, + compat_is_gene: false, + color: None, + }) + .collect(), + }; + + let cds = Cds { + id: cds_name.to_owned(), + name: cds_name.to_owned(), + product: cds_name.to_owned(), + segments, + proteins: vec![], + exceptions: vec![], + attributes: HashMap::default(), + compat_is_gene: true, + color: ann.color.clone(), + }; + + Ok(Gene { + index, + id: gene_name.clone(), + name: gene_name, + cdses: vec![cds], + exceptions: vec![], + attributes: HashMap::default(), + source_record: None, + compat_is_cds: true, + color: ann.color.clone(), + }) + }) + .collect() +} diff --git a/packages/nextclade/src/gene/gene_map.rs b/packages/nextclade/src/gene/gene_map.rs index ad6dbea74..a39493a64 100644 --- a/packages/nextclade/src/gene/gene_map.rs +++ b/packages/nextclade/src/gene/gene_map.rs @@ -1,11 +1,13 @@ use crate::features::feature_group::FeatureGroup; use crate::features::feature_tree::FeatureTree; use crate::features::sequence_region::SequenceRegion; +use crate::gene::auspice_annotations::convert_auspice_annotations_to_genes; use crate::gene::cds::Cds; use crate::gene::cds_segment::CdsSegment; use crate::gene::gene::{find_cdses, Gene}; use crate::io::file::open_file_or_stdin; use crate::io::yaml::yaml_parse; +use crate::tree::tree::AuspiceGenomeAnnotations; use crate::utils::collections::take_exactly_one; use crate::utils::error::report_to_string; use crate::{make_error, make_internal_report}; @@ -15,7 +17,6 @@ use log::warn; use num::Integer; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; - use std::path::Path; #[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema)] @@ -37,6 +38,11 @@ impl GeneMap { convert_feature_tree_to_gene_map(feature_tree) } + pub fn from_auspice_annotations(anns: &AuspiceGenomeAnnotations) -> Result { + let genes = convert_auspice_annotations_to_genes(anns)?; + Ok(GeneMap::from_genes(genes)) + } + pub fn from_path>(filename: P) -> Result { let filename = filename.as_ref(); let mut file = open_file_or_stdin(&Some(filename))?; diff --git a/packages/nextclade/src/gene/mod.rs b/packages/nextclade/src/gene/mod.rs index f0692e515..f77b1a201 100644 --- a/packages/nextclade/src/gene/mod.rs +++ b/packages/nextclade/src/gene/mod.rs @@ -1,3 +1,4 @@ +pub mod auspice_annotations; pub mod cds; pub mod cds_segment; pub mod frame; diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs index bdbd824ea..20ce67d02 100644 --- a/packages/nextclade/src/tree/tree.rs +++ b/packages/nextclade/src/tree/tree.rs @@ -404,8 +404,8 @@ pub struct AuspiceGenomeAnnotationNuc { #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct StartEnd { - pub start: usize, - pub end: usize, + pub start: isize, + pub end: isize, } #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] From b843ada325e5f2d516d69756f090e3c78b0ea0ad Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 16 May 2024 17:03:11 +0200 Subject: [PATCH 04/18] fix: parsing auspice genome annotations --- .../nextclade/src/gene/auspice_annotations.rs | 109 +++++++++++------- packages/nextclade/src/gene/cds.rs | 3 +- packages/nextclade/src/tree/tree.rs | 10 ++ 3 files changed, 76 insertions(+), 46 deletions(-) diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs index 9c45e830c..4176bd938 100644 --- a/packages/nextclade/src/gene/auspice_annotations.rs +++ b/packages/nextclade/src/gene/auspice_annotations.rs @@ -1,60 +1,37 @@ -use crate::coord::range::{NucRefGlobalRange, NucRefLocalRange}; -use crate::gene::cds::Cds; +use crate::coord::range::{NucRefGlobalRange, Range}; +use crate::features::feature::Landmark; +use crate::gene::cds::{split_circular_cds_segments, Cds}; use crate::gene::cds_segment::{CdsSegment, WrappingPart}; use crate::gene::frame::Frame; use crate::gene::gene::Gene; use crate::gene::phase::Phase; -use crate::tree::tree::{AuspiceGenomeAnnotations, Segments, StartEnd}; +use crate::io::json::{json_stringify, JsonPretty}; +use crate::tree::tree::{AuspiceGenomeAnnotationCds, AuspiceGenomeAnnotations, Segments, StartEnd}; use eyre::Report; +use maplit::hashmap; use std::collections::HashMap; pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> Result, Report> { + let landmark = Landmark { + index: 0, + id: "landmark".to_owned(), + name: "landmark".to_owned(), + range: NucRefGlobalRange::from_isize(anns.nuc.start, anns.nuc.end), + strand: anns.nuc.strand, + is_circular: true, + }; + anns .cdses .iter() .enumerate() .map(|(index, (cds_name, ann))| { - let gene_name = ann.gene.as_ref().cloned().unwrap_or_else(|| format!("gene_{index}")); + let gene_name = ann.gene.as_ref().unwrap_or(cds_name); let segments = match &ann.segments { - Segments::OneSegment(StartEnd { start, end }) => vec![CdsSegment { - index, - id: cds_name.to_owned(), - name: cds_name.to_owned(), - range: NucRefGlobalRange::from_isize(*start, *end), - range_local: NucRefLocalRange::from_isize(0, *end - *start), - landmark: None, - wrapping_part: WrappingPart::NonWrapping, - strand: ann.strand, - frame: Frame::_0, - phase: Phase::_0, - exceptions: vec![], - attributes: HashMap::default(), - source_record: None, - compat_is_gene: false, - color: None, - }], - Segments::MultipleSegments { segments } => segments - .iter() - .map(|StartEnd { start, end }| CdsSegment { - index, - id: cds_name.to_owned(), - name: cds_name.to_owned(), - range: NucRefGlobalRange::from_isize(*start, *end), - range_local: NucRefLocalRange::from_isize(0, *end - *start), - landmark: None, - wrapping_part: WrappingPart::NonWrapping, - strand: ann.strand, - frame: Frame::_0, - phase: Phase::_0, - exceptions: vec![], - attributes: HashMap::default(), - source_record: None, - compat_is_gene: false, - color: None, - }) - .collect(), - }; + Segments::OneSegment(segment) => convert_cds_segments(ann, &landmark, cds_name, &[segment.to_owned()]), + Segments::MultipleSegments { segments } => convert_cds_segments(ann, &landmark, cds_name, segments), + }?; let cds = Cds { id: cds_name.to_owned(), @@ -70,8 +47,8 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> Ok(Gene { index, - id: gene_name.clone(), - name: gene_name, + id: gene_name.to_owned(), + name: gene_name.to_owned(), cdses: vec![cds], exceptions: vec![], attributes: HashMap::default(), @@ -82,3 +59,47 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> }) .collect() } + +fn convert_cds_segments( + ann: &AuspiceGenomeAnnotationCds, + landmark: &Landmark, + cds_name: &str, + ann_segments: &[StartEnd], +) -> Result, Report> { + let mut begin = 0; + let mut segments = vec![]; + + for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() { + let name = format!("{cds_name}_fragment_{index}"); + + let start = start.saturating_sub(1); + let range = NucRefGlobalRange::from_isize(start, end); + let range_local = Range::from_usize(begin, begin + range.len()); + let phase = Phase::from_begin(range_local.begin)?; + let frame = Frame::from_begin(range.begin)?; + + segments.push(CdsSegment { + index, + id: name.clone(), + name, + range: range.clone(), + range_local, + landmark: Some(landmark.to_owned()), + wrapping_part: WrappingPart::NonWrapping, + strand: ann.strand, + frame, + phase, + exceptions: vec![], + attributes: hashmap! {}, + source_record: Some(json_stringify(ann, JsonPretty(true))?), + compat_is_gene: false, + color: ann.color.clone(), + }); + + begin += range.len(); + } + + let segments = split_circular_cds_segments(&segments)?; + + Ok(segments) +} diff --git a/packages/nextclade/src/gene/cds.rs b/packages/nextclade/src/gene/cds.rs index 62713df65..e10654145 100644 --- a/packages/nextclade/src/gene/cds.rs +++ b/packages/nextclade/src/gene/cds.rs @@ -14,7 +14,6 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::HashMap; - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct Cds { @@ -199,7 +198,7 @@ impl Cds { /// - the part from segment start to landmark end, before the wrap around /// - (optionally) the middle parts spanning the entire sequence /// - the last part from landmark start to segment end -fn split_circular_cds_segments(segments: &[CdsSegment]) -> Result, Report> { +pub fn split_circular_cds_segments(segments: &[CdsSegment]) -> Result, Report> { let mut linear_segments = vec![]; for segment in segments { if let Some(landmark) = &segment.landmark { diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs index 20ce67d02..2189d2a32 100644 --- a/packages/nextclade/src/tree/tree.rs +++ b/packages/nextclade/src/tree/tree.rs @@ -398,6 +398,16 @@ impl AuspiceDisplayDefaults { #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct AuspiceGenomeAnnotationNuc { + pub start: isize, + + pub end: isize, + + #[serde(default)] + pub strand: GeneStrand, + + #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")] + pub r#type: Option, + #[serde(flatten)] pub other: serde_json::Value, } From ff7e887cfb1849bfe7f891947373aa127b3e923d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 17 May 2024 09:35:40 +0200 Subject: [PATCH 05/18] fix: off-by-one in landmark range --- packages/nextclade/src/gene/auspice_annotations.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs index 4176bd938..6dc2ddb32 100644 --- a/packages/nextclade/src/gene/auspice_annotations.rs +++ b/packages/nextclade/src/gene/auspice_annotations.rs @@ -16,7 +16,7 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> index: 0, id: "landmark".to_owned(), name: "landmark".to_owned(), - range: NucRefGlobalRange::from_isize(anns.nuc.start, anns.nuc.end), + range: NucRefGlobalRange::from_isize(anns.nuc.start.saturating_sub(1), anns.nuc.end), strand: anns.nuc.strand, is_circular: true, }; @@ -72,8 +72,7 @@ fn convert_cds_segments( for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() { let name = format!("{cds_name}_fragment_{index}"); - let start = start.saturating_sub(1); - let range = NucRefGlobalRange::from_isize(start, end); + let range = NucRefGlobalRange::from_isize(start.saturating_sub(1), end); let range_local = Range::from_usize(begin, begin + range.len()); let phase = Phase::from_begin(range_local.begin)?; let frame = Frame::from_begin(range.begin)?; From 9b952bf1a7a0bf369ac4f3adb069e1c41735ca8d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 17 May 2024 10:42:31 +0200 Subject: [PATCH 06/18] fix: duplicated start and end fields in the annotation of output tree --- packages/nextclade/src/gene/auspice_annotations.rs | 4 ++-- packages/nextclade/src/tree/tree.rs | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs index 6dc2ddb32..6fb601f85 100644 --- a/packages/nextclade/src/gene/auspice_annotations.rs +++ b/packages/nextclade/src/gene/auspice_annotations.rs @@ -30,7 +30,7 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> let segments = match &ann.segments { Segments::OneSegment(segment) => convert_cds_segments(ann, &landmark, cds_name, &[segment.to_owned()]), - Segments::MultipleSegments { segments } => convert_cds_segments(ann, &landmark, cds_name, segments), + Segments::MultipleSegments { segments, .. } => convert_cds_segments(ann, &landmark, cds_name, segments), }?; let cds = Cds { @@ -69,7 +69,7 @@ fn convert_cds_segments( let mut begin = 0; let mut segments = vec![]; - for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() { + for (index, &StartEnd { start, end, .. }) in ann_segments.iter().enumerate() { let name = format!("{cds_name}_fragment_{index}"); let range = NucRefGlobalRange::from_isize(start.saturating_sub(1), end); diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs index 2189d2a32..6f5ce5e36 100644 --- a/packages/nextclade/src/tree/tree.rs +++ b/packages/nextclade/src/tree/tree.rs @@ -416,13 +416,21 @@ pub struct AuspiceGenomeAnnotationNuc { pub struct StartEnd { pub start: isize, pub end: isize, + + #[serde(flatten)] + pub other: serde_json::Value, } #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] #[serde(untagged)] pub enum Segments { OneSegment(StartEnd), - MultipleSegments { segments: Vec }, + MultipleSegments { + segments: Vec, + + #[serde(flatten)] + other: serde_json::Value, + }, } #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] @@ -447,9 +455,6 @@ pub struct AuspiceGenomeAnnotationCds { #[serde(flatten)] pub segments: Segments, - - #[serde(flatten)] - pub other: serde_json::Value, } #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] From 48d163c01ec8af5602624b4681d79b2843aba5ea Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 17 May 2024 12:32:13 +0200 Subject: [PATCH 07/18] feat: accept Auspice JSON genome annotation in read-annotation command --- packages/nextclade/src/gene/gene_map.rs | 59 ++++++++++++++++++------- packages/nextclade/src/tree/tree.rs | 13 +++++- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/packages/nextclade/src/gene/gene_map.rs b/packages/nextclade/src/gene/gene_map.rs index a39493a64..f98077a37 100644 --- a/packages/nextclade/src/gene/gene_map.rs +++ b/packages/nextclade/src/gene/gene_map.rs @@ -10,6 +10,7 @@ use crate::io::yaml::yaml_parse; use crate::tree::tree::AuspiceGenomeAnnotations; use crate::utils::collections::take_exactly_one; use crate::utils::error::report_to_string; +use crate::utils::string::{format_list, Indent}; use crate::{make_error, make_internal_report}; use eyre::{eyre, Report, WrapErr}; use itertools::Itertools; @@ -19,6 +20,8 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::path::Path; +type GeneMapParserFn = Box Result>; + #[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema)] #[must_use] pub struct GeneMap { @@ -51,25 +54,44 @@ impl GeneMap { Self::from_str(String::from_utf8(buf)?).wrap_err_with(|| eyre!("When reading file: {filename:?}")) } - // TODO: rename this function, because it handles more than GFF3 pub fn from_str(content: impl AsRef) -> Result { let content = content.as_ref(); - let gene_map_yaml: Result = Self::from_yaml_str(content); - let gene_map_gff: Result = Self::from_gff3_str(content); - - let gene_map = match (gene_map_yaml, gene_map_gff) { - (Err(json_err), Err(gff_err)) => { - return make_error!("Attempted to parse the genome annotation as JSON and as GFF, but both attempts failed:\nJSON error: {}\n\nGFF3 error: {}\n", - report_to_string(&json_err), - report_to_string(&gff_err), - ) - }, - (Ok(gene_map), _) => gene_map, - (_, Ok(gene_map)) => gene_map, - }; - gene_map.validate()?; - Ok(gene_map) + let parsers: Vec<(&str, GeneMapParserFn)> = vec![ + ( + "Genome annotation in GFF3 format", + Box::new(|content| Self::from_gff3_str(content)), + ), + ( + "Genome annotation in external JSON format", + Box::new(|content| Self::from_yaml_str(content)), + ), + ( + "Genome annotation extracted from Auspice JSON", + Box::new(|content| Self::from_tree_json_str(content)), + ), + ]; + + let mut errors = Vec::new(); + for (name, parser) in &parsers { + match parser(content) { + Ok(map) => { + map.validate()?; + return Ok(map); + } + Err(err) => { + errors.push(format!( + "When attempted to parse as {name}: {}\n", + report_to_string(&err) + )); + } + } + } + + make_error!( + "Attempted to parse the genome annotation but failed. Tried multiple formats:\n\n{}\n", + format_list(Indent::default(), errors.into_iter()) + ) } fn from_yaml_str(content: impl AsRef) -> Result { @@ -80,6 +102,11 @@ impl GeneMap { Self::from_feature_tree(&FeatureTree::from_gff3_str(content.as_ref())?) } + fn from_tree_json_str(content: impl AsRef) -> Result { + let anns = AuspiceGenomeAnnotations::from_tree_json_str(content)?; + Self::from_auspice_annotations(&anns) + } + #[must_use] pub fn is_empty(&self) -> bool { self.genes.is_empty() diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs index 6f5ce5e36..b3cfe0f31 100644 --- a/packages/nextclade/src/tree/tree.rs +++ b/packages/nextclade/src/tree/tree.rs @@ -11,7 +11,7 @@ use crate::graph::node::{GraphNode, Node}; use crate::graph::traits::{HasDivergence, HasName}; use crate::io::fs::read_file_to_string; use crate::io::json::json_parse; -use eyre::{Report, WrapErr}; +use eyre::{eyre, Report, WrapErr}; use schemars::JsonSchema; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::collections::BTreeMap; @@ -468,6 +468,17 @@ pub struct AuspiceGenomeAnnotations { pub other: serde_json::Value, } +impl AuspiceGenomeAnnotations { + pub fn from_tree_json_str(content: impl AsRef) -> Result { + let content = content.as_ref(); + let tree = AuspiceTree::from_str(content)?; + tree + .meta + .genome_annotations + .ok_or_else(|| eyre!("Auspice JSON does not contain `.genome_annotations` field, but required")) + } +} + #[derive(Clone, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)] pub struct AuspiceTreeMeta { #[serde(skip_serializing_if = "Option::is_none")] From 1fc493686abf3ace869dac8fa4dc9be38e7022ce Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 May 2024 12:37:10 +0200 Subject: [PATCH 08/18] refactor: aggregate inputs loading --- packages/nextclade-web/src/io/loadInputs.ts | 18 +++++++++++++++++ packages/nextclade-web/src/pages/_app.tsx | 22 ++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) create mode 100644 packages/nextclade-web/src/io/loadInputs.ts diff --git a/packages/nextclade-web/src/io/loadInputs.ts b/packages/nextclade-web/src/io/loadInputs.ts new file mode 100644 index 000000000..41072815b --- /dev/null +++ b/packages/nextclade-web/src/io/loadInputs.ts @@ -0,0 +1,18 @@ +import type { ParsedUrlQuery } from 'querystring' +import type { Dataset } from 'src/types' +import { createInputFastasFromUrlParam, createInputFromUrlParamMaybe } from 'src/io/createInputFromUrlParamMaybe' + +export async function loadInputs(urlQuery: ParsedUrlQuery, dataset?: Dataset) { + const inputFastas = await createInputFastasFromUrlParam(urlQuery, dataset) + const refSeq = await createInputFromUrlParamMaybe(urlQuery, 'input-ref') + const geneMap = await createInputFromUrlParamMaybe(urlQuery, 'input-annotation') + const refTree = await createInputFromUrlParamMaybe(urlQuery, 'input-tree') + const virusProperties = await createInputFromUrlParamMaybe(urlQuery, 'input-pathogen-json') + return { + inputFastas, + refSeq, + geneMap, + refTree, + virusProperties, + } +} diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx index 7ca35a9fb..62407c086 100644 --- a/packages/nextclade-web/src/pages/_app.tsx +++ b/packages/nextclade-web/src/pages/_app.tsx @@ -11,7 +11,7 @@ import dynamic from 'next/dynamic' import { sanitizeError } from 'src/helpers/sanitizeError' import { useRunAnalysis } from 'src/hooks/useRunAnalysis' import i18nAuspice, { changeAuspiceLocale } from 'src/i18n/i18n.auspice' -import { createInputFastasFromUrlParam, createInputFromUrlParamMaybe } from 'src/io/createInputFromUrlParamMaybe' +import { loadInputs } from 'src/io/loadInputs' import { mdxComponents } from 'src/mdx-components' import LoadingPage from 'src/pages/loading' import { globalErrorAtom } from 'src/state/error.state' @@ -37,7 +37,6 @@ import { I18nextProvider } from 'react-i18next' import { MDXProvider } from '@mdx-js/react' import { QueryClient, QueryClientConfig, QueryClientProvider } from 'react-query' import { ReactQueryDevtools } from 'react-query/devtools' - import { DOMAIN_STRIPPED } from 'src/constants' import { parseUrl } from 'src/helpers/parseUrl' import { getDatasetServerUrl, initializeDatasets } from 'src/io/fetchDatasets' @@ -122,19 +121,18 @@ export function RecoilStateInitializer() { return dataset }) .then(async (dataset) => { - const inputFastas = await createInputFastasFromUrlParam(urlQuery, dataset) + const { inputFastas, refSeq, geneMap, refTree, virusProperties } = await loadInputs(urlQuery, dataset) + + set(refSeqInputAtom, refSeq) + set(geneMapInputAtom, geneMap) + set(refTreeInputAtom, refTree) + set(virusPropertiesInputAtom, virusProperties) if (!isEmpty(inputFastas)) { set(qrySeqInputsStorageAtom, inputFastas) - } - - set(refSeqInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-ref')) - set(geneMapInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-annotation')) - set(refTreeInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-tree')) - set(virusPropertiesInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-pathogen-json')) - - if (!isEmpty(inputFastas) && !isEmpty(dataset)) { - run() + if (!isEmpty(dataset)) { + run() + } } return undefined From a27ee661d4fceebf01be5db6ff238e1fbf4f3e9e Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 May 2024 15:45:50 +0200 Subject: [PATCH 09/18] feat(web): add url parameter`dataset-json-url` This allows to input Auspice JSON as Nextclade dataset to the web app. --- .../src/dataset/dataset_download.rs | 43 +------- .../src/components/Error/ErrorContent.tsx | 3 +- .../error-types/NextcladeV2ErrorContent.tsx | 2 +- .../nextclade-web/src/hooks/useRunAnalysis.ts | 56 ++++++++++- .../src/io/fetchSingleDataset.ts | 35 +++++-- .../src/io/fetchSingleDatasetAuspice.ts | 36 +++++++ ...mUrl.ts => fetchSingleDatasetDirectory.ts} | 71 ++++++++------ packages/nextclade-web/src/pages/_app.tsx | 8 +- .../nextclade-web/src/state/inputs.state.ts | 8 +- .../src/workers/launchAnalysis.ts | 43 +------- packages/nextclade/src/run/nextclade_wasm.rs | 98 ++++++++++++++++--- 11 files changed, 256 insertions(+), 147 deletions(-) create mode 100644 packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts rename packages/nextclade-web/src/io/{fetchSingleDatasetFromUrl.ts => fetchSingleDatasetDirectory.ts} (94%) diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs index e5a099a99..64184a0c4 100644 --- a/packages/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages/nextclade-cli/src/dataset/dataset_download.rs @@ -303,48 +303,7 @@ pub fn dataset_json_load( // } = &run_args.inputs; let auspice_json = AuspiceTree::from_path(dataset_json).wrap_err("When reading Auspice JSON v2")?; - - let virus_properties = auspice_json - .meta - .extensions - .nextclade - .pathogen - .as_ref() - .cloned() - .unwrap_or_default(); - - let ref_record = { - let ref_name = virus_properties - .attributes - .get("reference name") - .cloned() - .unwrap_or_else(|| AnyType::String("reference".to_owned())) - .as_str() - .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")? - .to_owned(); - - let ref_seq = auspice_json.root_sequence.get("nuc") - .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned(); - - FastaRecord { - index: 0, - seq_name: ref_name, - seq: ref_seq, - } - }; - - let gene_map = auspice_json - .meta - .genome_annotations - .map_ref_fallible(GeneMap::from_auspice_annotations)? - .unwrap_or_default(); - - Ok(NextcladeParams { - ref_record, - gene_map, - tree: Some(auspice_json), - virus_properties, - }) + NextcladeParams::from_auspice(&auspice_json) } pub fn dataset_individual_files_load( diff --git a/packages/nextclade-web/src/components/Error/ErrorContent.tsx b/packages/nextclade-web/src/components/Error/ErrorContent.tsx index 0ab256631..e93a10e33 100644 --- a/packages/nextclade-web/src/components/Error/ErrorContent.tsx +++ b/packages/nextclade-web/src/components/Error/ErrorContent.tsx @@ -1,17 +1,16 @@ import React, { useCallback, useMemo, useState } from 'react' import { Button, Col, Row } from 'reactstrap' import { useTranslationSafe } from 'src/helpers/useTranslationSafe' -import { NextcladeV2Error } from 'src/io/fetchSingleDatasetFromUrl' import styled from 'styled-components' import { CopyToClipboard } from 'react-copy-to-clipboard' import { FaClipboardCheck, FaClipboardList } from 'react-icons/fa' - import { ErrorGeneric } from 'src/components/Error/error-types/ErrorGeneric' import { ErrorNetworkConnectionFailure } from 'src/components/Error/error-types/ErrorNetworkConnectionFailure' import { ErrorNetworkRequestFailure } from 'src/components/Error/error-types/ErrorNetworkRequestFailure' import { NextcladeV2ErrorContent } from 'src/components/Error/error-types/NextcladeV2ErrorContent' import { ErrorContentExplanation, getErrorReportText } from 'src/components/Error/ErrorContentExplanation' import { sanitizeError } from 'src/helpers/sanitizeError' +import { NextcladeV2Error } from 'src/io/fetchSingleDatasetDirectory' import { HttpRequestError } from 'src/io/axiosFetch' import { ErrorMessageMonospace } from './ErrorStyles' diff --git a/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx b/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx index c4c199054..bc177d9ba 100644 --- a/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx +++ b/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx @@ -3,8 +3,8 @@ import React, { useMemo } from 'react' import { ErrorContainer, ErrorMessage } from 'src/components/Error/ErrorStyles' import { LinkExternal } from 'src/components/Link/LinkExternal' import { PROJECT_NAME, RELEASE_OLD_URL } from 'src/constants' +import { NextcladeV2Error } from 'src/io/fetchSingleDatasetDirectory' import { useTranslationSafe } from 'src/helpers/useTranslationSafe' -import { NextcladeV2Error } from 'src/io/fetchSingleDatasetFromUrl' import urljoin from 'url-join' export interface Props { diff --git a/packages/nextclade-web/src/hooks/useRunAnalysis.ts b/packages/nextclade-web/src/hooks/useRunAnalysis.ts index 6d9746fa8..fae78841b 100644 --- a/packages/nextclade-web/src/hooks/useRunAnalysis.ts +++ b/packages/nextclade-web/src/hooks/useRunAnalysis.ts @@ -1,18 +1,20 @@ import type { AuspiceJsonV2, CladeNodeAttrDesc } from 'auspice' - import { changeColorBy } from 'auspice/src/actions/colors' +import { concurrent } from 'fasy' import { useRouter } from 'next/router' import { useDispatch } from 'react-redux' import { useRecoilCallback } from 'recoil' +import { ErrorInternal } from 'src/helpers/ErrorInternal' import { clearAllFiltersAtom } from 'src/state/resultFilters.state' import { viewedCdsAtom } from 'src/state/seqViewSettings.state' -import { AlgorithmGlobalStatus } from 'src/types' +import { AlgorithmGlobalStatus, AlgorithmInput, Dataset, NextcladeParamsRaw, NextcladeParamsRawDir } from 'src/types' import { sanitizeError } from 'src/helpers/sanitizeError' import { auspiceStartClean, treeFilterByNodeType } from 'src/state/auspice/auspice.actions' import { createAuspiceState } from 'src/state/auspice/createAuspiceState' import { datasetCurrentAtom, cdsOrderPreferenceAtom } from 'src/state/dataset.state' import { globalErrorAtom } from 'src/state/error.state' import { + datasetJsonAtom, geneMapInputAtom, qrySeqInputsStorageAtom, refSeqInputAtom, @@ -35,6 +37,7 @@ import { } from 'src/state/results.state' import { numThreadsAtom, showNewRunPopupAtom } from 'src/state/settings.state' import { launchAnalysis, LaunchAnalysisCallbacks, LaunchAnalysisInputs } from 'src/workers/launchAnalysis' +import { axiosFetchRaw } from 'src/io/axiosFetch' export function useRunAnalysis() { const router = useRouter() @@ -60,6 +63,8 @@ export function useRunAnalysis() { const qryInputs = getPromise(qrySeqInputsStorageAtom) const csvColumnConfig = getPromise(csvColumnConfigAtom) + const datasetJsonPromise = getPromise(datasetJsonAtom) + const inputs: LaunchAnalysisInputs = { refSeq: getPromise(refSeqInputAtom), geneMap: getPromise(geneMapInputAtom), @@ -130,7 +135,22 @@ export function useRunAnalysis() { .push('/results', '/results') .then(async () => { set(analysisStatusGlobalAtom, AlgorithmGlobalStatus.initWorkers) - return launchAnalysis(qryInputs, inputs, callbacks, datasetCurrent, numThreads, csvColumnConfig) + + const tree = await datasetJsonPromise + + let params: NextcladeParamsRaw + if (tree) { + params = { Auspice: { tree: JSON.stringify(tree) } } + } else { + const dataset = await datasetCurrent + if (!dataset) { + throw new ErrorInternal('Dataset is required but not found') + } + const data = await getParams(inputs, dataset) + params = { Dir: data } + } + + return launchAnalysis(qryInputs, params, callbacks, numThreads, csvColumnConfig) }) .catch((error) => { set(analysisStatusGlobalAtom, AlgorithmGlobalStatus.failed) @@ -140,3 +160,33 @@ export function useRunAnalysis() { [router, dispatch], ) } + +/** Resolves all param inputs into strings */ +async function getParams(paramInputs: LaunchAnalysisInputs, dataset: Dataset): Promise { + const entries = [ + { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset.files.genomeAnnotation }, + { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset.files.reference }, + { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset.files.treeJson }, + { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset.files.pathogenJson }, + ] + + return Object.fromEntries( + await concurrent.map(async ({ key, input, datasetFileUrl }) => { + return [key, await resolveInput(await input, datasetFileUrl)] + }, entries), + ) as unknown as NextcladeParamsRawDir +} + +async function resolveInput(input: AlgorithmInput | undefined, datasetFileUrl: string | undefined) { + // If data is provided explicitly, load it + if (input) { + return input.getContent() + } + + // Otherwise fetch corresponding file from the dataset + if (datasetFileUrl) { + return axiosFetchRaw(datasetFileUrl) + } + + return undefined +} diff --git a/packages/nextclade-web/src/io/fetchSingleDataset.ts b/packages/nextclade-web/src/io/fetchSingleDataset.ts index c6258dedd..e376bc66b 100644 --- a/packages/nextclade-web/src/io/fetchSingleDataset.ts +++ b/packages/nextclade-web/src/io/fetchSingleDataset.ts @@ -1,16 +1,39 @@ import type { ParsedUrlQuery } from 'querystring' +import { ErrorFatal } from 'src/helpers/ErrorFatal' +import { fetchSingleDatasetAuspice } from 'src/io/fetchSingleDatasetAuspice' +import { fetchSingleDatasetDirectory } from 'src/io/fetchSingleDatasetDirectory' import { getQueryParamMaybe } from 'src/io/getQueryParamMaybe' -import { fetchSingleDatasetFromUrl } from 'src/io/fetchSingleDatasetFromUrl' import { isGithubUrlOrShortcut, parseGitHubRepoUrlOrShortcut } from 'src/io/fetchSingleDatasetFromGithub' export async function fetchSingleDataset(urlQuery: ParsedUrlQuery) { const datasetUrl = getQueryParamMaybe(urlQuery, 'dataset-url') - if (!datasetUrl) { + const datasetUrlJson = getQueryParamMaybe(urlQuery, 'dataset-json-url') + + if (datasetUrl && datasetUrlJson) { + throw new ErrorFatal( + "URL parameters 'dataset-url' and 'dataset-url-json' are mutually exclusive, but both provided. Please remove one or the other.", + ) + } + + let finalUrl + let options + let fetchFunction + + if (datasetUrl) { + finalUrl = datasetUrl + fetchFunction = fetchSingleDatasetDirectory + } else if (datasetUrlJson) { + finalUrl = datasetUrlJson + fetchFunction = fetchSingleDatasetAuspice + } else { return undefined } - if (isGithubUrlOrShortcut(datasetUrl)) { - const { directUrl } = await parseGitHubRepoUrlOrShortcut(datasetUrl) - return fetchSingleDatasetFromUrl(directUrl, { datasetOriginalUrl: datasetUrl }) + + if (isGithubUrlOrShortcut(finalUrl)) { + const { directUrl } = await parseGitHubRepoUrlOrShortcut(finalUrl) + options = { datasetOriginalUrl: finalUrl } + finalUrl = directUrl } - return fetchSingleDatasetFromUrl(datasetUrl) + + return fetchFunction(finalUrl, options) } diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts new file mode 100644 index 000000000..d0d106c5d --- /dev/null +++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts @@ -0,0 +1,36 @@ +import { isEmpty } from 'lodash' +import { FatalError } from 'next/dist/lib/fatal-error' +import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles } from 'src/types' +import { removeTrailingSlash } from 'src/io/url' +import { axiosFetch } from 'src/io/axiosFetch' + +export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { + const datasetJsonUrl = removeTrailingSlash(datasetJsonUrl_) + + const auspiceJson = await axiosFetch(datasetJsonUrl) + const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen + + if (isEmpty(auspiceJson.root_sequence.nuc)) { + throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`) + } + + const currentDataset: Dataset = { + path: datasetJsonUrl, + capabilities: { + primers: false, + qc: [], + }, + ...pathogen, + + // HACK: there is no files if dataset comes from Auspice JSON, neither they are needed. What to do? + files: {} as unknown as DatasetFiles, + } + + const datasets = [currentDataset] + const defaultDataset = currentDataset + const currentDatasetName = currentDataset.path + const defaultDatasetName = currentDatasetName + const defaultDatasetNameFriendly = attrStrMaybe(currentDataset.attributes, 'name') ?? currentDatasetName + + return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset, auspiceJson } +} diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts similarity index 94% rename from packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts rename to packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts index a2045ce81..1b59c48c3 100644 --- a/packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts +++ b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts @@ -7,37 +7,7 @@ import { removeTrailingSlash } from 'src/io/url' import { axiosFetch, axiosHead, axiosHeadOrUndefined } from 'src/io/axiosFetch' import { sanitizeError } from 'src/helpers/sanitizeError' -export class NextcladeV2Error extends Error { - public readonly datasetRootUrl: string - - public constructor(datasetRootUrl: string) { - super() - this.datasetRootUrl = datasetRootUrl - } -} - -function checkDatasetV2FilesExist(datasetRootUrl: string) { - return Promise.all([ - ['genemap.gff', 'primers.csv', 'qc.json', 'tag.json', 'virus_properties.json'].map((file) => - axiosHeadOrUndefined(urljoin(datasetRootUrl, file)), - ), - ]) -} - -async function fetchPathogenJson(datasetRootUrl: string) { - let pathogen - try { - pathogen = await axiosFetch(urljoin(datasetRootUrl, 'pathogen.json')) - } catch (error: unknown) { - if (axios.isAxiosError(error) && error.status === '404' && (await checkDatasetV2FilesExist(datasetRootUrl))) { - throw new NextcladeV2Error(datasetRootUrl) - } - throw error - } - return pathogen -} - -export async function fetchSingleDatasetFromUrl( +export async function fetchSingleDatasetDirectory( datasetRootUrl_: string, meta?: { datasetOriginalUrl?: string; datasetGithubRepo?: string }, ) { @@ -79,7 +49,44 @@ export async function fetchSingleDatasetFromUrl( Object.entries(currentDataset.files).filter(([filename, _]) => !['sequences.fasta'].includes(filename)), ) - return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset } + return { + datasets, + defaultDataset, + defaultDatasetName, + defaultDatasetNameFriendly, + currentDataset, + auspiceJson: undefined, + } +} + +async function fetchPathogenJson(datasetRootUrl: string) { + let pathogen + try { + pathogen = await axiosFetch(urljoin(datasetRootUrl, 'pathogen.json')) + } catch (error: unknown) { + if (axios.isAxiosError(error) && error.status === '404' && (await checkDatasetV2FilesExist(datasetRootUrl))) { + throw new NextcladeV2Error(datasetRootUrl) + } + throw error + } + return pathogen +} + +export class NextcladeV2Error extends Error { + public readonly datasetRootUrl: string + + public constructor(datasetRootUrl: string) { + super() + this.datasetRootUrl = datasetRootUrl + } +} + +function checkDatasetV2FilesExist(datasetRootUrl: string) { + return Promise.all([ + ['genemap.gff', 'primers.csv', 'qc.json', 'tag.json', 'virus_properties.json'].map((file) => + axiosHeadOrUndefined(urljoin(datasetRootUrl, file)), + ), + ]) } export class ErrorDatasetFileMissing extends Error { diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx index 62407c086..e0d6e91f4 100644 --- a/packages/nextclade-web/src/pages/_app.tsx +++ b/packages/nextclade-web/src/pages/_app.tsx @@ -16,6 +16,7 @@ import { mdxComponents } from 'src/mdx-components' import LoadingPage from 'src/pages/loading' import { globalErrorAtom } from 'src/state/error.state' import { + datasetJsonAtom, geneMapInputAtom, qrySeqInputsStorageAtom, refSeqInputAtom, @@ -101,8 +102,8 @@ export function RecoilStateInitializer() { const datasetInfo = await fetchSingleDataset(urlQuery) if (!isNil(datasetInfo)) { - const { datasets, currentDataset } = datasetInfo - return { datasets, currentDataset, minimizerIndexVersion: undefined } + const { datasets, currentDataset, auspiceJson } = datasetInfo + return { datasets, currentDataset, minimizerIndexVersion: undefined, auspiceJson } } return { datasets, currentDataset, minimizerIndexVersion } }) @@ -112,12 +113,13 @@ export function RecoilStateInitializer() { set(globalErrorAtom, sanitizeError(error)) throw error }) - .then(async ({ datasets, currentDataset, minimizerIndexVersion }) => { + .then(async ({ datasets, currentDataset, minimizerIndexVersion, auspiceJson }) => { set(datasetsAtom, { datasets }) const previousDataset = await getPromise(datasetCurrentAtom) const dataset = currentDataset ?? previousDataset set(datasetCurrentAtom, dataset) set(minimizerIndexVersionAtom, minimizerIndexVersion) + set(datasetJsonAtom, auspiceJson) return dataset }) .then(async (dataset) => { diff --git a/packages/nextclade-web/src/state/inputs.state.ts b/packages/nextclade-web/src/state/inputs.state.ts index d3d91a4d1..a2df1b92e 100644 --- a/packages/nextclade-web/src/state/inputs.state.ts +++ b/packages/nextclade-web/src/state/inputs.state.ts @@ -1,11 +1,11 @@ import { isEmpty } from 'lodash' import { useCallback, useEffect } from 'react' import { atom, selector, useRecoilState, useResetRecoilState } from 'recoil' +import type { AlgorithmInput, AuspiceTree } from 'src/types' import { cdsOrderPreferenceAtom } from 'src/state/dataset.state' import { clearAllFiltersAtom } from 'src/state/resultFilters.state' import { analysisResultsAtom, analysisStatusGlobalAtom, treeAtom } from 'src/state/results.state' import { viewedCdsAtom } from 'src/state/seqViewSettings.state' -import { AlgorithmInput } from 'src/types' import { notUndefinedOrNull } from 'src/helpers/notUndefined' import { useResetSuggestions } from 'src/hooks/useResetSuggestions' @@ -101,6 +101,11 @@ export const hasRequiredInputsAtom = selector({ }, }) +export const datasetJsonAtom = atom({ + key: 'datasetJson', + default: undefined, +}) + /** Counts how many custom inputs are set */ export const inputCustomizationCounterAtom = selector({ key: 'inputCustomizationCounterAtom', @@ -130,5 +135,6 @@ export const datasetFilesResetAtom = selector({ reset(geneMapInputAtom) reset(refTreeInputAtom) reset(virusPropertiesInputAtom) + reset(datasetJsonAtom) }, }) diff --git a/packages/nextclade-web/src/workers/launchAnalysis.ts b/packages/nextclade-web/src/workers/launchAnalysis.ts index f7af20736..73947337c 100644 --- a/packages/nextclade-web/src/workers/launchAnalysis.ts +++ b/packages/nextclade-web/src/workers/launchAnalysis.ts @@ -2,7 +2,6 @@ import { concurrent } from 'fasy' import { isEmpty, merge } from 'lodash' import type { AlgorithmInput, - Dataset, FastaRecordId, NextcladeResult, CsvColumnConfig, @@ -11,10 +10,8 @@ import type { OutputTrees, } from 'src/types' import { AlgorithmGlobalStatus } from 'src/types' -import { ErrorInternal } from 'src/helpers/ErrorInternal' import type { LauncherThread } from 'src/workers/launcher.worker' import { spawn } from 'src/workers/spawn' -import { axiosFetchRaw } from 'src/io/axiosFetch' export interface LaunchAnalysisInputs { refSeq: Promise @@ -35,9 +32,8 @@ export interface LaunchAnalysisCallbacks { export async function launchAnalysis( qryFastaInputs: Promise, - paramInputs: LaunchAnalysisInputs, + params: NextcladeParamsRaw, callbacks: LaunchAnalysisCallbacks, - datasetPromise: Promise, numThreads: Promise, csvColumnConfigPromise: Promise, ) { @@ -46,13 +42,6 @@ export async function launchAnalysis( // Resolve inputs into the actual strings const qryFastaStr = await getQueryFasta(await qryFastaInputs) - const [dataset] = await Promise.all([datasetPromise]) - if (!dataset) { - throw new ErrorInternal('Dataset is required but not found') - } - - const params = await getParams(paramInputs, dataset) - const csvColumnConfig = await csvColumnConfigPromise const launcherWorker = await spawn( @@ -96,33 +85,3 @@ export async function getQueryFasta(inputs: AlgorithmInput[]) { const contents = await concurrent.map(async (input) => input.getContent(), inputs) return contents.join('\n') } - -/** Resolves all param inputs into strings */ -async function getParams(paramInputs: LaunchAnalysisInputs, dataset: Dataset): Promise { - const entries = [ - { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset.files.genomeAnnotation }, - { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset.files.reference }, - { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset.files.treeJson }, - { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset.files.pathogenJson }, - ] - - return Object.fromEntries( - await concurrent.map(async ({ key, input, datasetFileUrl }) => { - return [key, await resolveInput(await input, datasetFileUrl)] - }, entries), - ) as unknown as NextcladeParamsRaw -} - -async function resolveInput(input: AlgorithmInput | undefined, datasetFileUrl: string | undefined) { - // If data is provided explicitly, load it - if (input) { - return input.getContent() - } - - // Otherwise fetch corresponding file from the dataset - if (datasetFileUrl) { - return axiosFetchRaw(datasetFileUrl) - } - - return undefined -} diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs index b32835731..06c6a5eb1 100644 --- a/packages/nextclade/src/run/nextclade_wasm.rs +++ b/packages/nextclade/src/run/nextclade_wasm.rs @@ -20,7 +20,9 @@ use crate::tree::tree::{AuspiceGraph, AuspiceTree, CladeNodeAttrKeyDesc}; use crate::tree::tree_builder::graph_attach_new_nodes_in_place; use crate::tree::tree_preprocess::graph_preprocess_in_place; use crate::types::outputs::NextcladeOutputs; -use eyre::{Report, WrapErr}; +use crate::utils::any::AnyType; +use crate::utils::option::OptionMapRefFallible; +use eyre::{eyre, Report, WrapErr}; use itertools::Itertools; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -37,33 +39,93 @@ pub struct NextcladeParams { } impl NextcladeParams { - pub fn from_raw(raw: NextcladeParamsRaw) -> Result { - let virus_properties = VirusProperties::from_str(&raw.virus_properties).wrap_err("When parsing pathogen JSON")?; - - let ref_record = read_one_fasta_str(&raw.ref_seq).wrap_err("When parsing reference sequence")?; + pub fn from_auspice(auspice_json: &AuspiceTree) -> Result { + let virus_properties = auspice_json + .meta + .extensions + .nextclade + .pathogen + .as_ref() + .cloned() + .unwrap_or_default(); - let tree = raw - .tree - .map(|tree| AuspiceTree::from_str(tree).wrap_err("When parsing reference tree Auspice JSON v2")) - .transpose()?; + let ref_record = { + let ref_name = virus_properties + .attributes + .get("reference name") + .cloned() + .unwrap_or_else(|| AnyType::String("reference".to_owned())) + .as_str() + .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")? + .to_owned(); + + let ref_seq = auspice_json.root_sequence.get("nuc") + .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned(); + + FastaRecord { + index: 0, + seq_name: ref_name, + seq: ref_seq, + } + }; - let gene_map = raw.gene_map.map_or_else( - || Ok(GeneMap::new()), // If genome annotation is not provided, use an empty one - |gene_map| GeneMap::from_str(gene_map).wrap_err("When parsing genome annotation"), - )?; + let gene_map = auspice_json + .meta + .genome_annotations + .map_ref_fallible(GeneMap::from_auspice_annotations)? + .unwrap_or_default(); Ok(Self { ref_record, gene_map, - tree, + tree: Some(auspice_json.to_owned()), virus_properties, }) } + + pub fn from_raw(raw: NextcladeParamsRaw) -> Result { + match raw { + NextcladeParamsRaw::Auspice(raw) => { + let auspice_json = AuspiceTree::from_str(raw.tree)?; + Self::from_auspice(&auspice_json) + } + NextcladeParamsRaw::Dir(raw) => { + let virus_properties = + VirusProperties::from_str(&raw.virus_properties).wrap_err("When parsing pathogen JSON")?; + + let ref_record = read_one_fasta_str(&raw.ref_seq).wrap_err("When parsing reference sequence")?; + + let tree = raw + .tree + .map(|tree| AuspiceTree::from_str(tree).wrap_err("When parsing reference tree Auspice JSON v2")) + .transpose()?; + + let gene_map = raw + .gene_map + .map(|gene_map| GeneMap::from_str(gene_map).wrap_err("When parsing genome annotation")) + .transpose()? + .unwrap_or_default(); + + Ok(Self { + ref_record, + gene_map, + tree, + virus_properties, + }) + } + } + } } #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct NextcladeParamsRaw { +pub struct NextcladeParamsRawAuspice { + pub tree: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct NextcladeParamsRawDir { #[schemars(with = "String")] pub ref_seq: String, pub gene_map: Option, @@ -71,6 +133,12 @@ pub struct NextcladeParamsRaw { pub virus_properties: String, } +#[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)] +pub enum NextcladeParamsRaw { + Auspice(NextcladeParamsRawAuspice), + Dir(NextcladeParamsRawDir), +} + #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] pub struct AnalysisInput { From b1b3f5f94d41c9f193d70ae56d298ea87d640346 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 May 2024 15:59:14 +0200 Subject: [PATCH 10/18] fix(web): prevent crash when an auspice dataset was used in prev session --- packages/nextclade-web/src/io/fetchDatasets.ts | 8 ++++++-- .../src/io/fetchSingleDatasetAuspice.ts | 6 ++++-- .../src/io/fetchSingleDatasetDirectory.ts | 13 +++---------- packages/nextclade-web/src/pages/_app.tsx | 12 +++++++----- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/packages/nextclade-web/src/io/fetchDatasets.ts b/packages/nextclade-web/src/io/fetchDatasets.ts index 5aacc77f8..f798be141 100644 --- a/packages/nextclade-web/src/io/fetchDatasets.ts +++ b/packages/nextclade-web/src/io/fetchDatasets.ts @@ -9,7 +9,7 @@ import { parseGithubRepoUrl, } from 'src/io/fetchSingleDatasetFromGithub' -import { Dataset } from 'src/types' +import { type AuspiceTree, Dataset } from 'src/types' import { fetchDatasetsIndex, filterDatasets, @@ -128,7 +128,11 @@ export async function initializeDatasets(datasetServerUrl: string, urlQuery: Par const minimizerIndexVersion = await getCompatibleMinimizerIndexVersion(datasetServerUrl, datasetsIndexJson) // Check if URL params specify dataset params and try to find the corresponding dataset - const currentDataset = await getDatasetFromUrlParams(urlQuery, datasets) + const currentDataset: + | (Dataset & { + auspiceJson?: AuspiceTree + }) + | undefined = await getDatasetFromUrlParams(urlQuery, datasets) return { datasets, currentDataset, minimizerIndexVersion } } diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts index d0d106c5d..373e469a0 100644 --- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts +++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts @@ -14,7 +14,7 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`) } - const currentDataset: Dataset = { + const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = { path: datasetJsonUrl, capabilities: { primers: false, @@ -24,6 +24,8 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { // HACK: there is no files if dataset comes from Auspice JSON, neither they are needed. What to do? files: {} as unknown as DatasetFiles, + + auspiceJson, } const datasets = [currentDataset] @@ -32,5 +34,5 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { const defaultDatasetName = currentDatasetName const defaultDatasetNameFriendly = attrStrMaybe(currentDataset.attributes, 'name') ?? currentDatasetName - return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset, auspiceJson } + return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset } } diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts index 1b59c48c3..a735de622 100644 --- a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts +++ b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts @@ -2,7 +2,7 @@ import axios from 'axios' import urljoin from 'url-join' import { mapValues } from 'lodash' import { concurrent } from 'fasy' -import { attrStrMaybe, Dataset, DatasetFiles, VirusProperties } from 'src/types' +import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles, VirusProperties } from 'src/types' import { removeTrailingSlash } from 'src/io/url' import { axiosFetch, axiosHead, axiosHeadOrUndefined } from 'src/io/axiosFetch' import { sanitizeError } from 'src/helpers/sanitizeError' @@ -15,7 +15,7 @@ export async function fetchSingleDatasetDirectory( const pathogen = await fetchPathogenJson(datasetRootUrl) - const currentDataset: Dataset = { + const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = { path: datasetRootUrl, capabilities: { primers: false, @@ -49,14 +49,7 @@ export async function fetchSingleDatasetDirectory( Object.entries(currentDataset.files).filter(([filename, _]) => !['sequences.fasta'].includes(filename)), ) - return { - datasets, - defaultDataset, - defaultDatasetName, - defaultDatasetNameFriendly, - currentDataset, - auspiceJson: undefined, - } + return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset } } async function fetchPathogenJson(datasetRootUrl: string) { diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx index e0d6e91f4..b0eb962c0 100644 --- a/packages/nextclade-web/src/pages/_app.tsx +++ b/packages/nextclade-web/src/pages/_app.tsx @@ -56,6 +56,8 @@ import { import { ErrorBoundary } from 'src/components/Error/ErrorBoundary' import 'src/styles/global.scss' +import { Dataset } from '../types' +import { AuspiceTree } from '../types' RecoilEnv.RECOIL_DUPLICATE_ATOM_KEY_CHECKING_ENABLED = false @@ -102,8 +104,8 @@ export function RecoilStateInitializer() { const datasetInfo = await fetchSingleDataset(urlQuery) if (!isNil(datasetInfo)) { - const { datasets, currentDataset, auspiceJson } = datasetInfo - return { datasets, currentDataset, minimizerIndexVersion: undefined, auspiceJson } + const { datasets, currentDataset } = datasetInfo + return { datasets, currentDataset, minimizerIndexVersion: undefined } } return { datasets, currentDataset, minimizerIndexVersion } }) @@ -113,13 +115,13 @@ export function RecoilStateInitializer() { set(globalErrorAtom, sanitizeError(error)) throw error }) - .then(async ({ datasets, currentDataset, minimizerIndexVersion, auspiceJson }) => { + .then(async ({ datasets, currentDataset, minimizerIndexVersion }) => { set(datasetsAtom, { datasets }) const previousDataset = await getPromise(datasetCurrentAtom) - const dataset = currentDataset ?? previousDataset + const dataset: (Dataset & { auspiceJson?: AuspiceTree }) | undefined = currentDataset ?? previousDataset set(datasetCurrentAtom, dataset) set(minimizerIndexVersionAtom, minimizerIndexVersion) - set(datasetJsonAtom, auspiceJson) + set(datasetJsonAtom, dataset?.auspiceJson) return dataset }) .then(async (dataset) => { From e5ee0688c22b9fe8203c99900a2cf42c9cc7e925 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 May 2024 16:22:45 +0200 Subject: [PATCH 11/18] fix(web): prevent crash when auspice json has no `.root_sequence` --- packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts | 2 +- packages/nextclade/src/graph/graph.rs | 3 +-- packages/nextclade/src/run/nextclade_wasm.rs | 2 +- packages/nextclade/src/tree/tree.rs | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts index 373e469a0..8b2becd86 100644 --- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts +++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts @@ -10,7 +10,7 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { const auspiceJson = await axiosFetch(datasetJsonUrl) const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen - if (isEmpty(auspiceJson.root_sequence.nuc)) { + if (isEmpty(auspiceJson.root_sequence?.nuc)) { throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`) } diff --git a/packages/nextclade/src/graph/graph.rs b/packages/nextclade/src/graph/graph.rs index 4078ea5f1..5fe2be45f 100644 --- a/packages/nextclade/src/graph/graph.rs +++ b/packages/nextclade/src/graph/graph.rs @@ -13,7 +13,6 @@ use num_traits::Float; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use maplit::btreemap; #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] #[allow(clippy::partial_pub_fields)] @@ -557,7 +556,7 @@ pub fn convert_graph_to_auspice_tree(graph: &AuspiceGraph) -> Result, + #[serde(skip_serializing_if = "Option::is_none")] + pub root_sequence: Option>, #[serde(flatten)] pub other: serde_json::Value, From 883a0d6a74bf95450c9ddf4f6dc12de996af36a1 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 23 May 2024 16:32:23 +0200 Subject: [PATCH 12/18] refactor: lint --- packages/nextclade-cli/src/dataset/dataset_download.rs | 7 +++---- packages/nextclade-web/src/pages/_app.tsx | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs index 64184a0c4..e8e3f01b7 100644 --- a/packages/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages/nextclade-cli/src/dataset/dataset_download.rs @@ -8,12 +8,11 @@ use log::{warn, LevelFilter}; use nextclade::analyze::virus_properties::VirusProperties; use nextclade::gene::gene_map::{filter_gene_map, GeneMap}; use nextclade::io::dataset::{Dataset, DatasetsIndexJson}; -use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str, FastaRecord}; +use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str}; use nextclade::io::file::create_file_or_stdout; use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string}; use nextclade::run::nextclade_wasm::NextcladeParams; use nextclade::tree::tree::AuspiceTree; -use nextclade::utils::any::AnyType; use nextclade::utils::fs::list_files_recursive; use nextclade::utils::option::OptionMapRefFallible; use nextclade::utils::string::{format_list, surround_with_quotes, Indent}; @@ -288,9 +287,9 @@ pub fn dataset_dir_load( } pub fn dataset_json_load( - run_args: &NextcladeRunArgs, + _run_args: &NextcladeRunArgs, dataset_json: impl AsRef, - cdses: &Option>, + _cdses: &Option>, ) -> Result { let dataset_json = dataset_json.as_ref(); diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx index b0eb962c0..92a2970c3 100644 --- a/packages/nextclade-web/src/pages/_app.tsx +++ b/packages/nextclade-web/src/pages/_app.tsx @@ -8,6 +8,7 @@ import { RecoilEnv, RecoilRoot, useRecoilCallback, useRecoilState, useRecoilValu import { AppProps } from 'next/app' import { useRouter } from 'next/router' import dynamic from 'next/dynamic' +import type { Dataset, AuspiceTree } from 'src/types' import { sanitizeError } from 'src/helpers/sanitizeError' import { useRunAnalysis } from 'src/hooks/useRunAnalysis' import i18nAuspice, { changeAuspiceLocale } from 'src/i18n/i18n.auspice' @@ -56,8 +57,6 @@ import { import { ErrorBoundary } from 'src/components/Error/ErrorBoundary' import 'src/styles/global.scss' -import { Dataset } from '../types' -import { AuspiceTree } from '../types' RecoilEnv.RECOIL_DUPLICATE_ATOM_KEY_CHECKING_ENABLED = false From 9f3c1e0a7667c72ed9ba0e9701aedbc1e711defb Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 May 2024 08:14:12 +0200 Subject: [PATCH 13/18] fix(web): specifically accept json Let's add an explicit `Accept` HTTP header when fetching Auspice JSON. This is required for nextstrain.org links to work - the server sends different content depending on `Accept` header. --- packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts index 8b2becd86..80d85a2d5 100644 --- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts +++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts @@ -7,7 +7,9 @@ import { axiosFetch } from 'src/io/axiosFetch' export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) { const datasetJsonUrl = removeTrailingSlash(datasetJsonUrl_) - const auspiceJson = await axiosFetch(datasetJsonUrl) + const auspiceJson = await axiosFetch(datasetJsonUrl, { + headers: { Accept: 'application/json, text/plain, */*' }, + }) const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen if (isEmpty(auspiceJson.root_sequence?.nuc)) { From fc7b8bd63cf49947c18fcba2cacdb9796c201563 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 24 May 2024 08:32:32 +0200 Subject: [PATCH 14/18] fix(web): hide "Load examples" button when examples are not in dataset --- .../nextclade-web/src/components/Main/ButtonLoadExample.tsx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx index 4d2bbb827..c784eb77c 100644 --- a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx +++ b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx @@ -1,4 +1,5 @@ import { Dataset } from '_SchemaRoot' +import { isEmpty } from 'lodash' import React, { useCallback } from 'react' import { Button } from 'reactstrap' import { useRecoilValue } from 'recoil' @@ -44,6 +45,10 @@ export function ButtonLoadExample({ ...rest }) { setExampleSequences(datasetCurrent) }, [datasetCurrent, setExampleSequences]) + if (isEmpty(datasetCurrent?.files.examples)) { + return null + } + return (