From 0ec7adcad9bdb3a5f06cbc4db2e8cea7c14b880d Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Mon, 13 May 2024 12:50:42 +0200
Subject: [PATCH 01/18] feat: add ref and annotation data to Auspice tree types

---
 packages/nextclade/src/graph/graph.rs |  2 +
 packages/nextclade/src/tree/tree.rs   | 65 +++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
diff --git a/packages/nextclade/src/graph/graph.rs b/packages/nextclade/src/graph/graph.rs
index 2c4aa3743..4078ea5f1 100644
--- a/packages/nextclade/src/graph/graph.rs
+++ b/packages/nextclade/src/graph/graph.rs
@@ -13,6 +13,7 @@ use num_traits::Float;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use maplit::btreemap;
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 #[allow(clippy::partial_pub_fields)]
@@ -556,6 +557,7 @@ pub fn convert_graph_to_auspice_tree(graph: &AuspiceGraph) -> Result<AuspiceTree
     version: graph.data.auspice_tree_version.clone(),
     meta: graph.data.meta.clone(),
     tree,
+    root_sequence: btreemap! {},
     other: graph.data.other.clone(),
   })
 }
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index dd9ed05f3..5a68e6003 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -3,6 +3,7 @@ use crate::alphabet::nuc::Nuc;
 use crate::analyze::find_private_nuc_mutations::BranchMutations;
 use crate::coord::position::{AaRefPosition, NucRefGlobalPosition};
 use crate::coord::range::NucRefGlobalRange;
+use crate::gene::gene::GeneStrand;
 use crate::graph::edge::{Edge, GraphEdge};
 use crate::graph::graph::Graph;
 use crate::graph::node::{GraphNode, Node};
@@ -10,6 +11,7 @@ use crate::graph::traits::{HasDivergence, HasName};
 use crate::io::fs::read_file_to_string;
 use crate::io::json::json_parse;
 use eyre::{Report, WrapErr};
+use schemars::JsonSchema;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::collections::BTreeMap;
 use std::path::Path;
@@ -390,8 +392,68 @@ impl AuspiceDisplayDefaults {
   }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct AuspiceGenomeAnnotationNuc {
+  #[serde(flatten)]
+  pub other: serde_json::Value,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct StartEnd {
+  pub start: usize,
+  pub end: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+#[serde(untagged)]
+pub enum Segments {
+  OneSegment(StartEnd),
+  MultipleSegments { segments: Vec<StartEnd> },
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct AuspiceGenomeAnnotationCds {
+  #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")]
+  pub r#type: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub gene: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub color: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub display_name: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub description: Option<String>,
+
+  #[serde(default)]
+  pub strand: GeneStrand,
+
+  #[serde(flatten)]
+  pub segments: Segments,
+
+  #[serde(flatten)]
+  pub other: serde_json::Value,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct AuspiceGenomeAnnotations {
+  pub nuc: AuspiceGenomeAnnotationNuc,
+
+  #[serde(flatten)]
+  pub cdses: BTreeMap<String, AuspiceGenomeAnnotationCds>,
+
+  #[serde(flatten)]
+  pub other: serde_json::Value,
+}
+
 #[derive(Clone, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)]
 pub struct AuspiceTreeMeta {
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub genome_annotations: Option<AuspiceGenomeAnnotations>,
+
   #[serde(default, skip_serializing_if = "AuspiceMetaExtensions::is_empty")]
   pub extensions: AuspiceMetaExtensions,
 
@@ -472,6 +534,9 @@ pub struct AuspiceTree {
 
   pub tree: AuspiceTreeNode,
 
+  #[serde(skip_serializing_if = "BTreeMap::is_empty")]
+  pub root_sequence: BTreeMap<String, String>,
+
   #[serde(flatten)]
   pub other: serde_json::Value,
 }

From 1043b98ab97bac898be0d91cce678468d046e3a0 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 16 May 2024 14:35:08 +0200
Subject: [PATCH 02/18] refactor: add pathogen nextclade extension to auspice
 tree type

I had to derive a bunch of Eq and PartialEq traits to satisfy parent type requirements
---
 .../src/dataset/dataset_download.rs           | 36 +------------
 packages/nextclade/src/align/params.rs        | 22 ++++----
 packages/nextclade/src/align/seed_match.rs    |  6 +--
 packages/nextclade/src/analyze/phenotype.rs   |  2 +-
 .../nextclade/src/analyze/virus_properties.rs | 30 ++++++-----
 packages/nextclade/src/io/dataset.rs          |  2 +-
 packages/nextclade/src/qc/qc_config.rs        | 51 ++++++++++---------
 .../nextclade/src/qc/qc_rule_frame_shifts.rs  |  2 +-
 .../nextclade/src/qc/qc_rule_missing_data.rs  |  4 +-
 .../src/qc/qc_rule_private_mutations.rs       | 12 ++---
 .../nextclade/src/qc/qc_rule_snp_clusters.rs  |  2 +-
 .../nextclade/src/qc/qc_rule_stop_codons.rs   |  2 +-
 packages/nextclade/src/run/params_general.rs  |  2 +-
 packages/nextclade/src/tree/params.rs         |  7 +--
 packages/nextclade/src/tree/tree.rs           |  6 ++-
 packages/nextclade/src/utils/any.rs           | 48 +++++++++++++++--
 16 files changed, 126 insertions(+), 108 deletions(-)

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index 380afc318..8736e0c02 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -297,41 +297,7 @@ pub fn dataset_individual_files_load(
         .and_then(|input_pathogen_json| read_file_to_string(input_pathogen_json).ok())
         .map_ref_fallible(VirusProperties::from_str)
         .wrap_err("When reading pathogen JSON")?
-        .unwrap_or_else(|| {
-          // The only case where we allow pathogen.json to be missing is when there's no dataset and files are provided
-          // explicitly through args. Let's create a dummy value to avoid making the field optional,
-          // and avoid adding `Default` trait.
-          VirusProperties {
-            schema_version: "".to_owned(),
-            attributes: BTreeMap::default(),
-            shortcuts: vec![],
-            meta: DatasetMeta::default(),
-            files: DatasetFiles {
-              reference: "".to_owned(),
-              pathogen_json: "".to_owned(),
-              genome_annotation: None,
-              tree_json: None,
-              examples: None,
-              readme: None,
-              changelog: None,
-              rest_files: BTreeMap::default(),
-              other: serde_json::Value::default(),
-            },
-            default_cds: None,
-            cds_order_preference: vec![],
-            mut_labels: LabelledMutationsConfig::default(),
-            qc: None,
-            general_params: None,
-            alignment_params: None,
-            tree_builder_params: None,
-            phenotype_data: None,
-            aa_motifs: vec![],
-            versions: vec![],
-            version: None,
-            compatibility: None,
-            other: serde_json::Value::default(),
-          }
-        });
+        .unwrap_or_default();
 
       let ref_record = read_one_fasta(input_ref).wrap_err("When reading reference sequence")?;
 
diff --git a/packages/nextclade/src/align/params.rs b/packages/nextclade/src/align/params.rs
index eea5ed85f..1a0d3b6ba 100644
--- a/packages/nextclade/src/align/params.rs
+++ b/packages/nextclade/src/align/params.rs
@@ -1,12 +1,14 @@
+use crate::utils::any::AnyType;
 use crate::{make_error, o};
 use clap::{Parser, ValueEnum};
 use eyre::Report;
 use itertools::Itertools;
 use optfield::optfield;
+use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 
-#[derive(ValueEnum, Copy, Clone, Debug, Deserialize, Serialize, schemars::JsonSchema)]
+#[derive(ValueEnum, Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize, schemars::JsonSchema)]
 #[serde(rename_all = "kebab-case")]
 pub enum GapAlignmentSide {
   Left,
@@ -25,7 +27,7 @@ impl Default for GapAlignmentSide {
 
 #[allow(clippy::struct_excessive_bools)]
 #[optfield(pub AlignPairwiseParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)]
-#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[derive(Parser, Debug, Clone, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct AlignPairwiseParams {
   /// Minimum length of nucleotide sequence to consider for alignment.
@@ -116,7 +118,7 @@ pub struct AlignPairwiseParams {
   /// Fraction of the query sequence that has to be covered by extended seeds
   /// to proceed with the banded alignment.
   #[clap(long)]
-  pub min_seed_cover: f64,
+  pub min_seed_cover: OrderedFloat<f64>,
 
   /// Number of times Nextclade will retry alignment with more relaxed results if alignment band boundaries are hit
   #[clap(long)]
@@ -125,27 +127,27 @@ pub struct AlignPairwiseParams {
   // The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub max_indel: Option<f64>,
+  pub max_indel: Option<AnyType>,
 
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub seed_length: Option<f64>,
+  pub seed_length: Option<AnyType>,
 
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub mismatches_allowed: Option<f64>,
+  pub mismatches_allowed: Option<AnyType>,
 
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub min_seeds: Option<f64>,
+  pub min_seeds: Option<AnyType>,
 
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub min_match_rate: Option<f64>,
+  pub min_match_rate: Option<AnyType>,
 
   /// REMOVED
   #[clap(long, hide_long_help = true, hide_short_help = true)]
-  pub seed_spacing: Option<f64>,
+  pub seed_spacing: Option<AnyType>,
 }
 
 impl Default for AlignPairwiseParams {
@@ -166,7 +168,7 @@ impl Default for AlignPairwiseParams {
       gap_alignment_side: GapAlignmentSide::default(),
       excess_bandwidth: 9,
       terminal_bandwidth: 50,
-      min_seed_cover: 0.33,
+      min_seed_cover: OrderedFloat(0.33),
       kmer_length: 10,       // Should not be much larger than 1/divergence of amino acids
       kmer_distance: 50,     // Distance between successive k-mers
       min_match_length: 40,  // Experimentally determined, to keep off-target matches reasonably low
diff --git a/packages/nextclade/src/align/seed_match.rs b/packages/nextclade/src/align/seed_match.rs
index 1516f7795..c5c7abd2a 100644
--- a/packages/nextclade/src/align/seed_match.rs
+++ b/packages/nextclade/src/align/seed_match.rs
@@ -481,15 +481,15 @@ pub fn get_seed_matches2(
   // write_matches_to_file(&seed_matches, "chained_matches.csv");
 
   let sum_of_seed_length: usize = seed_matches.iter().map(|sm| sm.length).sum();
-  if (sum_of_seed_length as f64 / qry_seq.len() as f64) < params.min_seed_cover {
+  if (sum_of_seed_length as f64 / qry_seq.len() as f64) < *params.min_seed_cover {
     let query_knowns = qry_seq.iter().filter(|n| n.is_acgt()).count();
-    if (sum_of_seed_length as f64 / query_knowns as f64) < params.min_seed_cover {
+    if (sum_of_seed_length as f64 / query_knowns as f64) < *params.min_seed_cover {
       return make_error!(
         "Unable to align: seed alignment covers {:.2}% of the query sequence, which is less than expected {:.2}% \
         (configurable using 'min seed cover' CLI flag or dataset property). This is likely due to low quality of the \
         provided sequence, or due to using incorrect reference sequence.",
         100.0 * (sum_of_seed_length as f64) / (query_knowns as f64),
-        100.0 * params.min_seed_cover
+        100.0 * *params.min_seed_cover
       );
     }
   }
diff --git a/packages/nextclade/src/analyze/phenotype.rs b/packages/nextclade/src/analyze/phenotype.rs
index aafe8f7c0..fc32da44e 100644
--- a/packages/nextclade/src/analyze/phenotype.rs
+++ b/packages/nextclade/src/analyze/phenotype.rs
@@ -16,7 +16,7 @@ pub fn calculate_phenotype(phenotype_data: &PhenotypeData, aa_substitutions: &[A
         .iter()
         .map(|AaSub { pos, qry_aa: qry, .. }| phenotype_data.get_coeff(*pos, *qry))
         .sum();
-      phenotype_data.weight * (-phenotype_for_antibody).exp()
+      *phenotype_data.weight * (-phenotype_for_antibody).exp()
     })
     .sum();
 
diff --git a/packages/nextclade/src/analyze/virus_properties.rs b/packages/nextclade/src/analyze/virus_properties.rs
index 845c3f1f2..896705865 100644
--- a/packages/nextclade/src/analyze/virus_properties.rs
+++ b/packages/nextclade/src/analyze/virus_properties.rs
@@ -14,6 +14,7 @@ use crate::run::params_general::NextcladeGeneralParamsOptional;
 use crate::tree::params::TreeBuilderParamsOptional;
 use crate::utils::any::AnyType;
 use eyre::{Report, WrapErr};
+use ordered_float::OrderedFloat;
 use semver::Version;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
@@ -24,7 +25,7 @@ const PATHOGEN_JSON_SCHEMA_VERSION_FROM: &str = "3.0.0";
 const PATHOGEN_JSON_SCHEMA_VERSION_TO: &str = "3.0.0";
 
 /// Contains external configuration and data specific for a particular pathogen
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct VirusProperties {
   pub schema_version: String,
@@ -78,7 +79,7 @@ pub struct VirusProperties {
 pub type LabelMap<L> = BTreeMap<Genotype<L>, Vec<String>>;
 pub type NucLabelMap = LabelMap<Nuc>;
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 pub struct LabelledMutationsConfig {
   #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
@@ -87,41 +88,42 @@ pub struct LabelledMutationsConfig {
   pub other: serde_json::Value,
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct PhenotypeDataIgnore {
   #[serde(default)]
   pub clades: Vec<String>,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 #[serde(untagged)]
 pub enum PhenotypeCoeff {
-  ByPosition(f64),
-  ByPositionAndAa(BTreeMap<String, f64>),
+  ByPosition(OrderedFloat<f64>),
+  ByPositionAndAa(BTreeMap<String, OrderedFloat<f64>>),
   Other(serde_json::Value),
 }
 
 impl PhenotypeCoeff {
   pub fn get_coeff(&self, aa: Aa) -> f64 {
     match self {
-      PhenotypeCoeff::ByPosition(coeff) => Some(coeff),
+      PhenotypeCoeff::ByPosition(coeff) => Some(coeff.0),
       PhenotypeCoeff::ByPositionAndAa(aa_coeff_map) => aa_coeff_map
         .get(&aa.to_string())
-        .or_else(|| aa_coeff_map.get("default")),
+        .or_else(|| aa_coeff_map.get("default"))
+        .map(|c| c.0),
       PhenotypeCoeff::Other(_) => None,
     }
-    .unwrap_or(&0.0)
+    .unwrap_or(0.0)
     .to_owned()
   }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct PhenotypeDataEntry {
   pub name: String,
-  pub weight: f64,
+  pub weight: OrderedFloat<f64>,
   pub locations: BTreeMap<AaRefPosition, PhenotypeCoeff>,
 }
 
@@ -131,7 +133,7 @@ impl PhenotypeDataEntry {
   }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct PhenotypeData {
   pub name: String,
@@ -152,7 +154,7 @@ pub struct PhenotypeAttrDesc {
   pub description: String,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct AaMotifsDesc {
   pub name: String,
@@ -165,7 +167,7 @@ pub struct AaMotifsDesc {
   pub include_cdses: Vec<CountAaMotifsCdsDesc>,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Default, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct CountAaMotifsCdsDesc {
   pub cds: String,
diff --git a/packages/nextclade/src/io/dataset.rs b/packages/nextclade/src/io/dataset.rs
index 96f03a1af..1838a53f9 100644
--- a/packages/nextclade/src/io/dataset.rs
+++ b/packages/nextclade/src/io/dataset.rs
@@ -328,7 +328,7 @@ impl DatasetMeta {
   }
 }
 
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct DatasetFiles {
   pub reference: String,
diff --git a/packages/nextclade/src/qc/qc_config.rs b/packages/nextclade/src/qc/qc_config.rs
index b30b7727a..f043c6aeb 100644
--- a/packages/nextclade/src/qc/qc_config.rs
+++ b/packages/nextclade/src/qc/qc_config.rs
@@ -2,21 +2,22 @@ use crate::coord::range::AaRefRange;
 use crate::io::fs::read_file_to_string;
 use crate::io::json::json_parse;
 use eyre::{Report, WrapErr};
+use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize};
 use std::path::Path;
 use std::str::FromStr;
 use validator::Validate;
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigMissingData {
   pub enabled: bool,
-  pub missing_data_threshold: f64,
-  pub score_bias: f64,
+  pub missing_data_threshold: OrderedFloat<f64>,
+  pub score_bias: OrderedFloat<f64>,
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigMixedSites {
@@ -24,49 +25,49 @@ pub struct QcRulesConfigMixedSites {
   pub mixed_sites_threshold: usize,
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigPrivateMutations {
   pub enabled: bool,
 
   #[serde(default = "one")]
-  pub weight_reversion_substitutions: f64,
+  pub weight_reversion_substitutions: OrderedFloat<f64>,
 
   #[serde(default = "one")]
-  pub weight_reversion_deletions: f64,
+  pub weight_reversion_deletions: OrderedFloat<f64>,
 
   #[serde(default = "one")]
-  pub weight_labeled_substitutions: f64,
+  pub weight_labeled_substitutions: OrderedFloat<f64>,
 
   #[serde(default = "one")]
-  pub weight_labeled_deletions: f64,
+  pub weight_labeled_deletions: OrderedFloat<f64>,
 
   #[serde(default = "one")]
-  pub weight_unlabeled_substitutions: f64,
+  pub weight_unlabeled_substitutions: OrderedFloat<f64>,
 
   #[serde(default = "one")]
-  pub weight_unlabeled_deletions: f64,
+  pub weight_unlabeled_deletions: OrderedFloat<f64>,
 
-  pub typical: f64,
-  pub cutoff: f64,
+  pub typical: OrderedFloat<f64>,
+  pub cutoff: OrderedFloat<f64>,
 }
 
-const fn one() -> f64 {
-  1.0
+const fn one() -> OrderedFloat<f64> {
+  OrderedFloat(1.0)
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigSnpClusters {
   pub enabled: bool,
   pub window_size: usize,
   pub cluster_cut_off: usize,
-  pub score_weight: f64,
+  pub score_weight: OrderedFloat<f64>,
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct FrameShiftLocation {
@@ -74,14 +75,14 @@ pub struct FrameShiftLocation {
   pub codon_range: AaRefRange,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigFrameShifts {
   pub enabled: bool,
   #[serde(default, skip_serializing_if = "Vec::is_empty")]
   pub ignored_frame_shifts: Vec<FrameShiftLocation>,
-  pub score_weight: f64,
+  pub score_weight: OrderedFloat<f64>,
 }
 
 impl Default for QcRulesConfigFrameShifts {
@@ -89,7 +90,7 @@ impl Default for QcRulesConfigFrameShifts {
     Self {
       enabled: false,
       ignored_frame_shifts: vec![],
-      score_weight: 75.0,
+      score_weight: OrderedFloat(75.0),
     }
   }
 }
@@ -101,14 +102,14 @@ pub struct StopCodonLocation {
   pub codon: usize,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcRulesConfigStopCodons {
   pub enabled: bool,
   #[serde(default, skip_serializing_if = "Vec::is_empty")]
   pub ignored_stop_codons: Vec<StopCodonLocation>,
-  pub score_weight: f64,
+  pub score_weight: OrderedFloat<f64>,
 }
 
 impl Default for QcRulesConfigStopCodons {
@@ -116,12 +117,12 @@ impl Default for QcRulesConfigStopCodons {
     Self {
       enabled: false,
       ignored_stop_codons: vec![],
-      score_weight: 75.0,
+      score_weight: OrderedFloat(75.0),
     }
   }
 }
 
-#[derive(Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema, Validate)]
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate)]
 #[serde(rename_all = "camelCase")]
 #[serde(default)]
 pub struct QcConfig {
diff --git a/packages/nextclade/src/qc/qc_rule_frame_shifts.rs b/packages/nextclade/src/qc/qc_rule_frame_shifts.rs
index b89c4ac5b..262b8b653 100644
--- a/packages/nextclade/src/qc/qc_rule_frame_shifts.rs
+++ b/packages/nextclade/src/qc/qc_rule_frame_shifts.rs
@@ -45,7 +45,7 @@ pub fn rule_frame_shifts(
   let total_frame_shifts = frame_shifts.len();
   let total_frame_shifts_ignored = frame_shifts_ignored.len();
 
-  let score = total_frame_shifts as f64 * config.score_weight;
+  let score = total_frame_shifts as f64 * *config.score_weight;
   let status = QcStatus::from_score(score);
 
   Some(QcResultFrameShifts {
diff --git a/packages/nextclade/src/qc/qc_rule_missing_data.rs b/packages/nextclade/src/qc/qc_rule_missing_data.rs
index 0b4e0af5b..3759dc4c0 100644
--- a/packages/nextclade/src/qc/qc_rule_missing_data.rs
+++ b/packages/nextclade/src/qc/qc_rule_missing_data.rs
@@ -24,7 +24,7 @@ pub fn rule_missing_data(total_missing: usize, config: &QcRulesConfigMissingData
   }
 
   let score = clamp_min(
-    ((total_missing as f64 - config.score_bias) * 100.0) / config.missing_data_threshold,
+    ((total_missing as f64 - *config.score_bias) * 100.0) / *config.missing_data_threshold,
     0.0,
   );
   let status = QcStatus::from_score(score);
@@ -33,6 +33,6 @@ pub fn rule_missing_data(total_missing: usize, config: &QcRulesConfigMissingData
     score,
     status,
     total_missing,
-    missing_data_threshold: config.missing_data_threshold + config.score_bias,
+    missing_data_threshold: *config.missing_data_threshold + *config.score_bias,
   })
 }
diff --git a/packages/nextclade/src/qc/qc_rule_private_mutations.rs b/packages/nextclade/src/qc/qc_rule_private_mutations.rs
index b8f31c30c..22ac5aee6 100644
--- a/packages/nextclade/src/qc/qc_rule_private_mutations.rs
+++ b/packages/nextclade/src/qc/qc_rule_private_mutations.rs
@@ -46,13 +46,13 @@ pub fn rule_private_mutations(
   let total_deletion_ranges = deletion_ranges.len();
 
   let weighted_total = 0.0
-    + config.weight_reversion_substitutions * num_reversion_substitutions as f64
-    + config.weight_labeled_substitutions * num_labeled_substitutions as f64
-    + config.weight_unlabeled_substitutions * num_unlabeled_substitutions as f64
+    + *config.weight_reversion_substitutions * num_reversion_substitutions as f64
+    + *config.weight_labeled_substitutions * num_labeled_substitutions as f64
+    + *config.weight_unlabeled_substitutions * num_unlabeled_substitutions as f64
     + total_deletion_ranges as f64;
 
   // the score hits 100 if the excess mutations equals the cutoff value
-  let score = (clamp_min(weighted_total - config.typical, 0.0) * 100.0) / config.cutoff;
+  let score = (clamp_min(weighted_total - *config.typical, 0.0) * 100.0) / *config.cutoff;
   let status = QcStatus::from_score(score);
 
   Some(QcResultPrivateMutations {
@@ -63,8 +63,8 @@ pub fn rule_private_mutations(
     num_unlabeled_substitutions,
     total_deletion_ranges,
     weighted_total,
-    excess: weighted_total - config.typical,
-    cutoff: config.cutoff,
+    excess: weighted_total - *config.typical,
+    cutoff: *config.cutoff,
   })
 }
 
diff --git a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs
index ca5adb6e7..f174c0131 100644
--- a/packages/nextclade/src/qc/qc_rule_snp_clusters.rs
+++ b/packages/nextclade/src/qc/qc_rule_snp_clusters.rs
@@ -52,7 +52,7 @@ pub fn rule_snp_clusters(
   let clustered_snps = process_snp_clusters(snp_clusters);
   let total_snps = clustered_snps.iter().map(|cluster| cluster.number_of_snps).sum();
 
-  let score = clamp_min(total_clusters as f64 * config.score_weight, 0.0);
+  let score = clamp_min(total_clusters as f64 * *config.score_weight, 0.0);
   let status = QcStatus::from_score(score);
 
   Some(QcResultSnpClusters {
diff --git a/packages/nextclade/src/qc/qc_rule_stop_codons.rs b/packages/nextclade/src/qc/qc_rule_stop_codons.rs
index 0299596c4..32e14d0e6 100644
--- a/packages/nextclade/src/qc/qc_rule_stop_codons.rs
+++ b/packages/nextclade/src/qc/qc_rule_stop_codons.rs
@@ -50,7 +50,7 @@ pub fn rule_stop_codons(translation: &Translation, config: &QcRulesConfigStopCod
   let total_stop_codons = stop_codons.len();
   let total_stop_codons_ignored = stop_codons_ignored.len();
 
-  let score = total_stop_codons as f64 * config.score_weight;
+  let score = total_stop_codons as f64 * *config.score_weight;
   let status = QcStatus::from_score(score);
 
   Some(QcResultStopCodons {
diff --git a/packages/nextclade/src/run/params_general.rs b/packages/nextclade/src/run/params_general.rs
index fb3d19805..65685d9fa 100644
--- a/packages/nextclade/src/run/params_general.rs
+++ b/packages/nextclade/src/run/params_general.rs
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
 
 #[allow(clippy::struct_excessive_bools)]
 #[optfield(pub NextcladeGeneralParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)]
-#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[derive(Parser, Debug, Clone, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct NextcladeGeneralParams {
   /// Whether to include aligned reference nucleotide sequence into output nucleotide sequence FASTA file and reference peptides into output peptide FASTA files.
diff --git a/packages/nextclade/src/tree/params.rs b/packages/nextclade/src/tree/params.rs
index d751d2324..e797aac7b 100644
--- a/packages/nextclade/src/tree/params.rs
+++ b/packages/nextclade/src/tree/params.rs
@@ -1,5 +1,6 @@
 use clap::Parser;
 use optfield::optfield;
+use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize};
 
 // NOTE: The `optfield` attribute creates a struct that have the same fields, but which are wrapped into `Option`,
@@ -7,7 +8,7 @@ use serde::{Deserialize, Serialize};
 // into self (mutably).
 #[allow(clippy::struct_excessive_bools)]
 #[optfield(pub TreeBuilderParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)]
-#[derive(Parser, Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
+#[derive(Parser, Clone, Debug, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct TreeBuilderParams {
   /// Disable greedy tree builder algorithm
@@ -16,7 +17,7 @@ pub struct TreeBuilderParams {
   pub without_greedy_tree_builder: bool,
 
   #[clap(long)]
-  pub masked_muts_weight: f64,
+  pub masked_muts_weight: OrderedFloat<f64>,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -24,7 +25,7 @@ impl Default for TreeBuilderParams {
   fn default() -> Self {
     Self {
       without_greedy_tree_builder: false,
-      masked_muts_weight: 0.05,
+      masked_muts_weight: OrderedFloat(0.05),
     }
   }
 }
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index 5a68e6003..bdbd824ea 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -1,6 +1,7 @@
 use crate::alphabet::aa::Aa;
 use crate::alphabet::nuc::Nuc;
 use crate::analyze::find_private_nuc_mutations::BranchMutations;
+use crate::analyze::virus_properties::VirusProperties;
 use crate::coord::position::{AaRefPosition, NucRefGlobalPosition};
 use crate::coord::range::NucRefGlobalRange;
 use crate::gene::gene::GeneStrand;
@@ -321,7 +322,7 @@ pub struct CladeNodeAttrKeyDesc {
   pub other: serde_json::Value,
 }
 
-#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq, schemars::JsonSchema, Validate, Debug)]
+#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)]
 pub struct AuspiceMetaExtensionsNextclade {
   #[serde(default, skip_serializing_if = "Vec::is_empty")]
   pub clade_node_attrs: Vec<CladeNodeAttrKeyDesc>,
@@ -329,6 +330,9 @@ pub struct AuspiceMetaExtensionsNextclade {
   #[serde(default, skip_serializing_if = "Vec::is_empty")]
   pub placement_mask_ranges: Vec<NucRefGlobalRange>,
 
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub pathogen: Option<VirusProperties>,
+
   #[serde(flatten)]
   pub other: serde_json::Value,
 }
diff --git a/packages/nextclade/src/utils/any.rs b/packages/nextclade/src/utils/any.rs
index 8ed141b27..b96ec432c 100644
--- a/packages/nextclade/src/utils/any.rs
+++ b/packages/nextclade/src/utils/any.rs
@@ -1,17 +1,21 @@
 use crate::io::json::{json_stringify, JsonPretty};
+use crate::make_error;
 use eyre::{eyre, Report};
+use ordered_float::OrderedFloat;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
+use serde_json::Value;
 use std::collections::BTreeMap;
 use std::fmt::{Display, Formatter};
+use std::str::FromStr;
 
-#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema)]
 #[serde(rename_all = "camelCase")]
 #[serde(untagged)]
 pub enum AnyType {
   String(String),
   Int(isize),
-  Float(f64),
+  Float(OrderedFloat<f64>),
   Bool(bool),
   Array(Vec<AnyType>),
   Object(BTreeMap<String, AnyType>),
@@ -51,7 +55,7 @@ impl AnyType {
 
   pub const fn as_float_maybe(&self) -> Option<f64> {
     match &self {
-      AnyType::Float(x) => Some(*x),
+      AnyType::Float(x) => Some(x.0),
       _ => None,
     }
   }
@@ -79,3 +83,41 @@ impl AnyType {
     self.as_bool_maybe().ok_or(eyre!("Cannot parse value as bool"))
   }
 }
+
+impl FromStr for AnyType {
+  type Err = Report;
+
+  fn from_str(s: &str) -> Result<Self, Self::Err> {
+    let value: Value = match serde_json::from_str(s) {
+      Ok(v) => v,
+      Err(err) => return make_error!("Failed to parse JSON: {err}"),
+    };
+
+    match value {
+      Value::String(s) => Ok(AnyType::String(s)),
+      Value::Number(n) => {
+        if let Some(int_val) = n.as_i64() {
+          Ok(AnyType::Int(int_val as isize))
+        } else {
+          Ok(AnyType::Float(OrderedFloat(n.as_f64().unwrap())))
+        }
+      }
+      Value::Bool(b) => Ok(AnyType::Bool(b)),
+      Value::Array(arr) => {
+        let mut parsed_array = Vec::new();
+        for val in arr {
+          parsed_array.push(AnyType::from_str(&val.to_string())?);
+        }
+        Ok(AnyType::Array(parsed_array))
+      }
+      Value::Object(obj) => {
+        let mut parsed_object = BTreeMap::new();
+        for (key, val) in obj {
+          parsed_object.insert(key, AnyType::from_str(&val.to_string())?);
+        }
+        Ok(AnyType::Object(parsed_object))
+      }
+      Value::Null => Ok(AnyType::Null),
+    }
+  }
+}

From 4334f32ba4f8bdb675addf0a27b1dc93a1f44fae Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 16 May 2024 14:38:20 +0200
Subject: [PATCH 03/18] feat: use Auspice JSON as dataset

This allows to pass a path to Auspice JSON v2 to `--input-dataset` CLI argument. In this case we attempt to read not only tree, but also ref sequence, genome annotation and pathogen properties from that file, rather than from a conventional dataset.
---
 .../src/dataset/dataset_download.rs           | 74 ++++++++++++++--
 .../nextclade/src/gene/auspice_annotations.rs | 84 +++++++++++++++++++
 packages/nextclade/src/gene/gene_map.rs       |  8 +-
 packages/nextclade/src/gene/mod.rs            |  1 +
 packages/nextclade/src/tree/tree.rs           |  4 +-
 5 files changed, 163 insertions(+), 8 deletions(-)
 create mode 100644 packages/nextclade/src/gene/auspice_annotations.rs

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index 8736e0c02..e5a099a99 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -5,19 +5,20 @@ use color_eyre::{Section, SectionExt};
 use eyre::{eyre, ContextCompat, Report, WrapErr};
 use itertools::Itertools;
 use log::{warn, LevelFilter};
-use nextclade::analyze::virus_properties::{LabelledMutationsConfig, VirusProperties};
+use nextclade::analyze::virus_properties::VirusProperties;
 use nextclade::gene::gene_map::{filter_gene_map, GeneMap};
-use nextclade::io::dataset::{Dataset, DatasetFiles, DatasetMeta, DatasetsIndexJson};
-use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str};
+use nextclade::io::dataset::{Dataset, DatasetsIndexJson};
+use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str, FastaRecord};
 use nextclade::io::file::create_file_or_stdout;
 use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string};
 use nextclade::run::nextclade_wasm::NextcladeParams;
 use nextclade::tree::tree::AuspiceTree;
+use nextclade::utils::any::AnyType;
 use nextclade::utils::fs::list_files_recursive;
 use nextclade::utils::option::OptionMapRefFallible;
 use nextclade::utils::string::{format_list, surround_with_quotes, Indent};
 use nextclade::{make_error, make_internal_error, o};
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::BTreeSet;
 use std::fs::File;
 use std::io::{BufReader, Cursor, Read, Seek, Write};
 use std::ops::Deref;
@@ -35,13 +36,16 @@ pub fn nextclade_get_inputs(
     if input_dataset.is_file() && has_extension(input_dataset, "zip") {
       dataset_zip_load(run_args, input_dataset, cdses)
         .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}"))
+    } else if input_dataset.is_file() && has_extension(input_dataset, "json") {
+      dataset_json_load(run_args, input_dataset, cdses)
+        .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}"))
     } else if input_dataset.is_dir() {
       dataset_dir_load(run_args, input_dataset, cdses)
         .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}"))
     } else {
       make_error!(
         "--input-dataset: path is invalid. \
-        Expected a directory path or a zip archive file path, but got: '{input_dataset:#?}'"
+        Expected a directory path, a zip file path or json file path, but got: '{input_dataset:#?}'"
       )
     }
   } else {
@@ -283,6 +287,66 @@ pub fn dataset_dir_load(
   })
 }
 
+pub fn dataset_json_load(
+  run_args: &NextcladeRunArgs,
+  dataset_json: impl AsRef<Path>,
+  cdses: &Option<Vec<String>>,
+) -> Result<NextcladeParams, Report> {
+  let dataset_json = dataset_json.as_ref();
+
+  // let NextcladeRunInputArgs {
+  //   input_ref,
+  //   input_tree,
+  //   input_pathogen_json,
+  //   input_annotation,
+  //   ..
+  // } = &run_args.inputs;
+
+  let auspice_json = AuspiceTree::from_path(dataset_json).wrap_err("When reading Auspice JSON v2")?;
+
+  let virus_properties = auspice_json
+    .meta
+    .extensions
+    .nextclade
+    .pathogen
+    .as_ref()
+    .cloned()
+    .unwrap_or_default();
+
+  let ref_record = {
+    let ref_name = virus_properties
+      .attributes
+      .get("reference name")
+      .cloned()
+      .unwrap_or_else(|| AnyType::String("reference".to_owned()))
+      .as_str()
+      .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")?
+      .to_owned();
+
+    let ref_seq = auspice_json.root_sequence.get("nuc")
+    .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
+
+    FastaRecord {
+      index: 0,
+      seq_name: ref_name,
+      seq: ref_seq,
+    }
+  };
+
+  let gene_map = auspice_json
+    .meta
+    .genome_annotations
+    .map_ref_fallible(GeneMap::from_auspice_annotations)?
+    .unwrap_or_default();
+
+  Ok(NextcladeParams {
+    ref_record,
+    gene_map,
+    tree: Some(auspice_json),
+    virus_properties,
+  })
+}
+
 pub fn dataset_individual_files_load(
   run_args: &NextcladeRunArgs,
   cdses: &Option<Vec<String>>,
diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs
new file mode 100644
index 000000000..9c45e830c
--- /dev/null
+++ b/packages/nextclade/src/gene/auspice_annotations.rs
@@ -0,0 +1,84 @@
+use crate::coord::range::{NucRefGlobalRange, NucRefLocalRange};
+use crate::gene::cds::Cds;
+use crate::gene::cds_segment::{CdsSegment, WrappingPart};
+use crate::gene::frame::Frame;
+use crate::gene::gene::Gene;
+use crate::gene::phase::Phase;
+use crate::tree::tree::{AuspiceGenomeAnnotations, Segments, StartEnd};
+use eyre::Report;
+use std::collections::HashMap;
+
+pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> Result<Vec<Gene>, Report> {
+  anns
+    .cdses
+    .iter()
+    .enumerate()
+    .map(|(index, (cds_name, ann))| {
+      let gene_name = ann.gene.as_ref().cloned().unwrap_or_else(|| format!("gene_{index}"));
+
+      let segments = match &ann.segments {
+        Segments::OneSegment(StartEnd { start, end }) => vec![CdsSegment {
+          index,
+          id: cds_name.to_owned(),
+          name: cds_name.to_owned(),
+          range: NucRefGlobalRange::from_isize(*start, *end),
+          range_local: NucRefLocalRange::from_isize(0, *end - *start),
+          landmark: None,
+          wrapping_part: WrappingPart::NonWrapping,
+          strand: ann.strand,
+          frame: Frame::_0,
+          phase: Phase::_0,
+          exceptions: vec![],
+          attributes: HashMap::default(),
+          source_record: None,
+          compat_is_gene: false,
+          color: None,
+        }],
+        Segments::MultipleSegments { segments } => segments
+          .iter()
+          .map(|StartEnd { start, end }| CdsSegment {
+            index,
+            id: cds_name.to_owned(),
+            name: cds_name.to_owned(),
+            range: NucRefGlobalRange::from_isize(*start, *end),
+            range_local: NucRefLocalRange::from_isize(0, *end - *start),
+            landmark: None,
+            wrapping_part: WrappingPart::NonWrapping,
+            strand: ann.strand,
+            frame: Frame::_0,
+            phase: Phase::_0,
+            exceptions: vec![],
+            attributes: HashMap::default(),
+            source_record: None,
+            compat_is_gene: false,
+            color: None,
+          })
+          .collect(),
+      };
+
+      let cds = Cds {
+        id: cds_name.to_owned(),
+        name: cds_name.to_owned(),
+        product: cds_name.to_owned(),
+        segments,
+        proteins: vec![],
+        exceptions: vec![],
+        attributes: HashMap::default(),
+        compat_is_gene: true,
+        color: ann.color.clone(),
+      };
+
+      Ok(Gene {
+        index,
+        id: gene_name.clone(),
+        name: gene_name,
+        cdses: vec![cds],
+        exceptions: vec![],
+        attributes: HashMap::default(),
+        source_record: None,
+        compat_is_cds: true,
+        color: ann.color.clone(),
+      })
+    })
+    .collect()
+}
diff --git a/packages/nextclade/src/gene/gene_map.rs b/packages/nextclade/src/gene/gene_map.rs
index ad6dbea74..a39493a64 100644
--- a/packages/nextclade/src/gene/gene_map.rs
+++ b/packages/nextclade/src/gene/gene_map.rs
@@ -1,11 +1,13 @@
 use crate::features::feature_group::FeatureGroup;
 use crate::features::feature_tree::FeatureTree;
 use crate::features::sequence_region::SequenceRegion;
+use crate::gene::auspice_annotations::convert_auspice_annotations_to_genes;
 use crate::gene::cds::Cds;
 use crate::gene::cds_segment::CdsSegment;
 use crate::gene::gene::{find_cdses, Gene};
 use crate::io::file::open_file_or_stdin;
 use crate::io::yaml::yaml_parse;
+use crate::tree::tree::AuspiceGenomeAnnotations;
 use crate::utils::collections::take_exactly_one;
 use crate::utils::error::report_to_string;
 use crate::{make_error, make_internal_report};
@@ -15,7 +17,6 @@ use log::warn;
 use num::Integer;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
-
 use std::path::Path;
 
 #[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema)]
@@ -37,6 +38,11 @@ impl GeneMap {
     convert_feature_tree_to_gene_map(feature_tree)
   }
 
+  pub fn from_auspice_annotations(anns: &AuspiceGenomeAnnotations) -> Result<Self, Report> {
+    let genes = convert_auspice_annotations_to_genes(anns)?;
+    Ok(GeneMap::from_genes(genes))
+  }
+
   pub fn from_path<P: AsRef<Path>>(filename: P) -> Result<Self, Report> {
     let filename = filename.as_ref();
     let mut file = open_file_or_stdin(&Some(filename))?;
diff --git a/packages/nextclade/src/gene/mod.rs b/packages/nextclade/src/gene/mod.rs
index f0692e515..f77b1a201 100644
--- a/packages/nextclade/src/gene/mod.rs
+++ b/packages/nextclade/src/gene/mod.rs
@@ -1,3 +1,4 @@
+pub mod auspice_annotations;
 pub mod cds;
 pub mod cds_segment;
 pub mod frame;
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index bdbd824ea..20ce67d02 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -404,8 +404,8 @@ pub struct AuspiceGenomeAnnotationNuc {
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 pub struct StartEnd {
-  pub start: usize,
-  pub end: usize,
+  pub start: isize,
+  pub end: isize,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]

From b843ada325e5f2d516d69756f090e3c78b0ea0ad Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 16 May 2024 17:03:11 +0200
Subject: [PATCH 04/18] fix: parsing auspice genome annotations

---
 .../nextclade/src/gene/auspice_annotations.rs | 109 +++++++++++-------
 packages/nextclade/src/gene/cds.rs            |   3 +-
 packages/nextclade/src/tree/tree.rs           |  10 ++
 3 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs
index 9c45e830c..4176bd938 100644
--- a/packages/nextclade/src/gene/auspice_annotations.rs
+++ b/packages/nextclade/src/gene/auspice_annotations.rs
@@ -1,60 +1,37 @@
-use crate::coord::range::{NucRefGlobalRange, NucRefLocalRange};
-use crate::gene::cds::Cds;
+use crate::coord::range::{NucRefGlobalRange, Range};
+use crate::features::feature::Landmark;
+use crate::gene::cds::{split_circular_cds_segments, Cds};
 use crate::gene::cds_segment::{CdsSegment, WrappingPart};
 use crate::gene::frame::Frame;
 use crate::gene::gene::Gene;
 use crate::gene::phase::Phase;
-use crate::tree::tree::{AuspiceGenomeAnnotations, Segments, StartEnd};
+use crate::io::json::{json_stringify, JsonPretty};
+use crate::tree::tree::{AuspiceGenomeAnnotationCds, AuspiceGenomeAnnotations, Segments, StartEnd};
 use eyre::Report;
+use maplit::hashmap;
 use std::collections::HashMap;
 
 pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) -> Result<Vec<Gene>, Report> {
+  let landmark = Landmark {
+    index: 0,
+    id: "landmark".to_owned(),
+    name: "landmark".to_owned(),
+    range: NucRefGlobalRange::from_isize(anns.nuc.start, anns.nuc.end),
+    strand: anns.nuc.strand,
+    is_circular: true,
+  };
+
   anns
     .cdses
     .iter()
     .enumerate()
     .map(|(index, (cds_name, ann))| {
-      let gene_name = ann.gene.as_ref().cloned().unwrap_or_else(|| format!("gene_{index}"));
+      let gene_name = ann.gene.as_ref().unwrap_or(cds_name);
 
       let segments = match &ann.segments {
-        Segments::OneSegment(StartEnd { start, end }) => vec![CdsSegment {
-          index,
-          id: cds_name.to_owned(),
-          name: cds_name.to_owned(),
-          range: NucRefGlobalRange::from_isize(*start, *end),
-          range_local: NucRefLocalRange::from_isize(0, *end - *start),
-          landmark: None,
-          wrapping_part: WrappingPart::NonWrapping,
-          strand: ann.strand,
-          frame: Frame::_0,
-          phase: Phase::_0,
-          exceptions: vec![],
-          attributes: HashMap::default(),
-          source_record: None,
-          compat_is_gene: false,
-          color: None,
-        }],
-        Segments::MultipleSegments { segments } => segments
-          .iter()
-          .map(|StartEnd { start, end }| CdsSegment {
-            index,
-            id: cds_name.to_owned(),
-            name: cds_name.to_owned(),
-            range: NucRefGlobalRange::from_isize(*start, *end),
-            range_local: NucRefLocalRange::from_isize(0, *end - *start),
-            landmark: None,
-            wrapping_part: WrappingPart::NonWrapping,
-            strand: ann.strand,
-            frame: Frame::_0,
-            phase: Phase::_0,
-            exceptions: vec![],
-            attributes: HashMap::default(),
-            source_record: None,
-            compat_is_gene: false,
-            color: None,
-          })
-          .collect(),
-      };
+        Segments::OneSegment(segment) => convert_cds_segments(ann, &landmark, cds_name, &[segment.to_owned()]),
+        Segments::MultipleSegments { segments } => convert_cds_segments(ann, &landmark, cds_name, segments),
+      }?;
 
       let cds = Cds {
         id: cds_name.to_owned(),
@@ -70,8 +47,8 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) ->
 
       Ok(Gene {
         index,
-        id: gene_name.clone(),
-        name: gene_name,
+        id: gene_name.to_owned(),
+        name: gene_name.to_owned(),
         cdses: vec![cds],
         exceptions: vec![],
         attributes: HashMap::default(),
@@ -82,3 +59,47 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) ->
     })
     .collect()
 }
+
+fn convert_cds_segments(
+  ann: &AuspiceGenomeAnnotationCds,
+  landmark: &Landmark,
+  cds_name: &str,
+  ann_segments: &[StartEnd],
+) -> Result<Vec<CdsSegment>, Report> {
+  let mut begin = 0;
+  let mut segments = vec![];
+
+  for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() {
+    let name = format!("{cds_name}_fragment_{index}");
+
+    let start = start.saturating_sub(1);
+    let range = NucRefGlobalRange::from_isize(start, end);
+    let range_local = Range::from_usize(begin, begin + range.len());
+    let phase = Phase::from_begin(range_local.begin)?;
+    let frame = Frame::from_begin(range.begin)?;
+
+    segments.push(CdsSegment {
+      index,
+      id: name.clone(),
+      name,
+      range: range.clone(),
+      range_local,
+      landmark: Some(landmark.to_owned()),
+      wrapping_part: WrappingPart::NonWrapping,
+      strand: ann.strand,
+      frame,
+      phase,
+      exceptions: vec![],
+      attributes: hashmap! {},
+      source_record: Some(json_stringify(ann, JsonPretty(true))?),
+      compat_is_gene: false,
+      color: ann.color.clone(),
+    });
+
+    begin += range.len();
+  }
+
+  let segments = split_circular_cds_segments(&segments)?;
+
+  Ok(segments)
+}
diff --git a/packages/nextclade/src/gene/cds.rs b/packages/nextclade/src/gene/cds.rs
index 62713df65..e10654145 100644
--- a/packages/nextclade/src/gene/cds.rs
+++ b/packages/nextclade/src/gene/cds.rs
@@ -14,7 +14,6 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 
-
 #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct Cds {
@@ -199,7 +198,7 @@ impl Cds {
 ///   - the part from segment start to landmark end, before the wrap around
 ///   - (optionally) the middle parts spanning the entire sequence
 ///   - the last part from landmark start to segment end
-fn split_circular_cds_segments(segments: &[CdsSegment]) -> Result<Vec<CdsSegment>, Report> {
+pub fn split_circular_cds_segments(segments: &[CdsSegment]) -> Result<Vec<CdsSegment>, Report> {
   let mut linear_segments = vec![];
   for segment in segments {
     if let Some(landmark) = &segment.landmark {
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index 20ce67d02..2189d2a32 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -398,6 +398,16 @@ impl AuspiceDisplayDefaults {
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 pub struct AuspiceGenomeAnnotationNuc {
+  pub start: isize,
+
+  pub end: isize,
+
+  #[serde(default)]
+  pub strand: GeneStrand,
+
+  #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")]
+  pub r#type: Option<String>,
+
   #[serde(flatten)]
   pub other: serde_json::Value,
 }

From ff7e887cfb1849bfe7f891947373aa127b3e923d Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 17 May 2024 09:35:40 +0200
Subject: [PATCH 05/18] fix: off-by-one in landmark range

---
 packages/nextclade/src/gene/auspice_annotations.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs
index 4176bd938..6dc2ddb32 100644
--- a/packages/nextclade/src/gene/auspice_annotations.rs
+++ b/packages/nextclade/src/gene/auspice_annotations.rs
@@ -16,7 +16,7 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) ->
     index: 0,
     id: "landmark".to_owned(),
     name: "landmark".to_owned(),
-    range: NucRefGlobalRange::from_isize(anns.nuc.start, anns.nuc.end),
+    range: NucRefGlobalRange::from_isize(anns.nuc.start.saturating_sub(1), anns.nuc.end),
     strand: anns.nuc.strand,
     is_circular: true,
   };
@@ -72,8 +72,7 @@ fn convert_cds_segments(
   for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() {
     let name = format!("{cds_name}_fragment_{index}");
 
-    let start = start.saturating_sub(1);
-    let range = NucRefGlobalRange::from_isize(start, end);
+    let range = NucRefGlobalRange::from_isize(start.saturating_sub(1), end);
     let range_local = Range::from_usize(begin, begin + range.len());
     let phase = Phase::from_begin(range_local.begin)?;
     let frame = Frame::from_begin(range.begin)?;

From 9b952bf1a7a0bf369ac4f3adb069e1c41735ca8d Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 17 May 2024 10:42:31 +0200
Subject: [PATCH 06/18] fix: duplicated start and end fields in the annotation
 of output tree

---
 packages/nextclade/src/gene/auspice_annotations.rs |  4 ++--
 packages/nextclade/src/tree/tree.rs                | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/packages/nextclade/src/gene/auspice_annotations.rs b/packages/nextclade/src/gene/auspice_annotations.rs
index 6dc2ddb32..6fb601f85 100644
--- a/packages/nextclade/src/gene/auspice_annotations.rs
+++ b/packages/nextclade/src/gene/auspice_annotations.rs
@@ -30,7 +30,7 @@ pub fn convert_auspice_annotations_to_genes(anns: &AuspiceGenomeAnnotations) ->
 
       let segments = match &ann.segments {
         Segments::OneSegment(segment) => convert_cds_segments(ann, &landmark, cds_name, &[segment.to_owned()]),
-        Segments::MultipleSegments { segments } => convert_cds_segments(ann, &landmark, cds_name, segments),
+        Segments::MultipleSegments { segments, .. } => convert_cds_segments(ann, &landmark, cds_name, segments),
       }?;
 
       let cds = Cds {
@@ -69,7 +69,7 @@ fn convert_cds_segments(
   let mut begin = 0;
   let mut segments = vec![];
 
-  for (index, &StartEnd { start, end }) in ann_segments.iter().enumerate() {
+  for (index, &StartEnd { start, end, .. }) in ann_segments.iter().enumerate() {
     let name = format!("{cds_name}_fragment_{index}");
 
     let range = NucRefGlobalRange::from_isize(start.saturating_sub(1), end);
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index 2189d2a32..6f5ce5e36 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -416,13 +416,21 @@ pub struct AuspiceGenomeAnnotationNuc {
 pub struct StartEnd {
   pub start: isize,
   pub end: isize,
+
+  #[serde(flatten)]
+  pub other: serde_json::Value,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 #[serde(untagged)]
 pub enum Segments {
   OneSegment(StartEnd),
-  MultipleSegments { segments: Vec<StartEnd> },
+  MultipleSegments {
+    segments: Vec<StartEnd>,
+
+    #[serde(flatten)]
+    other: serde_json::Value,
+  },
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
@@ -447,9 +455,6 @@ pub struct AuspiceGenomeAnnotationCds {
 
   #[serde(flatten)]
   pub segments: Segments,
-
-  #[serde(flatten)]
-  pub other: serde_json::Value,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]

From 48d163c01ec8af5602624b4681d79b2843aba5ea Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 17 May 2024 12:32:13 +0200
Subject: [PATCH 07/18] feat: accept Auspice JSON genome annotation in
 read-annotation command

---
 packages/nextclade/src/gene/gene_map.rs | 59 ++++++++++++++++++-------
 packages/nextclade/src/tree/tree.rs     | 13 +++++-
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/packages/nextclade/src/gene/gene_map.rs b/packages/nextclade/src/gene/gene_map.rs
index a39493a64..f98077a37 100644
--- a/packages/nextclade/src/gene/gene_map.rs
+++ b/packages/nextclade/src/gene/gene_map.rs
@@ -10,6 +10,7 @@ use crate::io::yaml::yaml_parse;
 use crate::tree::tree::AuspiceGenomeAnnotations;
 use crate::utils::collections::take_exactly_one;
 use crate::utils::error::report_to_string;
+use crate::utils::string::{format_list, Indent};
 use crate::{make_error, make_internal_report};
 use eyre::{eyre, Report, WrapErr};
 use itertools::Itertools;
@@ -19,6 +20,8 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::path::Path;
 
+type GeneMapParserFn = Box<dyn Fn(&str) -> Result<GeneMap, Report>>;
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema)]
 #[must_use]
 pub struct GeneMap {
@@ -51,25 +54,44 @@ impl GeneMap {
     Self::from_str(String::from_utf8(buf)?).wrap_err_with(|| eyre!("When reading file: {filename:?}"))
   }
 
-  // TODO: rename this function, because it handles more than GFF3
   pub fn from_str(content: impl AsRef<str>) -> Result<Self, Report> {
     let content = content.as_ref();
-    let gene_map_yaml: Result<GeneMap, Report> = Self::from_yaml_str(content);
-    let gene_map_gff: Result<GeneMap, Report> = Self::from_gff3_str(content);
-
-    let gene_map = match (gene_map_yaml, gene_map_gff) {
-      (Err(json_err), Err(gff_err)) => {
-        return make_error!("Attempted to parse the genome annotation as JSON and as GFF, but both attempts failed:\nJSON error: {}\n\nGFF3 error: {}\n",
-          report_to_string(&json_err),
-          report_to_string(&gff_err),
-        )
-      },
-      (Ok(gene_map), _) => gene_map,
-      (_, Ok(gene_map)) => gene_map,
-    };
 
-    gene_map.validate()?;
-    Ok(gene_map)
+    let parsers: Vec<(&str, GeneMapParserFn)> = vec![
+      (
+        "Genome annotation in GFF3 format",
+        Box::new(|content| Self::from_gff3_str(content)),
+      ),
+      (
+        "Genome annotation in external JSON format",
+        Box::new(|content| Self::from_yaml_str(content)),
+      ),
+      (
+        "Genome annotation extracted from Auspice JSON",
+        Box::new(|content| Self::from_tree_json_str(content)),
+      ),
+    ];
+
+    let mut errors = Vec::new();
+    for (name, parser) in &parsers {
+      match parser(content) {
+        Ok(map) => {
+          map.validate()?;
+          return Ok(map);
+        }
+        Err(err) => {
+          errors.push(format!(
+            "When attempted to parse as {name}: {}\n",
+            report_to_string(&err)
+          ));
+        }
+      }
+    }
+
+    make_error!(
+      "Attempted to parse the genome annotation but failed. Tried multiple formats:\n\n{}\n",
+      format_list(Indent::default(), errors.into_iter())
+    )
   }
 
   fn from_yaml_str(content: impl AsRef<str>) -> Result<Self, Report> {
@@ -80,6 +102,11 @@ impl GeneMap {
     Self::from_feature_tree(&FeatureTree::from_gff3_str(content.as_ref())?)
   }
 
+  fn from_tree_json_str(content: impl AsRef<str>) -> Result<Self, Report> {
+    let anns = AuspiceGenomeAnnotations::from_tree_json_str(content)?;
+    Self::from_auspice_annotations(&anns)
+  }
+
   #[must_use]
   pub fn is_empty(&self) -> bool {
     self.genes.is_empty()
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index 6f5ce5e36..b3cfe0f31 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -11,7 +11,7 @@ use crate::graph::node::{GraphNode, Node};
 use crate::graph::traits::{HasDivergence, HasName};
 use crate::io::fs::read_file_to_string;
 use crate::io::json::json_parse;
-use eyre::{Report, WrapErr};
+use eyre::{eyre, Report, WrapErr};
 use schemars::JsonSchema;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::collections::BTreeMap;
@@ -468,6 +468,17 @@ pub struct AuspiceGenomeAnnotations {
   pub other: serde_json::Value,
 }
 
+impl AuspiceGenomeAnnotations {
+  pub fn from_tree_json_str(content: impl AsRef<str>) -> Result<Self, Report> {
+    let content = content.as_ref();
+    let tree = AuspiceTree::from_str(content)?;
+    tree
+      .meta
+      .genome_annotations
+      .ok_or_else(|| eyre!("Auspice JSON does not contain `.genome_annotations` field, but required"))
+  }
+}
+
 #[derive(Clone, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)]
 pub struct AuspiceTreeMeta {
   #[serde(skip_serializing_if = "Option::is_none")]

From 1fc493686abf3ace869dac8fa4dc9be38e7022ce Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 23 May 2024 12:37:10 +0200
Subject: [PATCH 08/18] refactor: aggregate inputs loading

---
 packages/nextclade-web/src/io/loadInputs.ts | 18 +++++++++++++++++
 packages/nextclade-web/src/pages/_app.tsx   | 22 ++++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)
 create mode 100644 packages/nextclade-web/src/io/loadInputs.ts

diff --git a/packages/nextclade-web/src/io/loadInputs.ts b/packages/nextclade-web/src/io/loadInputs.ts
new file mode 100644
index 000000000..41072815b
--- /dev/null
+++ b/packages/nextclade-web/src/io/loadInputs.ts
@@ -0,0 +1,18 @@
+import type { ParsedUrlQuery } from 'querystring'
+import type { Dataset } from 'src/types'
+import { createInputFastasFromUrlParam, createInputFromUrlParamMaybe } from 'src/io/createInputFromUrlParamMaybe'
+
+export async function loadInputs(urlQuery: ParsedUrlQuery, dataset?: Dataset) {
+  const inputFastas = await createInputFastasFromUrlParam(urlQuery, dataset)
+  const refSeq = await createInputFromUrlParamMaybe(urlQuery, 'input-ref')
+  const geneMap = await createInputFromUrlParamMaybe(urlQuery, 'input-annotation')
+  const refTree = await createInputFromUrlParamMaybe(urlQuery, 'input-tree')
+  const virusProperties = await createInputFromUrlParamMaybe(urlQuery, 'input-pathogen-json')
+  return {
+    inputFastas,
+    refSeq,
+    geneMap,
+    refTree,
+    virusProperties,
+  }
+}
diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx
index 7ca35a9fb..62407c086 100644
--- a/packages/nextclade-web/src/pages/_app.tsx
+++ b/packages/nextclade-web/src/pages/_app.tsx
@@ -11,7 +11,7 @@ import dynamic from 'next/dynamic'
 import { sanitizeError } from 'src/helpers/sanitizeError'
 import { useRunAnalysis } from 'src/hooks/useRunAnalysis'
 import i18nAuspice, { changeAuspiceLocale } from 'src/i18n/i18n.auspice'
-import { createInputFastasFromUrlParam, createInputFromUrlParamMaybe } from 'src/io/createInputFromUrlParamMaybe'
+import { loadInputs } from 'src/io/loadInputs'
 import { mdxComponents } from 'src/mdx-components'
 import LoadingPage from 'src/pages/loading'
 import { globalErrorAtom } from 'src/state/error.state'
@@ -37,7 +37,6 @@ import { I18nextProvider } from 'react-i18next'
 import { MDXProvider } from '@mdx-js/react'
 import { QueryClient, QueryClientConfig, QueryClientProvider } from 'react-query'
 import { ReactQueryDevtools } from 'react-query/devtools'
-
 import { DOMAIN_STRIPPED } from 'src/constants'
 import { parseUrl } from 'src/helpers/parseUrl'
 import { getDatasetServerUrl, initializeDatasets } from 'src/io/fetchDatasets'
@@ -122,19 +121,18 @@ export function RecoilStateInitializer() {
         return dataset
       })
       .then(async (dataset) => {
-        const inputFastas = await createInputFastasFromUrlParam(urlQuery, dataset)
+        const { inputFastas, refSeq, geneMap, refTree, virusProperties } = await loadInputs(urlQuery, dataset)
+
+        set(refSeqInputAtom, refSeq)
+        set(geneMapInputAtom, geneMap)
+        set(refTreeInputAtom, refTree)
+        set(virusPropertiesInputAtom, virusProperties)
 
         if (!isEmpty(inputFastas)) {
           set(qrySeqInputsStorageAtom, inputFastas)
-        }
-
-        set(refSeqInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-ref'))
-        set(geneMapInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-annotation'))
-        set(refTreeInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-tree'))
-        set(virusPropertiesInputAtom, await createInputFromUrlParamMaybe(urlQuery, 'input-pathogen-json'))
-
-        if (!isEmpty(inputFastas) && !isEmpty(dataset)) {
-          run()
+          if (!isEmpty(dataset)) {
+            run()
+          }
         }
 
         return undefined

From a27ee661d4fceebf01be5db6ff238e1fbf4f3e9e Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 23 May 2024 15:45:50 +0200
Subject: [PATCH 09/18] feat(web): add url parameter`dataset-json-url`

This allows to input Auspice JSON as Nextclade dataset to the web app.
---
 .../src/dataset/dataset_download.rs           | 43 +-------
 .../src/components/Error/ErrorContent.tsx     |  3 +-
 .../error-types/NextcladeV2ErrorContent.tsx   |  2 +-
 .../nextclade-web/src/hooks/useRunAnalysis.ts | 56 ++++++++++-
 .../src/io/fetchSingleDataset.ts              | 35 +++++--
 .../src/io/fetchSingleDatasetAuspice.ts       | 36 +++++++
 ...mUrl.ts => fetchSingleDatasetDirectory.ts} | 71 ++++++++------
 packages/nextclade-web/src/pages/_app.tsx     |  8 +-
 .../nextclade-web/src/state/inputs.state.ts   |  8 +-
 .../src/workers/launchAnalysis.ts             | 43 +-------
 packages/nextclade/src/run/nextclade_wasm.rs  | 98 ++++++++++++++++---
 11 files changed, 256 insertions(+), 147 deletions(-)
 create mode 100644 packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
 rename packages/nextclade-web/src/io/{fetchSingleDatasetFromUrl.ts => fetchSingleDatasetDirectory.ts} (94%)

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index e5a099a99..64184a0c4 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -303,48 +303,7 @@ pub fn dataset_json_load(
   // } = &run_args.inputs;
 
   let auspice_json = AuspiceTree::from_path(dataset_json).wrap_err("When reading Auspice JSON v2")?;
-
-  let virus_properties = auspice_json
-    .meta
-    .extensions
-    .nextclade
-    .pathogen
-    .as_ref()
-    .cloned()
-    .unwrap_or_default();
-
-  let ref_record = {
-    let ref_name = virus_properties
-      .attributes
-      .get("reference name")
-      .cloned()
-      .unwrap_or_else(|| AnyType::String("reference".to_owned()))
-      .as_str()
-      .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")?
-      .to_owned();
-
-    let ref_seq = auspice_json.root_sequence.get("nuc")
-    .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
-
-    FastaRecord {
-      index: 0,
-      seq_name: ref_name,
-      seq: ref_seq,
-    }
-  };
-
-  let gene_map = auspice_json
-    .meta
-    .genome_annotations
-    .map_ref_fallible(GeneMap::from_auspice_annotations)?
-    .unwrap_or_default();
-
-  Ok(NextcladeParams {
-    ref_record,
-    gene_map,
-    tree: Some(auspice_json),
-    virus_properties,
-  })
+  NextcladeParams::from_auspice(&auspice_json)
 }
 
 pub fn dataset_individual_files_load(
diff --git a/packages/nextclade-web/src/components/Error/ErrorContent.tsx b/packages/nextclade-web/src/components/Error/ErrorContent.tsx
index 0ab256631..e93a10e33 100644
--- a/packages/nextclade-web/src/components/Error/ErrorContent.tsx
+++ b/packages/nextclade-web/src/components/Error/ErrorContent.tsx
@@ -1,17 +1,16 @@
 import React, { useCallback, useMemo, useState } from 'react'
 import { Button, Col, Row } from 'reactstrap'
 import { useTranslationSafe } from 'src/helpers/useTranslationSafe'
-import { NextcladeV2Error } from 'src/io/fetchSingleDatasetFromUrl'
 import styled from 'styled-components'
 import { CopyToClipboard } from 'react-copy-to-clipboard'
 import { FaClipboardCheck, FaClipboardList } from 'react-icons/fa'
-
 import { ErrorGeneric } from 'src/components/Error/error-types/ErrorGeneric'
 import { ErrorNetworkConnectionFailure } from 'src/components/Error/error-types/ErrorNetworkConnectionFailure'
 import { ErrorNetworkRequestFailure } from 'src/components/Error/error-types/ErrorNetworkRequestFailure'
 import { NextcladeV2ErrorContent } from 'src/components/Error/error-types/NextcladeV2ErrorContent'
 import { ErrorContentExplanation, getErrorReportText } from 'src/components/Error/ErrorContentExplanation'
 import { sanitizeError } from 'src/helpers/sanitizeError'
+import { NextcladeV2Error } from 'src/io/fetchSingleDatasetDirectory'
 import { HttpRequestError } from 'src/io/axiosFetch'
 import { ErrorMessageMonospace } from './ErrorStyles'
 
diff --git a/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx b/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx
index c4c199054..bc177d9ba 100644
--- a/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx
+++ b/packages/nextclade-web/src/components/Error/error-types/NextcladeV2ErrorContent.tsx
@@ -3,8 +3,8 @@ import React, { useMemo } from 'react'
 import { ErrorContainer, ErrorMessage } from 'src/components/Error/ErrorStyles'
 import { LinkExternal } from 'src/components/Link/LinkExternal'
 import { PROJECT_NAME, RELEASE_OLD_URL } from 'src/constants'
+import { NextcladeV2Error } from 'src/io/fetchSingleDatasetDirectory'
 import { useTranslationSafe } from 'src/helpers/useTranslationSafe'
-import { NextcladeV2Error } from 'src/io/fetchSingleDatasetFromUrl'
 import urljoin from 'url-join'
 
 export interface Props {
diff --git a/packages/nextclade-web/src/hooks/useRunAnalysis.ts b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
index 6d9746fa8..fae78841b 100644
--- a/packages/nextclade-web/src/hooks/useRunAnalysis.ts
+++ b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
@@ -1,18 +1,20 @@
 import type { AuspiceJsonV2, CladeNodeAttrDesc } from 'auspice'
-
 import { changeColorBy } from 'auspice/src/actions/colors'
+import { concurrent } from 'fasy'
 import { useRouter } from 'next/router'
 import { useDispatch } from 'react-redux'
 import { useRecoilCallback } from 'recoil'
+import { ErrorInternal } from 'src/helpers/ErrorInternal'
 import { clearAllFiltersAtom } from 'src/state/resultFilters.state'
 import { viewedCdsAtom } from 'src/state/seqViewSettings.state'
-import { AlgorithmGlobalStatus } from 'src/types'
+import { AlgorithmGlobalStatus, AlgorithmInput, Dataset, NextcladeParamsRaw, NextcladeParamsRawDir } from 'src/types'
 import { sanitizeError } from 'src/helpers/sanitizeError'
 import { auspiceStartClean, treeFilterByNodeType } from 'src/state/auspice/auspice.actions'
 import { createAuspiceState } from 'src/state/auspice/createAuspiceState'
 import { datasetCurrentAtom, cdsOrderPreferenceAtom } from 'src/state/dataset.state'
 import { globalErrorAtom } from 'src/state/error.state'
 import {
+  datasetJsonAtom,
   geneMapInputAtom,
   qrySeqInputsStorageAtom,
   refSeqInputAtom,
@@ -35,6 +37,7 @@ import {
 } from 'src/state/results.state'
 import { numThreadsAtom, showNewRunPopupAtom } from 'src/state/settings.state'
 import { launchAnalysis, LaunchAnalysisCallbacks, LaunchAnalysisInputs } from 'src/workers/launchAnalysis'
+import { axiosFetchRaw } from 'src/io/axiosFetch'
 
 export function useRunAnalysis() {
   const router = useRouter()
@@ -60,6 +63,8 @@ export function useRunAnalysis() {
         const qryInputs = getPromise(qrySeqInputsStorageAtom)
         const csvColumnConfig = getPromise(csvColumnConfigAtom)
 
+        const datasetJsonPromise = getPromise(datasetJsonAtom)
+
         const inputs: LaunchAnalysisInputs = {
           refSeq: getPromise(refSeqInputAtom),
           geneMap: getPromise(geneMapInputAtom),
@@ -130,7 +135,22 @@ export function useRunAnalysis() {
           .push('/results', '/results')
           .then(async () => {
             set(analysisStatusGlobalAtom, AlgorithmGlobalStatus.initWorkers)
-            return launchAnalysis(qryInputs, inputs, callbacks, datasetCurrent, numThreads, csvColumnConfig)
+
+            const tree = await datasetJsonPromise
+
+            let params: NextcladeParamsRaw
+            if (tree) {
+              params = { Auspice: { tree: JSON.stringify(tree) } }
+            } else {
+              const dataset = await datasetCurrent
+              if (!dataset) {
+                throw new ErrorInternal('Dataset is required but not found')
+              }
+              const data = await getParams(inputs, dataset)
+              params = { Dir: data }
+            }
+
+            return launchAnalysis(qryInputs, params, callbacks, numThreads, csvColumnConfig)
           })
           .catch((error) => {
             set(analysisStatusGlobalAtom, AlgorithmGlobalStatus.failed)
@@ -140,3 +160,33 @@ export function useRunAnalysis() {
     [router, dispatch],
   )
 }
+
+/** Resolves all param inputs into strings */
+async function getParams(paramInputs: LaunchAnalysisInputs, dataset: Dataset): Promise<NextcladeParamsRawDir> {
+  const entries = [
+    { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset.files.genomeAnnotation },
+    { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset.files.reference },
+    { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset.files.treeJson },
+    { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset.files.pathogenJson },
+  ]
+
+  return Object.fromEntries(
+    await concurrent.map(async ({ key, input, datasetFileUrl }) => {
+      return [key, await resolveInput(await input, datasetFileUrl)]
+    }, entries),
+  ) as unknown as NextcladeParamsRawDir
+}
+
+async function resolveInput(input: AlgorithmInput | undefined, datasetFileUrl: string | undefined) {
+  // If data is provided explicitly, load it
+  if (input) {
+    return input.getContent()
+  }
+
+  // Otherwise fetch corresponding file from the dataset
+  if (datasetFileUrl) {
+    return axiosFetchRaw(datasetFileUrl)
+  }
+
+  return undefined
+}
diff --git a/packages/nextclade-web/src/io/fetchSingleDataset.ts b/packages/nextclade-web/src/io/fetchSingleDataset.ts
index c6258dedd..e376bc66b 100644
--- a/packages/nextclade-web/src/io/fetchSingleDataset.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDataset.ts
@@ -1,16 +1,39 @@
 import type { ParsedUrlQuery } from 'querystring'
+import { ErrorFatal } from 'src/helpers/ErrorFatal'
+import { fetchSingleDatasetAuspice } from 'src/io/fetchSingleDatasetAuspice'
+import { fetchSingleDatasetDirectory } from 'src/io/fetchSingleDatasetDirectory'
 import { getQueryParamMaybe } from 'src/io/getQueryParamMaybe'
-import { fetchSingleDatasetFromUrl } from 'src/io/fetchSingleDatasetFromUrl'
 import { isGithubUrlOrShortcut, parseGitHubRepoUrlOrShortcut } from 'src/io/fetchSingleDatasetFromGithub'
 
 export async function fetchSingleDataset(urlQuery: ParsedUrlQuery) {
   const datasetUrl = getQueryParamMaybe(urlQuery, 'dataset-url')
-  if (!datasetUrl) {
+  const datasetUrlJson = getQueryParamMaybe(urlQuery, 'dataset-json-url')
+
+  if (datasetUrl && datasetUrlJson) {
+    throw new ErrorFatal(
+      "URL parameters 'dataset-url' and 'dataset-url-json' are mutually exclusive, but both provided. Please remove one or the other.",
+    )
+  }
+
+  let finalUrl
+  let options
+  let fetchFunction
+
+  if (datasetUrl) {
+    finalUrl = datasetUrl
+    fetchFunction = fetchSingleDatasetDirectory
+  } else if (datasetUrlJson) {
+    finalUrl = datasetUrlJson
+    fetchFunction = fetchSingleDatasetAuspice
+  } else {
     return undefined
   }
-  if (isGithubUrlOrShortcut(datasetUrl)) {
-    const { directUrl } = await parseGitHubRepoUrlOrShortcut(datasetUrl)
-    return fetchSingleDatasetFromUrl(directUrl, { datasetOriginalUrl: datasetUrl })
+
+  if (isGithubUrlOrShortcut(finalUrl)) {
+    const { directUrl } = await parseGitHubRepoUrlOrShortcut(finalUrl)
+    options = { datasetOriginalUrl: finalUrl }
+    finalUrl = directUrl
   }
-  return fetchSingleDatasetFromUrl(datasetUrl)
+
+  return fetchFunction(finalUrl, options)
 }
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
new file mode 100644
index 000000000..d0d106c5d
--- /dev/null
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -0,0 +1,36 @@
+import { isEmpty } from 'lodash'
+import { FatalError } from 'next/dist/lib/fatal-error'
+import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles } from 'src/types'
+import { removeTrailingSlash } from 'src/io/url'
+import { axiosFetch } from 'src/io/axiosFetch'
+
+export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
+  const datasetJsonUrl = removeTrailingSlash(datasetJsonUrl_)
+
+  const auspiceJson = await axiosFetch<AuspiceTree>(datasetJsonUrl)
+  const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen
+
+  if (isEmpty(auspiceJson.root_sequence.nuc)) {
+    throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`)
+  }
+
+  const currentDataset: Dataset = {
+    path: datasetJsonUrl,
+    capabilities: {
+      primers: false,
+      qc: [],
+    },
+    ...pathogen,
+
+    // HACK: there is no files if dataset comes from Auspice JSON, neither they are needed. What to do?
+    files: {} as unknown as DatasetFiles,
+  }
+
+  const datasets = [currentDataset]
+  const defaultDataset = currentDataset
+  const currentDatasetName = currentDataset.path
+  const defaultDatasetName = currentDatasetName
+  const defaultDatasetNameFriendly = attrStrMaybe(currentDataset.attributes, 'name') ?? currentDatasetName
+
+  return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset, auspiceJson }
+}
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
similarity index 94%
rename from packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts
rename to packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
index a2045ce81..1b59c48c3 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetFromUrl.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
@@ -7,37 +7,7 @@ import { removeTrailingSlash } from 'src/io/url'
 import { axiosFetch, axiosHead, axiosHeadOrUndefined } from 'src/io/axiosFetch'
 import { sanitizeError } from 'src/helpers/sanitizeError'
 
-export class NextcladeV2Error extends Error {
-  public readonly datasetRootUrl: string
-
-  public constructor(datasetRootUrl: string) {
-    super()
-    this.datasetRootUrl = datasetRootUrl
-  }
-}
-
-function checkDatasetV2FilesExist(datasetRootUrl: string) {
-  return Promise.all([
-    ['genemap.gff', 'primers.csv', 'qc.json', 'tag.json', 'virus_properties.json'].map((file) =>
-      axiosHeadOrUndefined(urljoin(datasetRootUrl, file)),
-    ),
-  ])
-}
-
-async function fetchPathogenJson(datasetRootUrl: string) {
-  let pathogen
-  try {
-    pathogen = await axiosFetch<VirusProperties>(urljoin(datasetRootUrl, 'pathogen.json'))
-  } catch (error: unknown) {
-    if (axios.isAxiosError(error) && error.status === '404' && (await checkDatasetV2FilesExist(datasetRootUrl))) {
-      throw new NextcladeV2Error(datasetRootUrl)
-    }
-    throw error
-  }
-  return pathogen
-}
-
-export async function fetchSingleDatasetFromUrl(
+export async function fetchSingleDatasetDirectory(
   datasetRootUrl_: string,
   meta?: { datasetOriginalUrl?: string; datasetGithubRepo?: string },
 ) {
@@ -79,7 +49,44 @@ export async function fetchSingleDatasetFromUrl(
     Object.entries(currentDataset.files).filter(([filename, _]) => !['sequences.fasta'].includes(filename)),
   )
 
-  return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset }
+  return {
+    datasets,
+    defaultDataset,
+    defaultDatasetName,
+    defaultDatasetNameFriendly,
+    currentDataset,
+    auspiceJson: undefined,
+  }
+}
+
+async function fetchPathogenJson(datasetRootUrl: string) {
+  let pathogen
+  try {
+    pathogen = await axiosFetch<VirusProperties>(urljoin(datasetRootUrl, 'pathogen.json'))
+  } catch (error: unknown) {
+    if (axios.isAxiosError(error) && error.status === '404' && (await checkDatasetV2FilesExist(datasetRootUrl))) {
+      throw new NextcladeV2Error(datasetRootUrl)
+    }
+    throw error
+  }
+  return pathogen
+}
+
+export class NextcladeV2Error extends Error {
+  public readonly datasetRootUrl: string
+
+  public constructor(datasetRootUrl: string) {
+    super()
+    this.datasetRootUrl = datasetRootUrl
+  }
+}
+
+function checkDatasetV2FilesExist(datasetRootUrl: string) {
+  return Promise.all([
+    ['genemap.gff', 'primers.csv', 'qc.json', 'tag.json', 'virus_properties.json'].map((file) =>
+      axiosHeadOrUndefined(urljoin(datasetRootUrl, file)),
+    ),
+  ])
 }
 
 export class ErrorDatasetFileMissing extends Error {
diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx
index 62407c086..e0d6e91f4 100644
--- a/packages/nextclade-web/src/pages/_app.tsx
+++ b/packages/nextclade-web/src/pages/_app.tsx
@@ -16,6 +16,7 @@ import { mdxComponents } from 'src/mdx-components'
 import LoadingPage from 'src/pages/loading'
 import { globalErrorAtom } from 'src/state/error.state'
 import {
+  datasetJsonAtom,
   geneMapInputAtom,
   qrySeqInputsStorageAtom,
   refSeqInputAtom,
@@ -101,8 +102,8 @@ export function RecoilStateInitializer() {
 
         const datasetInfo = await fetchSingleDataset(urlQuery)
         if (!isNil(datasetInfo)) {
-          const { datasets, currentDataset } = datasetInfo
-          return { datasets, currentDataset, minimizerIndexVersion: undefined }
+          const { datasets, currentDataset, auspiceJson } = datasetInfo
+          return { datasets, currentDataset, minimizerIndexVersion: undefined, auspiceJson }
         }
         return { datasets, currentDataset, minimizerIndexVersion }
       })
@@ -112,12 +113,13 @@ export function RecoilStateInitializer() {
         set(globalErrorAtom, sanitizeError(error))
         throw error
       })
-      .then(async ({ datasets, currentDataset, minimizerIndexVersion }) => {
+      .then(async ({ datasets, currentDataset, minimizerIndexVersion, auspiceJson }) => {
         set(datasetsAtom, { datasets })
         const previousDataset = await getPromise(datasetCurrentAtom)
         const dataset = currentDataset ?? previousDataset
         set(datasetCurrentAtom, dataset)
         set(minimizerIndexVersionAtom, minimizerIndexVersion)
+        set(datasetJsonAtom, auspiceJson)
         return dataset
       })
       .then(async (dataset) => {
diff --git a/packages/nextclade-web/src/state/inputs.state.ts b/packages/nextclade-web/src/state/inputs.state.ts
index d3d91a4d1..a2df1b92e 100644
--- a/packages/nextclade-web/src/state/inputs.state.ts
+++ b/packages/nextclade-web/src/state/inputs.state.ts
@@ -1,11 +1,11 @@
 import { isEmpty } from 'lodash'
 import { useCallback, useEffect } from 'react'
 import { atom, selector, useRecoilState, useResetRecoilState } from 'recoil'
+import type { AlgorithmInput, AuspiceTree } from 'src/types'
 import { cdsOrderPreferenceAtom } from 'src/state/dataset.state'
 import { clearAllFiltersAtom } from 'src/state/resultFilters.state'
 import { analysisResultsAtom, analysisStatusGlobalAtom, treeAtom } from 'src/state/results.state'
 import { viewedCdsAtom } from 'src/state/seqViewSettings.state'
-import { AlgorithmInput } from 'src/types'
 import { notUndefinedOrNull } from 'src/helpers/notUndefined'
 import { useResetSuggestions } from 'src/hooks/useResetSuggestions'
 
@@ -101,6 +101,11 @@ export const hasRequiredInputsAtom = selector({
   },
 })
 
+export const datasetJsonAtom = atom<AuspiceTree | undefined>({
+  key: 'datasetJson',
+  default: undefined,
+})
+
 /** Counts how many custom inputs are set */
 export const inputCustomizationCounterAtom = selector<number>({
   key: 'inputCustomizationCounterAtom',
@@ -130,5 +135,6 @@ export const datasetFilesResetAtom = selector<undefined>({
     reset(geneMapInputAtom)
     reset(refTreeInputAtom)
     reset(virusPropertiesInputAtom)
+    reset(datasetJsonAtom)
   },
 })
diff --git a/packages/nextclade-web/src/workers/launchAnalysis.ts b/packages/nextclade-web/src/workers/launchAnalysis.ts
index f7af20736..73947337c 100644
--- a/packages/nextclade-web/src/workers/launchAnalysis.ts
+++ b/packages/nextclade-web/src/workers/launchAnalysis.ts
@@ -2,7 +2,6 @@ import { concurrent } from 'fasy'
 import { isEmpty, merge } from 'lodash'
 import type {
   AlgorithmInput,
-  Dataset,
   FastaRecordId,
   NextcladeResult,
   CsvColumnConfig,
@@ -11,10 +10,8 @@ import type {
   OutputTrees,
 } from 'src/types'
 import { AlgorithmGlobalStatus } from 'src/types'
-import { ErrorInternal } from 'src/helpers/ErrorInternal'
 import type { LauncherThread } from 'src/workers/launcher.worker'
 import { spawn } from 'src/workers/spawn'
-import { axiosFetchRaw } from 'src/io/axiosFetch'
 
 export interface LaunchAnalysisInputs {
   refSeq: Promise<AlgorithmInput | undefined>
@@ -35,9 +32,8 @@ export interface LaunchAnalysisCallbacks {
 
 export async function launchAnalysis(
   qryFastaInputs: Promise<AlgorithmInput[]>,
-  paramInputs: LaunchAnalysisInputs,
+  params: NextcladeParamsRaw,
   callbacks: LaunchAnalysisCallbacks,
-  datasetPromise: Promise<Dataset | undefined>,
   numThreads: Promise<number>,
   csvColumnConfigPromise: Promise<CsvColumnConfig | undefined>,
 ) {
@@ -46,13 +42,6 @@ export async function launchAnalysis(
   // Resolve inputs into the actual strings
   const qryFastaStr = await getQueryFasta(await qryFastaInputs)
 
-  const [dataset] = await Promise.all([datasetPromise])
-  if (!dataset) {
-    throw new ErrorInternal('Dataset is required but not found')
-  }
-
-  const params = await getParams(paramInputs, dataset)
-
   const csvColumnConfig = await csvColumnConfigPromise
 
   const launcherWorker = await spawn<LauncherThread>(
@@ -96,33 +85,3 @@ export async function getQueryFasta(inputs: AlgorithmInput[]) {
   const contents = await concurrent.map(async (input) => input.getContent(), inputs)
   return contents.join('\n')
 }
-
-/** Resolves all param inputs into strings */
-async function getParams(paramInputs: LaunchAnalysisInputs, dataset: Dataset): Promise<NextcladeParamsRaw> {
-  const entries = [
-    { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset.files.genomeAnnotation },
-    { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset.files.reference },
-    { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset.files.treeJson },
-    { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset.files.pathogenJson },
-  ]
-
-  return Object.fromEntries(
-    await concurrent.map(async ({ key, input, datasetFileUrl }) => {
-      return [key, await resolveInput(await input, datasetFileUrl)]
-    }, entries),
-  ) as unknown as NextcladeParamsRaw
-}
-
-async function resolveInput(input: AlgorithmInput | undefined, datasetFileUrl: string | undefined) {
-  // If data is provided explicitly, load it
-  if (input) {
-    return input.getContent()
-  }
-
-  // Otherwise fetch corresponding file from the dataset
-  if (datasetFileUrl) {
-    return axiosFetchRaw(datasetFileUrl)
-  }
-
-  return undefined
-}
diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs
index b32835731..06c6a5eb1 100644
--- a/packages/nextclade/src/run/nextclade_wasm.rs
+++ b/packages/nextclade/src/run/nextclade_wasm.rs
@@ -20,7 +20,9 @@ use crate::tree::tree::{AuspiceGraph, AuspiceTree, CladeNodeAttrKeyDesc};
 use crate::tree::tree_builder::graph_attach_new_nodes_in_place;
 use crate::tree::tree_preprocess::graph_preprocess_in_place;
 use crate::types::outputs::NextcladeOutputs;
-use eyre::{Report, WrapErr};
+use crate::utils::any::AnyType;
+use crate::utils::option::OptionMapRefFallible;
+use eyre::{eyre, Report, WrapErr};
 use itertools::Itertools;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@@ -37,33 +39,93 @@ pub struct NextcladeParams {
 }
 
 impl NextcladeParams {
-  pub fn from_raw(raw: NextcladeParamsRaw) -> Result<Self, Report> {
-    let virus_properties = VirusProperties::from_str(&raw.virus_properties).wrap_err("When parsing pathogen JSON")?;
-
-    let ref_record = read_one_fasta_str(&raw.ref_seq).wrap_err("When parsing reference sequence")?;
+  pub fn from_auspice(auspice_json: &AuspiceTree) -> Result<Self, Report> {
+    let virus_properties = auspice_json
+      .meta
+      .extensions
+      .nextclade
+      .pathogen
+      .as_ref()
+      .cloned()
+      .unwrap_or_default();
 
-    let tree = raw
-      .tree
-      .map(|tree| AuspiceTree::from_str(tree).wrap_err("When parsing reference tree Auspice JSON v2"))
-      .transpose()?;
+    let ref_record = {
+      let ref_name = virus_properties
+        .attributes
+        .get("reference name")
+        .cloned()
+        .unwrap_or_else(|| AnyType::String("reference".to_owned()))
+        .as_str()
+        .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")?
+        .to_owned();
+
+      let ref_seq = auspice_json.root_sequence.get("nuc")
+      .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
+
+      FastaRecord {
+        index: 0,
+        seq_name: ref_name,
+        seq: ref_seq,
+      }
+    };
 
-    let gene_map = raw.gene_map.map_or_else(
-      || Ok(GeneMap::new()), // If genome annotation is not provided, use an empty one
-      |gene_map| GeneMap::from_str(gene_map).wrap_err("When parsing genome annotation"),
-    )?;
+    let gene_map = auspice_json
+      .meta
+      .genome_annotations
+      .map_ref_fallible(GeneMap::from_auspice_annotations)?
+      .unwrap_or_default();
 
     Ok(Self {
       ref_record,
       gene_map,
-      tree,
+      tree: Some(auspice_json.to_owned()),
       virus_properties,
     })
   }
+
+  pub fn from_raw(raw: NextcladeParamsRaw) -> Result<Self, Report> {
+    match raw {
+      NextcladeParamsRaw::Auspice(raw) => {
+        let auspice_json = AuspiceTree::from_str(raw.tree)?;
+        Self::from_auspice(&auspice_json)
+      }
+      NextcladeParamsRaw::Dir(raw) => {
+        let virus_properties =
+          VirusProperties::from_str(&raw.virus_properties).wrap_err("When parsing pathogen JSON")?;
+
+        let ref_record = read_one_fasta_str(&raw.ref_seq).wrap_err("When parsing reference sequence")?;
+
+        let tree = raw
+          .tree
+          .map(|tree| AuspiceTree::from_str(tree).wrap_err("When parsing reference tree Auspice JSON v2"))
+          .transpose()?;
+
+        let gene_map = raw
+          .gene_map
+          .map(|gene_map| GeneMap::from_str(gene_map).wrap_err("When parsing genome annotation"))
+          .transpose()?
+          .unwrap_or_default();
+
+        Ok(Self {
+          ref_record,
+          gene_map,
+          tree,
+          virus_properties,
+        })
+      }
+    }
+  }
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
-pub struct NextcladeParamsRaw {
+pub struct NextcladeParamsRawAuspice {
+  pub tree: String,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
+#[serde(rename_all = "camelCase")]
+pub struct NextcladeParamsRawDir {
   #[schemars(with = "String")]
   pub ref_seq: String,
   pub gene_map: Option<String>,
@@ -71,6 +133,12 @@ pub struct NextcladeParamsRaw {
   pub virus_properties: String,
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
+pub enum NextcladeParamsRaw {
+  Auspice(NextcladeParamsRawAuspice),
+  Dir(NextcladeParamsRawDir),
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct AnalysisInput {

From b1b3f5f94d41c9f193d70ae56d298ea87d640346 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 23 May 2024 15:59:14 +0200
Subject: [PATCH 10/18] fix(web): prevent crash when an auspice dataset was
 used in prev session

---
 packages/nextclade-web/src/io/fetchDatasets.ts      |  8 ++++++--
 .../src/io/fetchSingleDatasetAuspice.ts             |  6 ++++--
 .../src/io/fetchSingleDatasetDirectory.ts           | 13 +++----------
 packages/nextclade-web/src/pages/_app.tsx           | 12 +++++++-----
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/packages/nextclade-web/src/io/fetchDatasets.ts b/packages/nextclade-web/src/io/fetchDatasets.ts
index 5aacc77f8..f798be141 100644
--- a/packages/nextclade-web/src/io/fetchDatasets.ts
+++ b/packages/nextclade-web/src/io/fetchDatasets.ts
@@ -9,7 +9,7 @@ import {
   parseGithubRepoUrl,
 } from 'src/io/fetchSingleDatasetFromGithub'
 
-import { Dataset } from 'src/types'
+import { type AuspiceTree, Dataset } from 'src/types'
 import {
   fetchDatasetsIndex,
   filterDatasets,
@@ -128,7 +128,11 @@ export async function initializeDatasets(datasetServerUrl: string, urlQuery: Par
   const minimizerIndexVersion = await getCompatibleMinimizerIndexVersion(datasetServerUrl, datasetsIndexJson)
 
   // Check if URL params specify dataset params and try to find the corresponding dataset
-  const currentDataset = await getDatasetFromUrlParams(urlQuery, datasets)
+  const currentDataset:
+    | (Dataset & {
+        auspiceJson?: AuspiceTree
+      })
+    | undefined = await getDatasetFromUrlParams(urlQuery, datasets)
 
   return { datasets, currentDataset, minimizerIndexVersion }
 }
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index d0d106c5d..373e469a0 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -14,7 +14,7 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
     throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`)
   }
 
-  const currentDataset: Dataset = {
+  const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = {
     path: datasetJsonUrl,
     capabilities: {
       primers: false,
@@ -24,6 +24,8 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
 
     // HACK: there is no files if dataset comes from Auspice JSON, neither they are needed. What to do?
     files: {} as unknown as DatasetFiles,
+
+    auspiceJson,
   }
 
   const datasets = [currentDataset]
@@ -32,5 +34,5 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
   const defaultDatasetName = currentDatasetName
   const defaultDatasetNameFriendly = attrStrMaybe(currentDataset.attributes, 'name') ?? currentDatasetName
 
-  return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset, auspiceJson }
+  return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset }
 }
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
index 1b59c48c3..a735de622 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
@@ -2,7 +2,7 @@ import axios from 'axios'
 import urljoin from 'url-join'
 import { mapValues } from 'lodash'
 import { concurrent } from 'fasy'
-import { attrStrMaybe, Dataset, DatasetFiles, VirusProperties } from 'src/types'
+import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles, VirusProperties } from 'src/types'
 import { removeTrailingSlash } from 'src/io/url'
 import { axiosFetch, axiosHead, axiosHeadOrUndefined } from 'src/io/axiosFetch'
 import { sanitizeError } from 'src/helpers/sanitizeError'
@@ -15,7 +15,7 @@ export async function fetchSingleDatasetDirectory(
 
   const pathogen = await fetchPathogenJson(datasetRootUrl)
 
-  const currentDataset: Dataset = {
+  const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = {
     path: datasetRootUrl,
     capabilities: {
       primers: false,
@@ -49,14 +49,7 @@ export async function fetchSingleDatasetDirectory(
     Object.entries(currentDataset.files).filter(([filename, _]) => !['sequences.fasta'].includes(filename)),
   )
 
-  return {
-    datasets,
-    defaultDataset,
-    defaultDatasetName,
-    defaultDatasetNameFriendly,
-    currentDataset,
-    auspiceJson: undefined,
-  }
+  return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset }
 }
 
 async function fetchPathogenJson(datasetRootUrl: string) {
diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx
index e0d6e91f4..b0eb962c0 100644
--- a/packages/nextclade-web/src/pages/_app.tsx
+++ b/packages/nextclade-web/src/pages/_app.tsx
@@ -56,6 +56,8 @@ import {
 import { ErrorBoundary } from 'src/components/Error/ErrorBoundary'
 
 import 'src/styles/global.scss'
+import { Dataset } from '../types'
+import { AuspiceTree } from '../types'
 
 RecoilEnv.RECOIL_DUPLICATE_ATOM_KEY_CHECKING_ENABLED = false
 
@@ -102,8 +104,8 @@ export function RecoilStateInitializer() {
 
         const datasetInfo = await fetchSingleDataset(urlQuery)
         if (!isNil(datasetInfo)) {
-          const { datasets, currentDataset, auspiceJson } = datasetInfo
-          return { datasets, currentDataset, minimizerIndexVersion: undefined, auspiceJson }
+          const { datasets, currentDataset } = datasetInfo
+          return { datasets, currentDataset, minimizerIndexVersion: undefined }
         }
         return { datasets, currentDataset, minimizerIndexVersion }
       })
@@ -113,13 +115,13 @@ export function RecoilStateInitializer() {
         set(globalErrorAtom, sanitizeError(error))
         throw error
       })
-      .then(async ({ datasets, currentDataset, minimizerIndexVersion, auspiceJson }) => {
+      .then(async ({ datasets, currentDataset, minimizerIndexVersion }) => {
         set(datasetsAtom, { datasets })
         const previousDataset = await getPromise(datasetCurrentAtom)
-        const dataset = currentDataset ?? previousDataset
+        const dataset: (Dataset & { auspiceJson?: AuspiceTree }) | undefined = currentDataset ?? previousDataset
         set(datasetCurrentAtom, dataset)
         set(minimizerIndexVersionAtom, minimizerIndexVersion)
-        set(datasetJsonAtom, auspiceJson)
+        set(datasetJsonAtom, dataset?.auspiceJson)
         return dataset
       })
       .then(async (dataset) => {

From e5ee0688c22b9fe8203c99900a2cf42c9cc7e925 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 23 May 2024 16:22:45 +0200
Subject: [PATCH 11/18] fix(web): prevent crash when auspice json has no
 `.root_sequence`

---
 packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts | 2 +-
 packages/nextclade/src/graph/graph.rs                      | 3 +--
 packages/nextclade/src/run/nextclade_wasm.rs               | 2 +-
 packages/nextclade/src/tree/tree.rs                        | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index 373e469a0..8b2becd86 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -10,7 +10,7 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
   const auspiceJson = await axiosFetch<AuspiceTree>(datasetJsonUrl)
   const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen
 
-  if (isEmpty(auspiceJson.root_sequence.nuc)) {
+  if (isEmpty(auspiceJson.root_sequence?.nuc)) {
     throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`)
   }
 
diff --git a/packages/nextclade/src/graph/graph.rs b/packages/nextclade/src/graph/graph.rs
index 4078ea5f1..5fe2be45f 100644
--- a/packages/nextclade/src/graph/graph.rs
+++ b/packages/nextclade/src/graph/graph.rs
@@ -13,7 +13,6 @@ use num_traits::Float;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
-use maplit::btreemap;
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 #[allow(clippy::partial_pub_fields)]
@@ -557,7 +556,7 @@ pub fn convert_graph_to_auspice_tree(graph: &AuspiceGraph) -> Result<AuspiceTree
     version: graph.data.auspice_tree_version.clone(),
     meta: graph.data.meta.clone(),
     tree,
-    root_sequence: btreemap! {},
+    root_sequence: None,
     other: graph.data.other.clone(),
   })
 }
diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs
index 06c6a5eb1..799aba328 100644
--- a/packages/nextclade/src/run/nextclade_wasm.rs
+++ b/packages/nextclade/src/run/nextclade_wasm.rs
@@ -59,7 +59,7 @@ impl NextcladeParams {
         .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")?
         .to_owned();
 
-      let ref_seq = auspice_json.root_sequence.get("nuc")
+      let ref_seq = auspice_json.root_sequence.as_ref().and_then(|root_sequence| root_sequence.get("nuc"))
       .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
 
       FastaRecord {
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index b3cfe0f31..88c62c57e 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -564,8 +564,8 @@ pub struct AuspiceTree {
 
   pub tree: AuspiceTreeNode,
 
-  #[serde(skip_serializing_if = "BTreeMap::is_empty")]
-  pub root_sequence: BTreeMap<String, String>,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub root_sequence: Option<BTreeMap<String, String>>,
 
   #[serde(flatten)]
   pub other: serde_json::Value,

From 883a0d6a74bf95450c9ddf4f6dc12de996af36a1 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Thu, 23 May 2024 16:32:23 +0200
Subject: [PATCH 12/18] refactor: lint

---
 packages/nextclade-cli/src/dataset/dataset_download.rs | 7 +++----
 packages/nextclade-web/src/pages/_app.tsx              | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index 64184a0c4..e8e3f01b7 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -8,12 +8,11 @@ use log::{warn, LevelFilter};
 use nextclade::analyze::virus_properties::VirusProperties;
 use nextclade::gene::gene_map::{filter_gene_map, GeneMap};
 use nextclade::io::dataset::{Dataset, DatasetsIndexJson};
-use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str, FastaRecord};
+use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str};
 use nextclade::io::file::create_file_or_stdout;
 use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string};
 use nextclade::run::nextclade_wasm::NextcladeParams;
 use nextclade::tree::tree::AuspiceTree;
-use nextclade::utils::any::AnyType;
 use nextclade::utils::fs::list_files_recursive;
 use nextclade::utils::option::OptionMapRefFallible;
 use nextclade::utils::string::{format_list, surround_with_quotes, Indent};
@@ -288,9 +287,9 @@ pub fn dataset_dir_load(
 }
 
 pub fn dataset_json_load(
-  run_args: &NextcladeRunArgs,
+  _run_args: &NextcladeRunArgs,
   dataset_json: impl AsRef<Path>,
-  cdses: &Option<Vec<String>>,
+  _cdses: &Option<Vec<String>>,
 ) -> Result<NextcladeParams, Report> {
   let dataset_json = dataset_json.as_ref();
 
diff --git a/packages/nextclade-web/src/pages/_app.tsx b/packages/nextclade-web/src/pages/_app.tsx
index b0eb962c0..92a2970c3 100644
--- a/packages/nextclade-web/src/pages/_app.tsx
+++ b/packages/nextclade-web/src/pages/_app.tsx
@@ -8,6 +8,7 @@ import { RecoilEnv, RecoilRoot, useRecoilCallback, useRecoilState, useRecoilValu
 import { AppProps } from 'next/app'
 import { useRouter } from 'next/router'
 import dynamic from 'next/dynamic'
+import type { Dataset, AuspiceTree } from 'src/types'
 import { sanitizeError } from 'src/helpers/sanitizeError'
 import { useRunAnalysis } from 'src/hooks/useRunAnalysis'
 import i18nAuspice, { changeAuspiceLocale } from 'src/i18n/i18n.auspice'
@@ -56,8 +57,6 @@ import {
 import { ErrorBoundary } from 'src/components/Error/ErrorBoundary'
 
 import 'src/styles/global.scss'
-import { Dataset } from '../types'
-import { AuspiceTree } from '../types'
 
 RecoilEnv.RECOIL_DUPLICATE_ATOM_KEY_CHECKING_ENABLED = false
 

From 9f3c1e0a7667c72ed9ba0e9701aedbc1e711defb Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 08:14:12 +0200
Subject: [PATCH 13/18] fix(web): specifically accept json

Let's add an explicit `Accept` HTTP header when fetching Auspice JSON. This is required for nextstrain.org links to work - the server sends different content depending on `Accept` header.
---
 packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index 8b2becd86..80d85a2d5 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -7,7 +7,9 @@ import { axiosFetch } from 'src/io/axiosFetch'
 export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
   const datasetJsonUrl = removeTrailingSlash(datasetJsonUrl_)
 
-  const auspiceJson = await axiosFetch<AuspiceTree>(datasetJsonUrl)
+  const auspiceJson = await axiosFetch<AuspiceTree>(datasetJsonUrl, {
+    headers: { Accept: 'application/json, text/plain, */*' },
+  })
   const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen
 
   if (isEmpty(auspiceJson.root_sequence?.nuc)) {

From fc7b8bd63cf49947c18fcba2cacdb9796c201563 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 08:32:32 +0200
Subject: [PATCH 14/18] fix(web): hide "Load examples" button when examples are
 not in dataset

---
 .../nextclade-web/src/components/Main/ButtonLoadExample.tsx  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
index 4d2bbb827..c784eb77c 100644
--- a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
+++ b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
@@ -1,4 +1,5 @@
 import { Dataset } from '_SchemaRoot'
+import { isEmpty } from 'lodash'
 import React, { useCallback } from 'react'
 import { Button } from 'reactstrap'
 import { useRecoilValue } from 'recoil'
@@ -44,6 +45,10 @@ export function ButtonLoadExample({ ...rest }) {
     setExampleSequences(datasetCurrent)
   }, [datasetCurrent, setExampleSequences])
 
+  if (isEmpty(datasetCurrent?.files.examples)) {
+    return null
+  }
+
   return (
     <Button {...rest} color="link" onClick={onClick} disabled={hasInputErrors || !datasetCurrent}>
       {t('Load example')}

From ddd9925848796f4b33d174b8a42aefd40cd09660 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 09:12:03 +0200
Subject: [PATCH 15/18] fix: make dataset files optional

Let's make all fields in object `.files` as well as `.files` field itself optional. The list of files does not make sense when using Auspice JSON as a dataset input.

This affects `pathogen.json` as well as `.meta.extensions.nextclade.pathogen.files`.

I had to add manual null checks in a few places for reference sequence and changelog files which were previously considered required.

If the reference sequence is not available from any of the possible sources (dataset file, Auspice JSON, `--input-ref` CLI arg or `&input-ref` URL param), there is no way for Nextclade to proceed, so an error is emitted in this case.
---
 .../src/dataset/dataset_download.rs           | 40 +++++++++----------
 .../src/components/Main/ButtonLoadExample.tsx |  2 +-
 .../components/Main/DatasetContentSection.tsx |  8 ++--
 .../components/Main/ExampleSequencePicker.tsx |  2 +-
 .../nextclade-web/src/hooks/useRunAnalysis.ts |  8 ++--
 .../nextclade-web/src/io/AlgorithmInput.ts    |  9 ++++-
 .../src/io/fetchSingleDatasetAuspice.ts       |  6 +--
 .../src/io/fetchSingleDatasetDirectory.ts     | 10 +++--
 .../nextclade/src/analyze/virus_properties.rs |  1 +
 packages/nextclade/src/io/dataset.rs          | 14 ++++++-
 10 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index e8e3f01b7..1b21394ec 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -122,14 +122,10 @@ pub fn dataset_zip_load(
     .wrap_err("When reading pathogen JSON from dataset")?
     .ok_or_else(|| eyre!("Pathogen JSON must always be present in the dataset but not found."))?;
 
-  let ref_record = read_from_path_or_zip(
-    &run_args.inputs.input_ref,
-    &mut zip,
-    &Some(&virus_properties.files.reference),
-  )?
-  .map_ref_fallible(read_one_fasta_str)
-  .wrap_err("When reading reference sequence from dataset")?
-  .ok_or_else(|| eyre!("Reference sequence must always be present in the dataset but not found."))?;
+  let ref_record = read_from_path_or_zip(&run_args.inputs.input_ref, &mut zip, &virus_properties.files.reference)?
+    .map_ref_fallible(read_one_fasta_str)
+    .wrap_err("When reading reference sequence from dataset")?
+    .ok_or_else(|| eyre!("Reference sequence must always be present in the dataset but not found."))?;
 
   let gene_map = read_from_path_or_zip(
     &run_args.inputs.input_annotation,
@@ -160,8 +156,8 @@ fn verify_dataset_files<'a, T: AsRef<str> + 'a + ?Sized>(
   files_present: impl Iterator<Item = &'a T> + 'a,
 ) {
   let declared: BTreeSet<&str> = [
-    Some(virus_properties.files.reference.as_str()),
-    Some(virus_properties.files.pathogen_json.as_str()),
+    virus_properties.files.reference.as_deref(),
+    virus_properties.files.pathogen_json.as_deref(),
     virus_properties.files.genome_annotation.as_deref(),
     virus_properties.files.tree_json.as_deref(),
     virus_properties.files.examples.as_deref(),
@@ -241,8 +237,17 @@ pub fn dataset_dir_load(
   let virus_properties = VirusProperties::from_path(input_pathogen_json)?;
 
   let input_ref = input_ref
-    .clone()
-    .unwrap_or_else(|| dataset_dir.join(&virus_properties.files.reference));
+    .as_ref()
+    .cloned()
+    .or_else(|| {
+      virus_properties
+        .files
+        .reference
+        .as_ref()
+        .map(|reference| dataset_dir.join(reference))
+    })
+    .expect("Reference sequence is required but it is neither declared in the dataset's pathogen.json `.files` section, nor provided as a separate file");
+
   let ref_record = read_one_fasta(input_ref).wrap_err("When reading reference sequence")?;
 
   let gene_map = input_annotation
@@ -389,14 +394,9 @@ pub fn dataset_str_download_and_load(
   .wrap_err("When reading pathogen JSON from dataset")?
   .ok_or_else(|| eyre!("Required file not found in dataset: 'pathogen.json'. Please report it to dataset authors."))?;
 
-  let ref_record = read_from_path_or_url(
-    &http,
-    &dataset,
-    &run_args.inputs.input_ref,
-    &Some(dataset.files.reference.clone()),
-  )?
-  .map_ref_fallible(read_one_fasta_str)?
-  .wrap_err("When reading reference sequence from dataset")?;
+  let ref_record = read_from_path_or_url(&http, &dataset, &run_args.inputs.input_ref, &dataset.files.reference)?
+    .map_ref_fallible(read_one_fasta_str)?
+    .wrap_err("When reading reference sequence from dataset")?;
 
   let gene_map = read_from_path_or_url(
     &http,
diff --git a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
index c784eb77c..745eab0cc 100644
--- a/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
+++ b/packages/nextclade-web/src/components/Main/ButtonLoadExample.tsx
@@ -45,7 +45,7 @@ export function ButtonLoadExample({ ...rest }) {
     setExampleSequences(datasetCurrent)
   }, [datasetCurrent, setExampleSequences])
 
-  if (isEmpty(datasetCurrent?.files.examples)) {
+  if (isEmpty(datasetCurrent?.files?.examples)) {
     return null
   }
 
diff --git a/packages/nextclade-web/src/components/Main/DatasetContentSection.tsx b/packages/nextclade-web/src/components/Main/DatasetContentSection.tsx
index 9b1346080..7f6175906 100644
--- a/packages/nextclade-web/src/components/Main/DatasetContentSection.tsx
+++ b/packages/nextclade-web/src/components/Main/DatasetContentSection.tsx
@@ -21,12 +21,12 @@ export function DatasetContentSection() {
   return (
     <ContentSection>
       <Nav tabs>
-        {currentDataset?.files.readme && (
+        {currentDataset?.files?.readme && (
           <TabLabel tabId={0} activeTabId={activeTabId} setActiveTabId={setActiveTabId}>
             {'Summary'}
           </TabLabel>
         )}
-        {currentDataset?.files.changelog && (
+        {currentDataset?.files?.changelog && (
           <TabLabel tabId={1} activeTabId={activeTabId} setActiveTabId={setActiveTabId}>
             {'History'}
           </TabLabel>
@@ -40,10 +40,10 @@ export function DatasetContentSection() {
       </Nav>
       <TabContent activeTab={activeTabId}>
         <TabPane tabId={0}>
-          {currentDataset?.files.readme && <MarkdownRemote url={currentDataset?.files.readme} />}
+          {currentDataset?.files?.readme && <MarkdownRemote url={currentDataset?.files.readme} />}
         </TabPane>
         <TabPane tabId={1}>
-          {currentDataset?.files.changelog && <MarkdownRemote url={currentDataset?.files.changelog} />}
+          {currentDataset?.files?.changelog && <MarkdownRemote url={currentDataset?.files.changelog} />}
         </TabPane>
         <TabPane tabId={2}>{currentDataset && <DatasetContentTabAdvanced />}</TabPane>
       </TabContent>
diff --git a/packages/nextclade-web/src/components/Main/ExampleSequencePicker.tsx b/packages/nextclade-web/src/components/Main/ExampleSequencePicker.tsx
index f3a1f24c7..1c84f8d87 100644
--- a/packages/nextclade-web/src/components/Main/ExampleSequencePicker.tsx
+++ b/packages/nextclade-web/src/components/Main/ExampleSequencePicker.tsx
@@ -29,7 +29,7 @@ export function ExampleSequencePicker({ ...restProps }: LanguageSwitcherProps) {
   const { datasets: allDatasets } = useRecoilValue(datasetsAtom)
 
   const filtered = useMemo(() => {
-    const datasets = allDatasets.filter((dataset) => !isNil(dataset.files.examples))
+    const datasets = allDatasets.filter((dataset) => !isNil(dataset?.files?.examples))
     if (searchTerm.trim().length === 0) {
       return datasets
     }
diff --git a/packages/nextclade-web/src/hooks/useRunAnalysis.ts b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
index fae78841b..e1c10a1fa 100644
--- a/packages/nextclade-web/src/hooks/useRunAnalysis.ts
+++ b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
@@ -164,10 +164,10 @@ export function useRunAnalysis() {
 /** Resolves all param inputs into strings */
 async function getParams(paramInputs: LaunchAnalysisInputs, dataset: Dataset): Promise<NextcladeParamsRawDir> {
   const entries = [
-    { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset.files.genomeAnnotation },
-    { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset.files.reference },
-    { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset.files.treeJson },
-    { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset.files.pathogenJson },
+    { key: 'geneMap', input: paramInputs.geneMap, datasetFileUrl: dataset?.files?.genomeAnnotation },
+    { key: 'refSeq', input: paramInputs.refSeq, datasetFileUrl: dataset?.files?.reference },
+    { key: 'tree', input: paramInputs.tree, datasetFileUrl: dataset?.files?.treeJson },
+    { key: 'virusProperties', input: paramInputs.virusProperties, datasetFileUrl: dataset?.files?.pathogenJson },
   ]
 
   return Object.fromEntries(
diff --git a/packages/nextclade-web/src/io/AlgorithmInput.ts b/packages/nextclade-web/src/io/AlgorithmInput.ts
index bde99335f..e80681929 100644
--- a/packages/nextclade-web/src/io/AlgorithmInput.ts
+++ b/packages/nextclade-web/src/io/AlgorithmInput.ts
@@ -1,3 +1,6 @@
+import { isEmpty } from 'lodash'
+import { FatalError } from 'next/dist/lib/fatal-error'
+import serializeJavascript from 'serialize-javascript'
 import { uniqueId } from 'src/helpers/uniqueId'
 import { AlgorithmInput, AlgorithmInputType, Dataset } from 'src/types'
 import { axiosFetchRaw } from 'src/io/axiosFetch'
@@ -115,6 +118,10 @@ export class AlgorithmInputDefault implements AlgorithmInput {
   }
 
   public async getContent(): Promise<string> {
-    return axiosFetchRaw(this.dataset.files.examples)
+    if (isEmpty(this.dataset.files?.examples)) {
+      const url = serializeJavascript(this.dataset.files?.examples)
+      throw new FatalError(`Attempting to fetch dataset example sequences from an invalid URL: '${url}'`)
+    }
+    return axiosFetchRaw(this.dataset.files?.examples)
   }
 }
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index 80d85a2d5..c80232129 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -1,6 +1,6 @@
 import { isEmpty } from 'lodash'
 import { FatalError } from 'next/dist/lib/fatal-error'
-import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles } from 'src/types'
+import { attrStrMaybe, AuspiceTree, Dataset } from 'src/types'
 import { removeTrailingSlash } from 'src/io/url'
 import { axiosFetch } from 'src/io/axiosFetch'
 
@@ -23,10 +23,6 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
       qc: [],
     },
     ...pathogen,
-
-    // HACK: there is no files if dataset comes from Auspice JSON, neither they are needed. What to do?
-    files: {} as unknown as DatasetFiles,
-
     auspiceJson,
   }
 
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
index a735de622..c51248f46 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetDirectory.ts
@@ -2,7 +2,7 @@ import axios from 'axios'
 import urljoin from 'url-join'
 import { mapValues } from 'lodash'
 import { concurrent } from 'fasy'
-import { attrStrMaybe, AuspiceTree, Dataset, DatasetFiles, VirusProperties } from 'src/types'
+import { attrStrMaybe, AuspiceTree, Dataset, VirusProperties } from 'src/types'
 import { removeTrailingSlash } from 'src/io/url'
 import { axiosFetch, axiosHead, axiosHeadOrUndefined } from 'src/io/axiosFetch'
 import { sanitizeError } from 'src/helpers/sanitizeError'
@@ -15,6 +15,8 @@ export async function fetchSingleDatasetDirectory(
 
   const pathogen = await fetchPathogenJson(datasetRootUrl)
 
+  const files = mapValues(pathogen.files, (file) => (file ? urljoin(datasetRootUrl, file) : file))
+
   const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = {
     path: datasetRootUrl,
     capabilities: {
@@ -22,7 +24,7 @@ export async function fetchSingleDatasetDirectory(
       qc: [],
     },
     ...pathogen,
-    files: mapValues(pathogen.files, (file) => (file ? urljoin(datasetRootUrl, file) : file)) as DatasetFiles,
+    files,
   }
 
   const datasets = [currentDataset]
@@ -32,7 +34,7 @@ export async function fetchSingleDatasetDirectory(
   const defaultDatasetNameFriendly = attrStrMaybe(currentDataset.attributes, 'name') ?? currentDatasetName
 
   await concurrent.forEach(
-    async ([filename, fileUrl]) => {
+    async ([filename, fileUrl]: [string, string]) => {
       try {
         await axiosHead(fileUrl)
       } catch (error_: unknown) {
@@ -46,7 +48,7 @@ export async function fetchSingleDatasetDirectory(
         })
       }
     },
-    Object.entries(currentDataset.files).filter(([filename, _]) => !['sequences.fasta'].includes(filename)),
+    Object.entries(files).filter(([_, key]) => !['examples', 'readme'].includes(key)),
   )
 
   return { datasets, defaultDataset, defaultDatasetName, defaultDatasetNameFriendly, currentDataset }
diff --git a/packages/nextclade/src/analyze/virus_properties.rs b/packages/nextclade/src/analyze/virus_properties.rs
index 896705865..cab9dccc0 100644
--- a/packages/nextclade/src/analyze/virus_properties.rs
+++ b/packages/nextclade/src/analyze/virus_properties.rs
@@ -39,6 +39,7 @@ pub struct VirusProperties {
   #[serde(default, skip_serializing_if = "DatasetMeta::is_default")]
   pub meta: DatasetMeta,
 
+  #[serde(default, skip_serializing_if = "DatasetFiles::is_default")]
   pub files: DatasetFiles,
 
   pub default_cds: Option<String>,
diff --git a/packages/nextclade/src/io/dataset.rs b/packages/nextclade/src/io/dataset.rs
index 1838a53f9..ff5c15f23 100644
--- a/packages/nextclade/src/io/dataset.rs
+++ b/packages/nextclade/src/io/dataset.rs
@@ -66,6 +66,7 @@ pub struct Dataset {
   #[serde(default, skip_serializing_if = "DatasetMeta::is_default")]
   pub meta: DatasetMeta,
 
+  #[serde(default, skip_serializing_if = "DatasetFiles::is_default")]
   pub files: DatasetFiles,
 
   #[serde(default, skip_serializing_if = "DatasetCapabilities::is_default")]
@@ -331,9 +332,11 @@ impl DatasetMeta {
 #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct DatasetFiles {
-  pub reference: String,
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub reference: Option<String>,
 
-  pub pathogen_json: String,
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub pathogen_json: Option<String>,
 
   #[serde(default, skip_serializing_if = "Option::is_none")]
   pub genome_annotation: Option<String>,
@@ -357,6 +360,13 @@ pub struct DatasetFiles {
   pub other: serde_json::Value,
 }
 
+impl DatasetFiles {
+  #[inline]
+  pub fn is_default(&self) -> bool {
+    self == &Self::default()
+  }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct DatasetCollectionUrl {

From fe260c6be7495b940af36dbc0ba6b6b9fa3b23de Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 14:56:33 +0200
Subject: [PATCH 16/18] feat: allow to override dataset components when Auspice
 dataset

This allows to use the individual input overrides for ref sequence, genome annotation, pathogen info and even the tree by providing individual files to the `--input-*` args in CLI and `?input-` URL params in Web.

The mechanism works the same as for the normal directory- and zip-based datasets.
---
 .../src/dataset/dataset_download.rs           |  48 +++++--
 .../nextclade-web/src/hooks/useRunAnalysis.ts |  18 ++-
 packages/nextclade/src/run/nextclade_wasm.rs  | 130 +++++++++++++-----
 packages/nextclade/src/utils/option.rs        |   5 +
 4 files changed, 153 insertions(+), 48 deletions(-)

diff --git a/packages/nextclade-cli/src/dataset/dataset_download.rs b/packages/nextclade-cli/src/dataset/dataset_download.rs
index 1b21394ec..9fb3916be 100644
--- a/packages/nextclade-cli/src/dataset/dataset_download.rs
+++ b/packages/nextclade-cli/src/dataset/dataset_download.rs
@@ -11,7 +11,7 @@ use nextclade::io::dataset::{Dataset, DatasetsIndexJson};
 use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str};
 use nextclade::io::file::create_file_or_stdout;
 use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string};
-use nextclade::run::nextclade_wasm::NextcladeParams;
+use nextclade::run::nextclade_wasm::{NextcladeParams, NextcladeParamsOptional};
 use nextclade::tree::tree::AuspiceTree;
 use nextclade::utils::fs::list_files_recursive;
 use nextclade::utils::option::OptionMapRefFallible;
@@ -292,22 +292,48 @@ pub fn dataset_dir_load(
 }
 
 pub fn dataset_json_load(
-  _run_args: &NextcladeRunArgs,
+  run_args: &NextcladeRunArgs,
   dataset_json: impl AsRef<Path>,
-  _cdses: &Option<Vec<String>>,
+  cdses: &Option<Vec<String>>,
 ) -> Result<NextcladeParams, Report> {
   let dataset_json = dataset_json.as_ref();
 
-  // let NextcladeRunInputArgs {
-  //   input_ref,
-  //   input_tree,
-  //   input_pathogen_json,
-  //   input_annotation,
-  //   ..
-  // } = &run_args.inputs;
+  let NextcladeRunInputArgs {
+    input_ref,
+    input_tree,
+    input_pathogen_json,
+    input_annotation,
+    ..
+  } = &run_args.inputs;
 
   let auspice_json = AuspiceTree::from_path(dataset_json).wrap_err("When reading Auspice JSON v2")?;
-  NextcladeParams::from_auspice(&auspice_json)
+
+  let overrides = {
+    let virus_properties = input_pathogen_json
+      .map_ref_fallible(VirusProperties::from_path)
+      .wrap_err("When parsing pathogen JSON")?;
+
+    let ref_record = input_ref
+      .map_ref_fallible(read_one_fasta)
+      .wrap_err("When parsing reference sequence")?;
+
+    let tree = input_tree
+      .map_ref_fallible(AuspiceTree::from_path)
+      .wrap_err("When parsing reference tree Auspice JSON v2")?;
+
+    let gene_map = input_annotation
+      .map_ref_fallible(GeneMap::from_path)
+      .wrap_err("When parsing genome annotation")?;
+
+    NextcladeParamsOptional {
+      ref_record,
+      gene_map,
+      tree,
+      virus_properties,
+    }
+  };
+
+  NextcladeParams::from_auspice(&auspice_json, &overrides, cdses)
 }
 
 pub fn dataset_individual_files_load(
diff --git a/packages/nextclade-web/src/hooks/useRunAnalysis.ts b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
index e1c10a1fa..f6cf8a01c 100644
--- a/packages/nextclade-web/src/hooks/useRunAnalysis.ts
+++ b/packages/nextclade-web/src/hooks/useRunAnalysis.ts
@@ -1,10 +1,12 @@
 import type { AuspiceJsonV2, CladeNodeAttrDesc } from 'auspice'
 import { changeColorBy } from 'auspice/src/actions/colors'
 import { concurrent } from 'fasy'
+import { isNil } from 'lodash'
 import { useRouter } from 'next/router'
 import { useDispatch } from 'react-redux'
 import { useRecoilCallback } from 'recoil'
 import { ErrorInternal } from 'src/helpers/ErrorInternal'
+import { notUndefinedOrNull } from 'src/helpers/notUndefined'
 import { clearAllFiltersAtom } from 'src/state/resultFilters.state'
 import { viewedCdsAtom } from 'src/state/seqViewSettings.state'
 import { AlgorithmGlobalStatus, AlgorithmInput, Dataset, NextcladeParamsRaw, NextcladeParamsRawDir } from 'src/types'
@@ -140,7 +142,21 @@ export function useRunAnalysis() {
 
             let params: NextcladeParamsRaw
             if (tree) {
-              params = { Auspice: { tree: JSON.stringify(tree) } }
+              const overridesEntries = [
+                { key: 'geneMap', input: inputs.geneMap },
+                { key: 'refSeq', input: inputs.refSeq },
+                { key: 'tree', input: inputs.tree },
+                { key: 'virusProperties', input: inputs.virusProperties },
+              ]
+              const overrides = await concurrent.map(async ({ key, input }) => {
+                const awaitedInput = await input
+                if (isNil(awaitedInput)) {
+                  return undefined
+                }
+                return [key, await awaitedInput.getContent()]
+              }, overridesEntries)
+              const overridesPresent = overrides.filter(notUndefinedOrNull)
+              params = { Auspice: { auspiceJson: JSON.stringify(tree), ...Object.fromEntries(overridesPresent) } }
             } else {
               const dataset = await datasetCurrent
               if (!dataset) {
diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs
index 799aba328..eb7c91cdd 100644
--- a/packages/nextclade/src/run/nextclade_wasm.rs
+++ b/packages/nextclade/src/run/nextclade_wasm.rs
@@ -7,7 +7,7 @@ use crate::analyze::find_aa_motifs_changes::AaMotifsMap;
 use crate::analyze::pcr_primers::PcrPrimer;
 use crate::analyze::phenotype::get_phenotype_attr_descs;
 use crate::analyze::virus_properties::{AaMotifsDesc, PhenotypeAttrDesc, VirusProperties};
-use crate::gene::gene_map::GeneMap;
+use crate::gene::gene_map::{filter_gene_map, GeneMap};
 use crate::graph::graph::{convert_auspice_tree_to_graph, convert_graph_to_auspice_tree};
 use crate::io::fasta::{read_one_fasta_str, FastaRecord};
 use crate::io::nextclade_csv::CsvColumnConfig;
@@ -21,14 +21,16 @@ use crate::tree::tree_builder::graph_attach_new_nodes_in_place;
 use crate::tree::tree_preprocess::graph_preprocess_in_place;
 use crate::types::outputs::NextcladeOutputs;
 use crate::utils::any::AnyType;
-use crate::utils::option::OptionMapRefFallible;
+use crate::utils::option::{find_some, OptionMapRefFallible};
 use eyre::{eyre, Report, WrapErr};
 use itertools::Itertools;
+use optfield::optfield;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 
 #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
+#[optfield(pub NextcladeParamsOptional, attrs, doc, field_attrs, field_doc, merge_fn = pub)]
 #[serde(rename_all = "camelCase")]
 pub struct NextcladeParams {
   #[schemars(with = "String")]
@@ -39,46 +41,68 @@ pub struct NextcladeParams {
 }
 
 impl NextcladeParams {
-  pub fn from_auspice(auspice_json: &AuspiceTree) -> Result<Self, Report> {
-    let virus_properties = auspice_json
-      .meta
-      .extensions
-      .nextclade
-      .pathogen
-      .as_ref()
-      .cloned()
-      .unwrap_or_default();
+  pub fn from_auspice(
+    auspice_json: &AuspiceTree,
+    overrides: &NextcladeParamsOptional,
+    cdses: &Option<Vec<String>>,
+  ) -> Result<Self, Report> {
+    let virus_properties = find_some(&[
+      &overrides.virus_properties,
+      &auspice_json.meta.extensions.nextclade.pathogen,
+    ])
+    .cloned()
+    .unwrap_or_default();
 
     let ref_record = {
-      let ref_name = virus_properties
-        .attributes
-        .get("reference name")
-        .cloned()
-        .unwrap_or_else(|| AnyType::String("reference".to_owned()))
-        .as_str()
-        .wrap_err("When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`")?
-        .to_owned();
-
-      let ref_seq = auspice_json.root_sequence.as_ref().and_then(|root_sequence| root_sequence.get("nuc"))
-      .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
-
-      FastaRecord {
-        index: 0,
-        seq_name: ref_name,
-        seq: ref_seq,
+      match &overrides.ref_record {
+        Some(ref_record) => ref_record.clone(),
+        None => {
+          let ref_name = virus_properties
+            .attributes
+            .get("reference name")
+            .cloned()
+            .unwrap_or_else(|| AnyType::String("reference".to_owned()))
+            .as_str()
+            .wrap_err(
+              "When reading Auspice JSON v2 `.meta.extensions.nextclade.pathogen.attributes[\"reference name\"]`",
+            )?
+            .to_owned();
+
+          let ref_seq = auspice_json.root_sequence.as_ref().and_then(|root_sequence| root_sequence.get("nuc"))
+          .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
+
+          FastaRecord {
+            index: 0,
+            seq_name: ref_name,
+            seq: ref_seq,
+          }
+        }
       }
     };
 
-    let gene_map = auspice_json
-      .meta
-      .genome_annotations
-      .map_ref_fallible(GeneMap::from_auspice_annotations)?
-      .unwrap_or_default();
+    let gene_map = {
+      match &overrides.gene_map {
+        Some(gene_map) => gene_map.clone(),
+        None => auspice_json
+          .meta
+          .genome_annotations
+          .map_ref_fallible(GeneMap::from_auspice_annotations)?
+          .map(|gene_map| filter_gene_map(gene_map, cdses))
+          .unwrap_or_default(),
+      }
+    };
+
+    let tree = {
+      match &overrides.tree {
+        Some(tree) => Some(tree.to_owned()),
+        None => Some(auspice_json.to_owned()),
+      }
+    };
 
     Ok(Self {
       ref_record,
       gene_map,
-      tree: Some(auspice_json.to_owned()),
+      tree,
       virus_properties,
     })
   }
@@ -86,8 +110,38 @@ impl NextcladeParams {
   pub fn from_raw(raw: NextcladeParamsRaw) -> Result<Self, Report> {
     match raw {
       NextcladeParamsRaw::Auspice(raw) => {
-        let auspice_json = AuspiceTree::from_str(raw.tree)?;
-        Self::from_auspice(&auspice_json)
+        let auspice_json = AuspiceTree::from_str(raw.auspice_json)?;
+
+        let overrides = {
+          let virus_properties = raw
+            .virus_properties
+            .map_ref_fallible(VirusProperties::from_str)
+            .wrap_err("When parsing pathogen JSON")?;
+
+          let ref_record = raw
+            .ref_seq
+            .map_ref_fallible(read_one_fasta_str)
+            .wrap_err("When parsing reference sequence")?;
+
+          let tree = raw
+            .tree
+            .map_ref_fallible(AuspiceTree::from_str)
+            .wrap_err("When parsing reference tree Auspice JSON v2")?;
+
+          let gene_map = raw
+            .gene_map
+            .map_ref_fallible(GeneMap::from_str)
+            .wrap_err("When parsing genome annotation")?;
+
+          NextcladeParamsOptional {
+            ref_record,
+            gene_map,
+            tree,
+            virus_properties,
+          }
+        };
+
+        Self::from_auspice(&auspice_json, &overrides, &None)
       }
       NextcladeParamsRaw::Dir(raw) => {
         let virus_properties =
@@ -120,7 +174,11 @@ impl NextcladeParams {
 #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct NextcladeParamsRawAuspice {
-  pub tree: String,
+  pub auspice_json: String,
+  pub ref_seq: Option<String>,
+  pub gene_map: Option<String>,
+  pub tree: Option<String>,
+  pub virus_properties: Option<String>,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)]
diff --git a/packages/nextclade/src/utils/option.rs b/packages/nextclade/src/utils/option.rs
index 5ff800ad2..4868dd349 100644
--- a/packages/nextclade/src/utils/option.rs
+++ b/packages/nextclade/src/utils/option.rs
@@ -59,3 +59,8 @@ impl<'o, T: 'o> OptionMapMutFallible<'o, T> for Option<T> {
     (*self).as_mut().map(f).transpose()
   }
 }
+
+/// Find first Some in a list of Options
+pub fn find_some<'a, T>(options: &'a [&'a Option<T>]) -> Option<&'a T> {
+  options.iter().find_map(|&x| x.as_ref())
+}

From 82e69a1c835a0a97a4ef0331627f6864b999b3bd Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 16:07:54 +0200
Subject: [PATCH 17/18] fix(web): don't error when ref missing from auspice
 json but is provided otherwise

---
 packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts | 6 ------
 packages/nextclade/src/run/nextclade_wasm.rs               | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index c80232129..fe02ff7f6 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -1,5 +1,3 @@
-import { isEmpty } from 'lodash'
-import { FatalError } from 'next/dist/lib/fatal-error'
 import { attrStrMaybe, AuspiceTree, Dataset } from 'src/types'
 import { removeTrailingSlash } from 'src/io/url'
 import { axiosFetch } from 'src/io/axiosFetch'
@@ -12,10 +10,6 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
   })
   const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen
 
-  if (isEmpty(auspiceJson.root_sequence?.nuc)) {
-    throw new FatalError(`Auspice JSON does not contain required field '.root_sequence.nuc': ${datasetJsonUrl_}`)
-  }
-
   const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = {
     path: datasetJsonUrl,
     capabilities: {
diff --git a/packages/nextclade/src/run/nextclade_wasm.rs b/packages/nextclade/src/run/nextclade_wasm.rs
index eb7c91cdd..cf0100562 100644
--- a/packages/nextclade/src/run/nextclade_wasm.rs
+++ b/packages/nextclade/src/run/nextclade_wasm.rs
@@ -69,7 +69,7 @@ impl NextcladeParams {
             .to_owned();
 
           let ref_seq = auspice_json.root_sequence.as_ref().and_then(|root_sequence| root_sequence.get("nuc"))
-          .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc)"))?.to_owned();
+          .ok_or_else(|| eyre!("Auspice JSON v2 is used as input dataset, but does not contain required reference sequence field (.root_sequence.nuc) and a reference sequence is not provided any other way."))?.to_owned();
 
           FastaRecord {
             index: 0,

From 44fb8a5dcaef2d7d77220d5d54fa067d4f3de032 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 24 May 2024 16:34:43 +0200
Subject: [PATCH 18/18] feat(web): take title, description and update date from
 Auspice JSON

When using Auspice JSON as an input dataset, if pathogen info is not present, let's also attempt to read `.meta.title` or `.meta.description` and use as a dataset name. And let's try to read `.meta.updated` as the updated date time of the dataset.

This allows for a prettier and more informative dataset info section when using Auspice JSON as an input dataset.
---
 .../src/components/Main/DatasetInfo.tsx       |  2 +-
 .../nextclade-web/src/helpers/formatDate.ts   |  8 ++++++-
 .../src/io/fetchSingleDatasetAuspice.ts       | 22 ++++++++++++++++++-
 packages/nextclade/src/tree/tree.rs           |  9 ++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/packages/nextclade-web/src/components/Main/DatasetInfo.tsx b/packages/nextclade-web/src/components/Main/DatasetInfo.tsx
index 386cb11e5..15640b922 100644
--- a/packages/nextclade-web/src/components/Main/DatasetInfo.tsx
+++ b/packages/nextclade-web/src/components/Main/DatasetInfo.tsx
@@ -79,7 +79,7 @@ export function DatasetInfo({ dataset, showSuggestions, ...restProps }: DatasetI
     if (version?.tag === 'unreleased') {
       updatedAt = `${updatedAt} (${t('unreleased')})`
     }
-    return updatedAt
+    return updatedAt ?? t('unknown')
   }, [t, version?.tag, version?.updatedAt])
 
   const datasetName = attrStrMaybe(attributes, 'name') ?? path
diff --git a/packages/nextclade-web/src/helpers/formatDate.ts b/packages/nextclade-web/src/helpers/formatDate.ts
index b645cc828..efada713a 100644
--- a/packages/nextclade-web/src/helpers/formatDate.ts
+++ b/packages/nextclade-web/src/helpers/formatDate.ts
@@ -1,15 +1,21 @@
+import { isEmpty } from 'lodash'
 import { DateTime } from 'luxon'
+import { notUndefinedOrNull } from 'src/helpers/notUndefined'
 
 export function formatDateIsoUtcSimple(dateTimeStr: string) {
   const utc = DateTime.fromISO(dateTimeStr, { zone: 'UTC' })
 
   const date = utc.toISODate()
 
+  if (isEmpty(date)) {
+    return undefined
+  }
+
   const time = utc.toISOTime({
     suppressMilliseconds: true,
     suppressSeconds: true,
     includeOffset: false,
   })
 
-  return [date, time, `(${utc.zoneName})`].join(' ')
+  return [date, time, `(${utc.zoneName})`].filter(notUndefinedOrNull).filter(isEmpty).join(' ')
 }
diff --git a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
index fe02ff7f6..70f6e7b9e 100644
--- a/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
+++ b/packages/nextclade-web/src/io/fetchSingleDatasetAuspice.ts
@@ -8,7 +8,22 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
   const auspiceJson = await axiosFetch<AuspiceTree>(datasetJsonUrl, {
     headers: { Accept: 'application/json, text/plain, */*' },
   })
-  const pathogen = auspiceJson.meta?.extensions?.nextclade?.pathogen
+  const pathogen = auspiceJson.meta.extensions?.nextclade?.pathogen
+
+  const name =
+    auspiceJson.meta.title ??
+    auspiceJson.meta.description ??
+    attrStrMaybe(pathogen?.attributes, 'name') ??
+    datasetJsonUrl
+
+  let version = pathogen?.version
+  if (!version) {
+    const updatedAt = pathogen?.version?.updatedAt ?? auspiceJson.meta.updated
+    version = {
+      tag: updatedAt ?? '',
+      updatedAt,
+    }
+  }
 
   const currentDataset: Dataset & { auspiceJson?: AuspiceTree } = {
     path: datasetJsonUrl,
@@ -17,6 +32,11 @@ export async function fetchSingleDatasetAuspice(datasetJsonUrl_: string) {
       qc: [],
     },
     ...pathogen,
+    attributes: {
+      name,
+      ...pathogen?.attributes,
+    },
+    version,
     auspiceJson,
   }
 
diff --git a/packages/nextclade/src/tree/tree.rs b/packages/nextclade/src/tree/tree.rs
index fca2f5ee1..2c7b25663 100644
--- a/packages/nextclade/src/tree/tree.rs
+++ b/packages/nextclade/src/tree/tree.rs
@@ -486,6 +486,15 @@ impl AuspiceGenomeAnnotations {
 
 #[derive(Clone, Serialize, Deserialize, schemars::JsonSchema, Validate, Debug)]
 pub struct AuspiceTreeMeta {
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub title: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub description: Option<String>,
+
+  #[serde(default, skip_serializing_if = "Option::is_none")]
+  pub updated: Option<String>,
+
   #[serde(skip_serializing_if = "Option::is_none")]
   pub genome_annotations: Option<AuspiceGenomeAnnotations>,