Merge branch 'dev'

lczech · Jun 6, 2024 · ed91b00 · ed91b00
2 parents 1088cd1 + 44c6d2d
commit ed91b00
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 19 deletions.
diff --git a/lib/genesis/population/filter/variant_filter.cpp b/lib/genesis/population/filter/variant_filter.cpp
@@ -285,7 +285,7 @@ std::ostream& print_variant_filter_category_stats(
         os << "Empty:     " << stats[VariantFilterTagCategory::kSamplesFailed] << "\n";
     }
     if( stats[VariantFilterTagCategory::kNumeric] > 0 || verbose ) {
-        os << "Numric:    " << stats[VariantFilterTagCategory::kNumeric] << "\n";
+        os << "Numeric:   " << stats[VariantFilterTagCategory::kNumeric] << "\n";
     }
     if( stats[VariantFilterTagCategory::kInvariant] > 0 || verbose ) {
         os << "Invariant: " << stats[VariantFilterTagCategory::kInvariant] << "\n";

diff --git a/lib/genesis/population/function/diversity_pool_calculator.hpp b/lib/genesis/population/function/diversity_pool_calculator.hpp
@@ -261,14 +261,14 @@ class DiversityPoolCalculator
             if( std::isfinite( tp )) {
                 theta_pi_sum_ += tp;
             }
-            assert( std::isfinite( tp ) );
+            // assert( std::isfinite( tp ) );
         }
         if( enable_theta_watterson_ || enable_tajima_d_ ) {
             tw = theta_watterson_pool( settings_, pool_size_, sample );
             if( std::isfinite( tw )) {
                 theta_watterson_sum_ += tw;
             }
-            assert( std::isfinite( tw ) );
+            // assert( std::isfinite( tw ) );
         }
 
         // We want to keep track of the minimum read depth of the data that we are processing.

diff --git a/lib/genesis/population/function/diversity_pool_functions.cpp b/lib/genesis/population/function/diversity_pool_functions.cpp
@@ -184,13 +184,15 @@ double theta_pi_pool_denominator(
         size_t min_count, size_t poolsize, size_t nucleotide_count
     ){
         // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
-        if( 2 * min_count > nucleotide_count ) {
-            throw std::invalid_argument(
-                "Cannot compute theta_pi_pool_denominator with min_count = " +
-                std::to_string( min_count ) + " and nucleotide_count = " +
-                std::to_string( nucleotide_count )
-            );
-        }
+        // Update: It's okay, we can return 0 here. The position will just not contribute
+        // to the overall diversity sum then, but still considered for the sum of valid positions.
+        // if( 2 * min_count > nucleotide_count ) {
+        //     throw std::invalid_argument(
+        //         "Cannot compute theta_pi_pool_denominator with min_count = " +
+        //         std::to_string( min_count ) + " and nucleotide_count = " +
+        //         std::to_string( nucleotide_count )
+        //     );
+        // }
 
         // Iterate all allele frequencies in between the min and max-min boundaries.
         double denom = 0.0;
@@ -240,13 +242,15 @@ double theta_watterson_pool_denominator(
         size_t min_count, size_t poolsize, size_t nucleotide_count
     ){
         // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
-        if( 2 * min_count > nucleotide_count ) {
-            throw std::invalid_argument(
-                "Cannot compute theta_watterson_pool_denominator with min_count = " +
-                std::to_string( min_count ) + " and nucleotide_count = " +
-                std::to_string( nucleotide_count )
-            );
-        }
+        // Update: It's okay, we can return 0 here. The position will just not contribute
+        // to the overall diversity sum then, but still considered for the sum of valid positions.
+        // if( 2 * min_count > nucleotide_count ) {
+        //     throw std::invalid_argument(
+        //         "Cannot compute theta_watterson_pool_denominator with min_count = " +
+        //         std::to_string( min_count ) + " and nucleotide_count = " +
+        //         std::to_string( nucleotide_count )
+        //     );
+        // }
 
         // Iterate all allele frequencies in between the min and max-min boundaries.
         double denom = 0.0;

diff --git a/lib/genesis/population/function/fst_pool_processor.hpp b/lib/genesis/population/function/fst_pool_processor.hpp
@@ -190,6 +190,7 @@ class FstPoolProcessor
         // Also reset the pi vectors to nan.
         // If they are not allocated, nothing happens.
         auto const res_sz = results_.size();
+        (void) res_sz;
         assert( std::get<0>( results_pi_ ).size() == 0 || std::get<0>( results_pi_ ).size() == res_sz );
         assert( std::get<1>( results_pi_ ).size() == 0 || std::get<1>( results_pi_ ).size() == res_sz );
         assert( std::get<2>( results_pi_ ).size() == 0 || std::get<2>( results_pi_ ).size() == res_sz );

diff --git a/lib/genesis/population/stream/variant_input_stream_sources.cpp b/lib/genesis/population/stream/variant_input_stream_sources.cpp
@@ -219,8 +219,16 @@ VariantInputStream make_variant_input_stream_from_sam_file(
         // Take this into account, and create as many empty (unnamed) samples as needed.
         // This cannot be more than one though, as it can be the unaccounted or none,
         // or, if we do not split by RG at all, just the one sample were every read ends up in.
-        data.sample_names = make_sample_name_list_( data.source_name, cur.sample_size() );
-        assert( data.sample_names.size() <= 1 );
+        // data.sample_names = make_sample_name_list_( data.source_name, cur.sample_size() );
+        // assert( data.sample_names.size() <= 1 );
+
+        // Scratch that. If we treat the file as a single sample anyway, we just use the file name
+        // as the sample name. Way more intuitive. Unfortunately, there is then the inconsistency
+        // in naming, but it's more in line with what e.g. the sync does if a header is provided.
+        assert( cur.sample_size() <= 1 );
+        if( cur.sample_size() == 1 ) {
+            data.sample_names = std::vector<std::string>{ data.source_name };
+        }
     } else {
         assert( reader.split_by_rg() == true );
     }