Merge pull request #1017 from ComparativeGenomicsToolkit/oneshot

prep release 2.5.2
ComparativeGenomicsToolkit · May 15, 2023 · a33d3ea · a33d3ea
2 parents 59c6e81 + 3251637
commit a33d3ea
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 7 deletions.
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
@@ -1,3 +1,18 @@
+# Relase 2.5.2 2023-05-15
+
+This Release patches some bugs in the pangenome pipeline and makes it a bit more user-friendly
+
+- Fix support for multiple referenes via `--reference` and `--vcfReference`
+- Fix bug where certain combinations of options (ie returning filtered but not clipped index) could lead to crash
+- Fix crash when handling non-ascii characters in vg crash reports
+- Fix the `--chrom-vg` option in `cactus-pangenome`
+- New option `--mgCores` to specify number of cores for minigraph construction (rather than lumping in with `--mapCores` which is also used for mapping)
+- Better defaults for number of cores used in pangenome pipeline on singlemachine.
+- Fix bug where small contigs in the reference sample could lead to crashes if they couldn't map to themselves (and `--refContigs` was not used to specify chromosomes). `--refContigs` is now automatically set if not specifed. 
+- Update to vg 1.48.0
+- Update pangenome paper citation from preprint to published version.
+
+
 # Release 2.5.1 2023-04-19
 
 This Release mostly patches some bugs in the pangenome pipeline

diff --git a/doc/pangenome.md b/doc/pangenome.md
@@ -134,7 +134,7 @@ Reducing `--consCores` will allow more chromosomes to be aligned at once, requir
 
 The application and impact of this option is demonstrated in the explanation of the Yeast pangenome example below.
 
-**Important** The reference genome assembly must be chromosome scale. If your reference assembly also consists of many small fragments (ex GRCh38) then you must use the `--refContigs` option to specify the chromosomes.  Ex for GRCh38 `--refContigs $(for i in `seq 22`; do printf "chr$i "; done ; echo "chrX chrY chrM")`.  If you want to include the remaining reference contig fragments in your graph, add the `--otherContig chrOther` option.
+**Important** The reference genome assembly must be chromosome scale. If your reference assembly also consists of many small fragments (ex GRCh38) then you can use the `--refContigs` option to specify the chromosomes.  Ex for GRCh38 `--refContigs $(for i in `seq 22`; do printf "chr$i "; done ; echo "chrX chrY chrM")`.  If you want to include the remaining reference contig fragments in your graph, add the `--otherContig chrOther` option.  If you do not specify `--refContigs`, they will be determined automatically and small contigs will be included. 
 
 **Also Important** We do not yet automatically support the *alternate* loci from GRCh38, ex the various HLA contigs.  They must be excluded from the input fasta file to get sane results. They can be included in the graph by providing a separate sample / fasta pair in the input for each contig.  Please [here](#grch38-alts-graph) for an example of how to do so.
 

diff --git a/doc/progressive.md b/doc/progressive.md
@@ -164,12 +164,12 @@ Conservation scores can be computed using [phast](http://compgen.cshl.edu/phast/
 The Cactus Docker image contains everything you need to run Cactus (python environment, all binaries, system dependencies). For example, to run the test data:
 
 ```
-docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.5.1 cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal
+docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.5.2 cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal
 ```
 
 Or you can proceed interactively by running
 ```
-docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.5.1 bash
+docker run -v $(pwd):/data --rm -it quay.io/comparative-genomics-toolkit/cactus:v2.5.2 bash
 cactus /data/jobStore /data/evolverMammals.txt /data/evolverMammals.hal
 
 ```

diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@ def run(self):
 
 setup(
     name = "Cactus",
-    version = "2.5.1",
+    version = "2.5.2",
     author = "Benedict Paten",
     package_dir = {'': 'src'},
     packages = find_packages(where='src'),

diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml
@@ -326,12 +326,16 @@
 	<!-- minQueryUniqueness: The ratio of the number of query bases aligned to the chosen ref contig vs the next best ref contig must exceed this threshold to not be considered ambigious. -->
 	<!-- ambiguousName: Contigs deemed ambiguous using the above filters get added to a "contig" with this name, and are preserved in output. -->
 	<!-- remapSplitOptions: extra rgfa-split options to apply when splitting after remapping (use -u 3000000 to enable autoclipping between contigs, add -s to allow within contig)-->
+	<!-- maxRefContigs: The maximum number of auto-detect refContigs -->
+	<!-- refContigDropoff: When detecting refContigs, sort in reverse order by size, and pick refContigs up until contig N is this many times smaller than contig 1 (or maxRefContigs) is reached.  The general idea is to have a refContig for each chromosome then lump all the tiny ones into their own graphs. -->
   	<graphmap_split
 		 minQueryCoverages="0.75 0.5 0.25"
-       minQueryCoverageThresholds="100000 1000000"
+		 minQueryCoverageThresholds="100000 1000000"
 		 minQueryUniqueness="2"
 		 ambiguousName="_AMBIGUOUS_"
 		 remapSplitOptions=""
+		 maxRefContigs="128"
+		 refContigDropoff="10.0"
 		 />
 	<!-- cactus-graphmap-join options -->
 	<!-- gfaffix: toggle gfaffix normalization on/off -->

diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py
@@ -111,6 +111,14 @@ def cactus_graphmap_split(options):
                     for line in rc_file:
                         if len(line.strip()):
                             ref_contigs.add(line.strip().split()[0])
+            # our list has moved from options to ref_contigs, so don't get confused later:
+            options.refContigs = None
+
+            if ref_contigs:
+                max_ref_contigs = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxRefContigs", typeFn=int, default=128)
+                if len(ref_contigs) > max_ref_contigs:
+                    logger.warning('You specified {} refContigs, which is greater than the suggested limit of {}. This may cause issues downstream.'.format(len(ref_contigs), max_ref_contigs))
+
 
             if options.otherContig:
                 assert options.otherContig not in ref_contigs
@@ -188,6 +196,14 @@ def graphmap_split_workflow(job, options, config, seq_id_map, seq_name_map, gfa_
     else:
         sanitize_job = Job()
         root_job.addChild(sanitize_job)
+
+    # auto-set --refContigs
+    if not options.refContigs:
+        refcontig_job = sanitize_job.addFollowOnJobFn(detect_ref_contigs, config, options, seq_id_map)
+        ref_contigs = refcontig_job.rv()        
+        options.otherContig = 'chrOther'
+        other_contig = 'chrOther'
+        sanitize_job = refcontig_job
 
     # use file extension to sniff out compressed input
     if gfa_path.endswith(".gz"):
@@ -224,6 +240,46 @@ def graphmap_split_workflow(job, options, config, seq_id_map, seq_name_map, gfa_
     # return all the files, as well as the 2 split logs
     return (seq_name_map, bin_other_job.rv(), split_gfa_job.rv(1), gather_fas_job.rv(1))
 
+def detect_ref_contigs(job, config, options, seq_id_map):
+    """ automatically determine --refContigs from the data """
+    work_dir = job.fileStore.getLocalTempDir()
+    # it will be unzipped since we're after sanitize
+    fa_path = os.path.join(work_dir, options.reference + '.fa')
+    job.fileStore.readGlobalFile(seq_id_map[options.reference], fa_path)
+    cactus_call(parameters=['samtools', 'faidx', fa_path])
+    contigs = []
+    with open(fa_path + '.fai', 'r') as fai_file:
+        for line in fai_file:
+            toks = line.split('\t')
+            assert len(toks) > 2
+            contigs.append((toks[0], float(toks[1])))
+
+    # get the cutoff heuristics 
+    max_ref_contigs = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxRefContigs", typeFn=int, default=128)
+    ref_contig_dropoff = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "refContigDropoff", typeFn=float, default=10.0)
+
+    # apply the heuristics to the contigs
+    ref_contigs = []
+    for contig_len in sorted(contigs, key=lambda x : x[1], reverse=True):
+        if len(ref_contigs):
+            dropoff = ref_contigs[0][1] / contig_len[1]
+            if dropoff >= ref_contig_dropoff:
+                break
+        ref_contigs.append(contig_len)        
+        if len(ref_contigs) >= max_ref_contigs:
+            break
+
+    # clean up the names (since they will have been prefixed here)
+    ref_contigs_clean = []
+    for contig in ref_contigs:
+        assert contig[0].startswith('id={}|'.format(options.reference))
+        ref_contigs_clean.append(contig[0][len('id={}|'.format(options.reference)):])
+
+    RealtimeLogger.info("auto-detected --refContigs {} --otherContig chrOther".format(' '.join(ref_contigs_clean)))
+
+    return ref_contigs_clean            
+
+
 def get_mask_bed(job, seq_id_map, min_length):
     """ make a bed file from the fastas """
     beds = []
@@ -560,7 +616,7 @@ def export_split_data(toil, input_name_map, output_id_map, split_log_id, contig_
     empty_contigs = set()
 
     for ref_contig in output_id_map.keys():        
-        if output_id_map[ref_contig] is None:
+        if output_id_map[ref_contig] is None or len(output_id_map[ref_contig]) == 0:
             # todo: check ambigous?
             continue
         ref_contig_path = os.path.join(output_dir, ref_contig)

diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py
@@ -305,7 +305,7 @@ def getDockerImage():
 
 def getDockerRelease(gpu=False):
     """Get the most recent docker release."""
-    r = "quay.io/comparative-genomics-toolkit/cactus:v2.5.1"
+    r = "quay.io/comparative-genomics-toolkit/cactus:v2.5.2"
     if gpu:
         r += "-gpu"
     return r