adding kegalign

richard-burhans · Aug 29, 2024 · f032b04 · f032b04
1 parent 167e4bf
commit f032b04
Showing 16 changed files with 1,638 additions and 0 deletions.
diff --git a/tools/kegalign/.shed.yml b/tools/kegalign/.shed.yml
@@ -0,0 +1,12 @@
+categories:
+- Next Gen Mappers
+description: A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm.
+homepage_url: https://github.com/galaxyproject/KegAlign
+long_description: |
+  KegAlign is a scalable, GPU-accelerated system for computing pairwise
+  WGA. KegAlign is based on the standard seed-filter-extend heuristic,
+  in which the filtering stage dominates the runtime.
+name: kegalign
+owner: richard-burhans
+remote_repository_url: https://github.com/richard-burhans/galaxytools/tree/main/tools/kegalign
+type: unrestricted
diff --git a/tools/kegalign/diagonal_partition.py b/tools/kegalign/diagonal_partition.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+
+"""
+Diagonal partitioning for segment files output by KegAlign.
+
+Usage:
+diagonal_partition.py <max-segments> <lastz-command>
+
+set <max-segments> = 0 to skip partitioning, -1 to estimate best parameter
+"""
+
+import collections
+import os
+import statistics
+import sys
+import typing
+
+
+def chunks(lst: tuple[str, ...], n: int) -> typing.Iterator[tuple[str, ...]]:
+    """Yield successive n-sized chunks from list."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+
+if __name__ == "__main__":
+    # TODO: make these optional user defined parameters
+
+    # deletes original segment file after splitting
+    DELETE_AFTER_CHUNKING = True
+
+    # don't partition segment files with line count below this value
+    MIN_CHUNK_SIZE = 5000
+
+    # only used when segment size is being estimated
+    MAX_CHUNK_SIZE = 50000
+
+    # include chosen split size in file name
+    DEBUG = False
+
+    # first parameter contains chunk size
+    chunk_size = int(sys.argv[1])
+    params = sys.argv[2:]
+
+    # don't do anything if 0 chunk size
+    if chunk_size == 0:
+        print(" ".join(params), flush=True)
+        sys.exit(0)
+
+    # Parsing command output from KegAlign
+    segment_key = "--segments="
+    segment_index = None
+    input_file = None
+
+    for index, value in enumerate(params):
+        if value[:len(segment_key)] == segment_key:
+            segment_index = index
+            input_file = value[len(segment_key):]
+            break
+
+    if segment_index is None:
+        sys.exit(f"Error: could not get segment key {segment_key} from parameters {params}")
+
+    if input_file is None:
+        sys.exit(f"Error: could not get segment file from parameters {params}")
+
+    if not os.path.isfile(input_file):
+        sys.exit(f"Error: File {input_file} does not exist")
+
+    # each char is 1 byte
+    line_size = None
+    file_size = os.path.getsize(input_file)
+    with open(input_file, "r") as f:
+        # add 1 for newline
+        line_size = len(f.readline())
+
+    estimated_lines = file_size // line_size
+
+    # check if chunk size should be estimated
+    if chunk_size < 0:
+        # optimization, do not need to get each file size in this case
+        if estimated_lines < MIN_CHUNK_SIZE:
+            print(" ".join(params), flush=True)
+            sys.exit(0)
+
+        # get size of each segment assuming DELETE_AFTER_CHUNKING == True
+        # takes into account already split segments
+        files = [i for i in os.listdir(".") if i.endswith(".segments")]
+
+        if len(files) < 2:
+            # if not enough segment files for estimation, use MAX_CHUNK_SIZE
+            chunk_size = MAX_CHUNK_SIZE
+        else:
+            fdict: typing.DefaultDict[str, int] = collections.defaultdict(int)
+            for filename in files:
+                size = os.path.getsize(filename)
+                f_ = filename.split(".split", 1)[0]
+                fdict[f_] += size
+
+            if len(fdict) < 7:
+                # outliers can heavily skew prediction if <7 data points
+                # to be safe, use 50% quantile
+                chunk_size = int(statistics.quantiles(fdict.values())[1] // line_size)
+            else:
+                # otherwise use 75% quantile
+                chunk_size = int(statistics.quantiles(fdict.values())[-1] // line_size)
+            # if not enough data points, there is a chance of getting unlucky
+            # minimize worst case by using MAX_CHUNK_SIZE
+
+            chunk_size = min(chunk_size, MAX_CHUNK_SIZE)
+
+    # no need to sort if number of lines <= chunk_size
+    if (estimated_lines <= chunk_size):
+        print(" ".join(params), flush=True)
+        sys.exit(0)
+
+    # Find rest of relevant parameters
+    output_key = "--output="
+    output_index = None
+    output_alignment_file = None
+    output_alignment_file_base = None
+    output_format = None
+
+    strand_key = "--strand="
+    strand_index = None
+    for index, value in enumerate(params):
+        if value[:len(output_key)] == output_key:
+            output_index = index
+            output_alignment_file = value[len(output_key):]
+            output_alignment_file_base, output_format = output_alignment_file.rsplit(".", 1)
+
+        if value[:len(strand_key)] == strand_key:
+            strand_index = index
+
+    if output_alignment_file_base is None:
+        sys.exit(f"Error: could not get output alignment file base from parameters {params}")
+
+    if output_format is None:
+        sys.exit(f"Error: could not get output format from parameters {params}")
+
+    if output_index is None:
+        sys.exit(f"Error: could not get output key {output_key} from parameters {params}")
+
+    if strand_index is None:
+        sys.exit(f"Error: could not get strand key {strand_key} from parameters {params}")
+
+    # error file is at very end
+    err_index = -1
+    err_name_base = params[-1].split(".err", 1)[0]
+
+    # dict of list of tuple (x, y, str)
+    data: dict[tuple[str, str], list[tuple[int, int, str]]] = {}
+
+    direction = None
+    if "plus" in params[strand_index]:
+        direction = "f"
+    elif "minus" in params[strand_index]:
+        direction = "r"
+    else:
+        sys.exit(f"Error: could not figure out direction from strand value {params[strand_index]}")
+
+    for line in open(input_file, "r"):
+        if line == "":
+            continue
+        seq1_name, seq1_start, seq1_end, seq2_name, seq2_start, seq2_end, _dir, score = line.split()
+        # data.append((int(seq1_start), int(seq2_start), line))
+        half_dist = int((int(seq1_end) - int(seq1_start)) // 2)
+        assert int(seq1_end) > int(seq1_start)
+        assert int(seq2_end) > int(seq2_start)
+        seq1_mid = int(seq1_start) + half_dist
+        seq2_mid = int(seq2_start) + half_dist
+        data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line))
+
+    # If there are chromosome pairs with segment count <= chunk_size
+    # then no need to sort and split these pairs into separate files.
+    # It is better to keep these pairs in a single segment file.
+
+    # pairs that have count <= chunk_size. these will not be sorted
+    skip_pairs = []
+
+    # save query key order
+    # for lastz segment files: 'Query sequence names must appear in the same
+    # order as they do in the query file'
+
+    # NOTE: assuming data.keys() preserves order of keys. Requires Python 3.7+
+
+    query_key_order = list(dict.fromkeys([i[1] for i in data.keys()]))
+
+    if len(data.keys()) > 1:
+        for pair in data.keys():
+            if len(data[pair]) <= chunk_size:
+                skip_pairs.append(pair)
+
+    # sorting for forward segments
+    if direction == "r":
+        for pair in data.keys():
+            if pair not in skip_pairs:
+                data[pair] = sorted(data[pair], key=lambda coord: (coord[1] - coord[0], coord[0]))
+
+    # sorting for reverse segments
+    elif direction == "f":
+        for pair in data.keys():
+            if pair not in skip_pairs:
+                data[pair] = sorted(data[pair], key=lambda coord: (coord[1] + coord[0], coord[0]))
+    else:
+        sys.exit(f"INVALID DIRECTION VALUE: {direction}")
+
+    # Writing file in chunks
+    ctr = 0
+    # [i for i in data_keys if i not in set(skip_pairs)]:
+    for pair in (data.keys() - skip_pairs):
+        for chunk in chunks(list(zip(*data[pair]))[2], chunk_size):
+            ctr += 1
+            name_addition = f".split{ctr}"
+
+            if DEBUG:
+                name_addition = f".{chunk_size}{name_addition}"
+
+            fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
+
+            assert len(chunk) != 0
+            with open(fname, "w") as f:
+                f.writelines(chunk)
+            # update segment file in command
+            params[segment_index] = segment_key + fname
+            # update output file in command
+            params[output_index] = output_key + output_alignment_file_base + name_addition + "." + output_format
+            # update error file in command
+            params[-1] = err_name_base + name_addition + ".err"
+            print(" ".join(params), flush=True)
+
+    # writing unsorted skipped pairs
+    if len(skip_pairs) > 0:
+        # list of tuples of (pair length, pair)
+        skip_pairs_with_len = sorted([(len(data[p]), p) for p in skip_pairs])
+        # NOTE: This sorting can violate lastz query key order requirement, this is fixed later
+
+        # used for sorting
+        query_key_order_table = {item: idx for idx, item in enumerate(query_key_order)}
+
+        # list of list of pair names
+        aggregated_skip_pairs: list[list[tuple[str, str]]] = []
+        current_count = 0
+        aggregated_skip_pairs.append([])
+        for count, pair in skip_pairs_with_len:
+            if current_count + count <= chunk_size:
+                current_count += count
+                aggregated_skip_pairs[-1].append(pair)
+            else:
+                aggregated_skip_pairs.append([])
+                current_count = count
+                aggregated_skip_pairs[-1].append(pair)
+
+        for aggregate in aggregated_skip_pairs:
+            ctr += 1
+            name_addition = f".split{ctr}"
+
+            if DEBUG:
+                name_addition = f".{chunk_size}{name_addition}"
+
+            fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
+
+            with open(fname, "w") as f:
+                # fix possible lastz query key order violations
+                # p[1] is query key
+                for pair in sorted(aggregate, key=lambda p: query_key_order_table[p[1]]):
+                    chunk = list(zip(*data[pair]))[2]
+                    f.writelines(chunk)
+            # update segment file in command
+            params[segment_index] = segment_key + fname
+            # update output file in command
+            params[output_index] = output_key + output_alignment_file_base + name_addition + "." + output_format
+            # update error file in command
+            params[-1] = err_name_base + name_addition + ".err"
+            print(" ".join(params), flush=True)
+
+    if DELETE_AFTER_CHUNKING:
+        os.remove(input_file)
diff --git a/tools/kegalign/gapped_extension_options.xml b/tools/kegalign/gapped_extension_options.xml
@@ -0,0 +1,9 @@
+<macros>
+    <xml name="gapped_extension_options">
+        <section name="gapped_extension_options" expanded="false" title="Gapped Extension Options">
+            <param argument="--ydrop" type="integer" value="9430" label="Y-drop value for gapped extension"/>
+            <param argument="--gappedthresh" type="integer" value="" optional="true" label="Score threshold for gapped alignments"/>
+            <param argument="--notrivial" type="boolean" checked="false" label="Don't output a trivial self-alignment block if the target and query sequences are identical"/>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/kegalign.xml b/tools/kegalign/kegalign.xml
@@ -0,0 +1,208 @@
+<tool id="kegalign" name="KegAlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>sequence_options.xml</import>
+        <import>scoring_options.xml</import>
+        <import>seeding_options.xml</import>
+        <import>ungapped_extension_options.xml</import>
+        <import>gapped_extension_options.xml</import>
+        <import>output_options.xml</import>
+        <import>system_options.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <required_files>
+        <include path="diagonal_partition.py"/>
+        <include path="lastz-cmd.ini"/>
+        <include path="package_output.py"/>
+        <include path="runner.py"/>
+    </required_files>
+    <command detect_errors="exit_code"><![CDATA[
+## Convert input sequences to 2bit -------------------------------------
+
+mkdir -p "\$(pwd)/work" &&
+faToTwoBit <(gzip -cdfq '$target') "\$(pwd)/work/ref.2bit" &&
+faToTwoBit <(gzip -cdfq '$query') "\$(pwd)/work/query.2bit" &&
+
+## Run KegAlign --------------------------------------------------------
+
+## explicitly calling python to bypass a pulsar bug
+## https://github.com/galaxyproject/pulsar/issues/341
+python '$__tool_directory__/runner.py'
+    --output-type tarball
+    --output-file '$kegalign_output'
+    --diagonal-partition
+    --num-cpu \${GALAXY_SLOTS:-2}
+    --tool_directory '$__tool_directory__'
+    '$target'
+    '$query'
+
+## Sequence Options ----------------------------------------------------
+
+    --strand '$sequence_options.strand_selector'
+
+## Scoring Options -----------------------------------------------------
+
+#set $scoring_pathname = str($scoring_options.scoring)
+#if $scoring_pathname != "None":
+    --scoring '$scoring_pathname'
+#end if
+#if str($scoring_options.ambiguous_selector) != "x"
+    #if str($scoring_options.set_ambiguous_params_selector) == "true"
+        #set $argument_value = ','.join($scoring_options.ambiguous_selector, $scoring_options.ambiguous_reward, $scoring_options.ambiguous_penalty)
+    --ambiguous '$argument_value'
+    #else
+    --ambiguous '$ambiguous_selector'
+    #end if
+#end if
+
+## Seeding Options -----------------------------------------------------
+
+#if str($seeding_options.seed.seed_selector) == "custom"
+    --seed '$seeding_options.seed.custom_seed'
+#else
+    --seed '$seeding_options.seed.seed_selector'
+#end if
+    --step '$seeding_options.step'
+#if str($seeding_options.notransition) == "true"
+    --notransition
+#end if
+
+## Ungapped Extension Options ------------------------------------------
+
+    --xdrop '$ungapped_extension_options.xdrop'
+    --hspthresh '$ungapped_extension_options.hspthresh'
+#if str($ungapped_extension_options.noentropy) == "true"
+    --noentropy
+#end if
+
+## Gapped Extension Options --------------------------------------------
+
+    --ydrop '$gapped_extension_options.ydrop'
+#if str($gapped_extension_options.gappedthresh) != ""
+    --gappedthresh '$gapped_extension_options.gappedthresh'
+#end if
+#if str($gapped_extension_options.notrivial) == "true"
+    --notrivial
+#end if
+
+## Output Options -----------------------------------------------------
+
+#if str($output_options.format.format_selector) == "bam"
+    --format '$output_options.format.bam_options'
+#else if str($output_options.format.format_selector) == "general_def"
+    --format general-
+#else if str($output_options.format.format_selector) == "general_full"
+    --format 'general-:${output_options.format.fields}'
+
+#else if str($output_options.format.format_selector) == "maf"
+    --format '$output_options.format.maf_type'
+#else if str($output_options.format.format_selector) == "blastn"
+    --format=BLASTN-
+#else if str($output_options.format.format_selector) == "differences"
+    --format=differences
+#end if
+## todo :: rplot, bam
+##  --action:target=multiple
+##  $output_format.rplot
+##  .if str( $output_format.out.format ) == "bam":
+##      | samtools sort -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '${output}'
+##  .else:
+##      > '${output}'
+##  .end if
+##  .if $output_format.rplot:
+##      &&
+##      Rscript $r_plot > /dev/null 2>&1
+##  .end if
+
+## System Options -----------------------------------------------------
+
+    --wga_chunk_size '$system_options.wga_chunk_size'
+    --lastz_interval_size '$system_options.lastz_interval_size'
+    --seq_block_size '$system_options.seq_block_size'
+    --num_gpu '$system_options.num_gpu'
+#if str($system_options.debug) == "true"
+    --debug
+#end if
+
+## Package Output ----------------------------------------------------
+
+&&
+python '$__tool_directory__/package_output.py'
+    --tool_directory '$__tool_directory__'
+    --format_selector '$output_options.format.format_selector'
+#if str($system_options.debug) == "true"
+    --debug
+#end if
+
+    ]]></command>
+    <inputs>
+        <param name="target" type="data" format="fasta,fasta.gz" label="Target sequence file in FASTA format"/>
+        <param name="query" type="data" format="fasta,fasta.gz" label="Query sequence file in FASTA format"/>
+        <expand macro="sequence_options"/>
+        <expand macro="scoring_options"/>
+        <expand macro="seeding_options"/>
+        <expand macro="ungapped_extension_options"/>
+        <expand macro="gapped_extension_options"/>
+        <expand macro="output_options"/>
+        <expand macro="system_options"/>
+    </inputs>
+    <outputs>
+        <data name="kegalign_output" format="tgz" from_work_dir="data_package.tgz" label="KegAlign on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1" expect_test_failure="true">
+            <param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta.gz"/>
+            <param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta.gz"/>
+            <output name="kegalign_output" ftype="tgz">
+                <assert_contents>
+                    <has_archive_member path="galaxy/commands.json"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+KegAlign is a scalable, GPU-accelerated system for computing pairwise WGA. KegAlign is based on the standard seed-filter-extend heuristic, in which the filtering stage dominates the runtime (e.g. 98% for human-mouse WGA), and is accelerated using GPU(s). KegAlign was designed as a faster replacement for lastz pairwise aligner.
+
+**Using this tool**
+
+.. class:: warningmark
+
+This tool is the first part of a two-step process for generation of paiwrise alignments. The output of this tool is used as an input to **Batched LASTZ** tool. The *Batched LASTZ** can be found in the list of tool of this Galaxy instance.
+
+**What it does**
+
+KegAlign processes **Target** and **Query** sequences to identify highly similar regions where gapped extension will be performed to create actual alignments. The actual alignments are generated by **Batched LASTZ** that should be run on the output of this tool.
+
+.. class:: infomark
+
+Although this tool is only the first part of the alignment process all parameters related to generation of alignments are set **during this stage**.
+
+**Scoring Options**
+
+By default the HOXD70 substitution scores are used (from `Chiaromonte et al. 2002 <https://www.ncbi.nlm.nih.gov/pubmed/11928468>`_)::
+
+    bad_score          = X:-1000  # used for sub['X'][*] and sub[*]['X']
+    fill_score         = -100     # used when sub[*][*] is not defined
+    gap_open_penalty   =  400
+    gap_extend_penalty =   30
+
+         A     C     G     T
+    A   91  -114   -31  -123
+    C -114   100  -125   -31
+    G  -31  -125   100  -114
+    T -123   -31  -114    91
+
+Matrix can be supplied as an input to **Read the substitution scores** parameter in *Scoring* section. Substitution matrix can be inferred from your data using another LASTZ-based tool (LASTZ_D: Infer substitution scores).
+
+**Output Options**
+
+.. class:: infomark
+
+The final format in which alignmnets will be generated by **Batched LASTZ** are set here.
+
+The default output is a MAF alignment file. Other formats can be configured in *Output Options* section.  See LASTZ manual <https://lastz.github.io/lastz/#formats>`_ for description of possible formats.
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/kegalign/lastz-cmd.ini b/tools/kegalign/lastz-cmd.ini
@@ -0,0 +1,94 @@
+[arguments]
+flag_args =
+  allgappedbounds
+  anyornone
+  gapped
+  gfextend
+  markend
+  nocensus
+  noentropy
+  nofilter
+  nogapped
+  nogfextend
+  nomirror
+  norecoverseeds
+  notransition
+  notrivial
+  notwins
+  noytrim
+  recoverseeds
+  self
+  version
+  yasra75
+  yasra85
+  yasra85short
+  yasra90
+  yasra95
+  yasra95short
+  yasra98
+
+str_args =
+  action:query
+  action:target
+  ambiguous
+  ball
+  chores
+  filter
+  format
+  gap
+  hspthresh
+  include
+  match
+  maxwordcount
+  mismatch
+  nochain
+  output
+  outputmasking
+  outputmasking+
+  outputmasking+:soft
+  outputmasking:soft
+  querydepth
+  queryhsplimit
+  rdotplot
+  readgroup
+  scores
+  seed
+  segments
+  show
+  strand
+  targetcapsule
+  twins
+  writecapsule
+  writesegments
+
+int_args =
+  allocate:query
+  allocate:target
+  allocate:traceback
+  exact
+  gappedthresh
+  inner
+  masking
+  queryhspbest
+  step
+  word
+  xdrop
+  ydrop
+
+bool_str_args=
+  census
+  census16
+  census32
+  chain
+  entropy
+  help
+  infer
+  inferonly
+  infscores
+  tableonly
+
+bool_int_args=
+  progress
+  progress+masking
+  transition
+
diff --git a/tools/kegalign/macros.xml b/tools/kegalign/macros.xml
@@ -0,0 +1,20 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">kegalign-full</requirement>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.1.2.7</token>
+    <token name="@VERSION_SUFFIX@">1</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="edam_ontology">
+        <edam_operations>
+            <edam_operation>operation_0491</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1109/SC41405.2020.00043</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/kegalign/output_options.xml b/tools/kegalign/output_options.xml
@@ -0,0 +1,95 @@
+<macros>
+    <xml name="output_options">
+        <section name="output_options" expanded="false" title="Output Options">
+            <conditional name="format">
+                <param name="format_selector" type="select" display="radio" label="Specify the output format">
+                    <option value="bam">BAM --format=sam)</option>
+                    <option value="general_def">General default (--format=general)</option>
+                    <option value="general_full">Customized general (--format=general[:fields])</option>
+                    <option value="maf" selected="true">MAF (--format=maf)</option>
+                    <option value="blastn">blastn (--format=BLASTN)</option>
+                    <option value="differences">Differences (--format=differences)</option>
+                </param>
+                <when value="bam">
+                    <param name="bam_options" type="select" display="radio" argument="--format=sam, --format=softsam" label="Select a BAM flavor to output" help="Lastz actually outputs SAM data but Galaxy converts it into BAM to save space. For alignments that don't reach the end of a query, ‑‑format=sam uses 'hard clipping', while ‑‑format=softsam uses 'soft clipping'. See the section on 'clipped alignment' in the SAM specification for an explanation of what this means. The options ‑‑format=sam- and ‑‑format=softsam- suppress the SAM header lines. This makes them suitable for concatenating output from multiple runs. If you need to specify readgroup information: use AddOrEplaceReadGroups from Picard package">
+                        <option value="sam">BAM</option>
+                        <option value="softsam">soft-clipped BAM</option>
+                        <option value="sam-">BAM without header</option>
+                        <option value="softsam-">soft-clipped BAM without header</option>
+                    </param>
+                </when>
+                <when value="general_def">
+                    <!-- Do nothing -->
+                </when>
+                <when value="general_full">
+                    <param name="fields" type="select" display="checkboxes" multiple="true" label="Select which fields to include" argument="--format=general-[:fields]">
+                        <option value="score" selected="true">score: Score of the alignment block</option>
+                        <option value="name1" selected="true">name1: Name of the target sequence</option>
+                        <option value="number1">number1: Number of the target sequence within the target file</option>
+                        <option value="strand1" selected="true">strand1: Target sequence strand </option>
+                        <option value="size1" selected="true">size1: Size of the entire target sequence</option>
+                        <option value="start1">start1: Starting position of the alignment block in the target, origin-one</option>
+                        <option value="zstart1" selected="true">zstart1: Starting position of the alignment block in the target, origin-zero</option>
+                        <option value="end1" selected="true">end1: Ending position of the alignment block in the target</option>
+                        <option value="length1">length1: Length of the alignment block in the target (excluding gaps)</option>
+                        <option value="text1">text1: Aligned characters in the target, including gap characters</option>
+                        <option value="qalign1">qalign1: The target quality sequence (if there is one) correpsonding to aligned characters</option>
+                        <option value="nucs1">nucs1: The entire target sequence</option>
+                        <option value="name2" selected="true">name2: Name of the query sequence</option>
+                        <option value="number2">number2: Number of the query sequence within the query file</option>
+                        <option value="strand2" selected="true">strand2: Query sequence strand</option>
+                        <option value="size2" selected="true">size2: Size of the entire query sequence</option>
+                        <option value="start2">start2: Starting position of the alignment block in the query, origin-one</option>
+                        <option value="zstart2" selected="true">zstart2: Starting position of the alignment block in the query, origin-one</option>
+                        <option value="end2" selected="true">end2: Ending position of the alignment block in the query</option>
+                        <option value="length2">length2: Length of the alignment block in the query (excluding gaps)</option>
+                        <option value="text2">text2: Aligned characters in the query, including gap characters</option>
+                        <option value="qalign2">qalign2: The query quality sequence (if there is one) correpsonding to aligned characters</option>
+                        <option value="nucs2">nucs2: The entire query sequence</option>
+                        <option value="nmatch">nmatch: Match count</option>
+                        <option value="nmismatch">nmismatch: Mismatch count</option>
+                        <option value="ncolumn">ncolumn: Number of columns in the block. This includes matches, mismatches (substitutions), and gaps</option>
+                        <option value="npair">npair: Number of aligned bases in the block that are matches or mismatches (substitutions)</option>
+                        <option value="ngap">ngap: Gap count, the number of gaps in the block, counting each run of gapped columns as a single gap</option>
+                        <option value="cgap">cgap: Gap column count, the number of gaps in the block, counting each gapped column as a separate gap</option>
+                        <option value="diff">diff: Differences between what would be written for text1 and text2</option>
+                        <option value="cigar">cigar: A CIGAR-like representation of the alignment’s path</option>
+                        <option value="cigarx">cigarx: Same as cigar, but uses a newer syntax that distinguishes matches from substitutions</option>
+                        <option value="identity" selected="true">identity: Fraction of aligned bases in the block that are matches </option>
+                        <option value="idfrac">idfrac: Fraction of aligned bases in the block that are matches </option>
+                        <option value="id%" selected="true">id% Fraction of aligned bases in the block that are matches (as %)</option>
+                        <option value="blastid%">blastid%: Fraction of the alignment block that is matches, as would be reported by NCBI BLAST</option>
+                        <option value="continuity">continuity: Rate of non-gaps (non-indels) in the alignment block</option>
+                        <option value="confrac">confrac: Rate of non-gaps (non-indels) in the alignment block (as fraction)</option>
+                        <option value="con%">con%: Rate of non-gaps (non-indels) in the alignment block (as %)</option>
+                        <option value="coverage" selected="true">coverage: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block</option>
+                        <option value="covfrac">covfrac: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as fraction)</option>
+                        <option value="cov%" selected="true">cov%: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as %)</option>
+                        <option value="diagonal">diagonal: The diagonal of the start of the alignment block in the DP matrix, expressed as an identifying number start1-start2</option>
+                        <option value="shingle">shingle: A measurement of the shingle overlap between the target and the query</option>
+                        <option value="number">number: The alignment number, counted as alignments are written to output (1-base)</option>
+                        <option value="znumber">znumber: The alignment number, counted as alignments are written to output (0-base)</option>
+                        <sanitizer invalid_char="">
+                            <valid initial="string.letters,string.digits">
+                                <add value="%"/>
+                            </valid>
+                        </sanitizer>
+                    </param>
+                </when>
+                <when value="maf">
+                    <param name="maf_type" type="select" display="radio" argument="--format=maf" label="Select MAF flavor" help="MAF is a multiple alignment format developed at UCSC">
+                        <option value="maf">MAF</option>
+                        <option value="maf+">MAF with additional stats</option>
+                        <option value="maf-" selected="true">MAF without header and comments</option>
+                    </param>
+                </when>
+                <when value="blastn">
+                    <!-- Do nothing -->
+                </when>
+                <when value="differences">
+                    <!-- Do nothing -->
+                </when>
+            </conditional>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/package_output.py b/tools/kegalign/package_output.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python
+
+import argparse
+import configparser
+import json
+import os
+import resource
+import sys
+import tarfile
+import time
+import typing
+
+import bashlex
+
+RUSAGE_ATTRS: typing.Final = ["ru_utime", "ru_stime", "ru_maxrss", "ru_minflt", "ru_majflt", "ru_inblock", "ru_oublock", "ru_nvcsw", "ru_nivcsw"]
+
+
+class PackageFile:
+    def __init__(
+        self,
+        pathname: str = "data_package.tgz",
+        top_dir: str = "galaxy",
+        data_dir: str = "files",
+        config_file: str = "commands.json",
+        format_file: str = "format.txt"
+    ) -> None:
+        self.pathname: str = os.path.realpath(pathname)
+        self.data_root: str = os.path.join(top_dir, data_dir)
+        self.config_path: str = os.path.join(top_dir, config_file)
+        self.config_file: str = config_file
+        self.format_path: str = os.path.join(top_dir, format_file)
+        self.format_file: str = format_file
+        self.tarfile: typing.Optional[tarfile.TarFile] = None
+        self.name_cache: typing.Dict[typing.Any, typing.Any] = {}
+        self.working_dir: str = os.path.realpath(os.getcwd())
+
+    def _initialize(self) -> None:
+        if self.tarfile is None:
+            self.tarfile = tarfile.open(
+                name=self.pathname,
+                mode="w:gz",
+                format=tarfile.GNU_FORMAT,
+                compresslevel=6,
+            )
+
+    def add_config(self, pathname: str) -> None:
+        if self.tarfile is None:
+            self._initialize()
+
+        source_path = os.path.realpath(pathname)
+
+        if self.tarfile is not None:
+            self.tarfile.add(source_path, arcname=self.config_path, recursive=False)
+
+    def add_file(self, pathname: str, arcname: typing.Optional[str] = None) -> None:
+        if self.tarfile is None:
+            self._initialize()
+
+        source_path = os.path.realpath(pathname)
+
+        dest_path = None
+
+        if arcname is None:
+            dest_path = os.path.join(self.data_root, os.path.basename(source_path))
+        else:
+            arc_path = os.path.realpath(arcname)
+            rel_path = os.path.relpath(arc_path, self.working_dir)
+            if not (os.path.isabs(rel_path) or rel_path.startswith("../")):
+                dest_path = os.path.join(self.data_root, rel_path)
+            else:
+                sys.exit("path fail")
+
+        if dest_path is not None:
+            if self.tarfile is not None:
+                if dest_path not in self.name_cache:
+                    try:
+                        self.tarfile.add(
+                            source_path, arcname=dest_path, recursive=False
+                        )
+                    except FileNotFoundError:
+                        sys.exit(f"missing source file {source_path}")
+
+                    self.name_cache[dest_path] = 1
+                    # print(f"added: {dest_path}", flush=True)
+
+    def add_format(self, pathname: str) -> None:
+        if self.tarfile is None:
+            self._initialize()
+
+        source_path = os.path.realpath(pathname)
+
+        if self.tarfile is not None:
+            self.tarfile.add(source_path, arcname=self.format_path, recursive=False)
+
+    def close(self) -> None:
+        if self.tarfile is not None:
+            self.tarfile.close()
+            self.tarfile = None
+
+
+class bashCommandLineFile:
+    def __init__(
+        self,
+        pathname: str,
+        config: configparser.ConfigParser,
+        args: argparse.Namespace,
+        package_file: PackageFile,
+    ) -> None:
+        self.pathname: str = pathname
+        self.config = config
+        self.args = args
+        self.package_file = package_file
+        self.executable: typing.Optional[str] = None
+        self._parse_lines()
+        self._write_format()
+
+    def _parse_lines(self) -> None:
+        with open("commands.json", "w") as ofh:
+            with open(self.pathname) as f:
+                line: str
+                for line in f:
+                    line = line.rstrip("\n")
+                    command_dict = self._parse_line(line)
+                    # we may want to re-write args here
+                    new_args_list = []
+
+                    args_list = command_dict.get("args", [])
+                    for arg in args_list:
+                        if arg.startswith("--target="):
+                            pathname = arg[9:]
+                            new_args_list.append(arg)
+                            if "[" in pathname:
+                                elems = pathname.split("[")
+                                sequence_file = elems.pop(0)
+                                self.package_file.add_file(sequence_file, sequence_file)
+                                for elem in elems:
+                                    if elem.endswith("]"):
+                                        elem = elem[:-1]
+                                        if elem.startswith("subset="):
+                                            subset_file = elem[7:]
+                                            self.package_file.add_file(subset_file)
+
+                        elif arg.startswith("--query="):
+                            pathname = arg[8:]
+                            new_args_list.append(arg)
+                            if "[" in pathname:
+                                elems = pathname.split("[")
+                                sequence_file = elems.pop(0)
+                                self.package_file.add_file(sequence_file, sequence_file)
+                                for elem in elems:
+                                    if elem.endswith("]"):
+                                        elem = elem[:-1]
+                                        if elem.startswith("subset="):
+                                            subset_file = elem[7:]
+                                            self.package_file.add_file(subset_file)
+                        elif arg.startswith("--segments="):
+                            pathname = arg[11:]
+                            new_args_list.append(arg)
+                            self.package_file.add_file(pathname)
+                        elif arg.startswith("--scores="):
+                            pathname = arg[9:]
+                            new_args_list.append("--scores=data/scores.txt")
+                            self.package_file.add_file(pathname, "data/scores.txt")
+                        else:
+                            new_args_list.append(arg)
+
+                    command_dict["args"] = new_args_list
+                    print(json.dumps(command_dict), file=ofh)
+
+        self.package_file.add_config("commands.json")
+
+    def _parse_line(self, line: str) -> typing.Dict[str, typing.Any]:
+        # resolve shell redirects
+        trees: typing.List[typing.Any] = bashlex.parse(line, strictmode=False)
+        positions: typing.List[typing.Tuple[int, int]] = []
+
+        for tree in trees:
+            visitor = nodevisitor(positions)
+            visitor.visit(tree)
+
+        # do replacements from the end so the indicies will be correct
+        positions.reverse()
+
+        processed = list(line)
+        for start, end in positions:
+            processed[start:end] = ""
+
+        processed_line: str = "".join(processed)
+
+        command_dict = self._parse_processed_line(processed_line)
+        command_dict["stdin"] = visitor.stdin
+        command_dict["stdout"] = visitor.stdout
+        command_dict["stderr"] = visitor.stderr
+
+        return command_dict
+
+    def _parse_processed_line(self, line: str) -> typing.Dict[str, typing.Any]:
+        argv: typing.List[str] = list(bashlex.split(line))
+        self.executable = argv.pop(0)
+
+        parser: argparse.ArgumentParser = argparse.ArgumentParser(add_help=False)
+        if "arguments" in self.config:
+            arguments_section = self.config["arguments"]
+
+            arg: str
+            if "flag_args" in arguments_section:
+                for arg in arguments_section["flag_args"].split():
+                    parser.add_argument(f"--{arg}", action="store_true")
+
+            if "str_args" in arguments_section:
+                for arg in arguments_section["str_args"].split():
+                    parser.add_argument(f"--{arg}", type=str)
+
+            if "bool_str_args" in arguments_section:
+                for arg in arguments_section["bool_str_args"].split():
+                    parser.add_argument(
+                        f"--{arg}", nargs="?", const=True, default=False
+                    )
+
+            if "int_args" in arguments_section:
+                for arg in arguments_section["int_args"].split():
+                    parser.add_argument(f"--{arg}", type=int)
+
+            if "bool_int_args" in arguments_section:
+                for arg in arguments_section["bool_int_args"].split():
+                    parser.add_argument(
+                        f"--{arg}", nargs="?", const=True, default=False
+                    )
+
+        namespace, rest = parser.parse_known_intermixed_args(argv)
+        vars_dict = vars(namespace)
+
+        command_dict: typing.Dict[str, typing.Any] = {
+            "executable": self.executable,
+            "args": [],
+        }
+
+        for var in vars_dict.keys():
+            value = vars_dict[var]
+            if value is not None:
+                if isinstance(value, bool):
+                    if value is True:
+                        command_dict["args"].append(f"--{var}")
+                else:
+                    command_dict["args"].append(f"--{var}={value}")
+
+        if len(rest) >= 0:
+            value = rest.pop(0)
+            command_dict["args"].append(f"--target={value}")
+
+        if len(rest) >= 0:
+            value = rest.pop(0)
+            command_dict["args"].append(f"--query={value}")
+
+        return command_dict
+
+    def _write_format(self) -> None:
+        if self.args.format_selector == "bam":
+            format_name = "bam"
+        elif self.args.format_selector == "maf":
+            format_name = "maf"
+        elif self.args.format_selector == "differences":
+            format_name = "interval"
+        else:
+            format_name = "tabular"
+
+        with open("format.txt", "w") as ofh:
+            print(f"{format_name}", file=ofh)
+
+        self.package_file.add_format("format.txt")
+
+
+class nodevisitor(bashlex.ast.nodevisitor):  # type: ignore[misc]
+    def __init__(self, positions: typing.List[typing.Tuple[int, int]]) -> None:
+        self.positions = positions
+        self.stdin = None
+        self.stdout = None
+        self.stderr = None
+
+    def visitredirect(
+        self,
+        n: bashlex.ast.node,
+        n_input: int,
+        n_type: str,
+        output: typing.Any,
+        heredoc: typing.Any,
+    ) -> None:
+        if isinstance(n_input, int) and 0 <= n_input <= 2:
+            if isinstance(output, bashlex.ast.node) and output.kind == "word":
+                self.positions.append(n.pos)
+                if n_input == 0:
+                    self.stdin = output.word
+                elif n_input == 1:
+                    self.stdout = output.word
+                elif n_input == 2:
+                    self.stderr = output.word
+            else:
+                sys.exit(f"oops 1: {type(n_input)}")
+        else:
+            sys.exit(f"oops 2: {type(n_input)}")
+
+    def visitheredoc(self, n: bashlex.ast.node, value: typing.Any) -> None:
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tool_directory", type=str, required=True, help="tool directory")
+    parser.add_argument("--format_selector", type=str, required=True, help="format selector")
+    parser.add_argument("--debug", action="store_true", help="enable debug messages")
+    args = parser.parse_args()
+
+    if args.debug:
+        r_beg = resource.getrusage(resource.RUSAGE_SELF)
+        beg: int = time.monotonic_ns()
+
+    lastz_command_config_file: str = os.path.join(args.tool_directory, "lastz-cmd.ini")
+
+    config: configparser.ConfigParser = configparser.ConfigParser()
+    config.read(lastz_command_config_file)
+
+    package_file = PackageFile()
+    lastz_command_file = "lastz-commands.txt"
+    bashCommandLineFile(lastz_command_file, config, args, package_file)
+    package_file.close()
+
+    if args.debug:
+        ns: int = time.monotonic_ns() - beg
+        r_end = resource.getrusage(resource.RUSAGE_SELF)
+        print(f"package output clock time: {ns} ns", file=sys.stderr, flush=True)
+        for rusage_attr in RUSAGE_ATTRS:
+            value = getattr(r_end, rusage_attr) - getattr(r_beg, rusage_attr)
+            print(f"  package output {rusage_attr}: {value}", file=sys.stderr, flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/kegalign/runner.py b/tools/kegalign/runner.py
diff --git a/tools/kegalign/scoring_options.xml b/tools/kegalign/scoring_options.xml
@@ -0,0 +1,25 @@
+<macros>
+    <xml name="scoring_options">
+        <section name="scoring_options" expanded="false" title="Scoring Options">
+            <param argument="--scoring" type="data" value="false" optional="true" format="txt" label="Scoring file in LASTZ format"/>
+            <param name="ambiguous_selector" type="select" label="Ambiguous Nucleotides">
+                <option value="x" selected="true">None</option>
+                <option value="n">N</option>
+                <option value="iupac">IUPAC</option>
+            </param>
+            <conditional name="ambiguous_params">
+                <param name="set_ambiguous_params_selector" type="select" label="Set Ambiguous Nucleotide Scoring Parameters">
+                    <option value="false" selected="true">No</option>
+                    <option value="true">Yes</option>
+                </param>
+                <when value="false">
+                    <!-- Do nothing -->
+                </when>
+                <when value="true">
+                    <param name="ambiguous_reward" type="integer" value="0" label="Ambiguous Nucleotide Reward"/>
+                    <param name="ambiguous_penalty" type="integer" value="0" label="Ambiguous Nucleotide Penalty"/>
+                </when>
+            </conditional>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/seeding_options.xml b/tools/kegalign/seeding_options.xml
@@ -0,0 +1,34 @@
+<macros>
+    <xml name="seeding_options">
+        <section name="seeding_options" expanded="false" title="Seeding Options">
+            <conditional name="seed">
+                <param name="seed_selector" type="select" label="Seed patern">
+                    <option value="12of19" selected="true">12of19 (1110100110010101111)</option>
+                    <option value="14of22">14of22 (1110101100110010101111)</option>
+                    <option value="custom">custom</option>
+                </param>
+                <when value="12of19">
+                    <!-- Do nothing -->
+                </when>
+                <when value="14of22">
+                    <!-- Do nothing -->
+                </when>
+                <when value="custom">
+                    <param name="custom_seed" type="text" value="" optional="false" label="Custom seed pattern">
+                        <validator type="empty_field"/>
+                        <validator type="regex" message="Arbitrary pattern of 1s, 0s, and Ts">^[01T]+$</validator>
+                        <sanitizer invalid_char="">
+                            <valid initial="none">
+                                <add value="0"/>
+                                <add value="1"/>
+                                <add value="T"/>
+                            </valid>
+                        </sanitizer>
+                    </param>
+                </when>
+            </conditional>
+            <param argument="--step" type="integer" value="1" label="Offset between the starting positions of successive target words considered for generating seed table"/>
+            <param argument="--notransition" type="boolean" checked="false" label="Don't allow one transition in a seed hit"/>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/sequence_options.xml b/tools/kegalign/sequence_options.xml
@@ -0,0 +1,12 @@
+<macros>
+    <xml name="sequence_options">
+        <section name="sequence_options" expanded="false" title="Sequence Options">
+            <param name="strand_selector" type="select" label="Strand to search">
+                <option value="plus">plus</option>
+                <option value="minus">minus</option>
+                <option value="both" selected="true">both</option>
+            </param>
+            <yield/>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/system_options.xml b/tools/kegalign/system_options.xml
@@ -0,0 +1,11 @@
+<macros>
+    <xml name="system_options">
+        <section name="system_options" title="System Options">
+            <param argument="--wga_chunk_size" type="integer" value="250000" label="Chunk sizes for GPU calls for Xdrop - ⚠ change only if you are a developer"/>
+            <param argument="--lastz_interval_size" type="integer" value="10000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
+            <param argument="--seq_block_size" type="integer" value="400000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
+            <param argument="--num_gpu" type="integer" value="-1" label="Specify number of GPUs to use - -1 if all the GPUs should be used"/>
+            <param argument="--debug" type="boolean" value="false" label="Print debug messages"/>
+        </section>
+    </xml>
+</macros>
diff --git a/tools/kegalign/test-data/hg38.chr20.chunk.fa.gz b/tools/kegalign/test-data/hg38.chr20.chunk.fa.gz
diff --git a/tools/kegalign/test-data/mm39.chr2.chunk.fa.gz b/tools/kegalign/test-data/mm39.chr2.chunk.fa.gz
diff --git a/tools/kegalign/ungapped_extension_options.xml b/tools/kegalign/ungapped_extension_options.xml
@@ -0,0 +1,9 @@
+<macros>
+    <xml name="ungapped_extension_options">
+        <section name="ungapped_extension_options" expanded="false" title="Ungapped Extension Options">
+            <param argument="--xdrop" type="integer" value="910" label="X-drop value for gap-free extension"/>
+            <param argument="--hspthresh" type="integer" value="3000" label="Segment score threshold for high scoring pairs"/>
+            <param argument="--noentropy" type="boolean" checked="false" label="Don't adjust low score segment pair scores using entropy factor after filtering stage"/>
+        </section>
+    </xml>
+</macros>