Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding kegalign
Browse files Browse the repository at this point in the history
Richard Burhans committed Aug 29, 2024
1 parent 167e4bf commit f032b04
Showing 16 changed files with 1,638 additions and 0 deletions.
12 changes: 12 additions & 0 deletions tools/kegalign/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
categories:
- Next Gen Mappers
description: A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm.
homepage_url: https://github.com/galaxyproject/KegAlign
long_description: |
KegAlign is a scalable, GPU-accelerated system for computing pairwise
WGA. KegAlign is based on the standard seed-filter-extend heuristic,
in which the filtering stage dominates the runtime.
name: kegalign
owner: richard-burhans
remote_repository_url: https://github.com/richard-burhans/galaxytools/tree/main/tools/kegalign
type: unrestricted
278 changes: 278 additions & 0 deletions tools/kegalign/diagonal_partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#!/usr/bin/env python


"""
Diagonal partitioning for segment files output by KegAlign.
Usage:
diagonal_partition.py <max-segments> <lastz-command>
set <max-segments> = 0 to skip partitioning, -1 to estimate best parameter
"""

import collections
import os
import statistics
import sys
import typing


def chunks(lst: tuple[str, ...], n: int) -> typing.Iterator[tuple[str, ...]]:
"""Yield successive n-sized chunks from list."""
for i in range(0, len(lst), n):
yield lst[i:i + n]


if __name__ == "__main__":
# TODO: make these optional user defined parameters

# deletes original segment file after splitting
DELETE_AFTER_CHUNKING = True

# don't partition segment files with line count below this value
MIN_CHUNK_SIZE = 5000

# only used when segment size is being estimated
MAX_CHUNK_SIZE = 50000

# include chosen split size in file name
DEBUG = False

# first parameter contains chunk size
chunk_size = int(sys.argv[1])
params = sys.argv[2:]

# don't do anything if 0 chunk size
if chunk_size == 0:
print(" ".join(params), flush=True)
sys.exit(0)

# Parsing command output from KegAlign
segment_key = "--segments="
segment_index = None
input_file = None

for index, value in enumerate(params):
if value[:len(segment_key)] == segment_key:
segment_index = index
input_file = value[len(segment_key):]
break

if segment_index is None:
sys.exit(f"Error: could not get segment key {segment_key} from parameters {params}")

if input_file is None:
sys.exit(f"Error: could not get segment file from parameters {params}")

if not os.path.isfile(input_file):
sys.exit(f"Error: File {input_file} does not exist")

# each char is 1 byte
line_size = None
file_size = os.path.getsize(input_file)
with open(input_file, "r") as f:
# add 1 for newline
line_size = len(f.readline())

estimated_lines = file_size // line_size

# check if chunk size should be estimated
if chunk_size < 0:
# optimization, do not need to get each file size in this case
if estimated_lines < MIN_CHUNK_SIZE:
print(" ".join(params), flush=True)
sys.exit(0)

# get size of each segment assuming DELETE_AFTER_CHUNKING == True
# takes into account already split segments
files = [i for i in os.listdir(".") if i.endswith(".segments")]

if len(files) < 2:
# if not enough segment files for estimation, use MAX_CHUNK_SIZE
chunk_size = MAX_CHUNK_SIZE
else:
fdict: typing.DefaultDict[str, int] = collections.defaultdict(int)
for filename in files:
size = os.path.getsize(filename)
f_ = filename.split(".split", 1)[0]
fdict[f_] += size

if len(fdict) < 7:
# outliers can heavily skew prediction if <7 data points
# to be safe, use 50% quantile
chunk_size = int(statistics.quantiles(fdict.values())[1] // line_size)
else:
# otherwise use 75% quantile
chunk_size = int(statistics.quantiles(fdict.values())[-1] // line_size)
# if not enough data points, there is a chance of getting unlucky
# minimize worst case by using MAX_CHUNK_SIZE

chunk_size = min(chunk_size, MAX_CHUNK_SIZE)

# no need to sort if number of lines <= chunk_size
if (estimated_lines <= chunk_size):
print(" ".join(params), flush=True)
sys.exit(0)

# Find rest of relevant parameters
output_key = "--output="
output_index = None
output_alignment_file = None
output_alignment_file_base = None
output_format = None

strand_key = "--strand="
strand_index = None
for index, value in enumerate(params):
if value[:len(output_key)] == output_key:
output_index = index
output_alignment_file = value[len(output_key):]
output_alignment_file_base, output_format = output_alignment_file.rsplit(".", 1)

if value[:len(strand_key)] == strand_key:
strand_index = index

if output_alignment_file_base is None:
sys.exit(f"Error: could not get output alignment file base from parameters {params}")

if output_format is None:
sys.exit(f"Error: could not get output format from parameters {params}")

if output_index is None:
sys.exit(f"Error: could not get output key {output_key} from parameters {params}")

if strand_index is None:
sys.exit(f"Error: could not get strand key {strand_key} from parameters {params}")

# error file is at very end
err_index = -1
err_name_base = params[-1].split(".err", 1)[0]

# dict of list of tuple (x, y, str)
data: dict[tuple[str, str], list[tuple[int, int, str]]] = {}

direction = None
if "plus" in params[strand_index]:
direction = "f"
elif "minus" in params[strand_index]:
direction = "r"
else:
sys.exit(f"Error: could not figure out direction from strand value {params[strand_index]}")

for line in open(input_file, "r"):
if line == "":
continue
seq1_name, seq1_start, seq1_end, seq2_name, seq2_start, seq2_end, _dir, score = line.split()
# data.append((int(seq1_start), int(seq2_start), line))
half_dist = int((int(seq1_end) - int(seq1_start)) // 2)
assert int(seq1_end) > int(seq1_start)
assert int(seq2_end) > int(seq2_start)
seq1_mid = int(seq1_start) + half_dist
seq2_mid = int(seq2_start) + half_dist
data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line))

# If there are chromosome pairs with segment count <= chunk_size
# then no need to sort and split these pairs into separate files.
# It is better to keep these pairs in a single segment file.

# pairs that have count <= chunk_size. these will not be sorted
skip_pairs = []

# save query key order
# for lastz segment files: 'Query sequence names must appear in the same
# order as they do in the query file'

# NOTE: assuming data.keys() preserves order of keys. Requires Python 3.7+

query_key_order = list(dict.fromkeys([i[1] for i in data.keys()]))

if len(data.keys()) > 1:
for pair in data.keys():
if len(data[pair]) <= chunk_size:
skip_pairs.append(pair)

# sorting for forward segments
if direction == "r":
for pair in data.keys():
if pair not in skip_pairs:
data[pair] = sorted(data[pair], key=lambda coord: (coord[1] - coord[0], coord[0]))

# sorting for reverse segments
elif direction == "f":
for pair in data.keys():
if pair not in skip_pairs:
data[pair] = sorted(data[pair], key=lambda coord: (coord[1] + coord[0], coord[0]))
else:
sys.exit(f"INVALID DIRECTION VALUE: {direction}")

# Writing file in chunks
ctr = 0
# [i for i in data_keys if i not in set(skip_pairs)]:
for pair in (data.keys() - skip_pairs):
for chunk in chunks(list(zip(*data[pair]))[2], chunk_size):
ctr += 1
name_addition = f".split{ctr}"

if DEBUG:
name_addition = f".{chunk_size}{name_addition}"

fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"

assert len(chunk) != 0
with open(fname, "w") as f:
f.writelines(chunk)
# update segment file in command
params[segment_index] = segment_key + fname
# update output file in command
params[output_index] = output_key + output_alignment_file_base + name_addition + "." + output_format
# update error file in command
params[-1] = err_name_base + name_addition + ".err"
print(" ".join(params), flush=True)

# writing unsorted skipped pairs
if len(skip_pairs) > 0:
# list of tuples of (pair length, pair)
skip_pairs_with_len = sorted([(len(data[p]), p) for p in skip_pairs])
# NOTE: This sorting can violate lastz query key order requirement, this is fixed later

# used for sorting
query_key_order_table = {item: idx for idx, item in enumerate(query_key_order)}

# list of list of pair names
aggregated_skip_pairs: list[list[tuple[str, str]]] = []
current_count = 0
aggregated_skip_pairs.append([])
for count, pair in skip_pairs_with_len:
if current_count + count <= chunk_size:
current_count += count
aggregated_skip_pairs[-1].append(pair)
else:
aggregated_skip_pairs.append([])
current_count = count
aggregated_skip_pairs[-1].append(pair)

for aggregate in aggregated_skip_pairs:
ctr += 1
name_addition = f".split{ctr}"

if DEBUG:
name_addition = f".{chunk_size}{name_addition}"

fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"

with open(fname, "w") as f:
# fix possible lastz query key order violations
# p[1] is query key
for pair in sorted(aggregate, key=lambda p: query_key_order_table[p[1]]):
chunk = list(zip(*data[pair]))[2]
f.writelines(chunk)
# update segment file in command
params[segment_index] = segment_key + fname
# update output file in command
params[output_index] = output_key + output_alignment_file_base + name_addition + "." + output_format
# update error file in command
params[-1] = err_name_base + name_addition + ".err"
print(" ".join(params), flush=True)

if DELETE_AFTER_CHUNKING:
os.remove(input_file)
9 changes: 9 additions & 0 deletions tools/kegalign/gapped_extension_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<macros>
<xml name="gapped_extension_options">
<section name="gapped_extension_options" expanded="false" title="Gapped Extension Options">
<param argument="--ydrop" type="integer" value="9430" label="Y-drop value for gapped extension"/>
<param argument="--gappedthresh" type="integer" value="" optional="true" label="Score threshold for gapped alignments"/>
<param argument="--notrivial" type="boolean" checked="false" label="Don't output a trivial self-alignment block if the target and query sequences are identical"/>
</section>
</xml>
</macros>
208 changes: 208 additions & 0 deletions tools/kegalign/kegalign.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
<tool id="kegalign" name="KegAlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm</description>
<macros>
<import>macros.xml</import>
<import>sequence_options.xml</import>
<import>scoring_options.xml</import>
<import>seeding_options.xml</import>
<import>ungapped_extension_options.xml</import>
<import>gapped_extension_options.xml</import>
<import>output_options.xml</import>
<import>system_options.xml</import>
</macros>
<expand macro="edam_ontology"/>
<expand macro="requirements"/>
<required_files>
<include path="diagonal_partition.py"/>
<include path="lastz-cmd.ini"/>
<include path="package_output.py"/>
<include path="runner.py"/>
</required_files>
<command detect_errors="exit_code"><![CDATA[
## Convert input sequences to 2bit -------------------------------------
mkdir -p "\$(pwd)/work" &&
faToTwoBit <(gzip -cdfq '$target') "\$(pwd)/work/ref.2bit" &&
faToTwoBit <(gzip -cdfq '$query') "\$(pwd)/work/query.2bit" &&
## Run KegAlign --------------------------------------------------------
## explicitly calling python to bypass a pulsar bug
## https://github.com/galaxyproject/pulsar/issues/341
python '$__tool_directory__/runner.py'
--output-type tarball
--output-file '$kegalign_output'
--diagonal-partition
--num-cpu \${GALAXY_SLOTS:-2}
--tool_directory '$__tool_directory__'
'$target'
'$query'
## Sequence Options ----------------------------------------------------
--strand '$sequence_options.strand_selector'
## Scoring Options -----------------------------------------------------
#set $scoring_pathname = str($scoring_options.scoring)
#if $scoring_pathname != "None":
--scoring '$scoring_pathname'
#end if
#if str($scoring_options.ambiguous_selector) != "x"
#if str($scoring_options.set_ambiguous_params_selector) == "true"
#set $argument_value = ','.join($scoring_options.ambiguous_selector, $scoring_options.ambiguous_reward, $scoring_options.ambiguous_penalty)
--ambiguous '$argument_value'
#else
--ambiguous '$ambiguous_selector'
#end if
#end if
## Seeding Options -----------------------------------------------------
#if str($seeding_options.seed.seed_selector) == "custom"
--seed '$seeding_options.seed.custom_seed'
#else
--seed '$seeding_options.seed.seed_selector'
#end if
--step '$seeding_options.step'
#if str($seeding_options.notransition) == "true"
--notransition
#end if
## Ungapped Extension Options ------------------------------------------
--xdrop '$ungapped_extension_options.xdrop'
--hspthresh '$ungapped_extension_options.hspthresh'
#if str($ungapped_extension_options.noentropy) == "true"
--noentropy
#end if
## Gapped Extension Options --------------------------------------------
--ydrop '$gapped_extension_options.ydrop'
#if str($gapped_extension_options.gappedthresh) != ""
--gappedthresh '$gapped_extension_options.gappedthresh'
#end if
#if str($gapped_extension_options.notrivial) == "true"
--notrivial
#end if
## Output Options -----------------------------------------------------
#if str($output_options.format.format_selector) == "bam"
--format '$output_options.format.bam_options'
#else if str($output_options.format.format_selector) == "general_def"
--format general-
#else if str($output_options.format.format_selector) == "general_full"
--format 'general-:${output_options.format.fields}'
#else if str($output_options.format.format_selector) == "maf"
--format '$output_options.format.maf_type'
#else if str($output_options.format.format_selector) == "blastn"
--format=BLASTN-
#else if str($output_options.format.format_selector) == "differences"
--format=differences
#end if
## todo :: rplot, bam
## --action:target=multiple
## $output_format.rplot
## .if str( $output_format.out.format ) == "bam":
## | samtools sort -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '${output}'
## .else:
## > '${output}'
## .end if
## .if $output_format.rplot:
## &&
## Rscript $r_plot > /dev/null 2>&1
## .end if
## System Options -----------------------------------------------------
--wga_chunk_size '$system_options.wga_chunk_size'
--lastz_interval_size '$system_options.lastz_interval_size'
--seq_block_size '$system_options.seq_block_size'
--num_gpu '$system_options.num_gpu'
#if str($system_options.debug) == "true"
--debug
#end if
## Package Output ----------------------------------------------------
&&
python '$__tool_directory__/package_output.py'
--tool_directory '$__tool_directory__'
--format_selector '$output_options.format.format_selector'
#if str($system_options.debug) == "true"
--debug
#end if
]]></command>
<inputs>
<param name="target" type="data" format="fasta,fasta.gz" label="Target sequence file in FASTA format"/>
<param name="query" type="data" format="fasta,fasta.gz" label="Query sequence file in FASTA format"/>
<expand macro="sequence_options"/>
<expand macro="scoring_options"/>
<expand macro="seeding_options"/>
<expand macro="ungapped_extension_options"/>
<expand macro="gapped_extension_options"/>
<expand macro="output_options"/>
<expand macro="system_options"/>
</inputs>
<outputs>
<data name="kegalign_output" format="tgz" from_work_dir="data_package.tgz" label="KegAlign on ${on_string}"/>
</outputs>
<tests>
<test expect_num_outputs="1" expect_test_failure="true">
<param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta.gz"/>
<param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta.gz"/>
<output name="kegalign_output" ftype="tgz">
<assert_contents>
<has_archive_member path="galaxy/commands.json"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
KegAlign is a scalable, GPU-accelerated system for computing pairwise WGA. KegAlign is based on the standard seed-filter-extend heuristic, in which the filtering stage dominates the runtime (e.g. 98% for human-mouse WGA), and is accelerated using GPU(s). KegAlign was designed as a faster replacement for lastz pairwise aligner.
**Using this tool**
.. class:: warningmark
This tool is the first part of a two-step process for generation of paiwrise alignments. The output of this tool is used as an input to **Batched LASTZ** tool. The *Batched LASTZ** can be found in the list of tool of this Galaxy instance.
**What it does**
KegAlign processes **Target** and **Query** sequences to identify highly similar regions where gapped extension will be performed to create actual alignments. The actual alignments are generated by **Batched LASTZ** that should be run on the output of this tool.
.. class:: infomark
Although this tool is only the first part of the alignment process all parameters related to generation of alignments are set **during this stage**.
**Scoring Options**
By default the HOXD70 substitution scores are used (from `Chiaromonte et al. 2002 <https://www.ncbi.nlm.nih.gov/pubmed/11928468>`_)::
bad_score = X:-1000 # used for sub['X'][*] and sub[*]['X']
fill_score = -100 # used when sub[*][*] is not defined
gap_open_penalty = 400
gap_extend_penalty = 30
A C G T
A 91 -114 -31 -123
C -114 100 -125 -31
G -31 -125 100 -114
T -123 -31 -114 91
Matrix can be supplied as an input to **Read the substitution scores** parameter in *Scoring* section. Substitution matrix can be inferred from your data using another LASTZ-based tool (LASTZ_D: Infer substitution scores).
**Output Options**
.. class:: infomark
The final format in which alignmnets will be generated by **Batched LASTZ** are set here.
The default output is a MAF alignment file. Other formats can be configured in *Output Options* section. See LASTZ manual <https://lastz.github.io/lastz/#formats>`_ for description of possible formats.
]]></help>
<expand macro="citations"/>
</tool>
94 changes: 94 additions & 0 deletions tools/kegalign/lastz-cmd.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
[arguments]
flag_args =
allgappedbounds
anyornone
gapped
gfextend
markend
nocensus
noentropy
nofilter
nogapped
nogfextend
nomirror
norecoverseeds
notransition
notrivial
notwins
noytrim
recoverseeds
self
version
yasra75
yasra85
yasra85short
yasra90
yasra95
yasra95short
yasra98

str_args =
action:query
action:target
ambiguous
ball
chores
filter
format
gap
hspthresh
include
match
maxwordcount
mismatch
nochain
output
outputmasking
outputmasking+
outputmasking+:soft
outputmasking:soft
querydepth
queryhsplimit
rdotplot
readgroup
scores
seed
segments
show
strand
targetcapsule
twins
writecapsule
writesegments

int_args =
allocate:query
allocate:target
allocate:traceback
exact
gappedthresh
inner
masking
queryhspbest
step
word
xdrop
ydrop

bool_str_args=
census
census16
census32
chain
entropy
help
infer
inferonly
infscores
tableonly

bool_int_args=
progress
progress+masking
transition

20 changes: 20 additions & 0 deletions tools/kegalign/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<macros>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">kegalign-full</requirement>
</requirements>
</xml>
<token name="@TOOL_VERSION@">0.1.2.7</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@PROFILE@">21.05</token>
<xml name="edam_ontology">
<edam_operations>
<edam_operation>operation_0491</edam_operation>
</edam_operations>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.1109/SC41405.2020.00043</citation>
</citations>
</xml>
</macros>
95 changes: 95 additions & 0 deletions tools/kegalign/output_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
<macros>
<xml name="output_options">
<section name="output_options" expanded="false" title="Output Options">
<conditional name="format">
<param name="format_selector" type="select" display="radio" label="Specify the output format">
<option value="bam">BAM --format=sam)</option>
<option value="general_def">General default (--format=general)</option>
<option value="general_full">Customized general (--format=general[:fields])</option>
<option value="maf" selected="true">MAF (--format=maf)</option>
<option value="blastn">blastn (--format=BLASTN)</option>
<option value="differences">Differences (--format=differences)</option>
</param>
<when value="bam">
<param name="bam_options" type="select" display="radio" argument="--format=sam, --format=softsam" label="Select a BAM flavor to output" help="Lastz actually outputs SAM data but Galaxy converts it into BAM to save space. For alignments that don't reach the end of a query, ‑‑format=sam uses 'hard clipping', while ‑‑format=softsam uses 'soft clipping'. See the section on 'clipped alignment' in the SAM specification for an explanation of what this means. The options ‑‑format=sam- and ‑‑format=softsam- suppress the SAM header lines. This makes them suitable for concatenating output from multiple runs. If you need to specify readgroup information: use AddOrEplaceReadGroups from Picard package">
<option value="sam">BAM</option>
<option value="softsam">soft-clipped BAM</option>
<option value="sam-">BAM without header</option>
<option value="softsam-">soft-clipped BAM without header</option>
</param>
</when>
<when value="general_def">
<!-- Do nothing -->
</when>
<when value="general_full">
<param name="fields" type="select" display="checkboxes" multiple="true" label="Select which fields to include" argument="--format=general-[:fields]">
<option value="score" selected="true">score: Score of the alignment block</option>
<option value="name1" selected="true">name1: Name of the target sequence</option>
<option value="number1">number1: Number of the target sequence within the target file</option>
<option value="strand1" selected="true">strand1: Target sequence strand </option>
<option value="size1" selected="true">size1: Size of the entire target sequence</option>
<option value="start1">start1: Starting position of the alignment block in the target, origin-one</option>
<option value="zstart1" selected="true">zstart1: Starting position of the alignment block in the target, origin-zero</option>
<option value="end1" selected="true">end1: Ending position of the alignment block in the target</option>
<option value="length1">length1: Length of the alignment block in the target (excluding gaps)</option>
<option value="text1">text1: Aligned characters in the target, including gap characters</option>
<option value="qalign1">qalign1: The target quality sequence (if there is one) correpsonding to aligned characters</option>
<option value="nucs1">nucs1: The entire target sequence</option>
<option value="name2" selected="true">name2: Name of the query sequence</option>
<option value="number2">number2: Number of the query sequence within the query file</option>
<option value="strand2" selected="true">strand2: Query sequence strand</option>
<option value="size2" selected="true">size2: Size of the entire query sequence</option>
<option value="start2">start2: Starting position of the alignment block in the query, origin-one</option>
<option value="zstart2" selected="true">zstart2: Starting position of the alignment block in the query, origin-one</option>
<option value="end2" selected="true">end2: Ending position of the alignment block in the query</option>
<option value="length2">length2: Length of the alignment block in the query (excluding gaps)</option>
<option value="text2">text2: Aligned characters in the query, including gap characters</option>
<option value="qalign2">qalign2: The query quality sequence (if there is one) correpsonding to aligned characters</option>
<option value="nucs2">nucs2: The entire query sequence</option>
<option value="nmatch">nmatch: Match count</option>
<option value="nmismatch">nmismatch: Mismatch count</option>
<option value="ncolumn">ncolumn: Number of columns in the block. This includes matches, mismatches (substitutions), and gaps</option>
<option value="npair">npair: Number of aligned bases in the block that are matches or mismatches (substitutions)</option>
<option value="ngap">ngap: Gap count, the number of gaps in the block, counting each run of gapped columns as a single gap</option>
<option value="cgap">cgap: Gap column count, the number of gaps in the block, counting each gapped column as a separate gap</option>
<option value="diff">diff: Differences between what would be written for text1 and text2</option>
<option value="cigar">cigar: A CIGAR-like representation of the alignment’s path</option>
<option value="cigarx">cigarx: Same as cigar, but uses a newer syntax that distinguishes matches from substitutions</option>
<option value="identity" selected="true">identity: Fraction of aligned bases in the block that are matches </option>
<option value="idfrac">idfrac: Fraction of aligned bases in the block that are matches </option>
<option value="id%" selected="true">id% Fraction of aligned bases in the block that are matches (as %)</option>
<option value="blastid%">blastid%: Fraction of the alignment block that is matches, as would be reported by NCBI BLAST</option>
<option value="continuity">continuity: Rate of non-gaps (non-indels) in the alignment block</option>
<option value="confrac">confrac: Rate of non-gaps (non-indels) in the alignment block (as fraction)</option>
<option value="con%">con%: Rate of non-gaps (non-indels) in the alignment block (as %)</option>
<option value="coverage" selected="true">coverage: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block</option>
<option value="covfrac">covfrac: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as fraction)</option>
<option value="cov%" selected="true">cov%: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as %)</option>
<option value="diagonal">diagonal: The diagonal of the start of the alignment block in the DP matrix, expressed as an identifying number start1-start2</option>
<option value="shingle">shingle: A measurement of the shingle overlap between the target and the query</option>
<option value="number">number: The alignment number, counted as alignments are written to output (1-base)</option>
<option value="znumber">znumber: The alignment number, counted as alignments are written to output (0-base)</option>
<sanitizer invalid_char="">
<valid initial="string.letters,string.digits">
<add value="%"/>
</valid>
</sanitizer>
</param>
</when>
<when value="maf">
<param name="maf_type" type="select" display="radio" argument="--format=maf" label="Select MAF flavor" help="MAF is a multiple alignment format developed at UCSC">
<option value="maf">MAF</option>
<option value="maf+">MAF with additional stats</option>
<option value="maf-" selected="true">MAF without header and comments</option>
</param>
</when>
<when value="blastn">
<!-- Do nothing -->
</when>
<when value="differences">
<!-- Do nothing -->
</when>
</conditional>
</section>
</xml>
</macros>
337 changes: 337 additions & 0 deletions tools/kegalign/package_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
#!/usr/bin/env python

import argparse
import configparser
import json
import os
import resource
import sys
import tarfile
import time
import typing

import bashlex

RUSAGE_ATTRS: typing.Final = ["ru_utime", "ru_stime", "ru_maxrss", "ru_minflt", "ru_majflt", "ru_inblock", "ru_oublock", "ru_nvcsw", "ru_nivcsw"]


class PackageFile:
def __init__(
self,
pathname: str = "data_package.tgz",
top_dir: str = "galaxy",
data_dir: str = "files",
config_file: str = "commands.json",
format_file: str = "format.txt"
) -> None:
self.pathname: str = os.path.realpath(pathname)
self.data_root: str = os.path.join(top_dir, data_dir)
self.config_path: str = os.path.join(top_dir, config_file)
self.config_file: str = config_file
self.format_path: str = os.path.join(top_dir, format_file)
self.format_file: str = format_file
self.tarfile: typing.Optional[tarfile.TarFile] = None
self.name_cache: typing.Dict[typing.Any, typing.Any] = {}
self.working_dir: str = os.path.realpath(os.getcwd())

def _initialize(self) -> None:
if self.tarfile is None:
self.tarfile = tarfile.open(
name=self.pathname,
mode="w:gz",
format=tarfile.GNU_FORMAT,
compresslevel=6,
)

def add_config(self, pathname: str) -> None:
if self.tarfile is None:
self._initialize()

source_path = os.path.realpath(pathname)

if self.tarfile is not None:
self.tarfile.add(source_path, arcname=self.config_path, recursive=False)

def add_file(self, pathname: str, arcname: typing.Optional[str] = None) -> None:
if self.tarfile is None:
self._initialize()

source_path = os.path.realpath(pathname)

dest_path = None

if arcname is None:
dest_path = os.path.join(self.data_root, os.path.basename(source_path))
else:
arc_path = os.path.realpath(arcname)
rel_path = os.path.relpath(arc_path, self.working_dir)
if not (os.path.isabs(rel_path) or rel_path.startswith("../")):
dest_path = os.path.join(self.data_root, rel_path)
else:
sys.exit("path fail")

if dest_path is not None:
if self.tarfile is not None:
if dest_path not in self.name_cache:
try:
self.tarfile.add(
source_path, arcname=dest_path, recursive=False
)
except FileNotFoundError:
sys.exit(f"missing source file {source_path}")

self.name_cache[dest_path] = 1
# print(f"added: {dest_path}", flush=True)

def add_format(self, pathname: str) -> None:
if self.tarfile is None:
self._initialize()

source_path = os.path.realpath(pathname)

if self.tarfile is not None:
self.tarfile.add(source_path, arcname=self.format_path, recursive=False)

def close(self) -> None:
if self.tarfile is not None:
self.tarfile.close()
self.tarfile = None


class bashCommandLineFile:
def __init__(
self,
pathname: str,
config: configparser.ConfigParser,
args: argparse.Namespace,
package_file: PackageFile,
) -> None:
self.pathname: str = pathname
self.config = config
self.args = args
self.package_file = package_file
self.executable: typing.Optional[str] = None
self._parse_lines()
self._write_format()

def _parse_lines(self) -> None:
with open("commands.json", "w") as ofh:
with open(self.pathname) as f:
line: str
for line in f:
line = line.rstrip("\n")
command_dict = self._parse_line(line)
# we may want to re-write args here
new_args_list = []

args_list = command_dict.get("args", [])
for arg in args_list:
if arg.startswith("--target="):
pathname = arg[9:]
new_args_list.append(arg)
if "[" in pathname:
elems = pathname.split("[")
sequence_file = elems.pop(0)
self.package_file.add_file(sequence_file, sequence_file)
for elem in elems:
if elem.endswith("]"):
elem = elem[:-1]
if elem.startswith("subset="):
subset_file = elem[7:]
self.package_file.add_file(subset_file)

elif arg.startswith("--query="):
pathname = arg[8:]
new_args_list.append(arg)
if "[" in pathname:
elems = pathname.split("[")
sequence_file = elems.pop(0)
self.package_file.add_file(sequence_file, sequence_file)
for elem in elems:
if elem.endswith("]"):
elem = elem[:-1]
if elem.startswith("subset="):
subset_file = elem[7:]
self.package_file.add_file(subset_file)
elif arg.startswith("--segments="):
pathname = arg[11:]
new_args_list.append(arg)
self.package_file.add_file(pathname)
elif arg.startswith("--scores="):
pathname = arg[9:]
new_args_list.append("--scores=data/scores.txt")
self.package_file.add_file(pathname, "data/scores.txt")
else:
new_args_list.append(arg)

command_dict["args"] = new_args_list
print(json.dumps(command_dict), file=ofh)

self.package_file.add_config("commands.json")

def _parse_line(self, line: str) -> typing.Dict[str, typing.Any]:
# resolve shell redirects
trees: typing.List[typing.Any] = bashlex.parse(line, strictmode=False)
positions: typing.List[typing.Tuple[int, int]] = []

for tree in trees:
visitor = nodevisitor(positions)
visitor.visit(tree)

# do replacements from the end so the indicies will be correct
positions.reverse()

processed = list(line)
for start, end in positions:
processed[start:end] = ""

processed_line: str = "".join(processed)

command_dict = self._parse_processed_line(processed_line)
command_dict["stdin"] = visitor.stdin
command_dict["stdout"] = visitor.stdout
command_dict["stderr"] = visitor.stderr

return command_dict

def _parse_processed_line(self, line: str) -> typing.Dict[str, typing.Any]:
argv: typing.List[str] = list(bashlex.split(line))
self.executable = argv.pop(0)

parser: argparse.ArgumentParser = argparse.ArgumentParser(add_help=False)
if "arguments" in self.config:
arguments_section = self.config["arguments"]

arg: str
if "flag_args" in arguments_section:
for arg in arguments_section["flag_args"].split():
parser.add_argument(f"--{arg}", action="store_true")

if "str_args" in arguments_section:
for arg in arguments_section["str_args"].split():
parser.add_argument(f"--{arg}", type=str)

if "bool_str_args" in arguments_section:
for arg in arguments_section["bool_str_args"].split():
parser.add_argument(
f"--{arg}", nargs="?", const=True, default=False
)

if "int_args" in arguments_section:
for arg in arguments_section["int_args"].split():
parser.add_argument(f"--{arg}", type=int)

if "bool_int_args" in arguments_section:
for arg in arguments_section["bool_int_args"].split():
parser.add_argument(
f"--{arg}", nargs="?", const=True, default=False
)

namespace, rest = parser.parse_known_intermixed_args(argv)
vars_dict = vars(namespace)

command_dict: typing.Dict[str, typing.Any] = {
"executable": self.executable,
"args": [],
}

for var in vars_dict.keys():
value = vars_dict[var]
if value is not None:
if isinstance(value, bool):
if value is True:
command_dict["args"].append(f"--{var}")
else:
command_dict["args"].append(f"--{var}={value}")

if len(rest) >= 0:
value = rest.pop(0)
command_dict["args"].append(f"--target={value}")

if len(rest) >= 0:
value = rest.pop(0)
command_dict["args"].append(f"--query={value}")

return command_dict

def _write_format(self) -> None:
if self.args.format_selector == "bam":
format_name = "bam"
elif self.args.format_selector == "maf":
format_name = "maf"
elif self.args.format_selector == "differences":
format_name = "interval"
else:
format_name = "tabular"

with open("format.txt", "w") as ofh:
print(f"{format_name}", file=ofh)

self.package_file.add_format("format.txt")


class nodevisitor(bashlex.ast.nodevisitor): # type: ignore[misc]
def __init__(self, positions: typing.List[typing.Tuple[int, int]]) -> None:
self.positions = positions
self.stdin = None
self.stdout = None
self.stderr = None

def visitredirect(
self,
n: bashlex.ast.node,
n_input: int,
n_type: str,
output: typing.Any,
heredoc: typing.Any,
) -> None:
if isinstance(n_input, int) and 0 <= n_input <= 2:
if isinstance(output, bashlex.ast.node) and output.kind == "word":
self.positions.append(n.pos)
if n_input == 0:
self.stdin = output.word
elif n_input == 1:
self.stdout = output.word
elif n_input == 2:
self.stderr = output.word
else:
sys.exit(f"oops 1: {type(n_input)}")
else:
sys.exit(f"oops 2: {type(n_input)}")

def visitheredoc(self, n: bashlex.ast.node, value: typing.Any) -> None:
pass


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--tool_directory", type=str, required=True, help="tool directory")
parser.add_argument("--format_selector", type=str, required=True, help="format selector")
parser.add_argument("--debug", action="store_true", help="enable debug messages")
args = parser.parse_args()

if args.debug:
r_beg = resource.getrusage(resource.RUSAGE_SELF)
beg: int = time.monotonic_ns()

lastz_command_config_file: str = os.path.join(args.tool_directory, "lastz-cmd.ini")

config: configparser.ConfigParser = configparser.ConfigParser()
config.read(lastz_command_config_file)

package_file = PackageFile()
lastz_command_file = "lastz-commands.txt"
bashCommandLineFile(lastz_command_file, config, args, package_file)
package_file.close()

if args.debug:
ns: int = time.monotonic_ns() - beg
r_end = resource.getrusage(resource.RUSAGE_SELF)
print(f"package output clock time: {ns} ns", file=sys.stderr, flush=True)
for rusage_attr in RUSAGE_ATTRS:
value = getattr(r_end, rusage_attr) - getattr(r_beg, rusage_attr)
print(f" package output {rusage_attr}: {value}", file=sys.stderr, flush=True)


if __name__ == "__main__":
main()
494 changes: 494 additions & 0 deletions tools/kegalign/runner.py

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions tools/kegalign/scoring_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<macros>
<xml name="scoring_options">
<section name="scoring_options" expanded="false" title="Scoring Options">
<param argument="--scoring" type="data" value="false" optional="true" format="txt" label="Scoring file in LASTZ format"/>
<param name="ambiguous_selector" type="select" label="Ambiguous Nucleotides">
<option value="x" selected="true">None</option>
<option value="n">N</option>
<option value="iupac">IUPAC</option>
</param>
<conditional name="ambiguous_params">
<param name="set_ambiguous_params_selector" type="select" label="Set Ambiguous Nucleotide Scoring Parameters">
<option value="false" selected="true">No</option>
<option value="true">Yes</option>
</param>
<when value="false">
<!-- Do nothing -->
</when>
<when value="true">
<param name="ambiguous_reward" type="integer" value="0" label="Ambiguous Nucleotide Reward"/>
<param name="ambiguous_penalty" type="integer" value="0" label="Ambiguous Nucleotide Penalty"/>
</when>
</conditional>
</section>
</xml>
</macros>
34 changes: 34 additions & 0 deletions tools/kegalign/seeding_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<macros>
<xml name="seeding_options">
<section name="seeding_options" expanded="false" title="Seeding Options">
<conditional name="seed">
<param name="seed_selector" type="select" label="Seed patern">
<option value="12of19" selected="true">12of19 (1110100110010101111)</option>
<option value="14of22">14of22 (1110101100110010101111)</option>
<option value="custom">custom</option>
</param>
<when value="12of19">
<!-- Do nothing -->
</when>
<when value="14of22">
<!-- Do nothing -->
</when>
<when value="custom">
<param name="custom_seed" type="text" value="" optional="false" label="Custom seed pattern">
<validator type="empty_field"/>
<validator type="regex" message="Arbitrary pattern of 1s, 0s, and Ts">^[01T]+$</validator>
<sanitizer invalid_char="">
<valid initial="none">
<add value="0"/>
<add value="1"/>
<add value="T"/>
</valid>
</sanitizer>
</param>
</when>
</conditional>
<param argument="--step" type="integer" value="1" label="Offset between the starting positions of successive target words considered for generating seed table"/>
<param argument="--notransition" type="boolean" checked="false" label="Don't allow one transition in a seed hit"/>
</section>
</xml>
</macros>
12 changes: 12 additions & 0 deletions tools/kegalign/sequence_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<macros>
<xml name="sequence_options">
<section name="sequence_options" expanded="false" title="Sequence Options">
<param name="strand_selector" type="select" label="Strand to search">
<option value="plus">plus</option>
<option value="minus">minus</option>
<option value="both" selected="true">both</option>
</param>
<yield/>
</section>
</xml>
</macros>
11 changes: 11 additions & 0 deletions tools/kegalign/system_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<macros>
<xml name="system_options">
<section name="system_options" title="System Options">
<param argument="--wga_chunk_size" type="integer" value="250000" label="Chunk sizes for GPU calls for Xdrop - ⚠ change only if you are a developer"/>
<param argument="--lastz_interval_size" type="integer" value="10000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
<param argument="--seq_block_size" type="integer" value="400000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
<param argument="--num_gpu" type="integer" value="-1" label="Specify number of GPUs to use - -1 if all the GPUs should be used"/>
<param argument="--debug" type="boolean" value="false" label="Print debug messages"/>
</section>
</xml>
</macros>
Binary file added tools/kegalign/test-data/hg38.chr20.chunk.fa.gz
Binary file not shown.
Binary file added tools/kegalign/test-data/mm39.chr2.chunk.fa.gz
Binary file not shown.
9 changes: 9 additions & 0 deletions tools/kegalign/ungapped_extension_options.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<macros>
<xml name="ungapped_extension_options">
<section name="ungapped_extension_options" expanded="false" title="Ungapped Extension Options">
<param argument="--xdrop" type="integer" value="910" label="X-drop value for gap-free extension"/>
<param argument="--hspthresh" type="integer" value="3000" label="Segment score threshold for high scoring pairs"/>
<param argument="--noentropy" type="boolean" checked="false" label="Don't adjust low score segment pair scores using entropy factor after filtering stage"/>
</section>
</xml>
</macros>

0 comments on commit f032b04

Please sign in to comment.