Skip to content

Commit

Permalink
the start of version 2: using binary alignment for a pseudophylogenet…
Browse files Browse the repository at this point in the history
…ic method
  • Loading branch information
lskatz committed Jan 8, 2021
1 parent b966948 commit c77b6cf
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ WriteMakefile1(
'DBD::SQLite' => 0,
'Bio::SeqIO' => 0,
'Bio::TreeIO' => 0,
'Bio::AlignIO' => 0,
'Bio::Matrix::IO'=> 0,
'Bio::Tree::Statistics'=> 0,
'Bio::Tree::DistanceFactory'=> 0,
'Bio::Sketch::Mash' => 0, # ensures that Mash gets loaded
"Bio::Sketch::Mash" => 0.23,
#'Bio::Kmer' => 0.19,
#'Graph::Dijkstra'=> 0,
#'Readonly' => 0,
Expand Down
22 changes: 13 additions & 9 deletions bin/mashtree
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use FindBin;
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/../lib/perl5";
use File::Which qw/which/;
use Mashtree qw/logmsg @fastqExt @fastaExt @mshExt @richseqExt _truncateFilename createTreeFromPhylip $MASHTREE_VERSION/;
use Mashtree qw/logmsg @fastqExt @fastaExt @mshExt @richseqExt _truncateFilename sketchesToAlignment createTreeFromBinaryAlignment $MASHTREE_VERSION/;
use Mashtree::Db;
use Bio::Tree::DistanceFactory;
use Bio::Matrix::IO;
Expand Down Expand Up @@ -74,11 +74,13 @@ sub main{
die "need more arguments\n".usage() if(@reads < 1);

# Check for prereq executables.
for my $exe(qw(mash quicktree)){
for my $exe(qw(mash)){
if(!which($exe)){
die "ERROR: could not find $exe in your PATH";
}
}
which("raxmlHPC") || which("raxmlHPC-PTHREADS")
or die "ERROR: need raxml in your PATH";

# Check mash version
my $mashVersion = `mash --version`;
Expand Down Expand Up @@ -143,18 +145,20 @@ sub main{

my $sketches=sketchAll(\@reads,"$$settings{tempdir}",$settings);

my $phylip = mashDistance($sketches,\@reads,$$settings{tempdir},$settings);
my $aln = sketchesToAlignment($sketches,"$$settings{tempdir}",$settings);

my $treeObj = createTreeFromPhylip($phylip,$$settings{tempdir},$settings);
my $tree = createTreeFromBinaryAlignment($aln,$$settings{tempdir}, $settings);

# Write the tree
if($$settings{outtree}){
open(my $treeFh, ">", $$settings{outtree}) or die "ERROR writing tree to $$settings{outtree}: $!";
print $treeFh $treeObj->as_text('newick');
close $treeFh
cp($tree, $$settings{outtree}) or die "ERROR: could not copy $tree => $$settings{outtree}: $!";
} else {
print $treeObj->as_text('newick');
print "\n";
open(my $treeFh, $tree) or die "ERROR: could not read from $tree: $!";
while(<$treeFh>){
print;
}
close $treeFh;
print "\n"; # ensure a newline after the tree
}

return 0;
Expand Down
114 changes: 112 additions & 2 deletions lib/Mashtree.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use strict;
use warnings;
use Exporter qw(import);
use File::Basename qw/fileparse basename dirname/;
use File::Which qw/which/;
use Data::Dumper;
use List::Util qw/shuffle/;
use Scalar::Util qw/looks_like_number/;
Expand All @@ -14,9 +15,11 @@ use threads::shared;
use lib dirname($INC{"Mashtree.pm"});
use Bio::Matrix::IO;
use Bio::TreeIO;
use Bio::Sketch::Mash;
use Bio::AlignIO;

our @EXPORT_OK = qw(
logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes
logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes sketchesToAlignment createTreeFromBinaryAlignment
@fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
$MASHTREE_VERSION
);
Expand All @@ -26,7 +29,7 @@ local $0=basename $0;
######
# CONSTANTS

our $VERSION = "1.1.2";
our $VERSION = "2.0";
our $MASHTREE_VERSION=$VERSION;
our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
Expand Down Expand Up @@ -101,6 +104,68 @@ sub _truncateFilename{
return $name;
}

# Read sketches and create an alignment in phylip format:
# 1. Make a presence/absence perl hash of min-hashes
# 2. Create a "sequence" for each sketch
# 3. Create the MSA of these sequences
# This function uses $settings with keys:
# presence: the nucleotide that stands for presence. Default: "1"
# absence: the nucleotide that stands for absence. Default: "0"
# format: the format of the output alignment. Default: "phylip"
sub sketchesToAlignment{
my($sketches, $outdir, $settings) = @_;

my $presence = $$settings{presence} || 1;
my $absence = $$settings{absence} || 0;
my $format = $$settings{format} || "phylip";

my $outfile = "$outdir/aln.$format";

## Presence/absence of hashes
my %p; # presence/absence
for my $file(@$sketches){
my $msh = Bio::Sketch::Mash->new($file);
my $sketches=$$msh{sketches}[0]{hashes};
for my $s(@$sketches){
$p{$s}{$file}=1;
}
}

## Create pseudo-sequences
my %sampleSeq;
# Sort to help keep output stable
my @hashInt = sort{$a<=>$b} keys(%p);
for my $h(@hashInt){
for my $file(@$sketches){
# If the hash is present, then give the "present" nucleotide
if($p{$h}{$file}){
$sampleSeq{$file} .= $presence;
}
# If the hash is not present, then give the "absent" nucleotide
else{
$sampleSeq{$file} .= $absence;
}
}
}

## Make alignment to string
my $alnStr = "";
for my $file(@$sketches){
my $name = _truncateFilename($file, $settings);
$alnStr .= ">$name\n";
$alnStr .= $sampleSeq{$file}."\n";
}

## Convert alignment to phylip
my $alnin = Bio::AlignIO->new(-string=>$alnStr, -format=>"fasta");
my $alnout= Bio::AlignIO->new(-file=>">".$outfile,-format=>$format);
while(my $aln = $alnin->next_aln){
$alnout->write_aln($aln);
}

return $outfile;
}


# 1. Read the mash distances
# 2. Create a phylip file
Expand Down Expand Up @@ -187,6 +252,51 @@ sub sortNames{
return @sorted;
}

# Create a tree from a binary alignment
sub createTreeFromBinaryAlignment{
my($aln, $outdir, $settings) = @_;

my $outfile = "$outdir/tree.dnd";

my $raxml = which("raxmlHPC") || which("raxmlHPC-PTHREADS");
if(!$raxml){
die "ERROR: could not find raxml in your path.";
}

# Avoid raxml crashing because original files were there
for my $filename (glob("$outdir/RAxML*.raxml")){
unlink $filename;
}

# Run raxml from in the output directory
# -f a bootstrapping algorithm
# -s the alignment
# -n suffix for output files is raxml
# -T number of threads
# -p and -x are seeds
# -N 10 only ten bootstraps for fast analysis
# -m BINGAMMA binary gamma model
my $raxmllog = "$outdir/raxml.log";
logmsg "RAxML log will be in $raxmllog";
system("cd $outdir && $raxml -f a -s aln.phylip -n raxml -T $$settings{numcpus} -p $$settings{seed} -x $$settings{seed} -N 10 -m BINGAMMA > ".basename($raxmllog)." 2>&1");
if($?){
my $raxmlErr = $!;
open(my $logfh, $raxmllog) or logmsg "ERROR opening log file $raxmllog: $!";
while(<$logfh>){
print;
}
close $logfh;

die "ERROR running raxml: $raxmlErr" if $?;
}

# Since we're not too interested in bootstrapping here,
# just return the best tree.
link("$outdir/RAxML_bestTree.raxml", $outfile);

return $outfile;
}

# Create tree file with Quicktree but bioperl
# as a backup.
sub createTreeFromPhylip{
Expand Down

0 comments on commit c77b6cf

Please sign in to comment.