smeg

#!/bin/bash

package=""  # Default to empty package
SMEG_DIR="$( cd "$( dirname "$(readlink -f ${BASH_SOURCE[0]})" )" && pwd )"
WDR=$(pwd)

while getopts ":hv" opt; do
  case ${opt} in
    h )
      echo "Usage:"
      echo "    smeg build_species <options>   Build species database"
      echo "    smeg build_rep <options>       Build dataset-specific representative strains database"
      echo "    smeg growth_est <options>      Estimate strain-specific growth rate"
      echo "    smeg -v                        Version"
      echo "    smeg -h                        Display this help message"
      exit 0
      ;;
    v )
      echo "smeg v1.0.1" 
      exit 0
      ;;
   \? )
     echo "Invalid Option: -$OPTARG" 1>&2
     exit 1
     ;;
  esac
done
###########################################################################
if [ $# -eq 0 ];
then
      echo "Usage:"
      echo "    smeg build_species <options>   Build species database"
      echo "    smeg build_rep <options>       Build dataset-specific representative strains database"
      echo "    smeg growth_est <options>      Estimate strain-specific growth rate"
      echo "    smeg -v                        Version"
      echo "    smeg -h                        Display this help message"
    exit 1
fi
##########################################################################

shift $((OPTIND -1))

subcommand=$1;
case "$subcommand" in
  # Parse options to the first sub command
  build_species)
    package=$1; shift  # Remove 'build_species' from the argument list
    LIST=false
    NUM_THREAD=1
    INT='^[0-9]+$'
    # Process package options
    while getopts ":g:o:l:p:h" opt; do
      case ${opt} in
        g )
          GEN_DIR=$OPTARG
          ;;
        o )
          OUTPUT_DIR=$OPTARG
          ;;
        l )
          LIST=$OPTARG
          ;;
	p )
          NUM_THREAD=$OPTARG
          ;;
	h )
      echo "Usage:"
      echo "    smeg build_species <options>"
      echo "    <options>"
      echo "    -g      Genomes directory"
      echo "    -o      Output directory"
      echo "    -l      Path to file listing a subset of genomes for"
      echo "            database building [default = use all genomes in 'Genomes directory']"
      echo "    -p INT  Number of threads [default 1]"
      echo "    -h      Display this message"
      exit 0
      ;;
        \? )
          echo "Invalid Option: -$OPTARG" 1>&2
          exit 1
          ;;
        : )
          echo "Invalid Option: -$OPTARG requires an argument" 1>&2
          exit 1
          ;;
       *  ) 
          echo "Unimplemented option: -$OPTARG" >&2
          exit 1
          ;;
esac
done

if [ $# -eq 0 ];
then
      echo "Usage:"
      echo "    smeg build_species <options>"
      echo "    <options>"
      echo "    -g      Genomes directory"
      echo "    -o      Output directory"
      echo "    -l      Path to file listing a subset of genomes for"
      echo "            database building [default = use all genomes in 'Genomes directory']"
      echo "    -p INT  Number of threads [default 1]"
      echo "    -h      Display this message"
      exit 1
fi

if [ "x" == "x$GEN_DIR" ]; then
  echo "-g [option] is required"
  exit 1
fi
if [ "x" == "x$OUTPUT_DIR" ]; then
 directory=$(echo "species_database_$(date +%s)") 
 mkdir $WDR/$directory
 OUTPUT_DIR=$WDR/$directory 
fi
if ! [[ $NUM_THREAD =~ $INT ]] && [[ $NUM_THREAD != 1 ]] ; then
    echo "-p [option] requires an integer"
    exit 1
fi
if [ "$LIST" != "false" ]; then
LIS=$(readlink -f $LIST)
fi

shift $((OPTIND -1))
    ;;
###############
 build_rep)
    package=$1; shift  # Remove 'build_rep' from the argument list
   LIST=false
   CUSTOM=false
   NUM_THREAD=1
   DEEPSPLIT=4
   INT='^[0-9]+$'
    # Process package options
    while getopts ":r:o:s:p:d:c:l:h" opt; do
      case ${opt} in
        r )
          READS_DIR=$OPTARG
          ;;
        o )
          OUTPUT_DIR=$OPTARG
          ;;
        s )
          SPECIES_DIR=$OPTARG
          ;;
	p )
          NUM_THREAD=$OPTARG
          ;;
        d )
          DEEPSPLIT=$OPTARG
          ;;
        l )
          LIST=$OPTARG
          ;;
	c )
          CUSTOM=$OPTARG
	  ;;
        h )
      echo "Usage:"
      echo "    smeg build_rep <options>"
      echo "    <options>"
      echo "    -r         Reads directory (paired-end reads)"
      echo "    -o         Output directory"
      echo "    -s         Species database directory"
      echo "    -d  INT    deepSplit for grouping strains into clusters (0 or 4) [default = 4]"
      echo "    -l         Path to file listing a subset of reads to estimate representative"
      echo "               strains [default = use all samples in Reads directory]"
      echo "    -c         Path to file with customized list of strains for representative database (ONLY use this option IF"
      echo "		   you are certain your chosen strains belong to different clusters and are present in your dataset)"
      echo "    -p  INT    Number of threads [default 1]"
      echo "    -h         Display this message"
      exit 0
      ;;
        \? )
          echo "Invalid Option: -$OPTARG" 1>&2
          exit 1
          ;;
        : )
          echo "Invalid Option: -$OPTARG requires an argument" 1>&2
          exit 1
          ;;
      esac
    done

if [ $# -eq 0 ];
then
      echo "Usage:"
      echo "    smeg build_rep <options>"
      echo "    <options>"
      echo "    -r         Reads directory (paired-end reads)"
      echo "    -o         Output directory"
      echo "    -s         Species database directory"
      echo "    -d  INT    deepSplit for grouping strains into clusters (0 or 4) [default = 4]"
      echo "    -l         Path to file listing a subset of reads to estimate representative"
      echo "               strains [default = use all samples in Reads directory]"
      echo "    -c         Path to file with customized list of strains for representative database (ONLY use this option IF"
      echo "               you are certain your chosen strains belong to different clusters and are present in your dataset)"
      echo "    -p  INT    Number of threads [default 1]"
      echo "    -h         Display this message"
      exit 1
fi
if [ "x" == "x$READS_DIR" ] && [ "$CUSTOM" == false ]; then
  echo "-r [option] is required"
  exit 1
fi
if [ "x" == "x$OUTPUT_DIR" ]; then
  directory=$(echo "representative_database_$(date +%s)")
 mkdir $WDR/$directory
 OUTPUT_DIR=$WDR/$directory
fi
if [ "x" == "x$SPECIES_DIR" ]; then
  echo "-s [option] is required"
  exit 1
fi
if  [[ $DEEPSPLIT != 4 ]] && [[ $DEEPSPLIT != 0 ]]; then
    echo "-d [option] requires an integer value of 0 or 4"
    exit 1
fi
if ! [[ $NUM_THREAD =~ $INT ]] && [[ $NUM_THREAD != 1 ]] ; then
    echo "-p [option] requires an integer"
    exit 1
fi
if [ "$LIST" != "false" ]; then
LIS=$(readlink -f $LIST)
fi

if [ "$CUSTOM" != "false" ]; then
CUS=$(readlink -f $CUSTOM)
fi
  shift $((OPTIND -1))
    ;;
###############
 growth_est)
    package=$1; shift  # Remove 'growth_est' from the argument list
   METHOD=0
   LIST=false
   COV_CUTOFF=1
   NUM_THREAD=1
   THETA=0.06
   INT='^[0-9]+$'
   FLOAT='^[0-9]+([.][0-9]+)?$'
   MERGE=false
    # Process package options
    while getopts ":r:o:s:d:l:m:t:c:p:eh" opt; do
      case ${opt} in
        r )
          READS_DIR=$OPTARG
          ;;
        o )
          OUTPUT_DIR=$OPTARG
          ;;
        s )
          SPECIES_DIR=$OPTARG
          ;;
	d )
          REP_DIR=$OPTARG
          ;;
        c )
          COV_CUTOFF=$OPTARG
          ;;
	p )
          NUM_THREAD=$OPTARG
          ;;
        m )
          METHOD=$OPTARG
          ;;
        t )
          THETA=$OPTARG
          ;;
        e )
          MERGE=true
          ;;
        l )
          LIST=$OPTARG
          ;;
        h )
      echo "Usage:"
      echo "    smeg growth_est <options>"
      echo "    <options>"
      echo "    -r         Reads directory (paired-end reads)"
      echo "    -o         Output directory"
      echo "    -s         Species database directory"
      echo "    -d         Representative strains database directory"
      echo "    -c  FLOAT  Coverage cutoff (>= 1) [default 1]"
      echo "    -m  INT    SMEG method (0 = SNP-method, 1 = gene-dosage method) [default = 0]"
      echo "    -t  FLOAT  Theta value for bin size (bin size = no of unique SNPs x theta)" 
      echo "               Not compatible with gene-dosage method (i.e. -m 1)  [default 0.06]"
      echo "    -l         Path to file listing a subset of reads for analysis"
      echo "               [default = analyze all samples in Reads directory]"
      echo "    -e         merge output tables into a single matrix file and generate heatmap"
      echo "    -p  INT    Number of threads [default 1]"
      echo "    -h         Display this message"
      exit 0
      ;;
        \? )
          echo "Invalid Option: -$OPTARG" 1>&2
          exit 1
          ;;
        : )
          echo "Invalid Option: -$OPTARG requires an argument" 1>&2
          exit 1
          ;;
      esac
    done

if [ $# -eq 0 ];
then
 echo "Usage:"
      echo "    smeg growth_est <options>"
      echo "    <options>"
      echo "    -r         Reads directory (paired-end reads)"
      echo "    -o         Output directory"
      echo "    -s         Species database directory"
      echo "    -d         Representative strains database directory"
      echo "    -c  FLOAT  Coverage cutoff (>= 1) [default 1]"
      echo "    -m  INT    SMEG method (0 = SNP-method, 1 = gene-dosage method) [default = 0]"
      echo "    -t  FLOAT  Theta value for bin size (bin size = no of unique SNPs x theta)"
      echo "               Not compatible with gene-dosage method (i.e. -m 1)  [default 0.06]"
      echo "    -l         Path to file listing a subset of reads for analysis"
      echo "               [default = analyze all samples in Reads directory]"
      echo "    -e         merge output tables into a single matrix file and generate heatmap"
      echo "    -p  INT    Number of threads [default 1]"
      echo "    -h         Display this message"
      exit 1
fi

if [ "x" == "x$READS_DIR" ]; then
  echo "-r [option] is required"
  exit 1
fi
if [ "x" == "x$OUTPUT_DIR" ]; then
  directory=$(echo "SMEG_growth_$(date +%s)")
 mkdir $WDR/$directory
 OUTPUT_DIR=$WDR/$directory
fi
if [ "x" == "x$SPECIES_DIR" ]; then
  echo "-s [option] is required"
  exit 1
fi
if [ "x" == "x$REP_DIR" ]; then
  echo "-d [option] is required"
  exit 1
fi
if ! [[ $COV_CUTOFF =~ $FLOAT ]] && [[ $COV_CUTOFF != 1 ]]; then
    echo "-c [option] requires a numeric value"
    exit 1
fi
if (( $(bc <<< "$COV_CUTOFF < 1") )); then
    echo "required minimum coverage cutoff is 1"
    exit 1
fi
if  [[ $METHOD != 1 ]] && [[ $METHOD != 0 ]]; then
    echo "-m [option] requires an integer value of 0 or 1"
    echo "SNP-method = 0, gene-dosage method = 1"
    exit 1
fi
if ! [[ $THETA =~ $FLOAT ]] && [[ $THETA != 0.06 ]]; then
    echo "-t [option] requires a numeric value"
    exit 1
fi
if [[ $THETA != 0.06 ]] && [ $METHOD == 1 ]; then
    echo "-t [option] is incompatible with gene dosage method"
    exit 1
fi
if [[ $THETA == 0 ]]; then
    echo "-t [option] requires a non-zero value"
    exit 1
fi
if ! [[ $NUM_THREAD =~ $INT ]] && [[ $NUM_THREAD != 1 ]] ; then
    echo "-p [option] requires an integer"
    exit 1
fi
if [ "$LIST" != "false" ]; then
LIS=$(readlink -f $LIST)
fi

  shift $((OPTIND -1))
    ;;
esac
###################################
if [ "$package" != "build_species" ] && [ "$package" != "build_rep" ] && [ "$package" != "growth_est" ]; then
  echo "Invalid option"
  exit 1
fi
#############################################
echo " ################ Checking for dependencies ########"
requirements=$(echo "gcc usearch OAU.jar parallel R Mauve roary prokka bowtie2 pathoscope samtools bamtools featureCounts")
for f in `echo $requirements` 
do 
toolCheck=$(type -P $f)
if [ -z $toolCheck ]; then 
echo "ERROR: $f missing" 
echo "Check https://github.com/ohlab/SMEG for required packages"
exit 1
else
echo "$f found"
fi 
done
echo "All required packages found"
echo " ################ Checking for required R libraries ########"
cd $SMEG_DIR
Rscript check_R_libraries.R
if [ -s .checkedR ]; then
echo "ERROR: The following R libraries are missing"
cat .checkedR
echo "Check https://github.com/ohlab/SMEG for required packages"
rm .checkedR
exit 1
else
echo "R libraries ok"
rm .checkedR
fi
##############################################


#############################################
############################################
# build species database
############################################
############################################
if [ "$package" == "build_species" ]
  then
cd $WDR
GDR=$(readlink -f $GEN_DIR)
ODR=$(readlink -f $OUTPUT_DIR)
echo "$package option activated"
echo "$WDR is present directory" 
echo "$GDR is the genomes directory"
echo "$ODR is the output directory"

mkdir -p $ODR

if [ -z "$(ls -A $ODR)" ]; then
   echo "Output directory ok"
else
   echo "Output directory not Empty"
   exit 1
fi

cd $GDR
exec 3>&2
exec 2> /dev/null
if [ "$LIST" == "false" ]; then
ls {*.fna,*.fa,*.fasta} > $ODR/genomes.txt #suppress error to std output
else
cat $LIS > $ODR/genomes.txt
fi
exec 2>&3

####################
exec 3>&2
exec 2> /dev/null
for f in `cat $ODR/genomes.txt`
do
var=$(grep "plasmid" $f | cut -f1 -d' ')
awk '{if (substr($0,1) == "'$var'") censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f > $ODR/$f.noplasmid

cd $ODR
rename $f.noplasmid $f $f.noplasmid
num_of_contigs=$(grep -c ">" $GDR/$f)
echo -e "$f\t$num_of_contigs" >> num_of_contigs.txt
cd $GDR
done
exec 2>&3
###################

cd $ODR
awk '$2 == 1' num_of_contigs.txt | cut -f1 > reference_genomes.txt
grep -w -v -f reference_genomes.txt num_of_contigs.txt | cut -f1 >  draft_genomes.txt

rm num_of_contigs.txt
rm genomes.txt
mkdir reordered_contigs
#################################################

if [ -s draft_genomes.txt ]; then
path=$(type -P usearch)
oauPath=$(type -P OAU.jar)
for i in `cat reference_genomes.txt`
do
for f in `cat draft_genomes.txt`
do
java -jar $oauPath -f1 $i -f2 $f -n $NUM_THREAD -u $path >> stats.txt 2>&1
done
done

awk '/orthoANI_value/{getline; print}' stats.txt > temp1.txt
awk '/aligned_files/{getline; print}' stats.txt | cut -f2 | sed 's/|/	/g'  > temp2.txt

paste -d'\t' temp1.txt temp2.txt > temp3.txt 
awk '{ print $0,"\011", 1-(($4+$5)/2) }' temp3.txt > ani_output_distance.txt
for f in `cat draft_genomes.txt` 
do 
grep -w "$f" ani_output_distance.txt | sort -k10,10 | head -1 >> input_data_contig_reordering.txt
done
rm temp*
rm ani_output_distance.txt
rm stats.txt
############## Reorder contigs using Mauve ###########################

cut -f9 input_data_contig_reordering.txt | sed 's/ //g' > temp.txt

for i in `cat temp.txt`
do
mkdir $i.dir
ref=$(grep -w "$i" input_data_contig_reordering.txt | cut -f8 )
prefix=$(type -P Mauve |  rev | cut -d'/' -f3- | rev)
mauvepath=$(find $prefix -name "Mauve.jar")
echo "java -cp $mauvepath -Djava.awt.headless=true org.gel.mauve.contigs.ContigOrderer -output $ODR/$i.dir -ref $ODR/$ref -draft $ODR/$i" >> parallel_commands
done
cat parallel_commands | parallel -X --load 80% --noswap '{}'

rm parallel_commands

for i in `cat temp.txt`
do
cd $i.dir
aa=$(ls alignment*/$i.fas | sort | tail -1)
grep -v ">" $aa | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$i" | sed -e '$a\ ' > $ODR/reordered_contigs/$i.fna
cd ../
done

else
echo "Only complete genomes identified"
fi

for f in `cat reference_genomes.txt`
do
cd $GDR
grep -v ">" $f | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$f" | sed -e '$a\ ' > $ODR/reordered_contigs/$f.fna
cd $ODR
done

exec 3>&2
exec 2> /dev/null
rm reference_genomes.txt
rm draft_genomes.txt
rm -rf *.dir
rm temp.txt
rm input_data_contig_reordering.txt
rm {*.fna,*.fa,*.fasta}
exec 2>&3
########### Run prokka ################
cd $ODR/reordered_contigs
ls *.fna > ../genomes.txt
cd $ODR
mkdir gff
for f in `cat genomes.txt`
do
aa=$( echo "$f" | rev | cut -d'.' -f2- | rev)
echo "prokka --kingdom Bacteria --outdir $aa.dir --locustag $aa --prefix $aa $ODR/reordered_contigs/$f" >> parallel_commands
done
cat parallel_commands | parallel -X --load 80% --noswap '{}'

for f in *.dir
do
cp $f/*.gff gff/.
rm -rf $f
done

rm parallel_commands
rm genomes.txt

############ Run Roary ####################
cd $ODR
roary -p $NUM_THREAD -f ./Roary -e -n -s -v $ODR/gff/*.gff

################ Generate phylogenetic tree #########

FastTree -nt -gtr $ODR/Roary/core_gene_alignment.aln > tree.newick

Rscript $SMEG_DIR/create_clusters.R -i tree.newick

mkdir Index
cd $ODR/reordered_contigs

cat *.fna > $ODR/Index/strains.fa
cd $ODR/Index
bowtie2-build strains.fa strains.fa -q --threads $NUM_THREAD
rm strains.fa

#########################################
# build representative strains database
########################################
else
if [ "$package" == "build_rep" ]; then
cd $WDR
RDR=$(readlink -f $READS_DIR)
ODR=$(readlink -f $OUTPUT_DIR)
DBR=$(readlink -f $SPECIES_DIR)
echo "$package option activated"
echo "$WDR is present directory"
echo "$RDR is the reads directory"
echo "$ODR is the output directory"
echo "$DBR is the species database directory"

mkdir -p $ODR

if [ -z "$(ls -A $ODR)" ]; then
   echo "Output directory ok"
else
   echo "Output directory not Empty"
   exit 1
fi

#######
if [ "$CUSTOM" == "false" ]; then

cd $RDR
if [ "$LIST" == "false" ]; then
ls *.fastq | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
else
cat $LIS | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
fi

for f in `cat $ODR/reads.txt`
do
echo "bowtie2 -x $DBR/Index/strains.fa -1 $RDR/$f\_1.fastq -2 $RDR/$f\_2.fastq --very-sensitive -S $ODR/$f.sam -p $NUM_THREAD --score-min L,-0.6,0.006" >> $ODR/parallel_commands
done

cd $ODR
cat parallel_commands | parallel -X --load 80% --noswap '{}'
rm parallel_commands

for f in *.sam
do
samtools view -h -S -F 4 $f > $f.mapped.sam
rm $f
echo "pathoscope ID -alignFile $f.mapped.sam -fileType sam -outDir . -expTag $f.out -thetaPrior 10000000000" >> parallel_commands
done
cat parallel_commands | parallel -X --load 80% --noswap '{}'
rm *.sam

########## get representative clusters #################

num_of_samples=$(ls *.tsv | wc -l)

cat *.tsv | grep -v "Read" | cut -f1 | sort | uniq > passed_genomes.txt

for f in `cat passed_genomes.txt`
do
num_of_sam_gen=$(grep -w "$f" *.tsv | grep -v "Read" | cut -f2 | sort -n | wc -l)
add=$(echo "scale=3; ($num_of_samples - $num_of_sam_gen)" |bc)

yes "0" | head -n $add > add
grep -w "$f" *.tsv | grep -v "Read" | cut -f2 | sort -n > add2
cat add add2 > add3

var=$(cat add3 | sort -n | awk 'NR>1{a[++i]=$1}END{if(i%2==1)print a[(i+1)/2];else print (a[i/2]+a[i/2+1])/2}' )
echo -e "$f\t$var" >> temp2_clusters
done

for f in `cat passed_genomes.txt`
do
paste <(awk '$1 == "'$f'" {print $0}' $DBR/clusters_deepSplit$DEEPSPLIT.txt) <(awk '$1 == "'$f'" {print $2}' temp2_clusters) >> filtered_clusters.txt
done


cut -f2 filtered_clusters.txt | sort | uniq > TMP
for f in `cat TMP`
do
awk '$2=="'$f'"' filtered_clusters.txt | sort -g -r -k3,3 | head -1 >> representative_genomes.TMP.txt
done

awk '$3 > 0' representative_genomes.TMP.txt | sort -g -r -k3,3 > representative_genomes.txt

rm filtered_clusters.txt
rm passed_genomes.txt
rm TMP
rm representative_genomes.TMP.txt
rm temp2_clusters
rm add add2 add3
rm *.tsv
rm parallel_commands
rm reads.txt

########## create the representative strains database ########
cut -f1 representative_genomes.txt > temp
mkdir Index
for f in `cat temp`
do
cat $DBR/reordered_contigs/$f.fna >> Index/representative_strains.fa
done
rm temp
cd Index
bowtie2-build representative_strains.fa representative_strains.fa -q --threads $NUM_THREAD

printf '1\ni\nStrain\tCluster\tMedian_relative_abundance\n.\nw\n' | ed -s $ODR/representative_genomes.txt
############################
else
cat $CUS | rev | cut -d'.' -f2- | rev > $ODR/temp
cd $ODR
mkdir Index
for f in `cat temp`
do
cat $DBR/reordered_contigs/$f.fna >> Index/representative_strains.fa
done
cat temp > representative_genomes.txt
rm temp
cd Index
bowtie2-build representative_strains.fa representative_strains.fa -q --threads $NUM_THREAD
printf '1\ni\nStrain\n.\nw\n' | ed -s $ODR/representative_genomes.txt
fi

################ get unique SNP #################
cd $ODR
if [ "$DEEPSPLIT" == 4 ]; then
sed '1d' representative_genomes.txt | cut -f1 > rep_strains
mkdir gff
for f in `cat rep_strains`
do
cp $DBR/gff/$f.gff gff/.
done
roary -p $NUM_THREAD -f ./Roary -e -n -s -v $ODR/gff/*.gff

cd $ODR/Roary

grep ">" core_gene_alignment.aln | sed 's/>//g' > strains.txt

$SMEG_DIR/uniqueSNPmultithreading core_gene_alignment.aln 2 $NUM_THREAD

awk '/feature/{getline; print}' core_alignment_header.embl | cut -f2 -d'=' > temp1
grep 'feature' core_alignment_header.embl | rev | cut -d' ' -f1 | rev | cut -f1 -d'.' > temp2
grep 'feature' core_alignment_header.embl | rev | cut -d' ' -f1 | rev | cut -f3 -d'.' > temp3
paste -d'\t' temp1 temp2 temp3 > coordinates.txt
rm temp2 temp3


for i in `cat strains.txt`
do
samtools faidx core_gene_alignment.aln $i | grep -v ">" | sed 's/\(.\)/\1\n/g' | sed '/^$/d' |  nl -w2 -s' ' | sed 's/^ //g' | sed 's/ /	/g' > $i.co.txt

for f in `cat temp1`
do
start=$(grep -w "$f" coordinates.txt | cut -f2)
stop=$(grep -w "$f" coordinates.txt | cut -f3)

sed -n ''$start','$stop'p;' $i.co.txt > $f.gene.temp.txt

name=$(echo "$i")
geneID=$(grep -w "$f" clustered_proteins | grep -o "\w*$name\_\w*")
reorder_start=$(grep -w "$geneID"  ../gff/$name.gff | cut -f4)
strand=$(grep -w "$geneID" ../gff/$name.gff | cut -f7)

if [ "-" == "$strand" ]; then  #for reverse stramd
grep -v "-" $f.gene.temp.txt | nl -w2 -s' ' -v $reorder_start | sed 's/^ //g' | sed 's/ /	/g' | cut -f1 | sort -nr > $f.num
grep -v "-" $f.gene.temp.txt | sed -E 's/a|A/www/g; s/t|T/xxx/g; s/c|C/yyy/g;  s/g|G/zzz/g; s/www/T/g; s/xxx/A/g; s/yyy/G/g; s/zzz/C/g;' > $f.gene.temp2.txt
paste $f.num $f.gene.temp2.txt > $f.gene_cord.txt
rm $f.gene.temp.txt
rm $f.gene.temp2.txt 
rm $f.num
else
grep -v "-" $f.gene.temp.txt |  nl -w2 -s' ' -v $reorder_start | sed 's/^ //g' | sed 's/ /	/g' > $f.gene_cord.txt 
rm $f.gene.temp.txt
fi
done

cat *.gene_cord.txt > $i.coordinates.txt
rm *.gene_cord.txt
###############################
grep -w "$i" uniqueSNPs  > $i.uniq.txt
Rscript $SMEG_DIR/extract_unique_SNP.R -i $i.uniq.txt -p $i.coordinates.txt -o $i.Input.txt

mv $i.Input.txt $ODR/.
rm $i.coordinates.txt
rm $i.co.txt
rm $i.uniq.txt
done

rm temp1
rm coordinates.txt
fi
exec 3>&2
exec 2> /dev/null
rm $ODR/rep_strains
exec 2>&3
#########################################
## growth_est
#########################################
else
cd $WDR
RDR=$(readlink -f $READS_DIR)
ODR=$(readlink -f $OUTPUT_DIR)
SPEC=$(readlink -f $SPECIES_DIR)
REP=$(readlink -f $REP_DIR)
echo "$package option activated"
echo "$WDR is present directory"
echo "$RDR is the reads directory"
echo "$ODR is the output directory"
echo "$SPEC is the species database directory"
echo "$REP is the representative strains database directory"

mkdir -p $ODR

if [ -z "$(ls -A $ODR)" ]; then
   echo "Output directory ok"
else
   echo "Output directory not Empty"
   exit 1
fi
################

if [ $METHOD == 1 ]; then # i.e. gene dosage
echo "gene dosage method activated"

mkdir -p $ODR
cd $REP
sed '1d' representative_genomes.txt | cut -f1 > $ODR/rep_strains

cd $ODR
for f in `cat rep_strains`
do
grep -v "#" $SPEC/gff/$f.gff | grep "ID=" | cut -f1 -d ';' | sed 's/ID=//g' | cut -f1,4,5,7,9 |  awk -v OFS='\t' '{print $1,"PROKKA","CDS",$2,$3,".",$4,".","gene_id " $5}' > $f.gtf
done

cd $RDR
if [ "$LIST" == "false" ]; then
ls *.fastq | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
else
cat $LIS | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
fi
cd $ODR
for i in `cat reads.txt`
do
readlength=$(awk '{if(NR%4==2) print length($1)}' $RDR/$i\_1.fastq | awk '{ total += $1 } END { print total/NR }')
bowtie2 -x $REP/Index/representative_strains.fa -1 $RDR/$i\_1.fastq -2 $RDR/$i\_2.fastq -S $i.sam -p $NUM_THREAD
samtools view -bS $i.sam > $i.bam
rm $i.sam
###############
exec 3>&2
exec 2> /dev/null
samcheck=$(samtools --version | grep "samtools" | cut -f2 -d' ')
exec 2>&3

if [ -z $samcheck ]; then
samtools view -b -F 4 $i.bam | samtools sort - $i.sorted
else
samtools view -b -F 4 $i.bam | samtools sort -o $i.sorted.bam
fi
###########
rm $i.bam
bamtools split -in $i.sorted.bam -reference

for GENOME in `cat rep_strains`
do
samtools view -H $i.sorted.REF_$GENOME.bam | grep -w "$GENOME" >> $i.$GENOME.temp.sam
samtools view $i.sorted.REF_$GENOME.bam >> $i.$GENOME.temp.sam
samtools view -bS $i.$GENOME.temp.sam > $i.$GENOME.bam

rm $i.$GENOME.temp.sam
rm $i.sorted.REF_$GENOME.bam
featureCounts -a $GENOME.gtf -t CDS -o $i.$GENOME.output.temp.txt $i.$GENOME.bam
readsNo=$(grep -w "Assigned" $i.$GENOME.output.temp.txt.summary | cut -f2)
genLen=$(awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' $SPEC/reordered_contigs/$GENOME.fna | grep -v ">" | awk '{ total += $1 } END { print total }')

echo "scale=3; ($readsNo*$readlength)/$genLen" |bc > $i\_$GENOME.geneDosage.txt.cov

sed '1,2d' $i.$GENOME.output.temp.txt | awk '{ print $0,"\011", $7/$6 }' | cut -f2,3,8 > $i\_$GENOME.geneDosage.txt
rm $i.$GENOME.bam
rm $i.$GENOME.output.temp.txt
rm $i.$GENOME.output.temp.txt.summary
done
rm $i.sorted.bam
Rscript $SMEG_DIR/geneDosage.R -i $i.txt -c $COV_CUTOFF
printf '1\ni\nStrain\tSMEG\tCoverage\n.\nw\n' | ed -s $i.txt
rm $i\_*.geneDosage.txt
rm $i\_*.geneDosage.txt.cov
done

rm rep_strains reads.txt
rm *.gtf

else
###########
echo "SNP-based method activated"

cd $RDR
samtools faidx $REP/Index/representative_strains.fa 
if [ "$LIST" == "false" ]; then
ls *.fastq | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
else
cat $LIS | rev | cut -d'_' -f2- | rev | sort | uniq > $ODR/reads.txt
fi
cd $ODR

for i in `cat reads.txt`
do
mkdir $i.dir
cd $i.dir
bowtie2 -x $REP/Index/representative_strains.fa -1 $RDR/$i\_1.fastq -2 $RDR/$i\_2.fastq  -S $i.sam -p $NUM_THREAD

samtools view -bS $i.sam > $i.bam
rm $i.sam

###############
exec 3>&2
exec 2> /dev/null
samcheck=$(samtools --version | grep "samtools" | cut -f2 -d' ')
exec 2>&3

if [ -z $samcheck ]; then
samtools view -b -F 4 $i.bam | samtools sort - $i.sorted
else
samtools view -b -F 4 $i.bam | samtools sort -o $i.sorted.bam
fi
rm $i.bam
###################

samtools mpileup -a -a -A -B -f $REP/Index/representative_strains.fa $i.sorted.bam > $i.pileup
$SMEG_DIR/pileupParser $i.pileup

grep ">" $REP/Index/representative_strains.fa | sed 's/>//g' > snp_strains.txt
for GENOME in `cat snp_strains.txt`
do
name=$(echo $GENOME | rev | cut -d'.' -f2- | rev)
grep -w "$GENOME" polymorphic.site.coverage > $name.out.temp.txt

cp $REP/$name.fna.Input.txt .
Rscript $SMEG_DIR/SMEG_SNP.R -i $name.fna.Input.txt -p $name.out.temp.txt -o $name.coord.txt

rm $name.out.temp.txt
rm $name.fna.Input.txt
awk '$8 ~ "a" || $8 ~ "A" {print $2"\011"$1"\011"$3}' $name.coord.txt >> $name.final.temp
awk '$8 ~ "t" || $8 ~ "T" {print $2"\011"$1"\011"$4}' $name.coord.txt >> $name.final.temp
awk '$8 ~ "g" || $8 ~ "G" {print $2"\011"$1"\011"$5}' $name.coord.txt >> $name.final.temp
awk '$8 ~ "c" || $8 ~ "C" {print $2"\011"$1"\011"$6}' $name.coord.txt >> $name.final.temp

sort -n -k2,2 $name.final.temp > $name.final.txt

rm $name.final.temp
rm $name.coord.txt
done
rm polymorphic.site.coverage
rm $i.pileup
rm $i.sorted.bam
rm snp_strains.txt
Rscript $SMEG_DIR/SNP_method.R -i $i.txt -c $COV_CUTOFF -t $THETA
printf '1\ni\nStrain\tSMEG\tCoverage\n.\nw\n' | ed -s $i.txt
mv $i.txt $ODR/.
cd ../
rm -rf $i.dir
done
rm reads.txt
fi

################## merge option ####################
cd $ODR
if [ "$MERGE" == "true" ]; then
cat *.txt | grep -v "SMEG" | cut -f1 | sort | uniq > SMEG.genomes
ls *.txt | sed 's/\.txt//g' > SMEG_list.txt

for i in `cat SMEG_list.txt`
do
for f in `cat SMEG.genomes`
do
paste <(awk '$1 == "'$f'" {print $1}' SMEG.genomes ) <(awk '$1 == "'$f'" {print $2}' $i.txt )  >> $i.temp.merge.txt
done
printf '1\ni\nGenome\t'$i'\n.\nw\n' | ed -s $i.temp.merge.txt
done

for f in *.temp.merge.txt
do
cut -f2 $f > $f.tmp
rm $f
done

printf '1\ni\nGenome\n.\nw\n' | ed -s SMEG.genomes
paste -d'\t' SMEG.genomes *.temp.merge.txt.tmp > merged_table.txt
rm *.temp.merge.txt.tmp
rm SMEG_list.txt
rm SMEG.genomes
Rscript $SMEG_DIR/heatmap.R
echo "run complete"
else
echo "run complete"
fi
fi
fi