KMC=~ /projects/projects2014-metagenome/metagraph/build_release/KMC/kmc;
DIR=~ /metagenome/data/kingsford_21;
# rm -r $DIR/kmc_21_filtered_2gb;
mkdir $DIR /kmc_21_filtered_2gb;
mkdir $DIR /logs;
for cutoff in {1,3,10,20,50}; do
ids=$DIR /kingsford_${cutoff} .txt;
bsub -J " filter[1-$( cat $ids | wc -l) ]%800" \
-o $DIR /logs/kmc_count_2gb.lsf \
-W 4:00 \
-n 1 -R " rusage[mem=5000] span[hosts=1] select[model==XeonGold_6140]" \
" id=\\\$ (sed -n \$ {LSB_JOBINDEX}p $ids ); \
mkdir ~/metagenome/scratch/nobackup/stripe_1/\\\$ {id}.kmc_cache; \
file=~/metagenome/raw_data/kingsford/data_fasta/\\\$ {id}.fasta.gz; \
/usr/bin/time -v $KMC -k21 -m1 -sm -ci$cutoff -fm -t2 \
\\\$ {file} \
$DIR /kmc_21_filtered_2gb/\\\$ id \
~/metagenome/scratch/nobackup/stripe_1/\\\$ {id}.kmc_cache \
2>&1 | tee $DIR /kmc_21_filtered_2gb/\\\$ {id}.log;
rm -r ~/metagenome/scratch/nobackup/stripe_1/\\\$ {id}.kmc_cache"
done
mkdir $DIR /unitigs;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
bsub -J " build_single[1-2652]%500" \
-w " filter*" \
-o $DIR /logs/build_single.lsf \
-W 4:00 \
-n 1 -R " rusage[mem=20000] span[hosts=1] select[model==XeonGold_6140]" \
" id=\\\$ (sed -n \$ {LSB_JOBINDEX}p $DIR /kingsford.txt); \
file=$DIR /kmc_21_filtered/\\\$ {id}.kmc_suf; \
/usr/bin/time -v $METAGRAPH build \
-k 21 \
--mode canonical \
--mem-cap-gb 8 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-p 2 \
-o $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}) \
\\\$ file; \
/usr/bin/time -v $METAGRAPH transform \
--to-fasta --primary-kmers \
-p 2 \
-o $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}) \
$DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}).dbg; \
rm $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}).dbg*"
bsub -J " build_graph" \
-w " build_single" \
-oo $DIR /logs/build_graph.lsf \
-W 4:00 \
-n 36 -R " rusage[mem=3000] span[hosts=1] select[model==XeonGold_6140]" \
" find $DIR /unitigs -name \" *.fasta.gz\" \
| /usr/bin/time -v $METAGRAPH build -v \
-k 21 \
--mode canonical \
--mem-cap-gb 50 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-p 72 \
-o $DIR /kingsford_canonical; \
/usr/bin/time -v $METAGRAPH transform -v \
--to-fasta --primary-kmers \
-o $DIR /kingsford_primary \
$DIR /kingsford_canonical.dbg \
-p 36; \
rm $DIR /kingsford_canonical.dbg; \
/usr/bin/time -v $METAGRAPH build -v \
-k 21 \
--mode primary \
--mem-cap-gb 50 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-p 72 \
-o $DIR /kingsford \
$DIR /kingsford_primary.fasta.gz; \
rm $DIR /kingsford_primary.fasta.gz; \
/usr/bin/time -v $METAGRAPH transform -v \
--state small \
-o $DIR /kingsford_small \
$DIR /kingsford.dbg \
-p 72"
WINDOW_SIZE=1;
# WINDOW_SIZE=1000000000;
DIR=~ /metagenome/data/kingsford_21/smoothing_${WINDOW_SIZE} ;
mkdir $DIR ;
mkdir $DIR /logs;
mkdir $DIR /unitigs;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
bsub -J " build_single_${WINDOW_SIZE} [1-2652]%500" \
-w " filter*" \
-o $DIR /logs/build_single.lsf \
-W 4:00 \
-n 1 -R " rusage[mem=20000] span[hosts=1] select[model==XeonGold_6140]" \
" id=\\\$ (sed -n \$ {LSB_JOBINDEX}p $DIR /../kingsford.txt); \
file=$DIR /../kmc_21_filtered/\\\$ {id}.kmc_suf; \
/usr/bin/time -v $METAGRAPH build -v \
-k 21 \
--mode canonical \
--count-kmers --count-width 32 \
--mem-cap-gb 8 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-p 2 \
-o $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}) \
\\\$ file; \
/usr/bin/time -v $METAGRAPH clean -v \
--to-fasta --primary-kmers \
--smoothing-window ${WINDOW_SIZE} \
-p 2 \
-o $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}) \
$DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}).dbg; \
rm $DIR /unitigs/\\\$ (basename \\\$ {file%.kmc_suf}).dbg*"
DIR=~ /metagenome/data/kingsford_21/smoothing_${WINDOW_SIZE} ;
bsub -J " split_${WINDOW_SIZE} " \
-w " build_single_${WINDOW_SIZE} " \
-o /dev/null -W 1:00 -n 1 -R " rusage[mem=1000]" \
" cd $DIR ; \
mkdir -p batches; \
cd batches; \
split -d -n r/10 <(find $DIR /unitigs -name " * .fasta.gz" | shuf); \
mkdir -p ${DIR} /columns;" ;
DIR=~ /metagenome/data/kingsford_21/smoothing_${WINDOW_SIZE} ;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
for N in {0..9}; do
N=$( printf " %02d" $N ) ;
list=x$N ;
bsub -J " annotate_${WINDOW_SIZE} _${list} " \
-w " split_${WINDOW_SIZE} && build_graph" \
-oo ${DIR} /logs/annotate_${list} .lsf \
-W 4:00 \
-n 18 -R " rusage[mem=1500] span[hosts=1] select[model==XeonGold_6140]" \
" cat $DIR /batches/${list} \
| /usr/bin/time -v $METAGRAPH annotate \
-i $DIR /../kingsford.dbg \
--anno-filename \
--separately \
--count-kmers --count-width 32 \
-o ${DIR} /columns \
-p 36" ; \
done
WINDOW_SIZE=1;
# WINDOW_SIZE=1000000000;
# git checkout 7a9027fa8c6c29742c7885f77f90d414c39c5b53
DIR=~ /metagenome/data/kingsford_21/smoothing_${WINDOW_SIZE} _new_enc;
# rm -r $DIR;
mkdir $DIR ;
mkdir $DIR /rd;
mkdir $DIR /logs;
mkdir $DIR /rd/rd_columns;
ln -s ~ /metagenome/data/kingsford_21/kingsford.dbg ${DIR} /rd/graph.dbg;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test/metagraph;
bsub -J " count_${WINDOW_SIZE} _rd_brwt" \
-oo ${DIR} /logs/count_rd_brwt.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1] select[model==XeonGold_6140]" \
" find ${DIR} /../smoothing_${WINDOW_SIZE} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff --count-kmers \
--row-diff-stage 0 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /../smoothing_${WINDOW_SIZE} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff --count-kmers \
--row-diff-stage 1 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /../smoothing_${WINDOW_SIZE} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff --count-kmers \
--row-diff-stage 2 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /rd/rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_int_brwt \
--greedy --fast --subsample 1000000 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /annotation \
-p 72 --parallel-nodes 10; \
/usr/bin/time -v $METAGRAPH relax_brwt -v \
-p 72 \
--relax-arity 32 \
-o ${DIR} /annotation.relaxed \
${DIR} /annotation.row_diff_int_brwt.annodbg" ;
DIR=~ /metagenome/data/kingsford_21;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test/metagraph;
bsub -J " kingsford_query_old" \
-oo ${DIR} /logs/query_rd_brwt_old.lsf \
-W 4:00 \
-n 36 -R " rusage[mem=8000] span[hosts=1] select[model==XeonGold_6140]" \
" /usr/bin/time -v $METAGRAPH query --count-labels --fast -v \
--discovery-fraction 0 \
-i ~/metagenome/finished_projects/counting_dbg/kingsford_21/kingsford_small.dbg \
-a ~/metagenome/finished_projects/counting_dbg/kingsford_21/annotation_old.relaxed.row_diff_brwt.annodbg \
~/projects/projects2014-metagenome/metagraph/tests/data/transcripts_100.fa" ;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
bsub -J " kingsford_query" \
-oo ${DIR} /logs/query_rd_brwt.lsf \
-W 4:00 \
-n 36 -R " rusage[mem=8000] span[hosts=1] select[model==XeonGold_6140]" \
" /usr/bin/time -v $METAGRAPH query --count-labels --fast -v \
--discovery-fraction 0 \
-i ${DIR} /kingsford_small.dbg \
-a ${DIR} /annotation.relaxed.row_diff_brwt.annodbg \
~/projects/projects2014-metagenome/metagraph/tests/data/transcripts_100.fa" ;
for WINDOW_SIZE in {1,1000000000}; do
DIR=~ /metagenome/data/kingsford_21/smoothing_${WINDOW_SIZE} ;
bsub -J " kingsford_count_query" \
-oo ${DIR} /logs/query_count_rd_brwt.lsf \
-W 4:00 \
-n 36 -R " rusage[mem=2000] span[hosts=1] select[model==XeonGold_6140]" \
" /usr/bin/time -v $METAGRAPH query --count-labels --query-counts --fast -v \
--discovery-fraction 0 \
-i ${DIR} /../kingsford_small.dbg \
-a ${DIR} /annotation.relaxed.row_diff_int_brwt.annodbg \
~/projects/projects2014-metagenome/metagraph/tests/data/transcripts_100.fa" ;
done
Binary annotation without k-mer counts
DIR=~ /metagenome/data/kingsford_21;
mkdir ${DIR} /columns;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
for N in {0..9}; do
N=$( printf " %02d" $N ) ;
list=x$N ;
bsub -J " annotate_${list} " \
-w " split_1 && build_graph" \
-oo ${DIR} /logs/annotate_${list} .lsf \
-W 4:00 \
-n 18 -R " rusage[mem=1500] span[hosts=1] select[model==XeonGold_6140]" \
" cat $DIR /smoothing_1/batches/${list} \
| /usr/bin/time -v $METAGRAPH annotate \
-i $DIR /kingsford.dbg \
--anno-filename \
--separately \
-o ${DIR} /columns \
-p 36" ; \
done
DIR=~ /metagenome/data/kingsford_21;
mkdir $DIR ;
mkdir $DIR /rd;
mkdir $DIR /rd/rd_columns;
ln -s ~ /metagenome/data/kingsford_21/kingsford.dbg ${DIR} /rd/graph.dbg;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
bsub -J " kingsford_rd_brwt" \
-oo ${DIR} /logs/rd_brwt.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1] select[model==XeonGold_6140]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--row-diff-stage 0 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--row-diff-stage 1 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--row-diff-stage 2 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72; \
find ${DIR} /rd/rd_columns -name \" *.row_diff.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_brwt \
--greedy --fast --subsample 1000000 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /annotation \
-p 72 --parallel-nodes 10; \
/usr/bin/time -v $METAGRAPH relax_brwt -v \
-p 72 \
--relax-arity 32 \
-o ${DIR} /annotation.relaxed \
${DIR} /annotation.row_diff_brwt.annodbg" ;
git checkout 0d9feb76a9840b92031c25571cbc0f23ffd1cbe2
DIR=~ /metagenome/data/kingsford_21;
mkdir $DIR /rd_old;
mkdir $DIR /rd_old/rd_columns;
ln -s ~ /metagenome/data/kingsford_21/kingsford.dbg ${DIR} /rd_old/graph.dbg;
DIR=~ /metagenome/data/kingsford_21;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test2/metagraph;
bsub -J " kingsford_rd_old_1" \
-w " kingsford_annotate_*" \
-oo ${DIR} /logs/old_rd_1.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /smoothing_1/columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--max-path-length 100 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd_old/graph.dbg \
-o ${DIR} /rd_old/rd_columns/out \
-p 72 \
2>&1 | tee ${DIR} /logs/old_rd_1.log" ;
DIR=~ /metagenome/data/kingsford_21;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test2/metagraph;
bsub -J " kingsford_rd_old_2" \
-w " kingsford_rd_old_1" \
-oo ${DIR} /logs/old_rd_2.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /smoothing_1/columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--optimize \
--max-path-length 100 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd_old/graph.dbg \
-o ${DIR} /rd_old/rd_columns/out \
-p 72 \
2>&1 | tee ${DIR} /logs/old_rd_2.log" ;
DIR=~ /metagenome/data/kingsford_21;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test2/metagraph;
bsub -J " kingsford_rd_old_brwt" \
-w " kingsford_rd_old_2" \
-oo ${DIR} /logs/old_rd_brwt.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=4000] span[hosts=1]" \
" find ${DIR} /rd_old/rd_columns -name \" *.row_diff.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_brwt \
--greedy --fast --subsample 1000000 \
-i ${DIR} /rd_old/graph.dbg \
-o ${DIR} /annotation_old \
-p 72 --parallel-nodes 10 \
2>&1 | tee ${DIR} /logs/old_rd_brwt.log" ;
DIR=~ /metagenome/data/kingsford_21;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test2/metagraph;
bsub -J " kingsford_rd_old_brwt_relax" \
-w " kingsford_rd_old_brwt" \
-oo ${DIR} /logs/old_rd_brwt_relax.lsf \
-W 24:00 \
-n 12 -R " rusage[mem=5000] span[hosts=1]" \
" /usr/bin/time -v $METAGRAPH relax_brwt -v \
-p 24 \
--relax-arity 32 \
-o ${DIR} /annotation_old.relaxed \
${DIR} /annotation_old.row_diff_brwt.annodbg \
2>&1 | tee ${DIR} /logs/old_rd_brwt_relax.log" ;
HiFi Viruses Index with k-mer coordinates (lossless read compression)
Compress with tools for seq compression
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data;
mkdir $DIR /compressed;
mkdir $DIR /compressed/logs;
find ~ /metagenome/raw_data/hifi_sra/viruses_hifi_data/ -name " *.fastq.gz" > $DIR /list.txt;
bsub -J " noheader_gzip[1-4132]" \
-o $DIR /compressed/logs/noheader_gzip.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir $DIR /compressed/\\\$ {id:0:6}; \
/usr/bin/time -v zcat \\\$ file | sed -n '1~4s/^@/>/p;2~4p' | sed 's/^>.*/>/' | gzip -9 > $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.fasta.gz ; \
/usr/bin/time -v spring -l -c -g --fasta-input -t 2 \
-i $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.fasta.gz \
-w ${DIR} /compressed/\\\$ {id:0:6} \
-o ${DIR} /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.spring; \
/usr/bin/time -v zcat \\\$ file | paste - - - - | cut -f2 | tr -d '\n' | wc -c > $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_num_bp ; \
done"
bsub -J " spring[1-4132]" \
-o $DIR /compressed/logs/spring.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir $DIR /compressed/\\\$ {id:0:6}; \
mkdir $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_temp;
/usr/bin/time -v spring -l -c -g --fasta-input -t 2 \
-i $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.fasta.gz \
-w ${DIR} /compressed/\\\$ {id:0:6}/\\\$ {id}_temp \
-o ${DIR} /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.spring; \
rm -r $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_temp;
/usr/bin/time -v zcat \\\$ file | paste - - - - | cut -f2 | tr -d '\n' | wc -c > $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_num_bp ; \
done"
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data;
mkdir $DIR /blast;
mkdir $DIR /blast/logs;
bsub -J " blast[1-4132]" \
-o $DIR /blast/logs/build_database.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir $DIR /blast/\\\$ {id:0:6}; \
mkdir $DIR /blast/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /blast/\\\$ {id:0:6}/\\\$ {id}; \
cp $DIR /compressed/\\\$ {id:0:6}/\\\$ {id}_no_header.fasta.gz ./; \
gunzip \\\$ {id}_no_header.fasta.gz; \
makeblastdb -in \\\$ {id}_no_header.fasta -dbtype nucl -out \\\$ {id}; \
rm \\\$ {id}_no_header.fasta; \
done"
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data;
mkdir $DIR /pufferfish_sparse;
mkdir $DIR /pufferfish_sparse/logs;
bsub -J " pufferfish_sparse[1-4132]" \
-o $DIR /pufferfish_sparse/logs/build_index.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=19000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}; \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
~/pufferfish index -r \\\$ file -k 31 -s -o \\\$ {id}; \
done"
# re-running those going out of RAM
bsub -J " pufferfish_sparse[86,104,3488,3828,3509]" \
-o $DIR /pufferfish_sparse/logs/build_index_2.lsf \
-W 48:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir -p $DIR /pufferfish_sparse/\\\$ {id:0:6}; \
rm -rf $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
~/pufferfish index -r \\\$ file -k 31 -s -o \\\$ {id}; \
done"
# re-running those going out of time
bsub -J " pufferfish_sparse[1354]" \
-o $DIR /pufferfish_sparse/logs/build_index_3.lsf \
-W 72:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 14)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir -p $DIR /pufferfish_sparse/\\\$ {id:0:6}; \
rm -rf $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
~/pufferfish index -p 36 -r \\\$ file -k 31 -s -o \\\$ {id}; \
done"
bsub -J " pufferfish_sparse[1354]" \
-o $DIR /pufferfish_sparse/logs/build_index_4.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=19000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 16)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir -p $DIR /pufferfish_sparse/\\\$ {id:0:6}; \
rm -rf $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
~/pufferfish index -r \\\$ file -k 31 -s -o \\\$ {id}; \
done"
bsub -J " pufferfish_sparse[1354]" \
-o $DIR /pufferfish_sparse/logs/build_index_5.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=19000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 26)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir -p $DIR /pufferfish_sparse/\\\$ {id:0:6}; \
rm -rf $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
mkdir $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
cd $DIR /pufferfish_sparse/\\\$ {id:0:6}/\\\$ {id}; \
~/pufferfish index -r \\\$ file -k 31 -s -o \\\$ {id}; \
done"
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data;
mkdir $DIR /megablast;
mkdir $DIR /megablast/logs;
bsub -J " megablast[1-4132]" \
-w " blast[*]" \
-o $DIR /megablast/logs/build_database.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $DIR /list.txt); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir $DIR /megablast/\\\$ {id:0:6}; \
mkdir $DIR /megablast/\\\$ {id:0:6}/\\\$ {id}; \
ln -s $DIR /blast/\\\$ {id:0:6}/\\\$ {id}/* $DIR /megablast/\\\$ {id:0:6}/\\\$ {id}/; \
makembindex -input $DIR /megablast/\\\$ {id:0:6}/\\\$ {id}/\\\$ {id}; \
done"
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg;
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg_fork_opt;
mkdir $DIR ;
mkdir $DIR /logs;
list=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/list.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5_test/metagraph;
bsub -J " build_${K} _[1-4132]" \
-W 24:00 \
-n 1 -R " rusage[mem=20000] span[hosts=1]" \
" for i in \\\$ (seq \\\$ ((\\\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \\\$ ((\\\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir ${DIR} /\\\$ {id:0:6}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs; \
/usr/bin/time -v $METAGRAPH transform -v \
--index-ranges 1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg; \
/usr/bin/time -v $METAGRAPH transform -v \
--state small \
--index-ranges 1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph_small \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg; \
done"
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5_test/metagraph;
NUM_THREADS=18;
bsub -J " build_${K} _[1-4132]" \
-o ${DIR} /logs/construct_${K} .lsf \
-W 24:00 \
-n 18 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir ${DIR} /\\\$ {id:0:6}/; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs; \
/usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/build.log; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph_small \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/transform.log; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/annotate.log; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_1.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_2.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_3.log; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg.coords; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_count; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_reduction; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ_boundary; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred_boundary; \
/usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_coord \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.column.annodbg; \
done"
NUM_THREADS=36;
bsub -J " build_${K} _rerun_[1-4132]" \
-w " exit(build_${K} _[*])" \
-o ${DIR} /logs/construct_${K} _rerun.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=10000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir ${DIR} /\\\$ {id:0:6}/; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs; \
/usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/build.log; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph_small \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/transform.log; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/annotate.log; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_1.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_2.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_3.log; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg.coords; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_count; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_reduction; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ_boundary; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred_boundary; \
/usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_coord \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.column.annodbg; \
done"
bsub -J " build_${K} _rerun2_[1-4132]" \
-w " exit(build_${K} _[*]) && exit(build_${K} _rerun_[*])" \
-o ${DIR} /logs/construct_${K} _rerun2.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=40000] span[hosts=1]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir ${DIR} /\\\$ {id:0:6}/; \
rm -r ${DIR} /\\\$ {id:0:6}/\\\$ {id}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs; \
/usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/build.log; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 1 \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph_small \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/transform.log; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
\\\$ file \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/annotate.log; \
mkdir ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_1.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_2.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id:0:6}/\\\$ {id}/logs/coord_rd_3.log; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.column.annodbg.coords; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_count; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.row_reduction; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.succ_boundary; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred; \
rm ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg.pred_boundary; \
/usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_coord \
-i ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation \
${DIR} /\\\$ {id:0:6}/\\\$ {id}/rd_columns/annotation.column.annodbg; \
done"
# ## Query
K=31
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/mtg;
list=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/list.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
bsub -J " align_[1-4132]" \
-o ${DIR} /logs/delta_variants_align.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=50000] span[hosts=1] select[model==XeonGold_6140]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir /scratch/mtg_\\\$ {id}; \
cp ${DIR} /\\\$ {id:0:6}/\\\$ {id}/graph_small.dbg /scratch/mtg_\\\$ {id}/;
cp ${DIR} /\\\$ {id:0:6}/\\\$ {id}/annotation.row_diff_coord.annodbg /scratch/mtg_\\\$ {id}/;
echo \\\$ {id} \\\$ (/usr/bin/time -v $METAGRAPH align -v \
--align-chain --align-max-seed-length 19 \
-i /scratch/mtg_\\\$ {id}/graph_small.dbg \
-a /scratch/mtg_\\\$ {id}/annotation.row_diff_coord.annodbg \
${DIR} /../mtg/joint/query/redo/delta_variants.fa \
2>&1 1>/dev/null | tail -n 19 | sed '1q;d' | cut -d' ' -f8) >> ${DIR} /delta_variants_align.times; \
rm -r /scratch/mtg_\\\$ {id}; \
done"
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/megablast;
list=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/list.txt;
bsub -J " query_[1-4132]" \
-o ${DIR} /logs/delta_variants_query.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=19000] span[hosts=1] select[model==XeonGold_6140]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir /scratch/\\\$ {id}; \
cp -H ${DIR} /\\\$ {id:0:6}/\\\$ {id}/* /scratch/\\\$ {id}/; \
echo \\\$ {id} \\\$ (/usr/bin/time -v blastn \
-query ${DIR} /../mtg/joint/query/redo/delta_variants.fa \
-db /scratch/\\\$ {id}/\\\$ {id} \
-use_index true \
-max_hsps 1 -outfmt '6 qseqid qseq sseq bitscore nident btop' \
2>&1 1>/dev/null | sed '5q;d' | cut -d' ' -f8) >> ${DIR} /delta_variants_query.times; \
rm -r /scratch/\\\$ {id}; \
done"
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/blast;
list=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/list.txt;
bsub -J " query_[1-4132]" \
-o ${DIR} /logs/delta_variants_query.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=19000] span[hosts=1] select[model==XeonGold_6140]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir /scratch/blast_\\\$ {id}; \
cp -H ${DIR} /\\\$ {id:0:6}/\\\$ {id}/* /scratch/blast_\\\$ {id}/; \
echo \\\$ {id} \\\$ (/usr/bin/time -v blastn \
-query ${DIR} /../mtg/joint/query/redo/delta_variants.fa \
-db /scratch/blast_\\\$ {id}/\\\$ {id} \
-max_hsps 1 -outfmt '6 qseqid qseq sseq bitscore nident btop' \
2>&1 1>/dev/null | sed '5q;d' | cut -d' ' -f8) >> ${DIR} /delta_variants_query.times; \
rm -r /scratch/blast_\\\$ {id}; \
done"
DIR=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/pufferfish_sparse;
list=~ /metagenome/finished_projects/counting_dbg/hifi_sra/viruses_hifi_data/list.txt;
bsub -J " align_[1-4132]" \
-o ${DIR} /logs/delta_variants_align.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=200000] span[hosts=1] select[model==XeonGold_6140]" \
" for i in \\\$ (seq \$ ((\$ {LSB_JOBINDEX} * 37 - 37 + 1)) \$ ((\$ {LSB_JOBINDEX} * 37))); do \
file=\\\$ (sed -n \\\$ {i}p $list ); \
id=\\\$ (basename \\\$ {file%.fastq.gz}); \
mkdir /scratch/pf_align_\\\$ {id}; \
cp -r ${DIR} /\\\$ {id:0:6}/\\\$ {id}/\\\$ {id} /scratch/pf_align_\\\$ {id}; \
echo \\\$ {id} \\\$ (/usr/bin/time -v ~/pufferfish align \
--read ${DIR} /../mtg/joint/query/redo/delta_variants.fa \
-i /scratch/pf_align_\\\$ {id}/\\\$ {id}/ \
--genomicReads --primaryAlignment -o /dev/null \
2>&1 | tail -n 19 | sed '1q;d' | cut -d' ' -f8) >> ${DIR} /delta_variants_align.times; \
rm -r /scratch/pf_align_\\\$ {id}; \
done"
# ### Joint index
` ` ` ` bash
K=31;
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
mkdir $DIR ;
mkdir $DIR /logs;
list=~ /metagenome/data/hifi_sra/viruses_hifi_data/list.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
NUM_THREADS=72;
bsub -J " build_${K} " \
-o ${DIR} /logs/construct_joint_${K} .lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" cat $list | /usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /graph; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
-o ${DIR} /graph_small \
${DIR} /graph.dbg; \
"
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
mkdir $DIR /columns;
list=~ /metagenome/data/hifi_sra/viruses_hifi_data/list.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
bsub -J " annotate_${K} _[1-1033]" \
-o ${DIR} /logs/annotate_joint_${K} .lsf \
-W 24:00 \
-n 9 -R " rusage[mem=19000] span[hosts=1]" \
" sed -n \$ ((\$ {LSB_JOBINDEX} * 148 - 148 + 1)),\$ ((\$ {LSB_JOBINDEX} * 148))p $list \
| /usr/bin/time -v $METAGRAPH annotate -v \
--separately -p 3 --threads-each 6 \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /columns; \
"
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
mkdir $DIR /columns;
list=~ /metagenome/data/hifi_sra/viruses_hifi_data/list2.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
bsub -J " annotate_${K} " \
-o ${DIR} /logs/annotate_joint_${K} _rest.lsf \
-W 24:00 \
-n 48 -R " rusage[mem=40000] span[hosts=1]" \
" cat $list \
| /usr/bin/time -v $METAGRAPH annotate -v \
--separately -p 4 --threads-each 24 \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /columns; \
"
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
mkdir ${DIR} /rd_columns;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
NUM_THREADS=36;
bsub -J " rd_coord_${K} _1" \
-o ${DIR} /logs/rd_joint_${K} .lsf \
-W 120:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 400 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out"
bsub -J " rd_coord_${K} _2" \
-w " rd_coord_${K} _1" \
-o ${DIR} /logs/rd_joint_${K} .lsf \
-W 120:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 400 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out"
bsub -J " rd_coord_${K} _3" \
-w " rd_coord_${K} _2" \
-o ${DIR} /logs/rd_joint_${K} .lsf \
-W 120:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 400 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out"
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
bsub -J " rd_brwt_coord_${K} _10M" \
-o ${DIR} /logs/rd_brwt_coord_joint_${K} .lsf \
-W 120:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v -p 72 \
--anno-type row_diff_brwt_coord \
--greedy --fast --subsample 10000000 \
--parallel-nodes 10 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /annotation" ;
bsub -J " relax_rd_brwt_coord_${K} _10M" \
-w " rd_brwt_coord_${K} _10M" \
-o ${DIR} /logs/relax_rd_brwt_coord_joint_${K} .lsf \
-W 72:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" /usr/bin/time -v $METAGRAPH relax_brwt -v -p 36 \
--relax-arity 32 \
${DIR} /annotation.row_diff_brwt_coord.annodbg \
-o ${DIR} /annotation.relaxed" ;
K=31
DIR=~ /metagenome/data/hifi_sra/viruses_hifi_data/mtg/joint;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5/metagraph;
bsub -J " rd_brwt_coord_${K} " \
-o ${DIR} /logs/rd_brwt_coord_joint_${K} _0.lsf \
-W 120:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v -p 72 \
--anno-type row_diff_brwt_coord \
--parallel-nodes 10 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /annotation_0" ;
bsub -J " relax_rd_brwt_coord_${K} " \
-w " rd_brwt_coord_${K} " \
-o ${DIR} /logs/relax_rd_brwt_coord_joint_${K} _0.lsf \
-W 72:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" /usr/bin/time -v $METAGRAPH relax_brwt -v -p 72 \
--relax-arity 32 \
${DIR} /annotation_0.row_diff_brwt_coord.annodbg \
-o ${DIR} /annotation_0.relaxed" ;
Index with k-mer coordinates (lossless read compression)
Compress with tools for seq compression
DIR=~ /metagenome/data/kingsford;
# mkdir $DIR/compressed;
# mkdir $DIR/compressed/logs;
# find ~/metagenome/raw_data/kingsford/data_fasta -name "*.fasta.gz" > $DIR/compressed/list.txt;
list=$DIR /compressed/list.txt;
bsub -J " gzip[1-$( cat $list | wc -l) ]" \
-o ~ /metagenome/data/kingsford/compressed/logs/gzip.lsf \
-W 24:00 \
-n 1 -R " rusage[mem=10000] span[hosts=1]" \
" file=\\\$ (sed -n \$ {LSB_JOBINDEX}p ${list} ); \
id=\\\$ (basename \\\$ {file%.fasta.gz}); \
/usr/bin/time -v zcat \\\$ file | sed '/^>/d' | tr -d '\n' | gzip -9 > $DIR /compressed/\\\$ {id}.fasta.gz; \
/usr/bin/time -v zcat \\\$ file | sed '/^>/!d' | tr -d '\n' | gzip -9 > $DIR /compressed/\\\$ {id}.fasta_headers.gz; \
/usr/bin/time -v zcat \\\$ file | sed 's/^>.*/>/' | awk '/^>/ { if(NR>1) print \\\"\\\" ; printf(\\\" %s\n\\\" ,\\\$ 0); next; } { printf(\\\" %s\\\" ,\\\$ 0);} END {printf(\\\" \n\\\" );}' | gzip -9 > $DIR /compressed/\\\$ {id}_no_header.fasta.gz"
bsub -J " count_bp[1-$( cat $list | wc -l) ]" \
-w " done(gzip[*])" \
-o ~ /metagenome/data/kingsford/compressed/logs/count_bp.lsf \
-W 4:00 \
-n 1 -R " rusage[mem=10000] span[hosts=1]" \
" file=\\\$ (sed -n \$ {LSB_JOBINDEX}p ${list} ); \
id=\\\$ (basename \\\$ {file%.fasta.gz}); \
/usr/bin/time -v zcat \\\$ file | sed '/^>/d' | tr -d '\n' | wc -c > $DIR /compressed/\\\$ {id}.num_bp;"
bsub -J " spring[1-$( cat ${list} | wc -l) ]" \
-o ~ /metagenome/data/kingsford/compressed/logs/noheader_spring.lsf \
-W 120:00 \
-n 18 -R " rusage[mem=10000] span[hosts=1]" \
" file=\\\$ (sed -n \$ {LSB_JOBINDEX}p ${list} ); \
id=\\\$ (basename \\\$ {file%.fasta.gz}); \
rm -rf $DIR /compressed/\\\$ {id}_no_header_temp; \
mkdir $DIR /compressed/\\\$ {id}_no_header_temp; \
/usr/bin/time -v spring -c -g --fasta-input -t 36 \
-i $DIR /compressed/\\\$ {id}_no_header.fasta.gz \
-w $DIR /compressed/\\\$ {id}_no_header_temp \
-o $DIR /compressed/\\\$ {id}_no_header.spring; \
rm -r $DIR /compressed/\\\$ {id}_no_header_temp"
bsub -J " check_spring[1-$( cat ${list} | wc -l) ]" \
-o ~ /metagenome/data/kingsford/compressed/logs/check_spring.lsf \
-W 24:00 \
-n 4 -R " rusage[mem=10000] span[hosts=1]" \
" file=\\\$ (sed -n \$ {LSB_JOBINDEX}p ${list} ); \
id=\\\$ (basename \\\$ {file%.fasta.gz}); \
rm -rf $DIR /compressed/\\\$ {id}_no_header_temp; \
mkdir $DIR /compressed/\\\$ {id}_no_header_temp; \
rm -rf $DIR /compressed/\\\$ {id}_no_header.spring.decompressed; \
/usr/bin/time -v spring -d -t 8 \
-i $DIR /compressed/\\\$ {id}_no_header.spring \
-w $DIR /compressed/\\\$ {id}_no_header_temp \
-o $DIR /compressed/\\\$ {id}_no_header.spring.decompressed; \
D=\\\$ (diff <(zcat $DIR /compressed/\\\$ {id}_no_header.fasta.gz) $DIR /compressed/\\\$ {id}_no_header.spring.decompressed | wc -l); \
echo \\\$ D \\\$ {id} >> ~/metagenome/data/kingsford/compressed/logs/check_spring.out; \
rm -r $DIR /compressed/\\\$ {id}_no_header_temp"
K=31
DIR=~ /metagenome/finished_projects/counting_dbg/kingsford_${K} _coordinates_fork_opt_new;
# git checkout 7a9027fa8c6c29742c7885f77f90d414c39c5b53
rm -rf $DIR ;
mkdir $DIR ;
mkdir $DIR /logs;
list=~ /metagenome/finished_projects/counting_dbg/kingsford_compressed/list.txt;
# rm ~/metagenome/data/kingsford/compressed/list2.txt;
# for file in $(cat ${list}); do
# if [[ ! -f ${DIR}/$(basename $file)/rd_columns/annotation.column.annodbg.coords ]];
# then echo $file >> ~/metagenome/data/kingsford/compressed/list2.txt;
# fi;
# done
# list=~/metagenome/data/kingsford/compressed/list2.txt;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5_test/metagraph;
NUM_THREADS=8;
bsub -J " build_graph_${K} _[1-$( cat $list | wc -l) ]" \
-o ${DIR} /logs/construct_${K} .lsf \
-W 24:00 \
-n 4 -R " rusage[mem=19000] span[hosts=1]" \
NUM_THREADS=36;
bsub -J " build_graph_${K} _rerun_[1-$( cat $list | wc -l) ]" \
-o ${DIR} /logs/construct_${K} _rerun.lsf \
-w " exit(build_graph_${K} _[*])" \
-W 24:00 \
-n 23 -R " rusage[mem=19000] span[hosts=1]" \
NUM_THREADS=36;
bsub -J " build_graph_${K} _rerun2_[1-$( cat $list | wc -l) ]" \
-o ${DIR} /logs/construct_${K} _rerun2.lsf \
-w " exit(build_graph_${K} _[*]) && exit(build_graph_${K} _rerun_[*])" \
-W 48:00 \
-n 36 -R " rusage[mem=25000] span[hosts=1]" \
NUM_THREADS=36;
bsub -J " build_graph_${K} _rerun3_[1-$( cat $list | wc -l) ]" \
-o ${DIR} /logs/construct_${K} _rerun3.lsf \
-w " exit(build_graph_${K} _[*]) && exit(build_graph_${K} _rerun_[*]) && exit(build_graph_${K} _rerun2_[*])" \
-W 48:00 \
-n 36 -R " rusage[mem=40000] span[hosts=1]" \
"
file=\\\$ (sed -n \$ {LSB_JOBINDEX}p $list ); \
L=\\\$ (zless \\\$ file | head -n 1 | grep -Eo '[0-9]+$'); \
if [[ \\\$ L -le 30 ]]; then exit 0; fi; \
id=\\\$ (basename \\\$ file); \
rm -r ${DIR} /\\\$ id; \
mkdir ${DIR} /\\\$ id; \
mkdir ${DIR} /\\\$ id/logs; \
/usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /\\\$ {id}/graph \
\\\$ file \
2> ${DIR} /\\\$ {id}/logs/build.log; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 1 \
-o ${DIR} /\\\$ {id}/graph_small \
${DIR} /\\\$ {id}/graph.dbg \
2> ${DIR} /\\\$ {id}/logs/transform.log; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--coordinates \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/annotation \
\\\$ file \
2> ${DIR} /\\\$ {id}/logs/annotate.log; \
mkdir ${DIR} /\\\$ {id}/rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--num-kmers-in-seq \\\$ ((\\\$ {L}-${K} +1)) \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id}/logs/coord_rd_1.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--num-kmers-in-seq \\\$ ((\\\$ {L}-${K} +1)) \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id}/logs/coord_rd_2.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--num-kmers-in-seq \\\$ ((\\\$ {L}-${K} +1)) \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg \
2> ${DIR} /\\\$ {id}/logs/coord_rd_3.log; \
rm ${DIR} /\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id}/annotation.column.annodbg.coords; \
rm ${DIR} /\\\$ {id}/rd_columns/annotation.row_count; \
rm ${DIR} /\\\$ {id}/rd_columns/annotation.row_reduction; \
rm ${DIR} /\\\$ {id}/graph.dbg.succ; \
rm ${DIR} /\\\$ {id}/graph.dbg.succ_boundary; \
rm ${DIR} /\\\$ {id}/graph.dbg.pred; \
rm ${DIR} /\\\$ {id}/graph.dbg.pred_boundary" ; \
for file in ' ~/metagenome/data/coordinates_K/data/SRR13577847_subreads.fastq.gz' ; do
bsub -J " gzip" \
-o /dev/null \
-W 24:00 \
-n 1 -R " rusage[mem=10000] span[hosts=1]" \
" file=${file} ; \
id=\\\$ (basename \\\$ {file%.gz}); \
/usr/bin/time -v zcat ${file} | sed -n '1~4s/^@/>/p;2~4p' | sed 's/^>.*/>/' | gzip -9 > ~/metagenome/data/coordinates_K/data/\\\$ {id}_no_header.fasta.gz"
done
for file in ' ~/metagenome/finished_projects/counting_dbg/coordinates_K/data/SRR11304401_subreads.fastq.gz' \
' ~/metagenome/finished_projects/counting_dbg/coordinates_K/data/SRR13684276.fastq.gz' \
' ~/metagenome/finished_projects/counting_dbg/coordinates_K/data/SRR13577847_subreads.fastq.gz' \
' /cluster/work/grlab/projects/metagenome/data/alignment/completeness/wgs_samples/fq/fq2/SRR4063132/SRR4063132_subreads.fastq.gz' \
' /cluster/work/grlab/projects/metagenome/data/alignment/completeness/wgs_samples/fq/fq2/SRR386922/SRR386922_subreads.fastq.gz' \
' /cluster/work/grlab/projects/metagenome/data/alignment/completeness/wgs_samples/fq/fq2/SRR3747284/SRR3747284_subreads.fastq.gz' \
' /cluster/work/grlab/projects/metagenome/data/alignment/completeness/wgs_samples/fq/fq2/SRR4235456/SRR4235456_subreads.fastq.gz' \
' /cluster/work/grlab/projects/metagenome/data/alignment/completeness/wgs_samples/fq/fq2/SRR3747411/SRR3747411_subreads.fastq.gz' \
' ~/metagenome/finished_projects/counting_dbg/kingsford_compressed/SRR805801_no_header.fasta.gz' \
' /cluster/work/grlab/projects/metagenome/raw_data/human/HG002/PacBio_SequelII_CCS_11kb/m64011_181218_235052.fastq.gz' ; do
for K in {31,}; do
DIR=~ /metagenome/data/coordinates_K/$K ;
# rm -rf $DIR;
mkdir -p $DIR ;
mkdir -p $DIR /logs;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_dna5_test/metagraph;
NUM_THREADS=16;
# bsub -J "construct_${K}_[1-10]" \
bsub -J " build_graph_${K} " \
-o ${DIR} /logs/construct_${K} .lsf \
-W 24:00 \
-n 8 -R " rusage[mem=40000] span[hosts=1]" \
"
file=$file ; \
id=\\\$ (basename \\\$ file); \
mkdir ${DIR} /\\\$ id; \
mkdir ${DIR} /\\\$ id/logs; \
/usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /\\\$ {id}/graph \
\\\$ file; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 3 \
-o ${DIR} /\\\$ {id}/graph_small \
${DIR} /\\\$ {id}/graph.dbg; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--coordinates \
--anno-filename \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/annotation \
\\\$ file; \
mkdir ${DIR} /\\\$ {id}/rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --coordinates \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /\\\$ {id}/graph.dbg \
-o ${DIR} /\\\$ {id}/rd_columns/out \
${DIR} /\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id}/annotation.column.annodbg; \
rm ${DIR} /\\\$ {id}/annotation.column.annodbg.coords; \
rm ${DIR} /\\\$ {id}/rd_columns/annotation.row_count; \
rm ${DIR} /\\\$ {id}/rd_columns/annotation.row_reduction; \
rm ${DIR} /\\\$ {id}/graph.dbg.succ; \
rm ${DIR} /\\\$ {id}/graph.dbg.succ_boundary; \
rm ${DIR} /\\\$ {id}/graph.dbg.pred; \
rm ${DIR} /\\\$ {id}/graph.dbg.pred_boundary" ; \
done
done
list=~ /metagenome/data/row_diff/subsets/refseq/8.txt
DIR=~ /metagenome/finished_projects/counting_dbg/refseq_fungi_coord;
mkdir $DIR ;
mkdir $DIR /logs;
cat ~ /metagenome/data/row_diff/subsets/refseq/8.txt | xargs -P 100 -I {} sh -c " zcat {} | gzip -9 > {}.9"
cat ~ /metagenome/data/row_diff/subsets/refseq/8.txt | xargs -P 200 -I {} sh -c " zcat {} | grep '>' >> ~/metagenome/data/refseq_fungi_coord/headers.txt"
cat ~ /metagenome/data/row_diff/subsets/refseq/8.txt | xargs -P 200 -I {} sh -c " zcat {} | grep -v '>' | tr -d '\n' | wc -c" | awk " {sum+=\$ 1}END{print sum}"
cat ~ /metagenome/data/row_diff/subsets/refseq/8.txt | xargs -I {} sh -c " ls -l {}.9" | sizeb
cat ~ /metagenome/data/refseq_fungi_coord/logs/annotate_graph.lsf | grep " Number of coordinates" | cut -d' ' -f7 | awk " {sum+=\$ 1}END{print sum}"
DIR=~ /metagenome/data/refseq_fungi_coord;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
bsub -J " fungi_graph" \
-oo $DIR /logs/build_graph.lsf \
-W 12:00 \
-n 15 -R " rusage[mem=19000] span[hosts=1]" \
" cat ${list} \
| /usr/bin/time -v $METAGRAPH build -v \
-k 31 \
-p 30 \
--mem-cap-gb 300 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o $DIR /graph" ; \
mkdir $DIR /columns;
bsub -J " annotate_fungi" \
-w " fungi_graph" \
-oo ${DIR} /logs/annotate_graph.lsf \
-W 12:00 \
-n 18 -R " rusage[mem=15000] span[hosts=1]" \
" cat ${list} \
| /usr/bin/time -v $METAGRAPH annotate \
-i $DIR /graph.dbg \
--anno-header \
--separately \
--coordinates \
-o ${DIR} /columns \
-p 36" ; \
DIR=~ /metagenome/data/refseq_fungi_coord;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_master/metagraph;
mkdir $DIR /rd;
mkdir $DIR /rd/rd_columns;
ln -s $DIR /graph.dbg ${DIR} /rd/graph.dbg;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_0" \
-oo ${DIR} /logs/rd_coord_0.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--coordinates \
--row-diff-stage 0 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72" ;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_1" \
-w " fungi_rd_0" \
-oo ${DIR} /logs/rd_coord_1.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--coordinates \
--row-diff-stage 1 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72" ;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_2" \
-w " fungi_rd_1" \
-oo ${DIR} /logs/rd_coord_2.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff \
--coordinates \
--row-diff-stage 2 \
--mem-cap-gb 500 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /rd/rd_columns/out \
-p 72" ;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test/metagraph;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_coord" \
-w " fungi_rd_2" \
-oo ${DIR} /logs/rd_coord.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=4000] span[hosts=1]" \
" find ${DIR} /rd/rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_coord \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /annotation \
-p 72" ;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_test/metagraph;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_brwt_coord" \
-oo ${DIR} /logs/rd_brwt_coord.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" find ${DIR} /rd/rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_brwt_coord \
--greedy --fast --subsample 1000000 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /annotation \
-p 72" ;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
DIR=~ /metagenome/data/refseq_fungi_coord;
bsub -J " fungi_rd_brwt_coord_relax" \
-oo ${DIR} /logs/rd_brwt_coord_relax.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" /usr/bin/time -v $METAGRAPH relax_brwt -v \
--relax-arity 8096 \
-o ${DIR} /annotation.relaxed \
${DIR} /annotation.row_diff_brwt_coord.annodbg \
-p 72" ;
K=21
DIR=~ /metagenome/data/coordinates_K/data/random_counts;
# rm -rf $DIR;
mkdir $DIR ;
mkdir $DIR /logs;
METAGRAPH=~ /projects/projects2014-metagenome/metagraph/build_release/metagraph;
NUM_THREADS=36;
bsub -J " build_graph" \
-oo ${DIR} /logs/construct.lsf \
-W 24:00 \
-n 36 -R " rusage[mem=19000] span[hosts=1]" \
" /usr/bin/time -v $METAGRAPH build -v -p $NUM_THREADS \
-k ${K} \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-o ${DIR} /graph \
${DIR} /random_1M.fasta.gz \
2> ${DIR} /logs/build.log; \
/usr/bin/time -v $METAGRAPH transform -v -p $NUM_THREADS \
--state small \
--index-ranges 1 \
-o ${DIR} /graph_small \
${DIR} /graph.dbg \
2> ${DIR} /logs/transform.log; \
/usr/bin/time -v $METAGRAPH annotate -v -p $NUM_THREADS \
--count-kmers \
--count-width 32 \
--anno-filename \
--mem-cap-gb 40 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /annotation \
${DIR} /random_1M.fasta.gz \
2> ${DIR} /logs/annotate.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type int_brwt \
--greedy --fast --subsample 1000000 \
-o ${DIR} /annotation \
${DIR} /annotation.column.annodbg \
-p 72 --parallel-nodes 10 \
2>&1 | tee ${DIR} /logs/count_brwt.log; \
mkdir ${DIR} /rd_columns; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --count-kmers \
--max-path-length 200 \
--row-diff-stage 0 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out \
${DIR} /annotation.column.annodbg \
2> ${DIR} /logs/coord_rd_1.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --count-kmers \
--max-path-length 200 \
--row-diff-stage 1 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out \
${DIR} /annotation.column.annodbg \
2> ${DIR} /logs/coord_rd_2.log; \
/usr/bin/time -v $METAGRAPH transform_anno -v -p $NUM_THREADS \
--anno-type row_diff --count-kmers \
--max-path-length 200 \
--row-diff-stage 2 \
--mem-cap-gb 200 \
--disk-swap ~/metagenome/scratch/nobackup/stripe_1 \
-i ${DIR} /graph.dbg \
-o ${DIR} /rd_columns/out \
${DIR} /annotation.column.annodbg \
2> ${DIR} /logs/coord_rd_3.log; \
rm ${DIR} /rd_columns/annotation.row_count; \
rm ${DIR} /rd_columns/annotation.row_reduction; \
rm ${DIR} /graph.dbg.succ; \
rm ${DIR} /graph.dbg.succ_boundary; \
rm ${DIR} /graph.dbg.pred; \
rm ${DIR} /graph.dbg.pred_boundary; \
find ${DIR} /rd/rd_columns -name \" *.column.annodbg\" \
| /usr/bin/time -v $METAGRAPH transform_anno -v \
--anno-type row_diff_int_brwt \
--greedy --fast --subsample 1000000 \
-i ${DIR} /rd/graph.dbg \
-o ${DIR} /annotation \
-p 72 --parallel-nodes 10 \
2>&1 | tee ${DIR} /logs/count_rd_brwt.log" ; \