From f59af368010190cc1084306131ed842c719ce940 Mon Sep 17 00:00:00 2001 From: mencian Date: Tue, 12 Nov 2024 04:26:08 -0600 Subject: [PATCH] cd-hit: add osx-arm64 build --- recipes/cd-hit/build.sh | 20 +- recipes/cd-hit/cd-hit.patch | 1093 +++++++++++++++++++++++++++++++++++ recipes/cd-hit/meta.yaml | 28 +- 3 files changed, 1123 insertions(+), 18 deletions(-) create mode 100644 recipes/cd-hit/cd-hit.patch diff --git a/recipes/cd-hit/build.sh b/recipes/cd-hit/build.sh index e1aab78924f19..87fadaf02c43c 100644 --- a/recipes/cd-hit/build.sh +++ b/recipes/cd-hit/build.sh @@ -1,26 +1,28 @@ -#!/bin/sh +#!/bin/bash -export CFLAGS="-I$PREFIX/include" -export CPPFLAGS="-I$PREFIX/include" -export CXXFLAGS="-I$PREFIX/include" -export LDFLAGS="-L$PREFIX/lib" +mkdir -p $PREFIX/bin + +export CFLAGS="${CFLAGS} -O3" +export CPPFLAGS="${CPPFLAGS} -I$PREFIX/include" +export CXXFLAGS="${CXXFLAGS} -O3 -I$PREFIX/include" +export LDFLAGS="${LDFLAGS} -L$PREFIX/lib" export CPATH=${PREFIX}/include sed -i.bak 's/^CC =$//g' Makefile sed -i.bak 's/^#LDFLAGS.*//g' Makefile - +rm -rf *.bak if [[ "$OSTYPE" == "darwin"* ]]; then #Lines below is commented out until fix provided for OPENMP support on OS X for this program CCFLAGS="$CCFLAGS -Wl,-rpath ${PREFIX}/lib -L${PREFIX}/lib -I${PREFIX}/include -fopenmp" sed -i.bak 's/CCFLAGS = -fopenmp/CCFLAGS += -fopenmp/g' Makefile + rm -rf *.bak LDFLAGS="$LDFLAGS -stdlib=libc++" make CC=$CXX openmp=no MAX_SEQ=1000000 else make CC=$GXX MAX_SEQ=1000000 fi - -mkdir -p $PREFIX/bin -make install PREFIX=$PREFIX/bin + +make install PREFIX="$PREFIX/bin" diff --git a/recipes/cd-hit/cd-hit.patch b/recipes/cd-hit/cd-hit.patch new file mode 100644 index 0000000000000..29ac6a70fd2ec --- /dev/null +++ b/recipes/cd-hit/cd-hit.patch @@ -0,0 +1,1093 @@ +diff --git a/FET.pl b/FET.pl +index 6db320f..bb56529 100755 +--- a/FET.pl ++++ b/FET.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + use Storable; + use strict; +diff --git a/cd-hit-2d-para.pl b/cd-hit-2d-para.pl +index 3cab955..e0c43c7 100755 +--- a/cd-hit-2d-para.pl ++++ b/cd-hit-2d-para.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl -w ++#!/usr/bin/env perl + # ============================================================================= + # CD-HIT + # http://cd-hit.org/ +diff --git a/cd-hit-auxtools/cd-hit-dup-PE-out.pl b/cd-hit-auxtools/cd-hit-dup-PE-out.pl +index bfe5af3..f035229 100755 +--- a/cd-hit-auxtools/cd-hit-dup-PE-out.pl ++++ b/cd-hit-auxtools/cd-hit-dup-PE-out.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $script_name = $0; + my $script_dir = $0; +diff --git a/cd-hit-clstr_2_blm8.pl b/cd-hit-clstr_2_blm8.pl +index 42f1e57..cb75ffb 100755 +--- a/cd-hit-clstr_2_blm8.pl ++++ b/cd-hit-clstr_2_blm8.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + # + + my $rep; +@@ -23,7 +23,10 @@ while($ll=<>){ + else { + push(@non_reps, $id); + my @lls = split(/\s+/, $ll); +- my ($a, $iden) = split(/\//, $lls[-1]); ++ # my ($a, $iden) = split(/\//, $lls[-1]); #### bug, with cd-hit-est-2d, there are +/- sign e.g. 10:1029:30:1042/+/97.35% ++ my @mms = split(/\//, $lls[-1]); ++ my $a = $mms[0]; ++ my $iden = $mms[-1]; + chop($iden); ### removing % sign + my ($qb, $qe, $sb, $se) = split(/:/, $a); + my $alnln = $qe-$qb+1; +diff --git a/cd-hit-div.pl b/cd-hit-div.pl +index e349394..db8d942 100755 +--- a/cd-hit-div.pl ++++ b/cd-hit-div.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #not like cd-hit-div, this script do not sort input + #or throw away seq +diff --git a/cd-hit-para.pl b/cd-hit-para.pl +index 33f1a1b..6ee3ca1 100755 +--- a/cd-hit-para.pl ++++ b/cd-hit-para.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl -w ++#!/usr/bin/env perl + # ============================================================================= + # CD-HIT + # http://cd-hit.org/ +diff --git a/clstr2tree.pl b/clstr2tree.pl +index 73fd37a..56d9fe2 100755 +--- a/clstr2tree.pl ++++ b/clstr2tree.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + $clstr = shift; + $fr = shift; # for nr80.clstr $fr = 0.8 +diff --git a/clstr2txt.pl b/clstr2txt.pl +index 902b083..127537e 100755 +--- a/clstr2txt.pl ++++ b/clstr2txt.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $no = 0; + my $clstr_no = ""; +diff --git a/clstr2xml.pl b/clstr2xml.pl +index 10d828c..ba8264a 100755 +--- a/clstr2xml.pl ++++ b/clstr2xml.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #usage: clstr_xml.pl [-len|-size] level1.clstr [level2.clstr level3.clstr ...] + #purpose: to create xml file from cd-hit or hierarchical cd-hit(h-cd-hit) results +diff --git a/clstr_cut.pl b/clstr_cut.pl +index 498f180..ae0264c 100755 +--- a/clstr_cut.pl ++++ b/clstr_cut.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #keep only top $no proteins in cluster + +diff --git a/clstr_list.pl b/clstr_list.pl +index 9c6639b..b997402 100755 +--- a/clstr_list.pl ++++ b/clstr_list.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + use Storable; + use strict; +diff --git a/clstr_list_sort.pl b/clstr_list_sort.pl +index e0d20d8..a9bd588 100755 +--- a/clstr_list_sort.pl ++++ b/clstr_list_sort.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + use Storable; + use strict; +diff --git a/clstr_merge.pl b/clstr_merge.pl +index 3fe108e..9186777 100755 +--- a/clstr_merge.pl ++++ b/clstr_merge.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + # the order of clusters need to be identical + my ($master_clstr, @clstr) = @ARGV; +diff --git a/clstr_merge_noorder.pl b/clstr_merge_noorder.pl +index f8acdfc..0852aee 100755 +--- a/clstr_merge_noorder.pl ++++ b/clstr_merge_noorder.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + # order of clusters don't need to be the same + # but then I have to read everything into memory +diff --git a/clstr_quality_eval.pl b/clstr_quality_eval.pl +index 62f2a3d..060ab01 100755 +--- a/clstr_quality_eval.pl ++++ b/clstr_quality_eval.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + ## calculate the sensitivity and specificity of clusters + ## if the input fasta file has pre-defined classification term +diff --git a/clstr_quality_eval_by_link.pl b/clstr_quality_eval_by_link.pl +index 8fba8df..140c05c 100755 +--- a/clstr_quality_eval_by_link.pl ++++ b/clstr_quality_eval_by_link.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + ## calculate the sensitivity and specificity of clusters + ## if the input fasta file has pre-defined classification term +diff --git a/clstr_reduce.pl b/clstr_reduce.pl +index 990f4ad..3621025 100755 +--- a/clstr_reduce.pl ++++ b/clstr_reduce.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + + $file90 = shift; +diff --git a/clstr_renumber.pl b/clstr_renumber.pl +index b542304..c66088d 100755 +--- a/clstr_renumber.pl ++++ b/clstr_renumber.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + $no = 0; + while($ll=<>){ + if ($ll =~ /^>Cluster (\d+)/) { +diff --git a/clstr_rep.pl b/clstr_rep.pl +index 0ebeb88..84b86b3 100755 +--- a/clstr_rep.pl ++++ b/clstr_rep.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + $rep = ""; + $no = 0; +diff --git a/clstr_reps_faa_rev.pl b/clstr_reps_faa_rev.pl +index 80a4a8a..3574b2b 100755 +--- a/clstr_reps_faa_rev.pl ++++ b/clstr_reps_faa_rev.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + # output single fasta file + # for each cluster output at least $cutoff seqs + +diff --git a/clstr_rev.pl b/clstr_rev.pl +index d7efdcc..71134e2 100755 +--- a/clstr_rev.pl ++++ b/clstr_rev.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + # if nr90 from nr100 and + # nr80 from nr90, so I have nr90.clstr and nr80.clstr + # but, in nr80.clstr, some gi numbers whose from nr100 are there +diff --git a/clstr_select.pl b/clstr_select.pl +index 1b168d9..dc70147 100755 +--- a/clstr_select.pl ++++ b/clstr_select.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #my $by = shift; + my $min; +diff --git a/clstr_select_rep.pl b/clstr_select_rep.pl +index 80c7b7e..f7c38f4 100755 +--- a/clstr_select_rep.pl ++++ b/clstr_select_rep.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #my $by = shift; + my $min; +diff --git a/clstr_size_histogram.pl b/clstr_size_histogram.pl +index 01ecb63..b726e46 100755 +--- a/clstr_size_histogram.pl ++++ b/clstr_size_histogram.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + if(@ARGV==0){ + print "Usage:\n\tclstr_size_histogram.pl [-bin N] clstr_file\n"; +diff --git a/clstr_size_stat.pl b/clstr_size_stat.pl +index b234b06..ecda7db 100755 +--- a/clstr_size_stat.pl ++++ b/clstr_size_stat.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + if(@ARGV==0){ + print "Usage:\n\tclstr_size_stat.pl clstr_file\n"; +diff --git a/clstr_sort_by.pl b/clstr_sort_by.pl +index 82e9cf8..adb12d8 100755 +--- a/clstr_sort_by.pl ++++ b/clstr_sort_by.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $sort_by_what = shift; + $sort_by_what = "no" unless $sort_by_what; +diff --git a/clstr_sort_prot_by.pl b/clstr_sort_prot_by.pl +index 64f19e2..0832b99 100755 +--- a/clstr_sort_prot_by.pl ++++ b/clstr_sort_prot_by.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $sort_by = shift; + $sort_by = "len" unless ($sort_by); +diff --git a/clstr_sql_tbl.pl b/clstr_sql_tbl.pl +index f2dba07..68bfd7d 100755 +--- a/clstr_sql_tbl.pl ++++ b/clstr_sql_tbl.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + if(@ARGV==0){ + print "Usage:\n\tclstr_sql_tbl.pl clstr_file tbl_file\n"; +diff --git a/clstr_sql_tbl_sort.pl b/clstr_sql_tbl_sort.pl +index 67d60a8..3dfe9c4 100755 +--- a/clstr_sql_tbl_sort.pl ++++ b/clstr_sql_tbl_sort.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + if(@ARGV==0){ + print "Usage:\n\tclstr_sql_tbl_sort.pl table_file level\n"; +diff --git a/make_multi_seq.pl b/make_multi_seq.pl +index 7b05636..3678654 100755 +--- a/make_multi_seq.pl ++++ b/make_multi_seq.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #note you have to use "-d 0" in the cd-hit run + #note you better to use "-g 1" in the cd-hit run +diff --git a/plot_2d.pl b/plot_2d.pl +index 418a5cf..91342ca 100755 +--- a/plot_2d.pl ++++ b/plot_2d.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + use Image::Magick; + +diff --git a/plot_len1.pl b/plot_len1.pl +index efcdfe0..e8be6e3 100755 +--- a/plot_len1.pl ++++ b/plot_len1.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + $file90 = shift; + $segs = shift; +diff --git a/psi-cd-hit/cd-hit-div.pl b/psi-cd-hit/cd-hit-div.pl +index e349394..db8d942 100755 +--- a/psi-cd-hit/cd-hit-div.pl ++++ b/psi-cd-hit/cd-hit-div.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + #not like cd-hit-div, this script do not sort input + #or throw away seq +diff --git a/psi-cd-hit/clstr_select_rep.pl b/psi-cd-hit/clstr_select_rep.pl +index b465586..63db0ce 100755 +--- a/psi-cd-hit/clstr_select_rep.pl ++++ b/psi-cd-hit/clstr_select_rep.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $by = shift; + my $min; +diff --git a/psi-cd-hit/clstr_select_seq.pl b/psi-cd-hit/clstr_select_seq.pl +index fd7bb8b..598b0e9 100755 +--- a/psi-cd-hit/clstr_select_seq.pl ++++ b/psi-cd-hit/clstr_select_seq.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my $by = shift; + my $min; +diff --git a/psi-cd-hit/fetch_fasta_by_ids.pl b/psi-cd-hit/fetch_fasta_by_ids.pl +index bfdbb26..9c17504 100755 +--- a/psi-cd-hit/fetch_fasta_by_ids.pl ++++ b/psi-cd-hit/fetch_fasta_by_ids.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my ($gi_file, $seq_file) = @ARGV; + +diff --git a/psi-cd-hit/fetch_fasta_exclude_ids.pl b/psi-cd-hit/fetch_fasta_exclude_ids.pl +index 90e237e..13d061a 100755 +--- a/psi-cd-hit/fetch_fasta_exclude_ids.pl ++++ b/psi-cd-hit/fetch_fasta_exclude_ids.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + my ($gi_file, $seq_file) = @ARGV; + +diff --git a/psi-cd-hit/psi-2d.pl b/psi-cd-hit/psi-2d.pl +index ab3f655..f3884a3 100755 +--- a/psi-cd-hit/psi-2d.pl ++++ b/psi-cd-hit/psi-2d.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + + + my $script_name = $0; +diff --git a/psi-cd-hit/psi-cd-hit-local-old.pl b/psi-cd-hit/psi-cd-hit-local-old.pl +index f5ab1b1..21fd706 100755 +--- a/psi-cd-hit/psi-cd-hit-local-old.pl ++++ b/psi-cd-hit/psi-cd-hit-local-old.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl -w ++#!/usr/bin/env perl + ################################################################################ + ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org + ################################################################################ +@@ -1138,7 +1138,7 @@ sub write_remote_perl_script { + + open(REPERL, "> $remote_perl_script") || die; + print REPERL < $remote_perl_script") || die; + print REPERL <){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa & + perl -e '$i=0; while(<>){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa & +- + wait ++gzip $SELF/R1.fa & ++gzip $SELF/R2.fa & ++wait ++ + rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq + ''' + } +@@ -61,41 +65,74 @@ rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq + + NGS_batch_jobs['otu'] = { + 'injobs' : ['qc'], +- 'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'], ++ 'non_zero_files' : ['seq.99f','seq.99f.2','seq.99f-all.clstr','pool.ok'], ++ 'CMD_opts' : ['150', '100', '0.0005', '75', 'path_to_pooled_sample_dir'], + 'execution' : 'qsub_1', # where to execute + 'cores_per_cmd' : 2, # number of threads used by command below + 'no_parallel' : 1, # number of total jobs to run using command below + 'command' : ''' +-#### cluster at 100% PE +-$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\ ++ ++#### 1. cluster at 100% PE ++$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa.gz -j $INJOBS.0/R2.fa.gz -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\ + -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.nr.log +-#### cluster at 99% PE and SE for R1,R2 +-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log +-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log ++ ++#### 2. cluster at 99% PE + $ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\ + -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.99.log +-$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\ ++ ++#### 3. cluster at 99% SE for R1, R2 ++$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.3 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log ++$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.3 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log ++rm -f $SELF/seq.chimeric-clstr.R1 $SELF/seq.chimeric-clstr.R1.log \\ ++ $SELF/seq.chimeric-clstr.R2 $SELF/seq.chimeric-clstr.R2.log ++ ++#### 4. 5. filter chimeric sequences and sequences in small clusters ++$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.2 -k $SELF/seq.nr.clstr \\ + -i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\ + -a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f + $ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr $SELF/seq.99f.clstr > $SELF/seq.99f-all.clstr +-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\ +- -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.log +-$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\ +- -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.ref.log +-$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr +-$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr +-$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr +- +-rm -f $SELF/seq.chimeric-clstr.R1 $SELF/seq.chimeric-clstr.R1.log $SELF/seq.chimeric-clstr.R2 $SELF/seq.chimeric-clstr.R2.log +-rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log + mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt + ++ ++#### ++if [ ! -e "$CMDOPTS.4" ]; then ++ mkdir -p $CMDOPTS.4 ++fi ++ ++i="0" ++while [ 1 ]; do ++ ++ if [ -e "$CMDOPTS.4/lock" ]; then ++ echo "wait $CMDOPTS.4/lock" ++ sleep 5 ++ else ++ date > $CMDOPTS.4/lock ++ ++ cat $SELF/seq.99f >> $CMDOPTS.4/seq.99f ++ cat $SELF/seq.99f.2 >> $CMDOPTS.4/seq.99f.2 ++ cat $SELF/seq.99f-all.clstr >> $CMDOPTS.4/seq.99f-all.clstr ++ cat $SELF/chimeric-small-clusters-list.txt >> $CMDOPTS.4/chimeric-small-clusters-list.txt ++ date > $SELF/pool.ok ++ sleep 1 ++ ++ rm -f $CMDOPTS.4/lock ++ break ++ fi ++ ++ i=$[$i+1] ++ if [ "$i" -gt "50" ]; then ++ echo "wait $CMDOPTS.4/lock for too long" ++ break ++ fi ++done ++ + ''' + } + + + NGS_batch_jobs['otu-pooled'] = { +- 'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'], ++ 'CMD_opts' : ['150', '100', '0.97', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1'], ++ 'non_zero_files' : ['OTU.txt'], + 'execution' : 'qsub_1', # where to execute + 'cores_per_cmd' : 2, # number of threads used by command below + 'no_parallel' : 1, # number of total jobs to run using command below +@@ -103,9 +140,9 @@ NGS_batch_jobs['otu-pooled'] = { + #### before running + #### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt + $ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\ +- -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log +-$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\ +- -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log ++ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c $CMDOPTS.2 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log ++$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.3 -j2 $CMDOPTS.4 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\ ++ -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c $CMDOPTS.2 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log + $ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr + $ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr + $ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr +diff --git a/usecases/Miseq-16S/NG-Omics-WF.pl b/usecases/Miseq-16S/NG-Omics-WF.pl +index 2f46255..195583e 100755 +--- a/usecases/Miseq-16S/NG-Omics-WF.pl ++++ b/usecases/Miseq-16S/NG-Omics-WF.pl +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + # =============================== NG-Omics-WF ================================== + # _ _ _____ ____ _ __ ________ + # | \ | |/ ____| / __ \ (_) \ \ / / ____| +diff --git a/usecases/Miseq-16S/OTU_2_taxon_table.pl b/usecases/Miseq-16S/OTU_2_taxon_table.pl +new file mode 100755 +index 0000000..86561af +--- /dev/null ++++ b/usecases/Miseq-16S/OTU_2_taxon_table.pl +@@ -0,0 +1,158 @@ ++#!/usr/bin/env perl ++## =========================== NGS tools ========================================== ++## NGS tools for metagenomic sequence analysis ++## May also be used for other type NGS data analysis ++## ++## Weizhong Li, UCSD ++## liwz@sdsc.edu ++## http://weizhongli-lab.org/ ++## ================================================================================ ++ ++use Getopt::Std; ++getopts("i:o:a:t:r:N:c:P:",\%opts); ++die usage() unless ($opts{o} and $opts{i} and $opts{t}); ++ ++ ++my $otu_table = $opts{i}; ### e.g. OTU-short.txt ++my $taxon_file = $opts{t}; ### e.g. OTU-feature.txt ++my $output = $opts{o}; ++ ++my ($i, $j, $k, $ll, $cmd); ++ ++my @samples = (); ++my @otus = (); ++my %otu_mat = (); ++ ++my $fh; ++if ($otu_table eq "-") { $fh = "STDIN";} ++else { ++ open(TMP, $otu_table) || die "can not open $otu_table"; ++ $fh = "TMP"; ++} ++ ++$ll = <$fh>; chop($ll); ++my ($t1, @lls) = split(/\t/, $ll); ++@samples = @lls; ++my $num_samples = $#samples+1; ++ ++while($ll=<$fh>){ ++ next if ($ll =~ /^#/); ++ next unless ($ll =~ /^otu/i); ++ chop($ll); ++ my ($otu, @v) = split(/\t/, $ll); ++ push(@otus, $otu); ++ for ($i=0; $i<$num_samples; $i++) { ++ $otu_mat{$otu}{$samples[$i]} = $v[$i]; ++ } ++} ++ ++open(TMP, $taxon_file) || die "can not open $taxon_file"; ++my %taxon_info = (); ++while($ll=) { ++ chop($ll); ++ next if ($ll =~ /^#/); ++ my ($otu, $taxon, $c) = split(/\t/, $ll); ++ # next unless ($taxon =~ /__/); #### skip unknown OTUs ++ ++#OTUID taxonomy confidence ++#OTU1 Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__ 1.0 ++#OTU2 Root;k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__F16;g__;s__ 1.0 ++#OTU3 Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__rappini 1.0 ++#OTU4 Root;k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__C21_c20 1.0 ++ ++ my $k = "unclassified"; ++ my $p = "unclassified"; ++ my $c = "unclassified"; ++ my $o = "unclassified"; ++ my $f = "unclassified"; ++ my $g = "unclassified"; ++ my $s = "unclassified"; ++ ++ $j = $taxon; ++ if ($j =~ /^k__([^;]+)/) {$k = $1;} ++ if ($j =~ /;k__([^;]+)/) {$k = $1;} ++ if ($j =~ /;p__([^;]+)/) {$p = $1;} ++ if ($j =~ /;c__([^;]+)/) {$c = $1;} ++ if ($j =~ /;o__([^;]+)/) {$o = $1;} ++ if ($j =~ /;f__([^;]+)/) {$f = $1;} ++ if ($j =~ /;g__([^;]+)/) {$g = $1;} ++ if ($j =~ /;s__([^;]+)/) {$s = $1;} ++ ++ if ($j =~ /^D_0__([^;]+)/) {$k = $1;} ++ if ($j =~ /;D_0__([^;]+)/) {$k = $1;} ++ if ($j =~ /;D_1__([^;]+)/) {$p = $1;} ++ if ($j =~ /;D_2__([^;]+)/) {$c = $1;} ++ if ($j =~ /;D_3__([^;]+)/) {$o = $1;} ++ if ($j =~ /;D_4__([^;]+)/) {$f = $1;} ++ if ($j =~ /;D_5__([^;]+)/) {$g = $1;} ++ if ($j =~ /;D_6__([^;]+)/) {$s = $1;} ++ ++ if (($g ne "unclassified") and ($s ne "unclassified")) { ++ if ( substr($s, 0, length($g)) ne $g) { #### if species name doesn't contain genus name, add ++ $s = "$g $s"; ++ } ++ } ++ $taxon_info{$otu} = [$k,$p,$c,$o,$f,$g,$s]; ++ ++} ++close(TMP); ++ ++my @ranks = qw/kingdom phylum class order family genus species/; ++my %rank_col = qw/kingdom 0 phylum 1 class 2 order 3 family 4 genus 5 species 6/; ++foreach $rank (@ranks) { ++ next if ($rank eq "kingdom"); ++ ++ my $c = $rank_col{$rank}; ++ my $out = "$output.$rank.txt"; ++ open(OUT, "> $out") || die "can not write to $out"; ++ #### print table header ++ print OUT "#", join("\t", @ranks[0..$c]); ++ print OUT "\t", join("\t", @samples), "\n"; ++ ++ my %rank_ti_info = (); ++ my %rank_mat = (); ++ my %ti_sum = (); ++ foreach $otu (@otus) { ++ my @ann = @{$taxon_info{$otu}}; ++ my $ti = join("|", @ann[0 .. $c] ); ++ ++ if (not defined($rank_ti_info{$ti})) { ++ $rank_ti_info{$ti} = [ @ann[0 .. $c] ]; ++ } ++ foreach $sample (@samples) { ++ $rank_mat{$ti}{$sample} += $otu_mat{$otu}{$sample}; ++ $ti_sum{$ti} += $otu_mat{$otu}{$sample}; ++ } ++ } ++ my @tis = keys %rank_mat; ++ @tis = sort {$ti_sum{$b} <=> $ti_sum{$a} } @tis; ++ ++ foreach $ti (@tis) { ++ print OUT join("\t", @{ $rank_ti_info{$ti} } ); ++ foreach $sample (@samples) { ++ print OUT "\t", $rank_mat{$ti}{$sample}; ++ } ++ print OUT "\n"; ++ } ++ close(OUT); ++} ++ ++ ++sub usage { ++<){ + $count_s{$sample_id}++; + } + else { ++ $id =~ s/^([^\|]+)\|//; ++ $id =~ s/;\./;/g; ++ $id = "Root;$id"; ++ $id =~ s/;D_0__/;k__/; ++ $id =~ s/;D_1__/;p__/; ++ $id =~ s/;D_2__/;c__/; ++ $id =~ s/;D_3__/;o__/; ++ $id =~ s/;D_4__/;f__/; ++ $id =~ s/;D_5__/;g__/; ++ $id =~ s/;D_6__/;s__/; + $OTU_2_ann{$OTU} = $id; +- $tree_flag = 1 if ($id =~ /\|k__Bacteria;.p__/); ++ $tree_flag = 1 if ($id =~ /;k__Bacteria/); + } + } + else { +@@ -45,23 +66,34 @@ close(TMP); + my @sample_ids = sort keys %sample_id; + + open(OUT1, "> $output") || die "can not write $output"; +-print OUT1 "OTU"; ++open(OUT2, "> $output_short") || die "can not write $output_short"; ++open(OUT3, "> $output_feature") || die "can not write $output_feature"; ++ ++print OUT1 "#OTUID"; ++print OUT2 "#OTUID"; ++print OUT3 "#OTUID"; + foreach $sample_id (@sample_ids){ + print OUT1 "\t$sample_id"; ++ print OUT2 "\t$sample_id"; + } + if ($tree_flag) { + print OUT1 "\t", join("\t", qw/Kingdom Phylum Class Order Family Genus Species/); + } + #print OUT1 "\tTotal\n"; + print OUT1 "\tAnnotation\n"; ++print OUT2 "\n"; ++print OUT3 "\ttaxonomy\tconfidence\n"; + + for ($i=1; $i<=$OTU; $i++){ +- $ann = "None"; ++ $ann = ""; + if ($OTU_2_ann{$i}) { $ann = $OTU_2_ann{$i}; } + print OUT1 "OTU$i"; ++ print OUT2 "OTU$i"; ++ print OUT3 "OTU$i"; + foreach $sample_id (@sample_ids){ + $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0; + print OUT1 "\t$k"; ++ print OUT2 "\t$k"; + } + if ($tree_flag) { + my ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s); +@@ -76,7 +108,37 @@ for ($i=1; $i<=$OTU; $i++){ + } + #print OUT1 "\t$count_t{$i}"; + print OUT1 "\t$ann\n"; ++ print OUT2 "\n"; ++ print OUT3 "\t$ann\t1.0\n"; + } + close(OUT1); ++close(OUT2); ++close(OUT3); ++ ++open(OUT, ">$output_meta") || die "can not write to $output_meta"; ++print OUT "#SampleID\tGroup\n"; ++foreach $sample_id (@sample_ids){ ++ print OUT "$sample_id\tnogroup\n"; ++} ++close(OUT); + ++if (-e $biom_exe) { ++ $cmd = `$biom_exe convert -i $output_short -o $output_biom --to-hdf5 --observation-metadata-fp $output_feature --sample-metadata-fp $output_meta`; ++} ++ ++sub usage { ++<){ ++ chop($ll); ++ my ($id, $txt) = split(/\s+/, $ll, 2); ++ $txt =~ s/ /./g; ++ $id_2_ann{$id} = $txt; ++} ++close(TMP); ++ ++my %id_2_seq = (); ++my $id = ""; ++open(TMP, $fasta) || die "can not open $fasta"; ++while($ll=){ ++ if ($ll =~ /^>(\S+)/) { ++ chop($ll); ++ $id = $1; ++ $ann = $id_2_ann{$id}; ++ $id = "$id|$ann" if ($ann); ++ } ++ else { ++ $id_2_seq{$id} .= $ll; ++ } ++} ++ ++close(TMP); ++ ++my @ids = keys %id_2_seq; ++ @ids = sort {length($b) <=> length($a) } @ids; ++ ++open(OUT, "> $output") || die "can not write to $output"; ++foreach $id (@ids) { ++ print OUT ">$id\n$id_2_seq{$id}"; ++} ++close(OUT); ++ ++ ++ ++sub usage { ++< /dev/null about: - home: https://github.com/weizhongli/cdhit - license: GPLv2 - summary: Clusters and compares protein or nucleotide sequences + home: "https://github.com/weizhongli/cdhit" + license: "GPL-2.0-or-later" + license_family: GPL + license_file: "license.txt" + summary: "Clusters and compares protein or nucleotide sequences." + dev_url: "https://github.com/weizhongli/cdhit" + doc_url: "https://github.com/weizhongli/cdhit/wiki" extra: additional-platforms: - - linux-aarch64 \ No newline at end of file + - linux-aarch64 + - osx-arm64 + identifiers: + - doi:10.1093/bioinformatics/17.3.282 + - doi:10.1093/bioinformatics/18.1.77 + - biotools:cd-hit + - usegalaxy-eu:cd_hit