From f59af368010190cc1084306131ed842c719ce940 Mon Sep 17 00:00:00 2001
From: mencian <joshua.zhuang@yahoo.com>
Date: Tue, 12 Nov 2024 04:26:08 -0600
Subject: [PATCH] cd-hit: add osx-arm64 build

---
 recipes/cd-hit/build.sh     |   20 +-
 recipes/cd-hit/cd-hit.patch | 1093 +++++++++++++++++++++++++++++++++++
 recipes/cd-hit/meta.yaml    |   28 +-
 3 files changed, 1123 insertions(+), 18 deletions(-)
 create mode 100644 recipes/cd-hit/cd-hit.patch

diff --git a/recipes/cd-hit/build.sh b/recipes/cd-hit/build.sh
index e1aab78924f19..87fadaf02c43c 100644
--- a/recipes/cd-hit/build.sh
+++ b/recipes/cd-hit/build.sh
@@ -1,26 +1,28 @@
-#!/bin/sh
+#!/bin/bash
 
-export CFLAGS="-I$PREFIX/include"
-export CPPFLAGS="-I$PREFIX/include"
-export CXXFLAGS="-I$PREFIX/include"
-export LDFLAGS="-L$PREFIX/lib"
+mkdir -p $PREFIX/bin
+
+export CFLAGS="${CFLAGS} -O3"
+export CPPFLAGS="${CPPFLAGS} -I$PREFIX/include"
+export CXXFLAGS="${CXXFLAGS} -O3 -I$PREFIX/include"
+export LDFLAGS="${LDFLAGS} -L$PREFIX/lib"
 export CPATH=${PREFIX}/include
 
 sed -i.bak 's/^CC =$//g' Makefile
 sed -i.bak 's/^#LDFLAGS.*//g' Makefile
-
+rm -rf *.bak
 
 if [[ "$OSTYPE" == "darwin"* ]]; then
   #Lines below is commented out until fix provided for OPENMP support on OS X for this program
  
   CCFLAGS="$CCFLAGS -Wl,-rpath ${PREFIX}/lib -L${PREFIX}/lib -I${PREFIX}/include -fopenmp"
   sed -i.bak 's/CCFLAGS = -fopenmp/CCFLAGS += -fopenmp/g' Makefile
+  rm -rf *.bak
   LDFLAGS="$LDFLAGS -stdlib=libc++"
   
   make CC=$CXX openmp=no MAX_SEQ=1000000
 else
   make CC=$GXX MAX_SEQ=1000000
 fi
-
-mkdir -p $PREFIX/bin 
-make install PREFIX=$PREFIX/bin 
+ 
+make install PREFIX="$PREFIX/bin"
diff --git a/recipes/cd-hit/cd-hit.patch b/recipes/cd-hit/cd-hit.patch
new file mode 100644
index 0000000000000..29ac6a70fd2ec
--- /dev/null
+++ b/recipes/cd-hit/cd-hit.patch
@@ -0,0 +1,1093 @@
+diff --git a/FET.pl b/FET.pl
+index 6db320f..bb56529 100755
+--- a/FET.pl
++++ b/FET.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Storable;
+ use strict;
+diff --git a/cd-hit-2d-para.pl b/cd-hit-2d-para.pl
+index 3cab955..e0c43c7 100755
+--- a/cd-hit-2d-para.pl
++++ b/cd-hit-2d-para.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ # =============================================================================
+ # CD-HIT
+ # http://cd-hit.org/
+diff --git a/cd-hit-auxtools/cd-hit-dup-PE-out.pl b/cd-hit-auxtools/cd-hit-dup-PE-out.pl
+index bfe5af3..f035229 100755
+--- a/cd-hit-auxtools/cd-hit-dup-PE-out.pl
++++ b/cd-hit-auxtools/cd-hit-dup-PE-out.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $script_name = $0;
+ my $script_dir = $0;
+diff --git a/cd-hit-clstr_2_blm8.pl b/cd-hit-clstr_2_blm8.pl
+index 42f1e57..cb75ffb 100755
+--- a/cd-hit-clstr_2_blm8.pl
++++ b/cd-hit-clstr_2_blm8.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ #
+ 
+ my $rep;
+@@ -23,7 +23,10 @@ while($ll=<>){
+       else              { 
+         push(@non_reps, $id);
+         my @lls = split(/\s+/, $ll);
+-        my ($a, $iden) = split(/\//, $lls[-1]);
++        # my ($a, $iden) = split(/\//, $lls[-1]); #### bug, with cd-hit-est-2d, there are +/- sign e.g. 10:1029:30:1042/+/97.35%
++        my @mms = split(/\//, $lls[-1]);
++        my $a = $mms[0];
++        my $iden = $mms[-1];
+         chop($iden); ### removing % sign
+         my ($qb, $qe, $sb, $se) = split(/:/, $a);
+         my $alnln = $qe-$qb+1;
+diff --git a/cd-hit-div.pl b/cd-hit-div.pl
+index e349394..db8d942 100755
+--- a/cd-hit-div.pl
++++ b/cd-hit-div.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #not like cd-hit-div, this script do not sort input
+ #or throw away seq
+diff --git a/cd-hit-para.pl b/cd-hit-para.pl
+index 33f1a1b..6ee3ca1 100755
+--- a/cd-hit-para.pl
++++ b/cd-hit-para.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ # =============================================================================
+ # CD-HIT
+ # http://cd-hit.org/
+diff --git a/clstr2tree.pl b/clstr2tree.pl
+index 73fd37a..56d9fe2 100755
+--- a/clstr2tree.pl
++++ b/clstr2tree.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ $clstr = shift;
+ $fr = shift; # for nr80.clstr $fr = 0.8
+diff --git a/clstr2txt.pl b/clstr2txt.pl
+index 902b083..127537e 100755
+--- a/clstr2txt.pl
++++ b/clstr2txt.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $no = 0;
+ my $clstr_no = "";
+diff --git a/clstr2xml.pl b/clstr2xml.pl
+index 10d828c..ba8264a 100755
+--- a/clstr2xml.pl
++++ b/clstr2xml.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #usage: clstr_xml.pl [-len|-size] level1.clstr [level2.clstr level3.clstr ...]
+ #purpose: to create xml file from cd-hit or hierarchical cd-hit(h-cd-hit) results
+diff --git a/clstr_cut.pl b/clstr_cut.pl
+index 498f180..ae0264c 100755
+--- a/clstr_cut.pl
++++ b/clstr_cut.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #keep only top $no proteins in cluster
+ 
+diff --git a/clstr_list.pl b/clstr_list.pl
+index 9c6639b..b997402 100755
+--- a/clstr_list.pl
++++ b/clstr_list.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Storable;
+ use strict;
+diff --git a/clstr_list_sort.pl b/clstr_list_sort.pl
+index e0d20d8..a9bd588 100755
+--- a/clstr_list_sort.pl
++++ b/clstr_list_sort.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Storable;
+ use strict;
+diff --git a/clstr_merge.pl b/clstr_merge.pl
+index 3fe108e..9186777 100755
+--- a/clstr_merge.pl
++++ b/clstr_merge.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ # the order of clusters need to be identical
+ my ($master_clstr, @clstr) = @ARGV;
+diff --git a/clstr_merge_noorder.pl b/clstr_merge_noorder.pl
+index f8acdfc..0852aee 100755
+--- a/clstr_merge_noorder.pl
++++ b/clstr_merge_noorder.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ # order of clusters don't need to be the same
+ # but then I have to read everything into memory
+diff --git a/clstr_quality_eval.pl b/clstr_quality_eval.pl
+index 62f2a3d..060ab01 100755
+--- a/clstr_quality_eval.pl
++++ b/clstr_quality_eval.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ ## calculate the sensitivity and specificity of clusters
+ ## if the input fasta file has pre-defined classification term
+diff --git a/clstr_quality_eval_by_link.pl b/clstr_quality_eval_by_link.pl
+index 8fba8df..140c05c 100755
+--- a/clstr_quality_eval_by_link.pl
++++ b/clstr_quality_eval_by_link.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ ## calculate the sensitivity and specificity of clusters
+ ## if the input fasta file has pre-defined classification term
+diff --git a/clstr_reduce.pl b/clstr_reduce.pl
+index 990f4ad..3621025 100755
+--- a/clstr_reduce.pl
++++ b/clstr_reduce.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ 
+ $file90 = shift;
+diff --git a/clstr_renumber.pl b/clstr_renumber.pl
+index b542304..c66088d 100755
+--- a/clstr_renumber.pl
++++ b/clstr_renumber.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ $no = 0;
+ while($ll=<>){
+   if ($ll =~ /^>Cluster (\d+)/) {
+diff --git a/clstr_rep.pl b/clstr_rep.pl
+index 0ebeb88..84b86b3 100755
+--- a/clstr_rep.pl
++++ b/clstr_rep.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ $rep = "";
+ $no = 0;
+diff --git a/clstr_reps_faa_rev.pl b/clstr_reps_faa_rev.pl
+index 80a4a8a..3574b2b 100755
+--- a/clstr_reps_faa_rev.pl
++++ b/clstr_reps_faa_rev.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ # output single fasta file
+ # for each cluster output at least $cutoff seqs
+ 
+diff --git a/clstr_rev.pl b/clstr_rev.pl
+index d7efdcc..71134e2 100755
+--- a/clstr_rev.pl
++++ b/clstr_rev.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ # if nr90 from nr100 and
+ #    nr80 from nr90, so I have nr90.clstr and nr80.clstr
+ # but, in nr80.clstr, some gi numbers whose from nr100 are there
+diff --git a/clstr_select.pl b/clstr_select.pl
+index 1b168d9..dc70147 100755
+--- a/clstr_select.pl
++++ b/clstr_select.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #my $by = shift;
+ my $min;
+diff --git a/clstr_select_rep.pl b/clstr_select_rep.pl
+index 80c7b7e..f7c38f4 100755
+--- a/clstr_select_rep.pl
++++ b/clstr_select_rep.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #my $by = shift;
+ my $min;
+diff --git a/clstr_size_histogram.pl b/clstr_size_histogram.pl
+index 01ecb63..b726e46 100755
+--- a/clstr_size_histogram.pl
++++ b/clstr_size_histogram.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ if(@ARGV==0){
+    print "Usage:\n\tclstr_size_histogram.pl [-bin N] clstr_file\n";
+diff --git a/clstr_size_stat.pl b/clstr_size_stat.pl
+index b234b06..ecda7db 100755
+--- a/clstr_size_stat.pl
++++ b/clstr_size_stat.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ if(@ARGV==0){
+    print "Usage:\n\tclstr_size_stat.pl clstr_file\n";
+diff --git a/clstr_sort_by.pl b/clstr_sort_by.pl
+index 82e9cf8..adb12d8 100755
+--- a/clstr_sort_by.pl
++++ b/clstr_sort_by.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $sort_by_what = shift;
+    $sort_by_what = "no" unless $sort_by_what;
+diff --git a/clstr_sort_prot_by.pl b/clstr_sort_prot_by.pl
+index 64f19e2..0832b99 100755
+--- a/clstr_sort_prot_by.pl
++++ b/clstr_sort_prot_by.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $sort_by = shift;
+    $sort_by = "len" unless ($sort_by);
+diff --git a/clstr_sql_tbl.pl b/clstr_sql_tbl.pl
+index f2dba07..68bfd7d 100755
+--- a/clstr_sql_tbl.pl
++++ b/clstr_sql_tbl.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ if(@ARGV==0){
+   print "Usage:\n\tclstr_sql_tbl.pl clstr_file tbl_file\n";
+diff --git a/clstr_sql_tbl_sort.pl b/clstr_sql_tbl_sort.pl
+index 67d60a8..3dfe9c4 100755
+--- a/clstr_sql_tbl_sort.pl
++++ b/clstr_sql_tbl_sort.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ if(@ARGV==0){
+    print "Usage:\n\tclstr_sql_tbl_sort.pl table_file level\n";
+diff --git a/make_multi_seq.pl b/make_multi_seq.pl
+index 7b05636..3678654 100755
+--- a/make_multi_seq.pl
++++ b/make_multi_seq.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #note you have to use "-d 0" in the cd-hit run
+ #note you better to use "-g 1" in the cd-hit run
+diff --git a/plot_2d.pl b/plot_2d.pl
+index 418a5cf..91342ca 100755
+--- a/plot_2d.pl
++++ b/plot_2d.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Image::Magick;
+ 
+diff --git a/plot_len1.pl b/plot_len1.pl
+index efcdfe0..e8be6e3 100755
+--- a/plot_len1.pl
++++ b/plot_len1.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ $file90 = shift;
+ $segs = shift;
+diff --git a/psi-cd-hit/cd-hit-div.pl b/psi-cd-hit/cd-hit-div.pl
+index e349394..db8d942 100755
+--- a/psi-cd-hit/cd-hit-div.pl
++++ b/psi-cd-hit/cd-hit-div.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ #not like cd-hit-div, this script do not sort input
+ #or throw away seq
+diff --git a/psi-cd-hit/clstr_select_rep.pl b/psi-cd-hit/clstr_select_rep.pl
+index b465586..63db0ce 100755
+--- a/psi-cd-hit/clstr_select_rep.pl
++++ b/psi-cd-hit/clstr_select_rep.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $by = shift;
+ my $min;
+diff --git a/psi-cd-hit/clstr_select_seq.pl b/psi-cd-hit/clstr_select_seq.pl
+index fd7bb8b..598b0e9 100755
+--- a/psi-cd-hit/clstr_select_seq.pl
++++ b/psi-cd-hit/clstr_select_seq.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my $by = shift;
+ my $min;
+diff --git a/psi-cd-hit/fetch_fasta_by_ids.pl b/psi-cd-hit/fetch_fasta_by_ids.pl
+index bfdbb26..9c17504 100755
+--- a/psi-cd-hit/fetch_fasta_by_ids.pl
++++ b/psi-cd-hit/fetch_fasta_by_ids.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my ($gi_file, $seq_file) = @ARGV;
+ 
+diff --git a/psi-cd-hit/fetch_fasta_exclude_ids.pl b/psi-cd-hit/fetch_fasta_exclude_ids.pl
+index 90e237e..13d061a 100755
+--- a/psi-cd-hit/fetch_fasta_exclude_ids.pl
++++ b/psi-cd-hit/fetch_fasta_exclude_ids.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ my ($gi_file, $seq_file) = @ARGV;
+ 
+diff --git a/psi-cd-hit/psi-2d.pl b/psi-cd-hit/psi-2d.pl
+index ab3f655..f3884a3 100755
+--- a/psi-cd-hit/psi-2d.pl
++++ b/psi-cd-hit/psi-2d.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ 
+ my $script_name = $0;
+diff --git a/psi-cd-hit/psi-cd-hit-local-old.pl b/psi-cd-hit/psi-cd-hit-local-old.pl
+index f5ab1b1..21fd706 100755
+--- a/psi-cd-hit/psi-cd-hit-local-old.pl
++++ b/psi-cd-hit/psi-cd-hit-local-old.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ ################################################################################
+ ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
+ ################################################################################
+@@ -1138,7 +1138,7 @@ sub write_remote_perl_script {
+ 
+   open(REPERL, "> $remote_perl_script") || die;
+   print REPERL <<EOD;
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ \$host = shift;
+ \$arg = shift;
+ 
+diff --git a/psi-cd-hit/psi-cd-hit-local.pl b/psi-cd-hit/psi-cd-hit-local.pl
+index edc7180..461057f 100755
+--- a/psi-cd-hit/psi-cd-hit-local.pl
++++ b/psi-cd-hit/psi-cd-hit-local.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ ################################################################################
+ ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
+ ################################################################################
+@@ -1372,7 +1372,7 @@ sub write_remote_perl_script {
+ 
+   open(REPERL, "> $remote_perl_script") || die;
+   print REPERL <<EOD;
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ \$host = shift;
+ \$instance = shift;
+ \$arg = shift;
+diff --git a/psi-cd-hit/psi-cd-hit-old.pl b/psi-cd-hit/psi-cd-hit-old.pl
+index bb4f512..4fbf688 100755
+--- a/psi-cd-hit/psi-cd-hit-old.pl
++++ b/psi-cd-hit/psi-cd-hit-old.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ ################################################################################
+ ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
+ ################################################################################
+diff --git a/psi-cd-hit/psi-cd-hit.pl b/psi-cd-hit/psi-cd-hit.pl
+index 6d209c6..3d5912c 100755
+--- a/psi-cd-hit/psi-cd-hit.pl
++++ b/psi-cd-hit/psi-cd-hit.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ ################################################################################
+ ######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
+ ################################################################################
+diff --git a/usecases/Miseq-16S/16S-ref-db-PE-splice.pl b/usecases/Miseq-16S/16S-ref-db-PE-splice.pl
+index dc51f93..7836d1c 100755
+--- a/usecases/Miseq-16S/16S-ref-db-PE-splice.pl
++++ b/usecases/Miseq-16S/16S-ref-db-PE-splice.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ ## =========================== NGS tools ==========================================
+ ## NGS tools for metagenomic sequence analysis
+ ## May also be used for other type NGS data analysis
+diff --git a/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl b/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
+index 1059323..0cc747a 100644
+--- a/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
++++ b/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ ################################################################################
+ # NGS workflow by Weizhong Li, http://weizhongli-lab.org
+ ################################################################################
+diff --git a/usecases/Miseq-16S/NG-Omics-Miseq-16S.py b/usecases/Miseq-16S/NG-Omics-Miseq-16S.py
+index 273e008..6e75dde 100644
+--- a/usecases/Miseq-16S/NG-Omics-Miseq-16S.py
++++ b/usecases/Miseq-16S/NG-Omics-Miseq-16S.py
+@@ -43,6 +43,7 @@ NGS_executions['sh_1'] = {
+ NGS_batch_jobs = {}
+ NGS_batch_jobs['qc'] = {
+   'CMD_opts'         : ['100'],
++  'non_zero_files'   : ['R1.fa.gz','R2.fa.gz'],
+   'execution'        : 'qsub_1',               # where to execute
+   'cores_per_cmd'    : 4,                    # number of threads used by command below
+   'no_parallel'      : 1,                    # number of total jobs to run using command below
+@@ -52,8 +53,11 @@ java -jar $ENV.NGS_prog_trimmomatic PE -threads 4 -phred33 $DATA.0 $DATA.1 $SELF
+ 
+ perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa &
+ perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa &
+-
+ wait
++gzip $SELF/R1.fa &
++gzip $SELF/R2.fa &
++wait
++
+ rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
+ '''
+ }
+@@ -61,41 +65,74 @@ rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
+ 
+ NGS_batch_jobs['otu'] = {
+   'injobs'            : ['qc'],
+-  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
++  'non_zero_files'    : ['seq.99f','seq.99f.2','seq.99f-all.clstr','pool.ok'],
++  'CMD_opts'          : ['150', '100', '0.0005', '75', 'path_to_pooled_sample_dir'],
+   'execution'         : 'qsub_1',               # where to execute
+   'cores_per_cmd'     : 2,                    # number of threads used by command below
+   'no_parallel'       : 1,                    # number of total jobs to run using command below
+   'command'           : '''
+-#### cluster at 100% PE
+-$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
++
++#### 1. cluster at 100% PE
++$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa.gz -j $INJOBS.0/R2.fa.gz -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
+     -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0  -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.nr.log
+-#### cluster at 99% PE and SE for R1,R2 
+-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr   -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
+-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
++
++#### 2. cluster at 99% PE 
+ $ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\
+     -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.99.log
+-$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\
++
++#### 3. cluster at 99% SE for R1, R2 
++$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr   -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.3 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
++$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.3 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
++rm -f $SELF/seq.chimeric-clstr.R1    $SELF/seq.chimeric-clstr.R1.log \\
++      $SELF/seq.chimeric-clstr.R2    $SELF/seq.chimeric-clstr.R2.log 
++
++#### 4. 5. filter chimeric sequences and sequences in small clusters
++$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.2 -k $SELF/seq.nr.clstr \\
+     -i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\
+     -a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f
+ $ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr       $SELF/seq.99f.clstr     > $SELF/seq.99f-all.clstr
+-$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\
+-    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.log
+-$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\
+-    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.ref.log
+-$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr
+-$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr
+-$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr
+-
+-rm -f $SELF/seq.chimeric-clstr.R1    $SELF/seq.chimeric-clstr.R1.log    $SELF/seq.chimeric-clstr.R2    $SELF/seq.chimeric-clstr.R2.log 
+-rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log
+ mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt
+ 
++
++####
++if [ ! -e "$CMDOPTS.4" ]; then
++  mkdir -p $CMDOPTS.4
++fi
++
++i="0"
++while [ 1 ]; do
++
++  if [ -e "$CMDOPTS.4/lock" ]; then
++    echo "wait $CMDOPTS.4/lock"
++    sleep 5
++  else
++    date > $CMDOPTS.4/lock
++    
++    cat $SELF/seq.99f                          >> $CMDOPTS.4/seq.99f
++    cat $SELF/seq.99f.2                        >> $CMDOPTS.4/seq.99f.2
++    cat $SELF/seq.99f-all.clstr                >> $CMDOPTS.4/seq.99f-all.clstr
++    cat $SELF/chimeric-small-clusters-list.txt >> $CMDOPTS.4/chimeric-small-clusters-list.txt
++    date > $SELF/pool.ok
++    sleep 1
++
++    rm -f $CMDOPTS.4/lock
++    break
++  fi
++
++  i=$[$i+1]
++  if [ "$i" -gt "50" ]; then
++    echo "wait $CMDOPTS.4/lock for too long"
++    break
++  fi
++done
++
+ '''
+ }
+ 
+ 
+ NGS_batch_jobs['otu-pooled'] = {
+-  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
++  'CMD_opts'          : ['150', '100', '0.97', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1'],
++  'non_zero_files'    : ['OTU.txt'],
+   'execution'         : 'qsub_1',               # where to execute
+   'cores_per_cmd'     : 2,                    # number of threads used by command below
+   'no_parallel'       : 1,                    # number of total jobs to run using command below
+@@ -103,9 +140,9 @@ NGS_batch_jobs['otu-pooled'] = {
+ #### before running
+ #### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
+ $ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
+-    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.log
+-$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
+-    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.ref.log
++    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c $CMDOPTS.2  -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.log
++$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.3 -j2 $CMDOPTS.4 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
++    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c $CMDOPTS.2  -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.ref.log
+ $ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
+ $ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
+ $ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
+diff --git a/usecases/Miseq-16S/NG-Omics-WF.pl b/usecases/Miseq-16S/NG-Omics-WF.pl
+index 2f46255..195583e 100755
+--- a/usecases/Miseq-16S/NG-Omics-WF.pl
++++ b/usecases/Miseq-16S/NG-Omics-WF.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ # =============================== NG-Omics-WF ==================================
+ #  _   _  _____         ____            _              __          ________ 
+ # | \ | |/ ____|       / __ \          (_)             \ \        / /  ____|
+diff --git a/usecases/Miseq-16S/OTU_2_taxon_table.pl b/usecases/Miseq-16S/OTU_2_taxon_table.pl
+new file mode 100755
+index 0000000..86561af
+--- /dev/null
++++ b/usecases/Miseq-16S/OTU_2_taxon_table.pl
+@@ -0,0 +1,158 @@
++#!/usr/bin/env perl
++## =========================== NGS tools ==========================================
++## NGS tools for metagenomic sequence analysis
++## May also be used for other type NGS data analysis
++##
++##                                      Weizhong Li, UCSD
++##                                      liwz@sdsc.edu
++## http://weizhongli-lab.org/
++## ================================================================================
++
++use Getopt::Std;
++getopts("i:o:a:t:r:N:c:P:",\%opts);
++die usage() unless ($opts{o} and $opts{i} and $opts{t});
++
++
++my $otu_table   = $opts{i}; ### e.g. OTU-short.txt
++my $taxon_file  = $opts{t}; ### e.g. OTU-feature.txt
++my $output      = $opts{o};
++
++my ($i, $j, $k, $ll, $cmd);
++
++my @samples = ();
++my @otus = ();
++my %otu_mat = ();
++
++my $fh;
++if ($otu_table eq "-") { $fh = "STDIN";}
++else {
++  open(TMP, $otu_table) || die "can not open $otu_table";
++  $fh = "TMP";
++}
++
++$ll = <$fh>; chop($ll);
++my ($t1, @lls) = split(/\t/, $ll);
++@samples = @lls;
++my $num_samples = $#samples+1;
++
++while($ll=<$fh>){
++  next if ($ll =~ /^#/);
++  next unless ($ll =~ /^otu/i);
++  chop($ll);
++  my ($otu, @v) = split(/\t/, $ll);
++  push(@otus, $otu);
++  for ($i=0; $i<$num_samples; $i++) {
++    $otu_mat{$otu}{$samples[$i]} = $v[$i];
++  }
++}
++
++open(TMP, $taxon_file) || die "can not open $taxon_file";
++my %taxon_info = ();
++while($ll=<TMP>) {
++  chop($ll);
++  next if ($ll =~ /^#/);
++  my ($otu, $taxon, $c) = split(/\t/, $ll);
++  # next unless ($taxon =~ /__/); #### skip unknown OTUs
++
++#OTUID	taxonomy	confidence
++#OTU1	Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__	1.0
++#OTU2	Root;k__Bacteria;p__TM7;c__TM7-3;o__CW040;f__F16;g__;s__	1.0
++#OTU3	Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__rappini	1.0
++#OTU4	Root;k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__C21_c20	1.0
++
++  my $k = "unclassified";
++  my $p = "unclassified";
++  my $c = "unclassified";
++  my $o = "unclassified";
++  my $f = "unclassified";
++  my $g = "unclassified";
++  my $s = "unclassified";
++
++  $j = $taxon;
++  if ($j =~ /^k__([^;]+)/) {$k = $1;}
++  if ($j =~ /;k__([^;]+)/) {$k = $1;}
++  if ($j =~ /;p__([^;]+)/) {$p = $1;}
++  if ($j =~ /;c__([^;]+)/) {$c = $1;}
++  if ($j =~ /;o__([^;]+)/) {$o = $1;}
++  if ($j =~ /;f__([^;]+)/) {$f = $1;}
++  if ($j =~ /;g__([^;]+)/) {$g = $1;}
++  if ($j =~ /;s__([^;]+)/) {$s = $1;}
++
++  if ($j =~ /^D_0__([^;]+)/) {$k = $1;}
++  if ($j =~ /;D_0__([^;]+)/) {$k = $1;}
++  if ($j =~ /;D_1__([^;]+)/) {$p = $1;}
++  if ($j =~ /;D_2__([^;]+)/) {$c = $1;}
++  if ($j =~ /;D_3__([^;]+)/) {$o = $1;}
++  if ($j =~ /;D_4__([^;]+)/) {$f = $1;}
++  if ($j =~ /;D_5__([^;]+)/) {$g = $1;}
++  if ($j =~ /;D_6__([^;]+)/) {$s = $1;}
++
++  if (($g ne "unclassified") and ($s ne "unclassified")) {
++    if ( substr($s, 0, length($g)) ne $g) { #### if species name doesn't contain genus name, add
++      $s = "$g $s";
++    }
++  } 
++  $taxon_info{$otu} = [$k,$p,$c,$o,$f,$g,$s];
++
++}
++close(TMP);
++
++my @ranks = qw/kingdom phylum class order family genus species/;
++my %rank_col = qw/kingdom 0 phylum 1 class 2 order 3 family 4 genus 5 species 6/;
++foreach $rank (@ranks) {
++  next if ($rank eq "kingdom");
++
++  my $c = $rank_col{$rank};
++  my $out = "$output.$rank.txt";
++  open(OUT, "> $out") || die "can not write to $out";
++  #### print table header
++  print OUT "#", join("\t", @ranks[0..$c]);
++  print OUT "\t", join("\t", @samples), "\n";
++
++  my %rank_ti_info = ();
++  my %rank_mat = ();
++  my %ti_sum = ();
++  foreach $otu (@otus) {
++    my @ann = @{$taxon_info{$otu}};
++    my $ti = join("|", @ann[0 .. $c] );
++
++    if (not defined($rank_ti_info{$ti})) {
++      $rank_ti_info{$ti} = [ @ann[0 .. $c] ];
++    }
++    foreach $sample (@samples) {
++      $rank_mat{$ti}{$sample} += $otu_mat{$otu}{$sample}; 
++      $ti_sum{$ti} += $otu_mat{$otu}{$sample};
++    }
++  }
++  my @tis = keys %rank_mat;
++     @tis = sort {$ti_sum{$b} <=> $ti_sum{$a} } @tis;
++
++  foreach $ti (@tis) {
++    print OUT join("\t", @{ $rank_ti_info{$ti} } );
++    foreach $sample (@samples) {
++      print OUT "\t", $rank_mat{$ti}{$sample};
++    }
++    print OUT "\n";
++  }
++  close(OUT);
++}
++
++
++sub usage {
++<<EOD
++Given cd-hit-otu pipeline output, this script generate
++taxonomy abundance tables
++
++usage:
++  $0  -i otu_table -t taxon_info_file -o output
++
++  options
++    -i input OTU table from cd-hit-otu 16S pipeline, default STDIN
++       e.g. OTU-short.txt
++    -t taxon info, e.g. OTU-features.txt
++    -o output file
++
++EOD
++}
++########## END usage
++
+diff --git a/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl b/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl
+index e746d60..5c2424c 100755
+--- a/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl
++++ b/usecases/Miseq-16S/cd-hit-otu-miseq-PE.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
+diff --git a/usecases/Miseq-16S/clstr_2_OTU_table.pl b/usecases/Miseq-16S/clstr_2_OTU_table.pl
+index 0ad13e3..0a4cccb 100755
+--- a/usecases/Miseq-16S/clstr_2_OTU_table.pl
++++ b/usecases/Miseq-16S/clstr_2_OTU_table.pl
+@@ -1,10 +1,21 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ #
+ use Getopt::Std;
+-getopts("i:s:S:o:f:j:",\%opts);
++getopts("i:s:S:o:f:j:b:h:",\%opts);
++
++die usage() unless($opts{o});
+ 
+ my $input             = $opts{i}; $input   = "OTU.clstr" unless $input;
+-my $output            = $opts{o}; $output  = "OTU.txt" unless ($output);
++my $output            = $opts{o};
++my $output_pre        = $output;
++   $output_pre        =~ s/\.([^\.])+$//;
++my $output_meta       = "$output_pre-sample-meta.txt";
++my $output_feature    = "$output_pre-feature.txt";
++my $output_short      = "$output_pre-short.txt";
++my $output_biom       = "$output_pre.biom";
++my $biom_exe          = $opts{b};
++
++
+ my ($i, $j, $k, $str, $cmd, $ll);
+ 
+ my %count = ();
+@@ -31,8 +42,18 @@ while($ll=<TMP>){
+         $count_s{$sample_id}++;
+       }
+       else {
++        $id =~ s/^([^\|]+)\|//;
++        $id =~ s/;\./;/g;
++        $id = "Root;$id";
++        $id =~ s/;D_0__/;k__/;
++        $id =~ s/;D_1__/;p__/;
++        $id =~ s/;D_2__/;c__/;
++        $id =~ s/;D_3__/;o__/;
++        $id =~ s/;D_4__/;f__/;
++        $id =~ s/;D_5__/;g__/;
++        $id =~ s/;D_6__/;s__/;
+         $OTU_2_ann{$OTU} = $id;
+-        $tree_flag = 1 if ($id =~ /\|k__Bacteria;.p__/);
++        $tree_flag = 1 if ($id =~ /;k__Bacteria/);
+       }
+     }
+     else {
+@@ -45,23 +66,34 @@ close(TMP);
+ my @sample_ids = sort keys %sample_id;
+ 
+ open(OUT1, "> $output") || die "can not write $output";
+-print OUT1 "OTU";
++open(OUT2, "> $output_short") || die "can not write $output_short";
++open(OUT3, "> $output_feature") || die "can not write $output_feature";
++
++print OUT1 "#OTUID";
++print OUT2 "#OTUID";
++print OUT3 "#OTUID";
+ foreach $sample_id (@sample_ids){
+   print OUT1 "\t$sample_id";
++  print OUT2 "\t$sample_id";
+ }
+ if ($tree_flag) {
+   print OUT1 "\t", join("\t", qw/Kingdom Phylum Class Order Family Genus Species/);
+ }
+ #print OUT1 "\tTotal\n";
+ print OUT1 "\tAnnotation\n";
++print OUT2 "\n";
++print OUT3 "\ttaxonomy\tconfidence\n";
+ 
+ for ($i=1; $i<=$OTU; $i++){
+-  $ann = "None";
++  $ann = "";
+   if ($OTU_2_ann{$i}) { $ann = $OTU_2_ann{$i}; }
+   print OUT1 "OTU$i";
++  print OUT2 "OTU$i";
++  print OUT3 "OTU$i";
+   foreach $sample_id (@sample_ids){
+     $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
+     print OUT1 "\t$k";
++    print OUT2 "\t$k";
+   }
+   if ($tree_flag) {
+     my ($tax_k, $tax_p, $tax_c, $tax_o, $tax_f, $tax_g, $tax_s);
+@@ -76,7 +108,37 @@ for ($i=1; $i<=$OTU; $i++){
+   }
+   #print OUT1 "\t$count_t{$i}";
+   print OUT1 "\t$ann\n";
++  print OUT2 "\n";
++  print OUT3 "\t$ann\t1.0\n";
+ }
+ close(OUT1);
++close(OUT2);
++close(OUT3);
++
++open(OUT, ">$output_meta") || die "can not write to $output_meta";
++print OUT "#SampleID\tGroup\n";
++foreach $sample_id (@sample_ids){
++  print OUT "$sample_id\tnogroup\n";
++}
++close(OUT);
+ 
++if (-e $biom_exe) {
++  $cmd = `$biom_exe convert -i $output_short -o $output_biom --to-hdf5 --observation-metadata-fp $output_feature --sample-metadata-fp $output_meta`;
++}
++
++sub usage {
++<<EOF
++This script converts OTU clusters to tsv format, and if biom is available,
++convert .biom file
++
++Usage:
++$script_name -i OTU.clstr -o OTU.txt
++
++Options:
++    -i input OTU.clstr, by cd-hit-otu-miseq
++    -o output OTU table
++    -b path to biom executible, if provided, will make .biom file
++
++EOF
++}
+ 
+diff --git a/usecases/Miseq-16S/filter-chimeric-and-small.pl b/usecases/Miseq-16S/filter-chimeric-and-small.pl
+index de303fe..3e07d35 100755
+--- a/usecases/Miseq-16S/filter-chimeric-and-small.pl
++++ b/usecases/Miseq-16S/filter-chimeric-and-small.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
+diff --git a/usecases/Miseq-16S/filter-chimeric-by-ref.pl b/usecases/Miseq-16S/filter-chimeric-by-ref.pl
+index 5e9b32a..9423398 100755
+--- a/usecases/Miseq-16S/filter-chimeric-by-ref.pl
++++ b/usecases/Miseq-16S/filter-chimeric-by-ref.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
+diff --git a/usecases/Miseq-16S/filter-nontop-ref.pl b/usecases/Miseq-16S/filter-nontop-ref.pl
+index 71d8ec3..4200212 100755
+--- a/usecases/Miseq-16S/filter-nontop-ref.pl
++++ b/usecases/Miseq-16S/filter-nontop-ref.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
+diff --git a/usecases/Miseq-16S/filter-refonly-cluster.pl b/usecases/Miseq-16S/filter-refonly-cluster.pl
+index c4c8ab6..bba8f12 100755
+--- a/usecases/Miseq-16S/filter-refonly-cluster.pl
++++ b/usecases/Miseq-16S/filter-refonly-cluster.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
+diff --git a/usecases/Miseq-16S/greengene-ann1.pl b/usecases/Miseq-16S/greengene-ann1.pl
+index fde835e..cf2fc6b 100755
+--- a/usecases/Miseq-16S/greengene-ann1.pl
++++ b/usecases/Miseq-16S/greengene-ann1.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ ## =========================== NGS tools ==========================================
+ ## NGS tools for metagenomic sequence analysis
+ ## May also be used for other type NGS data analysis
+diff --git a/usecases/Miseq-16S/pool_samples.pl b/usecases/Miseq-16S/pool_samples.pl
+index 44cc208..9e5c3a4 100755
+--- a/usecases/Miseq-16S/pool_samples.pl
++++ b/usecases/Miseq-16S/pool_samples.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ #
+ use Getopt::Std;
+ getopts("s:S:o:f:j:",\%opts);
+diff --git a/usecases/Miseq-16S/silva-ana1.pl b/usecases/Miseq-16S/silva-ann1.pl
+similarity index 98%
+rename from usecases/Miseq-16S/silva-ana1.pl
+rename to usecases/Miseq-16S/silva-ann1.pl
+index 03959a3..19fecc1 100755
+--- a/usecases/Miseq-16S/silva-ana1.pl
++++ b/usecases/Miseq-16S/silva-ann1.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ ## =========================== NGS tools ==========================================
+ ## NGS tools for metagenomic sequence analysis
+ ## May also be used for other type NGS data analysis
+diff --git a/usecases/Miseq-16S/silva-qiime-format-ann1.pl b/usecases/Miseq-16S/silva-qiime-format-ann1.pl
+new file mode 100755
+index 0000000..66e9e39
+--- /dev/null
++++ b/usecases/Miseq-16S/silva-qiime-format-ann1.pl
+@@ -0,0 +1,75 @@
++#!/usr/bin/env perl
++## =========================== NGS tools ==========================================
++## NGS tools for metagenomic sequence analysis
++## May also be used for other type NGS data analysis
++##
++##                                      Weizhong Li, UCSD
++##                                      liwz@sdsc.edu
++## http://weizhongli-lab.org/
++## ================================================================================
++
++use Getopt::Std;
++getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
++die usage() unless ($opts{i} and $opts{j} and $opts{o});
++my ($i, $j, $k, $cmd);
++my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
++my ($len, $lena, $lenb);
++
++my $file1 = $opts{i};
++my $fasta = $opts{j};
++my $output  = $opts{o};
++
++my %id_2_ann;
++open(TMP, $file1) || die "can not open $file1";
++while($ll=<TMP>){
++  chop($ll);
++  my ($id, $txt) = split(/\s+/, $ll, 2);
++  $txt =~ s/ /./g;
++  $id_2_ann{$id} = $txt;
++}
++close(TMP);
++
++my %id_2_seq = ();
++my $id = "";
++open(TMP, $fasta) || die "can not open $fasta";
++while($ll=<TMP>){
++  if ($ll =~ /^>(\S+)/) {
++    chop($ll);
++    $id = $1;
++    $ann = $id_2_ann{$id};
++    $id = "$id|$ann" if ($ann);
++  }
++  else {
++    $id_2_seq{$id} .= $ll;
++  }
++}
++
++close(TMP);
++
++my @ids = keys %id_2_seq;
++   @ids = sort {length($b) <=> length($a) } @ids;
++
++open(OUT, "> $output") || die "can not write to $output";
++foreach $id (@ids) {
++  print OUT ">$id\n$id_2_seq{$id}";
++}
++close(OUT);
++
++
++
++sub usage {
++<<EOD;
++This script formats Qiime comtatible Silva FASTA file for CD-HIT-OTU-MiSeq. You should download Silva sequences
++from https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_xxx_release.zip
++unzip this file
++
++Run this script as $0 -i SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \\
++  -j SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -o silva_132_99_16S_processed.fna
++
++Options:
++======================
++        -i path for SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt
++        -j path for SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -o silva_132_99_16S_processed.fna
++        -o output FASTA file of formatted Silva reference DB
++EOD
++}
+diff --git a/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl b/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
+index 010a088..20c8b78 100644
+--- a/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
++++ b/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ ################################################################################
+ # NGS workflow by Weizhong Li, http://weizhongli-lab.org
+ ################################################################################
+diff --git a/usecases/miRNA-seq/clstr_2_miRNA-table.pl b/usecases/miRNA-seq/clstr_2_miRNA-table.pl
+index a98688a..2f45874 100755
+--- a/usecases/miRNA-seq/clstr_2_miRNA-table.pl
++++ b/usecases/miRNA-seq/clstr_2_miRNA-table.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ #
+ use Getopt::Std;
+ getopts("i:s:S:o:f:j:",\%opts);
+diff --git a/usecases/miRNA-seq/filter-small-cluster.pl b/usecases/miRNA-seq/filter-small-cluster.pl
+index f649797..00dfae4 100755
+--- a/usecases/miRNA-seq/filter-small-cluster.pl
++++ b/usecases/miRNA-seq/filter-small-cluster.pl
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl
++#!/usr/bin/env perl
+ 
+ use Getopt::Std;
+ my $script_name = $0;
diff --git a/recipes/cd-hit/meta.yaml b/recipes/cd-hit/meta.yaml
index 8163e48de1d69..b46294a67d0a4 100644
--- a/recipes/cd-hit/meta.yaml
+++ b/recipes/cd-hit/meta.yaml
@@ -8,9 +8,11 @@ package:
 source:
   url: https://github.com/weizhongli/cdhit/archive/V{{ version }}.tar.gz
   sha256: {{ sha256 }}
+  patches:
+    - cd-hit.patch
 
 build:
-  number: 10
+  number: 11
   run_exports:
     - {{ pin_subpackage('cd-hit', max_pin="x") }}
 
@@ -19,22 +21,30 @@ requirements:
     - make
     - {{ compiler('cxx') }}
     - {{ compiler('c') }}
-    - llvm-openmp  # [osx]
-    - libgomp  # [linux]
   host:
     - zlib
-  run:
-    - zlib
+    - llvm-openmp  # [osx]
+    - libgomp  # [linux]
 
 test:
   commands:
     - cd-hit --help | grep 'Usage' > /dev/null
 
 about:
-  home: https://github.com/weizhongli/cdhit
-  license: GPLv2
-  summary: Clusters and compares protein or nucleotide sequences
+  home: "https://github.com/weizhongli/cdhit"
+  license: "GPL-2.0-or-later"
+  license_family: GPL
+  license_file: "license.txt"
+  summary: "Clusters and compares protein or nucleotide sequences."
+  dev_url: "https://github.com/weizhongli/cdhit"
+  doc_url: "https://github.com/weizhongli/cdhit/wiki"
 
 extra:
   additional-platforms:
-    - linux-aarch64
\ No newline at end of file
+    - linux-aarch64
+    - osx-arm64
+  identifiers:
+    - doi:10.1093/bioinformatics/17.3.282
+    - doi:10.1093/bioinformatics/18.1.77
+    - biotools:cd-hit
+    - usegalaxy-eu:cd_hit