diff --git a/bin/ChopGO_ChromoGoatee.pl b/bin/ChopGO_ChromoGoatee.pl deleted file mode 100755 index cca398a..0000000 --- a/bin/ChopGO_ChromoGoatee.pl +++ /dev/null @@ -1,495 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; -use Getopt::Long qw(GetOptions); -use Cwd qw(abs_path cwd); -use File::Path qw(make_path); - -# INITIALIZE PATH -my $binPath = abs_path($0); -$0 =~ s/^.*\///; -$binPath =~ s/\/$0$//; # vast-tools/bin - -my $GO_file; -my $input; -my $background; -my $species; -my $helpFlag=0; -my $path_to_DB="$binPath/../VASTDB"; -my $plot = 1; -my $pval_cutoff=0.05; -my $num_cutoff=10; -my $plot_only=0; -my $plot_open=0; -my $install_topGO; -my $meth="none"; -my $sort_enrich=0; -my $min_genes=2; -my $max_genes=100000; -my $infer=0; - -GetOptions( "help" => \$helpFlag, - 'i=s' => \$input, - "bg=s" => \$background, - 'sp=s' => \$species, - "db=s" => \$path_to_DB, - "GO_file=s" => \$GO_file, - "plot" => \$plot, - "pval=s" => \$pval_cutoff, - "pval_type=s" => \$meth, - "max_plot=s" => \$num_cutoff, - "plot_only" => \$plot_only, - "open_plot" => \$plot_open, - "install_topGO" => \$install_topGO, - "filt_enrich=s" => \$sort_enrich, - "min_genes_req=s" => \$min_genes, - "max_genes_req=s" => \$max_genes, - "allow_inferred" => \$infer, - ); - - - -my $EXIT_STATUS=0; -if (defined $input && ((defined $species && defined $path_to_DB) || (defined $GO_file))){ - print "Starting...\n\n"; -} -else{ - $EXIT_STATUS=1; -} - -if ($helpFlag or $EXIT_STATUS){ - die " -usage: ChopGO.pl -i (-sp -db OR --GO_file Custom_GO_file) [options] - -compulsory: - -i Input list of genes (compulsory). Sep by \'\\n\'. Optional second column for subdivisions of list. - - -sp Species must be (Hsa,Bla,Mmu code). Check availability in DB. -OR - --GO_file Custom file with GeneID\tGO_term associations - -optional: - -bg Background must be a list of genes sep by \\n. - -db Database, give path to DB of GO annotations. Default (./VASTDB) - -install_topGO Install topGO in R. - -plotting options: - - -plot Plot (BOOLEAN, optional), will create a histogram/s. Default=0=(OFF). - -pval Choose pval cutoff for histogram plots. Default=0.05. - -pval_type Choose correction (holm, hochberg, hommel, bonferroni, BH, BY, fdr or none). Default=none. - -filt_enrich Filter by fold enrichment. Default=0. - -max_plot Max number of results to plot for each histogram (Bp,MF,CC). Default=10. - -min_genes_req Choose minimum number of genes containing each GO term. Default=2. - -max_genes_req Choose maximum number of genes containing each GO term. Default=100000. - -allow_inferred Allow inferred parent (GO, not in original file) into final enrichment. Default=0=OFF. - -plot_only Plot Only (BOOLEAN, optional), don't do GO enrichments. Default=0=OFF. - -open_plot Open plots once made (BOOLEAN, optional). Default=0=OFF. - -Pre-requisites: - R - -*** Questions: - Chris Wyatt (chris.wyatt\@crg.eu) - Manuel Irimia (mirimia\@gmail.com) - -" -} - - -my $path_to_GOs; -#my inferred file path -if ($infer){} -elsif ($GO_file){ - $path_to_GOs=$GO_file; - $infer=$path_to_GOs; -} -else { - $path_to_GOs="$path_to_DB\/$species\/FILES/GO_FILE_$species"; - $infer=$path_to_GOs; -} - -if (-e $path_to_GOs){ - #good for you -} -else{ - print " - -GO file does not exist:\n -= $path_to_GOs\n -This may be because there is no GO file in the DB you are pointing at.\n -If this is the case, you might do the following: \n -1, Go to https://github.com/vastgroup/vast-tools. -2, Download the GO_file of the species of interest. - -Alternatively, you may download your own annotation, e.g.: - -1, Go to http://www.ensembl.org/index.html.\n -2, Go to BioMart.\n -3, Choose database Ensembl Genes (latest version).\n -4, Choose species.\n -5, Click Attributes.\n -6, Under Gene, select \"Gene ID\" only.\n -7, Under External, select : \"GO Term Accession\" and \"GO Term Name\".\n -8, Click results, Then Go (to save the file to your computer).\n -9, Finally, make sure file is tab separated, and it goes GENE ID (e.g. ENG000..), GO (e.g.GO:000001), GO name (e.g. Liver related). IN this order.\n -10, Save file as GO_FILE_species (e.g. GO_FILE_Hsa), in the DB folder under the same three letter species name.\n - " -} - - -#if you just want to plot -if ($plot_only == 0){ - -#Files to be printed, filled in later. -my @ALL_made_files; - -my @methods=("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"); - - - -### DATABASE CHOICE, print to user -print "GO = $species GO\nDb = $path_to_DB\n\n" if !defined $GO_file; -print "GO file = $GO_file\n\n" if defined $GO_file; - -###BACKGROUND### -my $outfile="BACKGROUND\.forR"; -my %Background_hash; -if($background){ - ## FIND BACKGROUND GENES - open(my $IN_b, "<", $background) or die "Could not open $background \n"; - while (my $line=<$IN_b>){ - $line=~s/\r//g; - chomp $line; - my @input_here=split("\t", $line); - if (scalar @input_here > 1.5){ - $Background_hash{$input_here[0]}="HIT"; - } - else{ - $Background_hash{$line}="HIT"; - } - - } - my $n_back = keys %Background_hash; - - ###MAKE BACKGROUND FOR R; - my $Table_b1; - if ($GO_file){ - $Table_b1 = $GO_file; - } - else { - $Table_b1 = "$path_to_DB\/$species\/FILES/GO_FILE_$species"; - } - open(my $IN, "<", $Table_b1) or die "Could not open $Table_b1 \n"; - open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; - my %Gene_Go_Hash; - my %tot_genes; - while (my $line=<$IN>){ - $line=~s/\r//g; - chomp $line; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $tot_genes{$gene}="HIT"; - if ($Background_hash{$gene}){ - my $GO=$linesplit[1]; - if ($GO){ - if ($Gene_Go_Hash{$gene}){ - my $old=$Gene_Go_Hash{$gene}; - #print "2+ $gene $old\t$GO\n"; - $Gene_Go_Hash{$gene}="$old\",\"$GO"; - } - else{ - $Gene_Go_Hash{$gene}=$GO; - #print "1st $gene $GO\n"; - } - } - } - } - my $n_all=keys %tot_genes; - print "BACKGROUND\nOf $n_all total Genes with GO annotation, $n_back were chosen for the background (based on user -b list)\n\n"; - - print $outhandle "Chop.gene2GO<- list()\n"; - foreach my $key ( keys %Gene_Go_Hash ){ - print $outhandle "Chop.gene2GO\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - } -} -else { - #all genes as background. No background chosen by user. - ###MAKE BACKGROUND FOR R; - my $Table_b2; - if ($GO_file){ - $Table_b2 = $GO_file; - } - else { - $Table_b2 = "$path_to_DB\/$species\/FILES/GO_FILE_$species"; - } - open(my $IN, "<", $Table_b2) or die "Could not open $Table_b2 \n"; - open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; - my %Gene_Go_Hash; - while (my $line=<$IN>){ - $line=~s/\r//g; - chomp $line; - #$line=~s/-/_/g; - #$line=~s/:/_/g; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $gene=~s/-/_/g; - $gene=~s/:/_/g; - my $GO=$linesplit[1]; - if ($GO){ - if ($Gene_Go_Hash{$gene}){ - my $old=$Gene_Go_Hash{$gene}; - #print "2+ $gene $old\t$GO\n"; - $Gene_Go_Hash{$gene}="$old\",\"$GO"; - } - else{ - $Gene_Go_Hash{$gene}=$GO; - #print "1st $gene $GO\n"; - } - } - } - - print $outhandle "Chop.gene2GO<- list()\n"; - foreach my $key ( keys %Gene_Go_Hash ){ - print $outhandle "Chop.gene2GO\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - } -} - - -###QUERY INPUT### - -my $Table_i = $input; -open(my $IN, "<", $Table_i) or die "Could not open $Table_i \n"; - -if ($Table_i=~ m/\//){ - my @split=split ("\/", $Table_i); - $Table_i=$split[-1]; - $Table_i=~ s/\-/\_/g; - -} - -if ($Table_i =~ m/\-/){ - print "Name of input file contains \"-\"\'s , replacing with \"\_\"\'s, for processing in R\n"; - $Table_i=~ s/\-/\_/g; -} - -my $outfile2="$Table_i\.converted"; -my $outfile3="$Table_i\.R_GO_subs"; - -open(my $outhandle2, ">", $outfile2) or die "Could not open $outfile2 \n"; -open(my $outhandle3, ">", $outfile3) or die "Could not open $outfile3 \n"; - -print "\nStep 2\n\n"; - -my %chromo_score; - -my %Gene_Go_Hash; -while (my $line=<$IN>){ - $line =~ s/\r//g; - $line =~ s/\"//g; - chomp $line; - #print "$line\n"; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $gene=~s/-/_/g; - $gene=~s/:/_/g; - - - if ($linesplit[1]){ - my $GROUP=$linesplit[1]; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - $chromo_score{$GROUP}++; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - $chromo_score{$GROUP}=1; - } - } - else{ - my $GROUP="$Table_i"; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } -} - - -#Go through chromosomes and if any contain 20 or less genes, ignore them, as topGO may fail. -if (%chromo_score){ - foreach my $key ( keys %chromo_score ){ - my $val=$chromo_score{$key}; - if ($val <=20){ - #Remove entry from hash, as its too small really for GO. - delete($Gene_Go_Hash{$key}); - } - } -} - -#Read through multi GROUP file and ensure no lists are less than 20 genes (which won't often run through GO). - - - -### RUN R COMPARISONS ### - -print $outhandle2 "Chop.WGCNA2Gene<- list()\n"; -foreach my $key ( keys %Gene_Go_Hash ){ - - print $outhandle2 "Chop.WGCNA2Gene\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - - #print GO runs - print $outhandle3 "selGenes<-Chop.WGCNA2Gene\$",$key,"\n"; - print $outhandle3 "inGenes <- factor(as.integer(names(Chop.gene2GO) %in% selGenes))\n"; - print $outhandle3 "names(inGenes) <- names(Chop.gene2GO)\n"; - - #BP - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"BP\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_BP <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_BP\$",$meths,"<-p.adjust(allRes_BP\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_BP\$FoldChange<-allRes_BP\$Significant/allRes_BP\$Expected\n"; - print $outhandle3 "allRes_BP\$ontology<-\"BP\"\n"; - - #MF - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"MF\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_MF <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_MF\$",$meths,"<-p.adjust(allRes_MF\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_MF\$FoldChange<-allRes_MF\$Significant/allRes_MF\$Expected\n"; - print $outhandle3 "allRes_MF\$ontology<-\"MF\"\n"; - - #CC - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"CC\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_CC <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_CC\$",$meths,"<-p.adjust(allRes_CC\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_CC\$FoldChange<-allRes_CC\$Significant/allRes_CC\$Expected\n"; - print $outhandle3 "allRes_CC\$ontology<-\"CC\"\n"; - - #ALL MERGE - print $outhandle3 "ALL_res_",$key,"<-rbind(allRes_BP, allRes_MF, allRes_CC)\n"; - print $outhandle3 "x.sub <- subset(ALL_res_",$key,", none < ",$pval_cutoff,")\n"; - print $outhandle3 "f.sub<-x.sub[sort.list(x.sub\$none),]\n"; - print $outhandle3 "e.sub <- subset(f.sub, FoldChange > ",$sort_enrich,")\n"; - print $outhandle3 "write.table(e.sub, \"$species\_$key\_res.tab\", sep=\"\\t\", quote=FALSE, eol=\"\\n\", row.names=F) \n"; - my $out_name="$species\_$key\_res.tab"; - push (@ALL_made_files, $out_name); -} - -print "Finished compiling scripts, now running R... (can take ~40 seconds for each query)\n\n"; - -### RUN MAIN CODE ### -my $outfile4="R_CODE_TO_RUN"; -open(my $outhandle4, ">", $outfile4) or die "Could not open $outfile4 \n"; -if (defined $install_topGO){ - print $outhandle4 "source(\"http://bioconductor.org/biocLite.R\")\n"; - print $outhandle4 "biocLite(\"topGO\")\n"; -} - -print $outhandle4 "suppressWarnings(suppressMessages(library (topGO)))\n"; -print $outhandle4 "source (\"$outfile\")\nsource (\"$outfile2\")\nsource (\"$outfile3\")\n"; -print $outhandle4 "save.image(\"Image.Rdata\")\nquit(save = \"no\")\n"; - -`R --vanilla < R_CODE_TO_RUN >output.ofthis.test`; - -print "Start Plotting\n\n"; - -### RUN HITOGRAM CODE ### - -if (defined $plot){ - my @files=@ALL_made_files; - foreach my $results (@files){ - chomp $results; - my $in_here="$binPath\/MakeHist_fromChartGO-jcvi.pl"; - if ($in_here=~ m/ /){ - $in_here=~ s/ /\\ /g; - } - if ($in_here=~ m/\(/){ - $in_here=~ s/\(/\\(/g; - } - if ($in_here=~ m/\)/){ - $in_here=~ s/\)/\\)/g; - } - #print "HERE: $in_here\n"; - `$in_here $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer`; - } -} - -#if you just want to plot. Miss all the other steps. -} -else { - my $Table_i = $input; - open(my $IN, "<", $Table_i) or die "Could not open $Table_i \n"; - my %Gene_Go_Hash; - while (my $line=<$IN>){ - $line =~ s/\r//g; - $line =~ s/\"//g; - chomp $line; - #print "$line\n"; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - if ($linesplit[1]){ - my $GROUP=$linesplit[1]; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } - else{ - my $GROUP="$Table_i"; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } - } - - my @ALL_made_files; - foreach my $key ( keys %Gene_Go_Hash ){ - my $out_name="$species\_$key\_res.tab.tab"; - push (@ALL_made_files, $out_name); - } - - - foreach my $results (@ALL_made_files){ - chomp $results; - my $in_here="$binPath\/MakeHist_fromChartGO-jcvi.pl"; - if ($in_here=~ m/ /){ - $in_here=~ s/ /\\ /g; - } - if ($in_here=~ m/\(/){ - $in_here=~ s/\(/\\(/g; - } - if ($in_here=~ m/\)/){ - $in_here=~ s/\)/\\)/g; - } - #print "HERE: $in_here\n"; - print "$in_here $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer\n"; - `perl $in_here $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer`; - } -} - -print "Script completed\n\n"; - -#TIDY UP , temporary files -$input=~s/\-/\_/g; -#`rm -f ACTUAL_R_CODE BACKGROUND.forR R_CODE_TO_RUN output.ofthis.test $input\.converted $input\.R_GO_subs tmp_file Image.Rdata`; - - diff --git a/bin/ChopGO_VTS2.pl b/bin/ChopGO_VTS2.pl deleted file mode 100755 index ac07244..0000000 --- a/bin/ChopGO_VTS2.pl +++ /dev/null @@ -1,476 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; -use Getopt::Long qw(GetOptions); -use Cwd qw(abs_path cwd); -use File::Path qw(make_path); - -# INITIALIZE PATH -my $binPath = abs_path($0); -$0 =~ s/^.*\///; -$binPath =~ s/\/$0$//; # vast-tools/bin - -my $GO_file; -my $input; -my $background; -my $species; -my $helpFlag=0; -my $path_to_DB="$binPath/../VASTDB"; -my $plot = 1; -my $pval_cutoff=0.05; -my $num_cutoff=10; -my $plot_only=0; -my $plot_open=0; -my $install_topGO; -my $meth="none"; -my $sort_enrich=0; -my $min_genes=2; -my $max_genes=100000; -my $infer=0; - -GetOptions( "help" => \$helpFlag, - 'i=s' => \$input, - "bg=s" => \$background, - 'sp=s' => \$species, - "db=s" => \$path_to_DB, - "GO_file=s" => \$GO_file, - "plot" => \$plot, - "pval=s" => \$pval_cutoff, - "pval_type=s" => \$meth, - "max_plot=s" => \$num_cutoff, - "plot_only" => \$plot_only, - "open_plot" => \$plot_open, - "install_topGO" => \$install_topGO, - "filt_enrich=s" => \$sort_enrich, - "min_genes_req=s" => \$min_genes, - "max_genes_req=s" => \$max_genes, - "allow_inferred" => \$infer, - ); - - - -my $EXIT_STATUS=0; -if (defined $input && ((defined $species && defined $path_to_DB) || (defined $GO_file))){ - print "Starting...\n\n"; -} -else{ - $EXIT_STATUS=1; -} - -if ($helpFlag or $EXIT_STATUS){ - die " -usage: ChopGO.pl -i (-sp -db OR --GO_file Custom_GO_file) [options] - -compulsory: - -i Input list of genes (compulsory). Sep by \'\\n\'. Optional second column for subdivisions of list. - - -sp Species must be (Hsa,Bla,Mmu code). Check availability in DB. -OR - --GO_file Custom file with GeneID\tGO_term associations - -optional: - -bg Background must be a list of genes sep by \\n. - -db Database, give path to DB of GO annotations. Default (./VASTDB) - -install_topGO Install topGO in R. - -plotting options: - - -plot Plot (BOOLEAN, optional), will create a histogram/s. Default=0=(OFF). - -pval Choose pval cutoff for histogram plots. Default=0.05. - -pval_type Choose correction (holm, hochberg, hommel, bonferroni, BH, BY, fdr or none). Default=none. - -filt_enrich Filter by fold enrichment. Default=0. - -max_plot Max number of results to plot for each histogram (Bp,MF,CC). Default=10. - -min_genes_req Choose minimum number of genes containing each GO term. Default=2. - -max_genes_req Choose maximum number of genes containing each GO term. Default=100000. - -allow_inferred Allow inferred parent (GO, not in original file) into final enrichment. Default=0=OFF. - -plot_only Plot Only (BOOLEAN, optional), don't do GO enrichments. Default=0=OFF. - -open_plot Open plots once made (BOOLEAN, optional). Default=0=OFF. - -Pre-requisites: - R - -*** Questions: - Chris Wyatt - -" -} - - -my $path_to_GOs; -#my inferred file path -if ($infer){} -elsif ($GO_file){ - $path_to_GOs=$GO_file; - $infer=$path_to_GOs; -} -else { - $path_to_GOs="$path_to_DB\/$species\/FILES/GO_FILE_$species"; - $infer=$path_to_GOs; -} - -if (-e $path_to_GOs){ - #good for you -} -else{ - print " - -GO file does not exist:\n -= $path_to_GOs\n -This may be because there is no GO file in the DB you are pointing at.\n -If this is the case, you might do the following: \n -1, Go to https://github.com/vastgroup/vast-tools. -2, Download the GO_file of the species of interest. - -Alternatively, you may download your own annotation, e.g.: - -1, Go to http://www.ensembl.org/index.html.\n -2, Go to BioMart.\n -3, Choose database Ensembl Genes (latest version).\n -4, Choose species.\n -5, Click Attributes.\n -6, Under Gene, select \"Gene ID\" only.\n -7, Under External, select : \"GO Term Accession\" and \"GO Term Name\".\n -8, Click results, Then Go (to save the file to your computer).\n -9, Finally, make sure file is tab separated, and it goes GENE ID (e.g. ENG000..), GO (e.g.GO:000001), GO name (e.g. Liver related). IN this order.\n -10, Save file as GO_FILE_species (e.g. GO_FILE_Hsa), in the DB folder under the same three letter species name.\n - " -} - - -#if you just want to plot -if ($plot_only == 0){ - -#Files to be printed, filled in later. -my @ALL_made_files; - -my @methods=("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"); - - - -### DATABASE CHOICE, print to user -print "GO = $species GO\nDb = $path_to_DB\n\n" if !defined $GO_file; -print "GO file = $GO_file\n\n" if defined $GO_file; - -###BACKGROUND### -my $outfile="BACKGROUND\.forR"; -my %Background_hash; -if($background){ - ## FIND BACKGROUND GENES - open(my $IN_b, "<", $background) or die "Could not open $background \n"; - while (my $line=<$IN_b>){ - $line=~s/\r//g; - $line=~s/\-/\_/g; - chomp $line; - my @input_here=split("\t", $line); - if (scalar @input_here > 1.5){ - $Background_hash{$input_here[0]}="HIT"; - } - else{ - $Background_hash{$line}="HIT"; - } - - } - my $n_back = keys %Background_hash; - - ###MAKE BACKGROUND FOR R; - my $Table_b1; - if ($GO_file){ - $Table_b1 = $GO_file; - } - else { - $Table_b1 = "$path_to_DB\/$species\/FILES/GO_FILE_$species"; - } - open(my $IN, "<", $Table_b1) or die "Could not open $Table_b1 \n"; - open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; - my %Gene_Go_Hash; - my %tot_genes; - while (my $line=<$IN>){ - $line=~s/\r//g; - $line=~s/\-/\_/g; - chomp $line; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $tot_genes{$gene}="HIT"; - if ($Background_hash{$gene}){ - my $GO=$linesplit[1]; - if ($GO){ - if ($Gene_Go_Hash{$gene}){ - my $old=$Gene_Go_Hash{$gene}; - #print "2+ $gene $old\t$GO\n"; - $Gene_Go_Hash{$gene}="$old\",\"$GO"; - } - else{ - $Gene_Go_Hash{$gene}=$GO; - #print "1st $gene $GO\n"; - } - } - } - } - my $n_all=keys %tot_genes; - print "BACKGROUND\nOf $n_all total Genes with GO annotation, $n_back were chosen for the background (based on user -b list)\n\n"; - - print $outhandle "Chop.gene2GO<- list()\n"; - foreach my $key ( keys %Gene_Go_Hash ){ - print $outhandle "Chop.gene2GO\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - } -} -else { - #all genes as background. No background chosen by user. - ###MAKE BACKGROUND FOR R; - my $Table_b2; - if ($GO_file){ - $Table_b2 = $GO_file; - } - else { - $Table_b2 = "$path_to_DB\/$species\/FILES/GO_FILE_$species"; - } - open(my $IN, "<", $Table_b2) or die "Could not open $Table_b2 \n"; - open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; - my %Gene_Go_Hash; - while (my $line=<$IN>){ - $line=~s/\r//g; - chomp $line; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $gene=~s/-/_/g; - $gene=~s/:/_/g; - my $GO=$linesplit[1]; - if ($GO){ - if ($Gene_Go_Hash{$gene}){ - my $old=$Gene_Go_Hash{$gene}; - #print "2+ $gene $old\t$GO\n"; - $Gene_Go_Hash{$gene}="$old\",\"$GO"; - } - else{ - $Gene_Go_Hash{$gene}=$GO; - #print "1st $gene $GO\n"; - } - } - } - - print $outhandle "Chop.gene2GO<- list()\n"; - foreach my $key ( keys %Gene_Go_Hash ){ - print $outhandle "Chop.gene2GO\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - } -} - - -###QUERY INPUT### - -my $Table_i = $input; -open(my $IN, "<", $Table_i) or die "Could not open $Table_i \n"; - -if ($Table_i=~ m/\//){ - my @split=split ("\/", $Table_i); - $Table_i=$split[-1]; - $Table_i=~ s/\-/\_/g; - -} - -if ($Table_i =~ m/\-/){ - print "Name of input file contains \"-\"\'s , replacing with \"\_\"\'s, for processing in R\n"; - $Table_i=~ s/\-/\_/g; -} - -my $outfile2="$Table_i\.converted"; -my $outfile3="$Table_i\.R_GO_subs"; - -open(my $outhandle2, ">", $outfile2) or die "Could not open $outfile2 \n"; -open(my $outhandle3, ">", $outfile3) or die "Could not open $outfile3 \n"; - -print "\nStep 2\n\n"; - -my %Gene_Go_Hash; -while (my $line=<$IN>){ - $line =~ s/\r//g; - $line =~ s/\"//g; - chomp $line; - #print "$line\n"; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - $gene=~s/\-/\_/g; - $gene=~s/\:/\_/g; - - - if ($linesplit[1]){ - my $GROUP=$linesplit[1]; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } - else{ - my $GROUP="$Table_i"; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } -} - - -### RUN R COMPARISONS ### - -print $outhandle2 "Chop.WGCNA2Gene<- list()\n"; -foreach my $key ( keys %Gene_Go_Hash ){ - - print $outhandle2 "Chop.WGCNA2Gene\$$key <- c(\"$Gene_Go_Hash{$key}\")\n"; - - #print GO runs - print $outhandle3 "selGenes<-Chop.WGCNA2Gene\$",$key,"\n"; - print $outhandle3 "inGenes <- factor(as.integer(names(Chop.gene2GO) %in% selGenes))\n"; - print $outhandle3 "names(inGenes) <- names(Chop.gene2GO)\n"; - - #BP - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"BP\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_BP <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_BP\$",$meths,"<-p.adjust(allRes_BP\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_BP\$FoldChange<-allRes_BP\$Significant/allRes_BP\$Expected\n"; - print $outhandle3 "allRes_BP\$ontology<-\"BP\"\n"; - - #MF - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"MF\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_MF <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_MF\$",$meths,"<-p.adjust(allRes_MF\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_MF\$FoldChange<-allRes_MF\$Significant/allRes_MF\$Expected\n"; - print $outhandle3 "allRes_MF\$ontology<-\"MF\"\n"; - - #CC - print $outhandle3 "GOdata <- new(\"topGOdata\", ontology=\"CC\", allGenes=inGenes, annot=annFUN.gene2GO, gene2GO=Chop.gene2GO)\n"; - print $outhandle3 "resultFisher <- runTest(GOdata, algorithm = \"classic\", statistic = \"fisher\")\n"; - print $outhandle3 "allRes_CC <- GenTable(GOdata, classicFisher = resultFisher,orderBy = \"classicFisher\", ranksOf = \"classicFisher\", topNodes = 50)\n"; - foreach my $meths (@methods){ - print $outhandle3 "allRes_CC\$",$meths,"<-p.adjust(allRes_CC\$classicFisher, method = \"",$meths,"\")\n"; - } - print $outhandle3 "allRes_CC\$FoldChange<-allRes_CC\$Significant/allRes_CC\$Expected\n"; - print $outhandle3 "allRes_CC\$ontology<-\"CC\"\n"; - - #ALL MERGE - print $outhandle3 "ALL_res_",$key,"<-rbind(allRes_BP, allRes_MF, allRes_CC)\n"; - print $outhandle3 "x.sub <- subset(ALL_res_",$key,", none < ",$pval_cutoff,")\n"; - print $outhandle3 "f.sub<-x.sub[sort.list(x.sub\$none),]\n"; - print $outhandle3 "e.sub <- subset(f.sub, FoldChange > ",$sort_enrich,")\n"; - print $outhandle3 "write.table(e.sub, \"",$key,"_TopGo_results_ALL.tab\", sep=\"\\t\", quote=FALSE, eol=\"\\n\", row.names=F) \n"; - my $out_name="$key\_TopGo_results_ALL.tab"; - push (@ALL_made_files, $out_name); -} - -print "Finished compiling scripts, now running R... (can take ~40 seconds for each query)\n\n"; - -### RUN MAIN CODE ### -my $outfile4="R_CODE_TO_RUN"; -open(my $outhandle4, ">", $outfile4) or die "Could not open $outfile4 \n"; -if (defined $install_topGO){ - print $outhandle4 "source(\"http://bioconductor.org/biocLite.R\")\n"; - print $outhandle4 "biocLite(\"topGO\")\n"; -} - -print $outhandle4 "suppressWarnings(suppressMessages(library (topGO)))\n"; -print $outhandle4 "source (\"$outfile\")\nsource (\"$outfile2\")\nsource (\"$outfile3\")\n"; -print $outhandle4 "save.image(\"Image.Rdata\")\nquit(save = \"no\")\n"; - -`R --vanilla < R_CODE_TO_RUN >output.ofthis.test`; - -print "Start Plotting\n\n"; - -### RUN HITOGRAM CODE ### - -if (defined $plot){ - my @files=@ALL_made_files; - foreach my $results (@files){ - chomp $results; - my $in_here="$binPath\/MakeHist_fromChartGO-10.pl"; - if ($in_here=~ m/ /){ - $in_here=~ s/ /\\ /g; - } - if ($in_here=~ m/\(/){ - $in_here=~ s/\(/\\(/g; - } - if ($in_here=~ m/\)/){ - $in_here=~ s/\)/\\)/g; - } - - if ($results){ - `MakeHist_fromChartGO-10.pl $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer`; - } - } -} - -#if you just want to plot. Miss all the other steps. -} -else { - my $Table_i = $input; - open(my $IN, "<", $Table_i) or die "Could not open $Table_i \n"; - my %Gene_Go_Hash; - while (my $line=<$IN>){ - $line =~ s/\r//g; - $line =~ s/\"//g; - chomp $line; - #print "$line\n"; - my @linesplit= split("\t", $line); - my $gene=$linesplit[0]; - if ($linesplit[1]){ - my $GROUP=$linesplit[1]; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } - else{ - my $GROUP="$Table_i"; - if ($Gene_Go_Hash{$GROUP}){ - my $old=$Gene_Go_Hash{$GROUP}; - $Gene_Go_Hash{$GROUP}="$old\",\"$gene"; - } - else{ - $Gene_Go_Hash{$GROUP}=$gene; - } - } - } - - my @ALL_made_files; - foreach my $key ( keys %Gene_Go_Hash ){ - my $out_name="$key\_TopGo_results_ALL.tab"; - push (@ALL_made_files, $out_name); - } - - - foreach my $results (@ALL_made_files){ - chomp $results; - my $in_here="$binPath\/MakeHist_fromChartGO-10.pl"; - if ($in_here=~ m/ /){ - $in_here=~ s/ /\\ /g; - } - if ($in_here=~ m/\(/){ - $in_here=~ s/\(/\\(/g; - } - if ($in_here=~ m/\)/){ - $in_here=~ s/\)/\\)/g; - } - if ($results){ - print "MakeHist_fromChartGO-10.pl $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer\n"; - `MakeHist_fromChartGO-10.pl $results $pval_cutoff $num_cutoff $plot_open $meth $sort_enrich $min_genes $max_genes $infer`; - } - } -} - -print "Script completed\n\n"; - -#TIDY UP , temporary files -$input=~s/\-/\_/g; -`rm -f ACTUAL_R_CODE BACKGROUND.forR R_CODE_TO_RUN output.ofthis.test $input\.converted $input\.R_GO_subs tmp_file Image.Rdata`; diff --git a/bin/Expansion_summary.pl b/bin/Expansion_summary.pl deleted file mode 100755 index ada422a..0000000 --- a/bin/Expansion_summary.pl +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - - -print "Please be in folder with the *expansion.txt files\n\n"; - -#Find out all the names of GO terms found within our dataset. -my $ALL_go_terms=`cat *.txt | cut -f 1 | sort | uniq`; - -#Download the GO iD to name hash: -`download_go_names.R > output`; - -#Put go id to name infor into a hash. -my %ID_to_name; -my $input="GO_to_name"; -open(my $goin, "<", $input) or die "Could not open $input\n"; -while (my $lineH=<$goin>){ - chomp $lineH; - my @split=split("\t", $lineH); - $ID_to_name{$split[1]}=$split[2]; - #print "$split[1] $split[2]\n"; -} - -#Read in file and infer species from file name -my @in_gofile=`ls *expansions.txt`; - -my %species_go_hash; -foreach my $species_files (@in_gofile){ - chomp $species_files; - my @name=split(/\./, $species_files); - my $species=$name[0]; - open(my $filein, "<", $species_files) or die "Could not open $species_files\n"; - while (my $line=<$filein>){ - chomp $line; - my @split=split("\t", $line); - $species_go_hash{$species}{$split[0]}=$split[1]; - } - close $filein; -} - -#Set the full output name -my $out="GO_table_counts.tsv"; -open(my $fileout, ">", $out) or die "Could not open $out\n"; - -#Set the output name -my $out2="GO_table_counts_forCAFE.tsv"; -open(my $fileout2, ">", $out2) or die "Could not open $out2\n"; - - -#Print header: -print $fileout "Desc\tFamily ID"; -print $fileout2 "Desc\tFamily ID"; -foreach my $species (keys %species_go_hash){ - print $fileout "\t$species"; - print $fileout2 "\t$species"; -} -print $fileout "\n"; -print $fileout2 "\n"; - -my %line_to_score; #This hash we keep to we can print off the top 300 most variable terms. - -#Print table contents -foreach my $GO_terms (keys %ID_to_name){ - print $fileout "$ID_to_name{$GO_terms}\t$GO_terms"; - my $count=0; - my $sum=0; - my $max=0; #Start the max at zero. - my $min=100000000000; # We set min super high, so that the next number should be lower than this. - my @row_info_for_hash=(); - #Now for each species fill in the number of genes associated with each term. - foreach my $species (keys %species_go_hash){ - if (exists $species_go_hash{$species}{$GO_terms}){ - print $fileout "\t$species_go_hash{$species}{$GO_terms}"; - $count++; - $sum+=$species_go_hash{$species}{$GO_terms}; - if ($species_go_hash{$species}{$GO_terms} > $max){ - $max=$species_go_hash{$species}{$GO_terms}; - } - if ($species_go_hash{$species}{$GO_terms} < $min){ - $min=$species_go_hash{$species}{$GO_terms}; - } - push(@row_info_for_hash, $species_go_hash{$species}{$GO_terms}); - } - else{ - print $fileout "\t0"; - $count++; - $min=0; - push(@row_info_for_hash, "0"); - } - } - - my $aver=$sum/$count; - my $range=$max-$min; - my $score; - if ($aver == 0){ - $score=0; - } - elsif ($range == 0){ - $score=0; - } - else{ - $score=$range/$aver; - } - - print $fileout "\t$score\n"; - - my $join=join("\t", @row_info_for_hash); - my $full_line="$ID_to_name{$GO_terms}\t$GO_terms\t$join"; - $line_to_score{$full_line}=$score; -} - -#Now print off the top hits for cafe: -my $top300=300; -foreach my $key (sort {$line_to_score{$b} <=> $line_to_score{$a}} keys %line_to_score) { - if ($top300){ - print $fileout2 "$key\n"; - $top300--; - } -} - -print "Script complete\n\n"; - diff --git a/bin/Fix_fasta.pl b/bin/Fix_fasta.pl deleted file mode 100755 index 070e53c..0000000 --- a/bin/Fix_fasta.pl +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -my $line; -while ($line = ) { - chomp $line; - my $line2= ; - chomp $line2; - if ($line2 =~ m/Sequence unavailable/){ - #do nothing - } - else{ - print ">$line\n$line2\n"; - } -} diff --git a/bin/GO_make_isoform_hash.pl b/bin/GO_make_isoform_hash.pl deleted file mode 100755 index 29cf61e..0000000 --- a/bin/GO_make_isoform_hash.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - - -print "Please be in folder with *_gene_alltran_list.txt file and the GO protein file *_Result_All_Combine_GO_format\n\n"; - - -my $in_gofile=`ls *_Result_All_Combine_GO_format`; -chomp $in_gofile; -my @namesplit=split(/\./, $in_gofile); -my $species=$namesplit[0]; - -my $gene2tran="$species\_gene_alltran_list.txt"; - -print "My species is $species\n\n"; - -open(my $filein, "<", $in_gofile) or die "Could not open $in_gofile\n"; - -my $out="$species\.transcripts_Combine_GO_format.txt"; -open(my $fileout, ">", $out) or die "Could not open $out\n"; - - -my %GO_hash; - -while (my $line=<$filein>){ - chomp $line; - my @split=split("\t", $line); - if ($GO_hash{$split[0]}){ - my $old=$GO_hash{$split[0]}; - $GO_hash{$split[0]}="$old\,$split[1]"; - } - else{ - $GO_hash{$split[0]}=$split[1]; - } -} - -open(my $tranin, "<", $gene2tran) or die "Could not open $gene2tran\n"; - -while (my $line2=<$tranin>){ - chomp $line2; - my @split=split("\t", $line2); - my $gene=$split[0]; - my @trans=split(/\,/, $split[1]); - foreach my $transcripts (@trans){ - if ($GO_hash{$gene}){ - my @go_terms_of_gene=split(/\,/, $GO_hash{$gene}); - foreach my $GO_term_of_gene (@go_terms_of_gene){ - print $fileout "$transcripts\t$GO_term_of_gene\n"; - } - } - } -} - diff --git a/bin/Goatee_ortho_go_match.pl b/bin/Goatee_ortho_go_match.pl deleted file mode 100755 index 2345cce..0000000 --- a/bin/Goatee_ortho_go_match.pl +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/perl -#loopfold.sortmerna.pl -use warnings; -use strict; - -die "Please specify (1) Orthofinder.csv (2) File focal species fasta\n" unless(@ARGV==2); - -my $ORTHOFINDER = $ARGV[0]; -open(my $filein, "<", $ORTHOFINDER) or die "Could not open $ORTHOFINDER\n"; - -my $FOCAL = $ARGV[1]; -#open(my $focal, "<", $FOCAL) or die "Could not open $FOCAL\n"; -if ($FOCAL =~ m/gz$/){ - my @sl=split(".gz", $FOCAL); - my $NEWFOCAL = "$sl[0]"; - `zcat $FOCAL > $NEWFOCAL`; - $FOCAL = $NEWFOCAL; - print "ungzipping\n"; -} -else{ - print "nah\n"; -} - -my $outfile="Result_All_Data"; -open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; -my $outfile2="$FOCAL\_Result_All_Combine_GO_format"; -open(my $outhandle2, ">", $outfile2) or die "Could not open $outfile2 \n"; -my $outfile3="OG_GO_format.tsv"; -open(my $outhandle3, ">", $outfile3) or die "Could not open $outfile3 \n"; - -# -print "\n#####################################################\n"; -print "Starting script to join orthofinder results to GO IDs\n\n"; -print "Step 1: Work out header column order and make species to GO hashes\n\n"; - - -#Read the header off the orthofinder file to match up the species with each column (then later, read the GO files and make a hash) -my $header=<$filein>; -chomp $header; -my @split_head=split("\t", $header); -my $orthogroup_name=shift @split_head; #Remove orthogroup column -print "Header of Orthofinder file:\n$header\n\n"; -my $colposition=0; -my %col_to_sp_store; -my %sp_store_to_col; - -my %Nested_species_go_store; - -foreach my $col (@split_head){ - chomp $col; - $col =~ s/\r\n|\n|\R|\r//g; - #my @split_dot=split("\.", $col); - my $exp_gofile="$col\.go.txt"; - print "Col of Orthofinder table: $col\n"; - - print "TESTED: $exp_gofile\n"; - if (-e "$exp_gofile"){ - print "3. Match to GO database\n"; - $col_to_sp_store{$colposition}=$exp_gofile; - $sp_store_to_col{$exp_gofile}=$colposition; - #Read in the go txt files and make them into a hash of species gene ID to a list of all GO terms associated (e.g. ENS23523525 -> "GO:000032342,GO:0000248214") - my $GO="$exp_gofile"; - open(my $goin, "<", $GO) or die "Could not open $GO\n"; - my $ignoreheader=<$goin>; - while (my $line=<$goin>){ - chomp $line; - $line =~ s/\r\n|\n|\R|\r//g; - my @sp=split("\t", $line); - my $gene=$sp[1]; - my $go_id=$sp[2]; - #print "$line\n"; - if ($go_id){ - if ($go_id =~ m/GO/){ - - #print "<-$go_id\->\n"; - if ($Nested_species_go_store{$exp_gofile}{$gene}){ - my $old=$Nested_species_go_store{$exp_gofile}{$gene}; - $Nested_species_go_store{$exp_gofile}{$gene}="$old\t$go_id"; - } - else{ - $Nested_species_go_store{$exp_gofile}{$gene}=$go_id; - } - } - } - } - } - #If we find the focal column: - elsif($FOCAL eq "$col\.fa" || $FOCAL eq "$col\.fasta"){ - print "1. This happened $FOCAL eq $col\.fa || $FOCAL eq $col\.fasta \n"; - $col_to_sp_store{$colposition}=$FOCAL; - } - #Else, means we probably messed up or a typo: - else{ - print "2. Did not find a match for $col, expected to be: $exp_gofile\nMake sure this is the behaviour you expect, else, maybe you didn't put the correctly titled file in the folder with the GO or protein files.\n"; - } - $colposition++; -} - - - - -print "\n\nRead in the Orthofinder to GO information ( step 1 )\n\n"; -print "Step 2: Matching focal genes to Orthofinder orthologs and stripping GO assiugnments\n\n"; - -#Now read through the rest of the orthofinder file to match each Orthogroups genes for each species to its corresponding lines in their GO files. So we can get for say orthogroup 000001: Apis has two genes, and they have 30 GO annotations from their ensembl ortho data. -my %focal_to_orthofinder_store; -my %target_orthoid_to_GOterms; -while (my $line=<$filein>){ - chomp $filein; - #print "$line\n"; - my @split=split("\t", $line); - my $orthogroup_name=shift @split; #Remove orthogroup column - my $current_col=0; - - foreach my $sp_line (@split){ - #print "$sp_line\n"; - my $species=$col_to_sp_store{$current_col}; - if ($species){ - - my @genes=split(", ",$sp_line); - - - #If the focal species, just add a key from focal gene ID to Orthogroup ID: - #print "HERE: $FOCAL eq $species\n"; - if ($FOCAL eq "$species"){ - #print "yes\n"; - foreach my $gene (@genes){ - $focal_to_orthofinder_store{$gene}=$orthogroup_name; - } - } - #else for the rest of the species we find the gene go hash entries for each gene: - else{ - #print "No: $FOCAL eq $species\n"; - foreach my $gene (@genes){ - if ($Nested_species_go_store{$species}{$gene}){ - if ($target_orthoid_to_GOterms{$orthogroup_name}{$species}){ - my $old=$target_orthoid_to_GOterms{$orthogroup_name}{$species}; - $target_orthoid_to_GOterms{$orthogroup_name}{$species}="$old\t$Nested_species_go_store{$species}{$gene}"; - } - else{ - $target_orthoid_to_GOterms{$orthogroup_name}{$species}=$Nested_species_go_store{$species}{$gene}; - } - } - } - } - } - $current_col++; - } -} - - -print "\n\nLinked orthofinder IDs to GO terms ( step 2 )\n\nStep 3:Now get the focal IDs to GO terms per species\n\n"; - -#Then pull everything together. - -my %done; #This is to make sure we dont get duplicates in the result; - -foreach my $focal_genes ( keys %focal_to_orthofinder_store ){ - - my $OG=$focal_to_orthofinder_store{$focal_genes}; - - #print "$OG\t$focal_genes\t$focal_to_orthofinder_store{$focal_genes}\n"; - - - foreach my $species ( keys %{ $target_orthoid_to_GOterms{$OG} } ){ - #print "here something $species\n"; - my $goterms=$target_orthoid_to_GOterms{$OG}{$species}; - print $outhandle "$focal_genes\t$OG\t$species\t$goterms\n"; - my @goterms_sep=split("\t", $goterms); - foreach my $terms (@goterms_sep){ - my $combtest="$focal_genes\t$terms"; - if ($done{$combtest}){ - #do nothing, already in final doc. - } - else{ - if($focal_genes =~ /^\d/){ - $focal_genes="GENE_$focal_genes"; - } - print $outhandle2 "$focal_genes\t$terms\n"; - print $outhandle3 "$OG\t$terms\n"; - $done{$combtest}="DONE"; - } - } - } -} - -close $filein; -close $outhandle; -close $outhandle2; -close $outhandle3; -print "Finished\n"; - diff --git a/bin/MakeHist_fromChartGO-10.pl b/bin/MakeHist_fromChartGO-10.pl deleted file mode 100755 index 6184177..0000000 --- a/bin/MakeHist_fromChartGO-10.pl +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/perl - -die "\nUsage: MakeHist_fromChartGO_v2.pl topGO_Chart_output [min_p-value max_number_categories]\n\n". - " >> Min Raw p-value for a GO category to be plotted (default 0.01)\n". - " >> Maximum number of GO categories to be plotted per Type (BP, MF, CC) (default 10)\n". - " >> Open or not (default 0)\n". - "\n" if !$ARGV[0]; - -$min_p=$ARGV[1]; -$num_results=$ARGV[2]; -$open_me=$ARGV[3]; -$correction=$ARGV[4]; #none or bonferroni. -$enrich=$ARGV[5]; #0 -$min_gene=$ARGV[6]; #6 -$max_gene=$ARGV[7]; #100000 -$inferred=$ARGV[8]; #FILES/GO_FILE - -### Set colors: -$col_BP = "red"; -$col_MF = "forestgreen"; -$col_CC = "blue3"; - -### open input and output files -if (-e $ARGV[0]){ - open (I, $ARGV[0]) || die "Can't open the input file \"$ARGV[0]\"\n"; - $root=$ARGV[0]; - $out_file="$root\_$correction\_pval\-$correction\-$min_p.ALL.tab"; - open (O, ">$out_file") || die "Can't open the output file"; - - print O "TYPE\tGO\tTERM\tLogP-$correction\n"; - - - open (I_2, $inferred); - my %hits; - while ($line=){ - #print "Stuff $line\n"; - chomp $line; - @t_2=split(/\t/, $line); - ($go_2)=$t_2[1]; - $hits{$go_2}="HITS"; - } - - - $header=; - chomp $header; - @split_head=split("\t", $header); - my $n_h=0; - my $pval_choice_col; - foreach my $cols_head (@split_head){ - #print "head: $cols_head $n_h\n"; - if ($cols_head eq $correction){ - $pval_choice_col=$n_h; - #print "$cols_head eq $correction = $pval_choice_col\n"; - } - $n_h++; - } - print "type $pval_choice_col\n"; - - while (){ - #print "Stuff\n"; - if ($_ =~m/GO:0016645/){print "Here\n"}; - my $save=$_; - chomp; - #if ($save =~m/GO:0016645/){print "$save $save $save\n"}; - @t=split(/\t/); - ($go)=$t[0]; - print "dfasfef $go" if ($save =~m/GO:0016645/); - if ($inferred){ - print "dfasfef $go" if ($save =~m/GO:0016645/); - if ($hits{$go}){ - #if ($save =~m/GO:0016645/){print "Here2\n"}; - ($term)=$t[1]; - if ($term=~ m/\'/){ - #print "YES $term\n"; - $term=~ s/\'/p/g; - } - ($type)=$t[-1]; - ($fold_enrich)=$t[-2]; - if ($fold_enrich=~ m/inf/){ - $fold_enrich="100"; - } - ($annotated)=$t[2]; - - if ($annotated <= $min_gene){ - #Do nothing - #if ($save =~m/GO:0016645/){print "Here2\n"}; - } - else{ - #if ($save =~m/GO:0016645/){print "Here2\n"}; - if ($annotated >= $max_gene){ - #Do nothing - } - else{ - $p; - $e; - $log_p; - if ($fold_enrich >= $enrich){ - #$e=$t[-2]; - $p=$t[$pval_choice_col]; - #print "$p \n"; - if ($p ==0){ - $log_p=$p; - } - elsif ($p =~ m /\< 1e-30/){ - $log_p="1e-30"; - } - else{ - $log_p=sprintf("%.2f",-log($p)/log(10)); - } - - if ($p<$min_p){ - $terms{$go}=$term; - push(@{$order{$type}},"$log_p=$go"); - } - else{ - #print "NO $p\n"; - } - } - } - } - } - else{ - #NADA - } - } - else{ - print "catastophe $go" if ($save =~m/GO:0016645/); - ($term)=$t[1]; - ($type)=$t[-1]; - ($annotated)=$t[2]; - if ($annotated <= $min_gene){ - #Do nothing - #print "$go" if ($save =~m/GO:0016645/); - } - else{ - #print "dfasfef $go" if ($save =~m/GO:0016645/); - if ($annotated >= $max_gene){ - #Do nothing - } - else{ - $p; - $e; - $log_p; - if ($fold_enrich >= $enrich){ - #$e=$t[-2]; - $p=$t[$pval_choice_col]; - $log_p=sprintf("%.2f",-log($p)/log(10)); - if ($p<$min_p){ - $terms{$go}=$term; - push(@{$order{$type}},"$log_p=$go"); - } - else{ - #print "NO $p\n"; - } - } - } - } - } - - } - - - - @cols; - $Good_to_Go=0; - - #print gap for the key - push (@cols,$col_MF); - print O "BLANK\t\t\t\n"; - push (@cols,$col_MF); - print O "BLANK\t\t\t\n"; - - @t=sort{$b<=>$a}(@{$order{CC}}); - @t=reverse @t; - my $len_1=scalar(@t); - foreach $t (@t){ - if (($len_1 <= $num_results) || !$num_results){ - ($p,$go)=$t=~/(.+?)\=(.+)/; - $a++; - print O "CC\t$go\t$terms{$go}\t$p\n"; - push (@cols,$col_CC); - $Good_to_Go=1; - } - $len_1--; - } - push (@cols,$col_CC); - print O "BLANK\t\t\t\n"; - #print "CC $ARGV[0] $Good_to_Go\n"; - - @t=sort{$b<=>$a}(@{$order{MF}}); - @t=reverse @t; - my $len_2=scalar(@t); - foreach $t (@t){ - if (($len_2 <= $num_results) || !$num_results){ - ($p,$go)=$t=~/(.+?)\=(.+)/; - $a++; - print O "MF\t$go\t$terms{$go}\t$p\n"; - push (@cols,$col_MF); - $Good_to_Go=1; - } - $len_2--; - } - push (@cols,$col_MF); - print O "BLANK\t\t\t\n"; - #print "MF $ARGV[0] $Good_to_Go\n"; - - @t=sort{$b<=>$a}(@{$order{BP}}); - @t=reverse @t; - my $len_3=scalar(@t); - foreach $t (@t){ - if (($len_3 <= $num_results) || !$num_results){ - ($p,$go)=$t=~/(.+?)\=(.+)/; - $a++; - print O "BP\t$go\t$terms{$go}\t$p\n"; - push (@cols,$col_BP); - $Good_to_Go=1; - } - $len_3--; - } - - #print "BP $ARGV[0] $Good_to_Go\n"; - - my $length_result=scalar(@cols); - my $number_of_bars_total=$num_results*3; - my $fract=$length_result/$number_of_bars_total; - - - - $plot_file="TopGO_Pval_barplot_"."$root.pdf"; - - my $RCOMMAND="ACTUAL_R_CODE"; - open(my $out_Rcmds, "> $RCOMMAND") or die "error opening $RCOMMAND. $!"; - my $height=2+(8*$fract); - print $out_Rcmds "pdf (\"$plot_file\", width=10, height=$height)\n"; - print $out_Rcmds "par(las=2)\n"; - print $out_Rcmds "BP<-read.table(\"$out_file\", h\=T,sep=\"\t\")\n"; - print $out_Rcmds "par(mar=c(5,25,4,10))\n"; - my @split_col=split("_", $ARGV[0]); - - my $palette_here=join ("\"\,\"", @cols); - #my $Types=join(/","/, @cols); - print $out_Rcmds "barplot(BP\$LogP.$type_p, names.arg = BP\$TERM, horiz=TRUE, xlab = \"log pvalues (correction=",$correction,")\", main= \"$root\", cex.main=2, width = 1, col=c(\"$palette_here\"))\n"; - - print $out_Rcmds "legend(\"bottomright\", title=\"GO Categories\", c(\"BP\",\"MF\",\"CC\"), ". - "fill=c(\"$col_BP\", \"$col_MF\", \"$col_CC\"), horiz=TRUE, cex=0.8)\n"; - print $out_Rcmds "dev.off()\n"; - - #my $length_out=`grep [MF|CC|BF] $out_file`; - - #print "DONE\n"; - if (!$Good_to_Go){ - print STDERR "FAILED, TopGO_Pval_barplot_"."$root.pdf.\nNo significant hits with these thresholds. Try lowering pval or removing correction\n"; - } - else{ - `R --vanilla output.ofthis.test`; - print STDERR "Plotted TopGO_Pval_barplot_"."$root.pdf\n"; - if ($open_me){ - `open $plot_file`; - } - } - - - #Tidy up - `rm -f $out_file`; - -} diff --git a/bin/Orthofinder_duplicate_go.pl b/bin/Orthofinder_duplicate_go.pl deleted file mode 100755 index 9afaad1..0000000 --- a/bin/Orthofinder_duplicate_go.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - - -print "Please be in folder with orthofinder file, plus the GO file of interest\n\n"; - -my $in_orth="Orthogroups.tsv"; -my $in_gofile=`ls *GO_format`; -chomp $in_gofile; -my @namesplit=split(/\./, $in_gofile); -my $species=$namesplit[0]; - -print "My species is $species\n\n"; - -open(my $filein, "<", $in_orth) or die "Could not open $in_orth\n"; - -my $out="$species\_duplications.txt"; -open(my $fileout, ">", $out) or die "Could not open $out\n"; - -my $head=<$filein>; -chomp $head; -#print "$head"; -my @splithead=split("\t",$head); -my $n=0; -my $sp_col; -foreach my $col (@splithead){ - my @orthnamesp=split(/\./, $col); - my $species_orth=$orthnamesp[0]; - #print "$species_orth eq $species $n\n"; - if ($species_orth eq $species){ - $sp_col=$n; - } - $n++; -} - -print "Our species col is $sp_col\n\n"; - -my $output=0; - -while (my $line=<$filein>){ - chomp $line; - my @split=split("\t", $line); - my $orthogroup=$split[0]; - my $genes=$split[$sp_col]; - if ($genes){ - if ($genes =~ m/, /g){ - #print "$genes\n"; - my @all=split(/, /, $genes); - foreach my $bit (@all){ - #if the gene starts with a number, add a letter before it. - if ($bit =~ /^\d/){ - $bit="GENE_$bit"; - } - print $fileout "$bit\n"; - $output++; - } - } - } -} - - -if ($output){ - print "Now run the GO enrichment test\n\n"; - `ChopGO_VTS2.pl -i $species\_duplications.txt --GO_file $in_gofile` -} -else{ - print "No genes satisfied the cutoff, will not run orthofinder duplicate go\n"; -} diff --git a/bin/Orthofinder_gene_expansion.pl b/bin/Orthofinder_gene_expansion.pl deleted file mode 100755 index d75158d..0000000 --- a/bin/Orthofinder_gene_expansion.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - - -print "Please be in folder with the GO file of interest\n\n"; - -#Read in file and infer species from file name -my $in_gofile=`ls *GO_format`; -chomp $in_gofile; -my @namesplit=split(/\./, $in_gofile); -my $species=$namesplit[0]; -open(my $filein, "<", $in_gofile) or die "Could not open $in_gofile\n"; - -print "My species is $species\n\n"; - -#Set the output name -my $out="$species\.go_family_expansions.txt"; -open(my $fileout, ">", $out) or die "Could not open $out\n"; - -my %uniq; -my %go_count; -while (my $line=<$filein>){ - chomp $line; - my @split=split("\t", $line); - - if ($uniq{$line}){ - #do nothing, we already read in this line - } - else{ - $go_count{$split[1]}++; - $uniq{$line}="EXISTS"; - } -} - -print "Now print out the GO counts\n\n"; - -foreach my $GO_term (sort keys %go_count){ - print $fileout "$GO_term\t$go_count{$GO_term}\n"; -} - diff --git a/bin/R_biomart.R b/bin/R_biomart.R deleted file mode 100755 index b405ce3..0000000 --- a/bin/R_biomart.R +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/Rscript -#R script to obtain the go data for the species submitted to the script, plus the protein.fasta file. -args = commandArgs(trailingOnly=TRUE) -library(biomaRt) - -biomartCacheClear() - -ensembl_entry <- useEnsemblGenomes(biomart = (args[1]) , dataset = (args[2])) - -get_go_ids <- getBM(attributes = c('ensembl_gene_id','go_id'), mart = ensembl_entry) - -write.table(get_go_ids, "go_hash.txt", row.names=T, quote=F, sep="\t" ) - -d<-unique(get_go_ids$ensembl_gene_id) - -new_ids<- split(d, ceiling(seq_along(d)/200)) - -for(i in 1:length(new_ids)) { -bazooka2 <- getBM(attributes = c('ensembl_gene_id','peptide'), mart = ensembl_entry, values=new_ids[i], filters='ensembl_gene_id', uniqueRows=T) -Sys.sleep(runif(1, 5, 10)) -outname <- paste("Myoutput_", i, ".fasta", sep="") -write.table(bazooka2, file=outname, row.names=F, quote=F, sep="\n") -} \ No newline at end of file diff --git a/bin/cafe_go.pl b/bin/cafe_go.pl deleted file mode 100755 index 3aedd7a..0000000 --- a/bin/cafe_go.pl +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; -use Scalar::Util qw(looks_like_number); - -print "Please be in folder with N0.tsv, Base/Gamma_change.tab, Base/Gamma_branch_probabilities.tab and the Go folder\n"; - -my $pval = $ARGV[0]; -my $type = $ARGV[1]; -my $go_max_plot = $ARGV[2]; - -#Set up output name -my $outname1="CAFE_summary.txt"; -open(my $out1, ">", $outname1) or die "Could not open $outname1\n"; - -#Read in OG hash -my $go="OG_GO_format.tsv"; - -#Read in Orthofinder file -my $file=`ls N0.tsv`; -chomp $file; -open(my $filein, "<", $file) or die "Could not open $file\n"; -my $header=<$filein>; -#get rid of mac and and endings -$header =~ s/\n//g; -$header =~ s/\r//g; -chomp $header; -my @head_N0=split("\t", $header); - -#print "header : $header\n"; - -my %HOG_TO_OG; -my %GENE_DATA; -my %GENE_DATA_OG; -my %OG_DATA; -#Make a hash for the whole line per HOG -my %HOG_to_line; - -#Create a background OG list for each species, for GO. See in next section. -my %Background_OGs; -print "Create Background OG list\n"; - -while (my $line = <$filein>){ - chomp $line; - $line =~ s/\n//g; - $line =~ s/\r//g; - my @splitl=split("\t", $line); - my $hog=$splitl[0]; - my $og=$splitl[1]; - $HOG_TO_OG{$hog}=$og; - my $n=0; - #add hog to line: - $HOG_to_line{$hog}=$line; - foreach my $col (@splitl){ - - #Save -> Species, HOG, and genes - #This will print any gene that is associated with each OG, which may be in a HOG not changing. - $GENE_DATA{$head_N0[$n]}{$hog}=$col; - - if ($GENE_DATA_OG{$head_N0[$n]}{$og}){ - my $old_h=$GENE_DATA_OG{$head_N0[$n]}{$og}; - $GENE_DATA_OG{$head_N0[$n]}{$og}="$old_h\, $col"; - #print "IOt happens $head_N0[$n] $og $old_h GENEs $col \n"; - } - else{ - $GENE_DATA_OG{$head_N0[$n]}{$og}=$col; - } - - $OG_DATA{$head_N0[$n]}{$hog}=$og; - - #Create a background for each species, to show which OGs are present in each species, and hence were tested in each species (Background for GO). - if ($col){ - if ($Background_OGs{$head_N0[$n]}){ - my $olde=$Background_OGs{$head_N0[$n]}; - $Background_OGs{$head_N0[$n]}="$olde\n$og"; - } - else{ - $Background_OGs{$head_N0[$n]}=$og; - } - } - $n++; - #print "<$col>"; - } - #my $len=scalar(@splitl); - #print "$hog $n\n"; -} - -#Read in Pvalue and count file -my %PVALUE_DATA; -my %COUNT_DATA; -my $file2=`ls Out_cafe/Base_branch_probabilities.tab`; -my $file3=`ls Out_cafe/Base_change.tab`; -chomp $file2; -chomp $file3; -open(my $filein2, "<", $file2) or die "Could not open $file2\n"; -open(my $filein3, "<", $file3) or die "Could not open $file3\n"; -my $header2=<$filein2>; -my $header3=<$filein3>; -chomp $header2; -chomp $header3; - -my @head_pval=split("\t", $header2); -my @head_pval_simp; -foreach my $colh (@head_pval){ - if ($colh =~ m/^", $colh); - my @spo2=split("\<", $spo[0]); - #print "$spo2[1]\n"; - push (@head_pval_simp, $spo2[1]); - } - else{ - my @spo=split("\<", $colh); - push (@head_pval_simp, $spo[0]); - } -} - -print "Reading in Out_cafe/Base_branch_probabilities.tab\n"; -while (my $line2 = <$filein2>){ - chomp $line2; - my @splitl=split("\t", $line2); - my $hog=$splitl[0]; - my $n=0; - foreach my $col (@splitl){ - #Save -> Species, HOG, and genes - my $sp=$head_pval_simp[$n]; - $PVALUE_DATA{$sp}{$hog}=$col; - $n++; - } -} - -print "Reading in Out_cafe/Base_change.tab\n"; -while (my $line_count = <$filein3>){ - chomp $line_count; - my @splitc=split("\t", $line_count); - my $hog=$splitc[0]; - my $m=0; - foreach my $col (@splitc){ - my $sp=$head_pval_simp[$m]; - $COUNT_DATA{$sp}{$hog}=$col; - $m++; - } -} - -my %SPECIES_TOTAL; -my %SPECIES_EXPANSION; -my %SPECIES_CONTRACTION; - -my %SPECIES_EXPANSION_SUM; -my %SPECIES_CONTRACTION_SUM; - -print "Calculate significant expanded and contracted gene families\n"; - -foreach my $species (keys %PVALUE_DATA){ - - - #set up output gene list files - my $outnagenes5="$species\.sig.neg.genes.txt"; - open(my $outgene5, ">", $outnagenes5) or die "Could not open $outnagenes5\n"; - my $outnagenes6="$species\.sig.pos.genes.txt"; - open(my $outgene6, ">", $outnagenes6) or die "Could not open $outnagenes6\n"; - - print $outgene6 "$header\n"; - print $outgene5 "$header\n"; - - #print "SP $species\n"; - foreach my $hog (keys %{$PVALUE_DATA{$species}}){ - my $pval=$PVALUE_DATA{$species}{$hog}; - #print "HOGGY $hog\n"; - if (looks_like_number($pval) ){ - if($pval <= 0.05){ - my $direction=$COUNT_DATA{$species}{$hog}; - #print "HERE $direction\n"; - $SPECIES_TOTAL{$species}++; - if ($direction eq "\+0" || $direction eq "0"){ - #Do nothing - print "HEY, why is this true:\n$species $hog $PVALUE_DATA{$species}{$hog} has a direction $direction\n"; - } - elsif($direction =~ m/\-/g){ - $SPECIES_CONTRACTION{$species}++; - $SPECIES_CONTRACTION_SUM{$species}+=$direction; - - #print the hog lines of contractions - print $outgene5 "$HOG_to_line{$hog}\n"; - } - else { - $SPECIES_EXPANSION{$species}++; - $SPECIES_EXPANSION_SUM{$species}+=$direction; - - #print the hog lines of expansions - print $outgene6 "$HOG_to_line{$hog}\n"; - } - } - } - } -} - - - -#Count the number of HOGs with + or - change, not regarding pvalue. -my %SPECIES_TOTAL_C; -my %SPECIES_EXPANSION_C; -my %SPECIES_CONTRACTION_C; -my %SPECIES_C_OGS; - -print "Run through the Count data\n"; - -#Run through the COUNT DATA and count -foreach my $species3 (keys %COUNT_DATA){ - #print "HEEER: $species3\n"; - - #set up output gene list files - my $outnagenes3="$species3\.neg.genes.txt"; - open(my $outgene3, ">", $outnagenes3) or die "Could not open $outnagenes3\n"; - my $outnagenes4="$species3\.pos.genes.txt"; - open(my $outgene4, ">", $outnagenes4) or die "Could not open $outnagenes4\n"; - - - foreach my $hog (keys %{$COUNT_DATA{$species3}}){ - if ($COUNT_DATA{$species3}{$hog} eq "\+0" || $COUNT_DATA{$species3}{$hog} eq "0"){ - #Do nothing - #print "NO $species3 $hog $COUNT_DATA{$species3}{$hog}\n"; - } - elsif($COUNT_DATA{$species3}{$hog} =~ m/\-/g){ - - # Just a print line to check things are working.-> print "OORR $species3 $hog $COUNT_DATA{$species3}{$hog} $HOG_TO_OG{$hog}\n"; - $SPECIES_CONTRACTION_C{$species3}++; - $SPECIES_TOTAL_C{$species3}++; - my $og=$HOG_TO_OG{$hog}; - - - - - if ($SPECIES_C_OGS{$species3}{'neg'}){ - my $old=$SPECIES_C_OGS{$species3}{'neg'}; - $SPECIES_C_OGS{$species3}{'neg'}="$old\n$og"; - my $genes_related_to_OG=$GENE_DATA_OG{$species3}{$og}; - - #print "genes: $genes_related_to_OG $species3 $og\n"; - if ($genes_related_to_OG){ - print $outgene3 "$og\t$genes_related_to_OG\n"; - } - else{ - print $outgene3 "$og\tNo_genes\n"; - } - - } - else{ - $SPECIES_C_OGS{$species3}{'neg'}="$og"; - - #print "genes: $genes_related_to_OG $species3 $og\n"; - my $genes_related_to_OG=$GENE_DATA_OG{$species3}{$og}; - if ($genes_related_to_OG){ - print $outgene3 "$og\t$genes_related_to_OG\n"; - } - else{ - print $outgene3 "$og\tNo_genes\n"; - } - } - } - else { - #print "X $species3 $hog $COUNT_DATA{$species3}{$hog}\n"; - $SPECIES_EXPANSION_C{$species3}++; - $SPECIES_TOTAL_C{$species3}++; - my $og=$HOG_TO_OG{$hog}; - - if ($SPECIES_C_OGS{$species3}{'pos'}){ - my $old=$SPECIES_C_OGS{$species3}{'pos'}; - $SPECIES_C_OGS{$species3}{'pos'}="$old\n$og"; - - #print "genes: $genes_related_to_OG $species3 $og\n"; - my $genes_related_to_OG=$GENE_DATA_OG{$species3}{$og}; - if ($genes_related_to_OG){ - print $outgene4 "$og\t$genes_related_to_OG\n"; - } - else{ - print $outgene4 "$og\tNo_genes\n"; - } - } - else{ - $SPECIES_C_OGS{$species3}{'pos'}="$og"; - - #print "genes: $genes_related_to_OG $species3 $og\n"; - my $genes_related_to_OG=$GENE_DATA_OG{$species3}{$og}; - if ($genes_related_to_OG){ - print $outgene4 "$og\t$genes_related_to_OG\n"; - } - else{ - print $outgene4 "$og\tNo_genes\n"; - } - } - } - } -} - -print "Print GO_lists\n"; - -foreach my $species4 (keys %SPECIES_C_OGS){ - #print "$species4 $SPECIES_C_OGS{$species4}\n"; - if ($species4 eq "#FamilyID" || $species4 eq "FamilyID"){ - #Do nothing, this is not a useful line - } - else{ - - if ($SPECIES_C_OGS{$species4}{'pos'}){ - my $outna2="$species4\.pos.txt"; - open(my $out2, ">", $outna2) or die "Could not open $outna2\n"; - print $out2 "$SPECIES_C_OGS{$species4}{'pos'}\n"; - close $out2; - } - - if ($SPECIES_C_OGS{$species4}{'neg'}){ - my $outna3="$species4\.neg.txt"; - open(my $out3, ">", $outna3) or die "Could not open $outna3\n"; - print $out3 "$SPECIES_C_OGS{$species4}{'neg'}\n"; - close $out3; - } - - } - -} - -print "Print Background GO lists\n"; - -foreach my $species6 (keys %Background_OGs){ - chomp $species6; - - if ($Background_OGs{$species6}){ - if ($species6 =~ m/Gene Tree Parent Clade/){ - #ignore it, this is the header. - } - else{ - #print "ITS HERE: $species6 \n"; - my $out_back="$species6\.BK.txt"; - open(my $outb, ">", $out_back) or die "Could not open $out_back\n"; - print $outb "$Background_OGs{$species6}\n"; - `sort $species6\.BK.txt | uniq > $species6\.BK.txt.uniq`; - } - - } - -} - - - - -print "Print a summary table:\n"; - -print $out1 "Total_HOGs_significant\tExpansion_HOGs_significant\tContraction_HOGs_significant\tExpansion_genes_significant\tContraction_genes_significant\tExpansion_HOGs_total\tContraction_HOGs_total\n"; - -foreach my $species2 (sort keys %SPECIES_TOTAL){ - print $out1 "$species2\t$SPECIES_TOTAL{$species2}"; - if ($SPECIES_EXPANSION{$species2}){ - print $out1 "\t$SPECIES_EXPANSION{$species2}"; - } - else{ - print $out1 "\t0"; - } - if ($SPECIES_CONTRACTION{$species2}){ - print $out1 "\t$SPECIES_CONTRACTION{$species2}"; - } - else{ - print $out1 "\t0"; - } - - - if ($SPECIES_EXPANSION_SUM{$species2}){ - print $out1 "\t$SPECIES_EXPANSION_SUM{$species2}"; - } - else{ - print $out1 "\t0"; - } - if ($SPECIES_CONTRACTION_SUM{$species2}){ - print $out1 "\t$SPECIES_CONTRACTION_SUM{$species2}"; - } - else{ - print $out1 "\t0"; - } - - #Not significant expansions and contractions: - if ($SPECIES_EXPANSION_C{$species2}){ - print $out1 "\t$SPECIES_EXPANSION_C{$species2}"; - } - else{ - print $out1 "\t0"; - } - if ($SPECIES_CONTRACTION_C{$species2}){ - print $out1 "\t$SPECIES_CONTRACTION_C{$species2}"; - } - else{ - print $out1 "\t0"; - } - print $out1 "\n"; -} - - - -#Now run Chopgo - -foreach my $species5 (keys %SPECIES_TOTAL){ - print "Running GO on $species5\n"; - if (looks_like_number($species5)){ - #`mv $species5\.pos.txt Node_$species5\.pos.txt`; - #`mv $species5\.neg.txt Node_$species5\.neg.txt`; - - if ( -e "$species5\.pos.txt" ){ - `cut -f 1 $species5\.pos.txt > Node_$species5\.pos.txt`; - my $line_count = `wc -l Node_$species5\.pos.txt | awk '{print \$1}'`; - if ( $line_count >= 10){ - print "ChopGO_VTS2.pl -i Node_$species5\.pos.txt --GO_file $go -bg OG_GO_format.tsv -pval $pval -pval_type $type -max_plot $go_max_plot\n"; - `ChopGO_VTS2.pl -i Node_$species5\.pos.txt --GO_file $go -bg OG_GO_format.tsv -pval $pval -pval_type $type -max_plot $go_max_plot`; - } - if ( $line_count <= 100){ - print "WARNING, GO likely to fail on $line_count genes (Run $species5)\n"; - } - - } - if ( -e "$species5\.neg.txt" ){ - `cut -f 1 $species5\.neg.txt > Node_$species5\.neg.txt`; - my $line_count = `wc -l Node_$species5\.neg.txt | awk '{print \$1}'`; - if ( $line_count >= 10){ - print "ChopGO_VTS2.pl -i Node_$species5\.neg.txt --GO_file $go -bg OG_GO_format.tsv -pval $pval -pval_type $type -max_plot $go_max_plot\n"; - `ChopGO_VTS2.pl -i Node_$species5\.neg.txt --GO_file $go -bg OG_GO_format.tsv -pval $pval -pval_type $type -max_plot $go_max_plot`; - } - if ( $line_count <= 100){ - print "WARNING, GO likely to fail on $line_count genes (Run $species5)\n"; - } - } - - } - else{ - if ( -e "$species5\.pos.txt" ){ - my $line_count = `wc -l $species5\.pos.txt | awk '{print \$1}'`; - if ( $line_count <= 100){ - print "WARNING, GO likely to fail on $line_count genes (Run $species5)\n"; - } - if ( $line_count >= 10){ - `ChopGO_VTS2.pl -i $species5\.pos.txt --GO_file $go -bg $species5\.BK.txt.uniq -pval $pval -pval_type $type -max_plot $go_max_plot`; - } - } - if( -e "$species5\.neg.txt" ){ - my $line_count = `wc -l $species5\.neg.txt | awk '{print \$1}'`; - if ( $line_count <= 100){ - print "WARNING, GO likely to fail on $line_count genes (Run $species5)\n"; - } - if ( $line_count >= 10){ - `ChopGO_VTS2.pl -i $species5\.neg.txt --GO_file $go -bg $species5\.BK.txt.uniq -pval $pval -pval_type $type -max_plot $go_max_plot`; - } - } - } -} - - -close $out1; - -print "\nFinished Running\n\n"; - - - diff --git a/bin/cafe_prep.R b/bin/cafe_prep.R deleted file mode 100755 index 6bd7e04..0000000 --- a/bin/cafe_prep.R +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/Rscript -library(ape) -library(data.table) - -tre <- read.tree('pruned_tree') -stopifnot(is.binary(tre)) -stopifnot(is.rooted(tre)) - -if(is.ultrametric(tre)) { - utre <- tre -} else{ - utre <- chronos(tre) -} -write.tree(utre, 'SpeciesTree_rooted_ultra.txt') - -hog <- fread('N0.tsv') -hog[, OG := NULL] -hog[, `Gene Tree Parent Clade` := NULL] -hog <- melt(hog, id.vars='HOG', variable.name='species', value.name='pid') -hog <- hog[pid != ''] -hog$n <- sapply(hog$pid, function(x) length(strsplit(x, ', ')[[1]])) - -# Exclude HOGs with lots of genes in a one or more species. -# See also cafe tutorial about filtering gene families -keep <- hog[, list(n_max=max(n)), HOG][n_max < 100]$HOG -hog <- hog[HOG %in% keep] - -# Exclude HOGs present in only 1 species -keep <- hog[, .N, HOG][N > 1]$HOG -hog <- hog[HOG %in% keep] - -counts <- dcast(hog, HOG ~ species, value.var='n', fill=0) -counts[, Desc := 'n/a'] -setcolorder(counts, 'Desc') -fwrite(counts, 'hog_gene_counts.tsv', sep='\t') diff --git a/bin/download_go_names.R b/bin/download_go_names.R deleted file mode 100755 index 8626c20..0000000 --- a/bin/download_go_names.R +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/Rscript -library(GO.db) -go <- keys(GO.db, keytype="GOID") -df <- select(GO.db, columns=c("GOID","TERM"), keys=go, keytype="GOID") -write.table(df, file="GO_to_name", sep="\t", quote=F) diff --git a/bin/go_chromosome.pl b/bin/go_chromosome.pl deleted file mode 100755 index 1aae365..0000000 --- a/bin/go_chromosome.pl +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - -print "Please be in folder with focal gff3 file and GO hashes\n\n"; - -my @goes=`ls *Result_All_Combine_GO_format`; -my @in_gfffile=`ls *.gff3`; -my $ortho="Orthogroups.tsv"; - - -#Store orthogroup names hash/ -my %orthogroup_hash; -open(my $orthin, "<", $ortho) or die "Could not open $ortho\n"; -my $header=<$orthin>; -chomp $header; -my @colHeadsplit=split("\t", $header); -while (my $lineOrtho=<$orthin>){ - chomp $lineOrtho; - my $n=1; - my @colsplit=split("\t", $lineOrtho); - my $OG= shift(@colsplit); - foreach my $row (@colsplit){ - my @isoform_sp=split(", ", $row); - - #print "HERE $OG : $n\n"; - my @big_name=split(/\./, $colHeadsplit[$n]); - my $tidyname=$big_name[0]; - foreach my $iso (@isoform_sp){ - $orthogroup_hash{$tidyname}{$iso}=$OG; - } - $n++; - } -} - - - -my @jobs; #To store pairs of gff and go files to run - -#Check we have the right files for each species: -foreach my $gofile (@goes){ - chomp $gofile; - #print "$gofile\n"; - my @sp_gp =split(/\./, $gofile); - my $match=0; - foreach my $gfffile (@in_gfffile){ - chomp $gfffile; - my @sp_gff=split(/\./, $gfffile); - #print "$gfffile cat $sp_gp[0] and $sp_gff[0]\n"; - - - if ($sp_gp[0] eq $sp_gff[0]){ - $match=1; - push (@jobs, "$gofile $gfffile"); - } - } - if ($match){ - #print "$sp_gp[0] is matched\n"; - } - else{ - print "ERROR: $sp_gp[0] does not have an equivalent gff file\n" - } -} - - - - -#Now run through the jobs and prepare the input files. -my %Gene_tran_hash; - -foreach my $species (@jobs){ - my @sp=split(/\ /, $species); - my $go=$sp[0]; - my $gff=$sp[1]; - chomp $gff; - - my @go_split =split(/\./, $go); - my $species_name=$go_split[0]; - - my $out="$species_name\.go_r_file.txt"; - open(my $fileout, ">", $out) or die "Could not open $out\n"; - - my $out2="$species_name\.go_r_file.noDuplicates.txt"; - open(my $fileout2, ">", $out2) or die "Could not open $out2\n"; - - open(my $filein, "<", $gff) or die "Could not open $gff\n"; - while (my $line=<$filein>){ - chomp $line; - my @split=split("\t", $line); - #if line is an mRNA line, then we can check the gene to isoform IDs. - my $gene; - my $tran; - my $scaffold=$split[0]; - - #ignore blank lines, starting with #/ - if ($line =~ /^#/){ - #do nothing - } - else{ - #print "$line\n"; - if ($split[2] eq "mRNA"){ - - #Do different split if AUGUSTUS or NCBI - if ($split[1] eq "AUGUSTUS"){ - #Its an AUGUSTUS GFF - my @lsplit=split("\;", $split[8]); - my @genesp=split("\=", $lsplit[1]); - my @transp=split("\=", $lsplit[0]); - $gene=$genesp[1]; - $tran=$transp[1]; - } - elsif($split[1] eq "maker"){ - #Its probably a maker gff with ID and Parent for gene and transcript isoform names: - my @lsplit=split("\;", $split[8]); - my %temp_h; - foreach my $bits (@lsplit){ - my @spbit=split("\=", $bits); - $temp_h{$spbit[0]}=$spbit[1]; - } - $gene=$temp_h{"Parent"}; - $tran=$temp_h{"ID"}; - } - else{ - #Its probably a normal NCBI type: - my @lsplit=split("\;", $split[8]); - my %temp_h; - foreach my $bits (@lsplit){ - my @spbit=split("\=", $bits); - $temp_h{$spbit[0]}=$spbit[1]; - } - my $fullgene=$temp_h{"Parent"}; - my @fullsp=split("\:", $fullgene); - my @fullminusdash=split("\-",$fullsp[-1]); - $gene=$fullminusdash[-1]; - $tran=$temp_h{"transcript_id"}; - } - - - print $fileout "$gene\t$scaffold\n"; - #if the species gene exists in the orthofinder table, then print it to the OG run: - if ($orthogroup_hash{$species_name}{$gene}){ - print $fileout2 "$orthogroup_hash{$species_name}{$gene}\t$scaffold\n"; - } - - if ($Gene_tran_hash{$gene}){ - my $old=$Gene_tran_hash{$gene}; - $Gene_tran_hash{$gene}="$old\,$tran"; - } - else{ - $Gene_tran_hash{$gene}=$tran; - } - } - } - } - - #Now make a Orthogroup background file: - - open(my $filego, "<", $go) or die "Could not open $go\n"; - - my $out3="$species_name\.go_r_file.noDuplicates_BK.txt"; - open(my $fileout3, ">", $out3) or die "Could not open $out3\n"; - - my %exist_hit; - while (my $linego=<$filego>){ - chomp $linego; - my @splitgo=split("\t", $linego); - - if ($orthogroup_hash{$species_name}{$splitgo[0]}){ - if ($exist_hit{"$orthogroup_hash{$species_name}{$splitgo[0]}\t$splitgo[1]"}){ - #do nothing; hit exists. - } - else{ - print $fileout3 "$orthogroup_hash{$species_name}{$splitgo[0]}\t$splitgo[1]\n"; - $exist_hit{"$orthogroup_hash{$species_name}{$splitgo[0]}\t$splitgo[1]"}="YES"; - } - } - } - - - #close handles, sort duplicated go list, and run ChopGO. - - close $fileout; - close $filein; - print "Now running go chromosome analysis for $species_name\n"; - - #Sort the duplicate file: - `sort $species_name\.go_r_file.noDuplicates.txt | uniq > $species_name\.go_r_file.noDuplicates.uniq.txt`; - - my $out4="$species_name\.go_r_file.noDuplicates.noSameScaffold.txt"; - open(my $fileout4, ">", $out4) or die "Could not open $out4\n"; - open(my $filein4, "<", "$species_name\.go_r_file.noDuplicates.uniq.txt") or die "Could not open $species_name\.go_r_file.noDuplicates.uniq.txt\n"; - my %hit; - my %bad; - while (my $line4=<$filein4>){ - chomp $line4; - my @split=split("\t", $line4); - if ($hit{$split[0]}){ - #delete $hit{$split[0]}; - $bad{$split[0]}="duplicate_scaffold"; - } - $hit{$split[0]}=$split[1]; - } - - foreach my $key ( keys %hit ){ - if ($bad{$key}){ - #don't use it - } - else{ - print $fileout4 "$key\t$hit{$key}\n"; - } - } - - close $fileout4; - close $filein4; - - #Remove background duplicated orthogroups. - my $out5="$species_name\.go_r_file.noDuplicates_BK.noDuplicates.txt"; - open(my $fileout5, ">", $out5) or die "Could not open $out5\n"; - open(my $filein5, "<", "$species_name\.go_r_file.noDuplicates_BK.txt") or die "Could not open $species_name\.go_r_file.noDuplicates_BK.txt\n"; - while (my $line5=<$filein5>){ - chomp $line5; - my @split=split("\t", $line5); - if ($bad{$split[0]}){ - #do nothing - } - else{ - print $fileout5 "$line5\n"; - } - } - close $fileout5; - close $filein5; - - `mkdir Unfiltered_Go_$species_name`; - `ChopGO_ChromoGoatee.pl -i $species_name\.go_r_file.txt --GO_file $go -sp $species_name`; - `mv *_res.tab Unfiltered_Go_$species_name`; - `mv *_res.tab.pdf Unfiltered_Go_$species_name`; - `mkdir Filtered_dup_Go_$species_name`; - `ChopGO_ChromoGoatee.pl -i $species_name\.go_r_file.noDuplicates.noSameScaffold.txt --GO_file $species_name\.go_r_file.noDuplicates_BK.noDuplicates.txt -sp $species_name`; - `mv *_res.tab Filtered_dup_Go_$species_name`; - `mv *_res.tab.pdf Filtered_dup_Go_$species_name`; -} - -print "\n\nScript finished\n"; diff --git a/bin/plotting_go.R b/bin/plotting_go.R deleted file mode 100755 index e37ac91..0000000 --- a/bin/plotting_go.R +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/Rscript - -library(pheatmap) - -erefd<-read.table("Go_summary_neg.tsv", h=T, sep="\t") -newdata <- erefd[order(erefd$Count_significant, decreasing = T),] -rownames(newdata) <- paste(newdata$GO_ID,newdata$GO_term) -df = subset(newdata, select = -c(GO_term,GO_ID,Count_significant) ) -df[is.na(df)] <- 1 -df2<-as.matrix(df) -my_palette <- colorRampPalette(c("purple","red", "orange", "yellow", "white")) (n=20) -pdf("Go_summary_neg.pdf", width=9, height=5) -pheatmap::pheatmap(log10(head(df2, n=30)), col=my_palette, cluster_rows = F, treeheight_row = 0, treeheight_col = 0, legend=T) -dev.off() - -mat_clean <- df2[, !grepl("Node", colnames(df2))] -pdf("Go_summary_neg_noNode.pdf", width=9, height=5) -pheatmap::pheatmap(log10(head(mat_clean, n=30)), col=my_palette, cluster_rows = F, treeheight_row = 0, treeheight_col = 0, legend=T) -dev.off() - -erefd<-read.table("Go_summary_pos.tsv", h=T, sep="\t") -newdata <- erefd[order(erefd$Count_significant, decreasing = T),] -rownames(newdata) <- paste(newdata$GO_ID,newdata$GO_term) -df = subset(newdata, select = -c(GO_term,GO_ID,Count_significant) ) -df[is.na(df)] <- 1 -df2<-as.matrix(df) -my_palette <- colorRampPalette(c("purple","red", "orange", "yellow", "white")) (n=20) -pdf("Go_summary_pos.pdf", width=9, height=5) -pheatmap::pheatmap(log10(head(df2, n=30)), col=my_palette, cluster_rows = F, treeheight_row = 0, treeheight_col = 0, legend=T) -dev.off() - - -mat_clean2 <- df2[, !grepl("Node", colnames(df2))] -pdf("Go_summary_pos_noNode.pdf", width=9, height=5) -pheatmap::pheatmap(log10(head(mat_clean2, n=30)), col=my_palette, cluster_rows = F, treeheight_row = 0, treeheight_col = 0, legend=T) -dev.off() - - diff --git a/bin/post_processing/combine_agat.py b/bin/post_processing/combine_agat.py deleted file mode 100644 index 45ae30e..0000000 --- a/bin/post_processing/combine_agat.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import pandas as pd -import argparse - -def extract_mrna_section(filepath): - """ - Extracts the mrna section from the input file and returns it as a DataFrame. - """ - with open(filepath, 'r') as file: - lines = file.readlines() - - # Find the starting line for the mrna section - start_idx = None - for i, line in enumerate(lines): - if "mrna" in line.lower() and "-----" in line: - start_idx = i + 1 - break - - if start_idx is None: - raise ValueError(f"mrna section not found in file: {filepath}") - - # Collect lines until an empty line or another section starts - mrna_data = [] - for line in lines[start_idx:]: - if line.strip() == "" or ("-----" in line): - break - mrna_data.append(line.strip()) - - # Parse the extracted data into a dictionary - data_dict = {} - for line in mrna_data: - if line: - key, value = line.rsplit(maxsplit=1) - data_dict[key.strip()] = value.strip() - - # Convert the dictionary to a DataFrame - df = pd.DataFrame.from_dict(data_dict, orient='index', columns=[os.path.basename(filepath).split('.')[0]]) - - return df - -def combine_files(file_list, output_filepath): - # Initialize an empty DataFrame to store the consolidated data - consolidated_df = pd.DataFrame() - - # Loop through each file in the provided file list - for filepath in file_list: - # Extract the mrna section data as a DataFrame - df = extract_mrna_section(filepath) - - # Merge the current file's DataFrame with the consolidated DataFrame - if consolidated_df.empty: - consolidated_df = df - else: - consolidated_df = consolidated_df.join(df, how='outer') - - # Reset the index to have the metric names as the first column - consolidated_df.reset_index(inplace=True) - - # Rename the first column to 'Metric' - consolidated_df.rename(columns={'index': 'Metric'}, inplace=True) - - # Save the consolidated table to a CSV file - consolidated_df.to_csv(output_filepath, index=False) - - print(f"Consolidated data saved to {output_filepath}") - -if __name__ == "__main__": - # Set up argument parser - parser = argparse.ArgumentParser(description="Extract mrna section data from multiple files and combine them into a single CSV file.") - parser.add_argument('files', metavar='F', type=str, nargs='+', help='Input files to process (e.g., *.txt)') - parser.add_argument('-o', '--output', type=str, default='consolidated_mrna_table.csv', help='Output CSV file name') - - # Parse the arguments - args = parser.parse_args() - - # Combine files and save to the output file - combine_files(args.files, args.output) - diff --git a/bin/post_processing/combine_busco.py b/bin/post_processing/combine_busco.py deleted file mode 100644 index a5f13d2..0000000 --- a/bin/post_processing/combine_busco.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import pandas as pd -import argparse - -def extract_busco_data(filepath): - """ - Extracts the relevant BUSCO metrics from the input file and returns it as a DataFrame. - """ - data_dict = {} - - with open(filepath, 'r') as file: - lines = file.readlines() - - for line in lines: - if line.startswith("C:"): - # Extract percentages (Complete, Fragmented, Missing) - metrics = line.split(',') - data_dict['Complete BUSCOs (C)'] = metrics[0].split(':')[1].split('%')[0].strip() - data_dict['Fragmented BUSCOs (F)'] = metrics[1].split(':')[1].split('%')[0].strip() - data_dict['Missing BUSCOs (M)'] = metrics[2].split(':')[1].split('%')[0].strip() - elif "Complete BUSCOs (C)" in line: - data_dict['Complete BUSCOs'] = line.split()[0].strip() - elif "Complete and single-copy BUSCOs (S)" in line: - data_dict['Complete and single-copy BUSCOs'] = line.split()[0].strip() - elif "Complete and duplicated BUSCOs (D)" in line: - data_dict['Complete and duplicated BUSCOs'] = line.split()[0].strip() - elif "Fragmented BUSCOs (F)" in line: - data_dict['Fragmented BUSCOs'] = line.split()[0].strip() - elif "Missing BUSCOs (M)" in line: - data_dict['Missing BUSCOs'] = line.split()[0].strip() - elif "Total BUSCO groups searched" in line: - data_dict['Total BUSCO groups searched'] = line.split()[0].strip() - - # Convert the dictionary to a DataFrame - df = pd.DataFrame.from_dict(data_dict, orient='index', columns=[os.path.basename(filepath).split('.')[0]]) - - return df - -def combine_files(file_list, output_filepath): - # Initialize an empty DataFrame to store the consolidated data - consolidated_df = pd.DataFrame() - - # Loop through each file in the provided file list - for filepath in file_list: - # Extract the BUSCO data as a DataFrame - df = extract_busco_data(filepath) - - # Ensure unique column names by appending the full basename - column_prefix = os.path.basename(filepath).replace('.', '_') - df.columns = [f"{column_prefix}_{col}" for col in df.columns] - - # Merge the current file's DataFrame with the consolidated DataFrame - if consolidated_df.empty: - consolidated_df = df - else: - consolidated_df = consolidated_df.join(df, how='outer') - - # Reset the index to have the metric names as the first column - consolidated_df.reset_index(inplace=True) - - # Rename the first column to 'Metric' - consolidated_df.rename(columns={'index': 'Metric'}, inplace=True) - - # Save the consolidated table to a CSV file - consolidated_df.to_csv(output_filepath, index=False) - - print(f"Consolidated data saved to {output_filepath}") - -if __name__ == "__main__": - # Set up argument parser - parser = argparse.ArgumentParser(description="Extract BUSCO data from multiple files and combine them into a single CSV file.") - parser.add_argument('files', metavar='F', type=str, nargs='+', help='Input BUSCO summary files (e.g., *.txt)') - parser.add_argument('-o', '--output', type=str, default='consolidated_busco_table.csv', help='Output CSV file name') - - # Parse the arguments - args = parser.parse_args() - - # Combine files and save to the output file - combine_files(args.files, args.output) - diff --git a/bin/post_processing/combine_quast.py b/bin/post_processing/combine_quast.py deleted file mode 100644 index 4e0cb9d..0000000 --- a/bin/post_processing/combine_quast.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import pandas as pd -import argparse - -def combine_files(file_list, output_filepath): - # Initialize an empty DataFrame to store the consolidated data - consolidated_df = pd.DataFrame() - - # Loop through each file in the provided file list - for filepath in file_list: - # Get the filename without the directory path - filename = os.path.basename(filepath) - - # Read the file into a DataFrame, skipping the first line (header) - df = pd.read_csv(filepath, sep="\t", skiprows=1, header=None, index_col=0) - - # Use the file's name (without extension) as the column header - column_name = os.path.splitext(filename)[0] - df.columns = [column_name] - - # Merge the current file's DataFrame with the consolidated DataFrame - if consolidated_df.empty: - consolidated_df = df - else: - consolidated_df = consolidated_df.join(df, how='outer') - - # Reset the index to have the metric names as the first column - consolidated_df.reset_index(inplace=True) - - # Rename the first column to 'Metric' - consolidated_df.rename(columns={'index': 'Metric'}, inplace=True) - - # Save the consolidated table to a CSV file - consolidated_df.to_csv(output_filepath, index=False) - - print(f"Consolidated data saved to {output_filepath}") - -if __name__ == "__main__": - # Set up argument parser - parser = argparse.ArgumentParser(description="Combine multiple QUAST summary files into a single CSV file.") - parser.add_argument('files', metavar='F', type=str, nargs='+', help='Input QUAST summary files (e.g., *.tsv)') - parser.add_argument('-o', '--output', type=str, default='consolidated_table.csv', help='Output CSV file name') - - # Parse the arguments - args = parser.parse_args() - - # Combine files and save to the output file - combine_files(args.files, args.output) - diff --git a/bin/post_processing/combine_tables.py b/bin/post_processing/combine_tables.py deleted file mode 100644 index b63b124..0000000 --- a/bin/post_processing/combine_tables.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse - -def combine_csv_files(file_list, output_filepath): - with open(output_filepath, 'w') as outfile: - for i, filepath in enumerate(file_list): - with open(filepath, 'r') as infile: - if i == 0: - # Write the first file entirely including its header - outfile.write(infile.read()) - else: - # Skip the header for subsequent files - next(infile) - outfile.write(infile.read()) - - print(f"Combined data saved to {output_filepath}") - -if __name__ == "__main__": - # Set up argument parser - parser = argparse.ArgumentParser(description="Combine multiple CSV files with the same header into one CSV file.") - parser.add_argument('files', metavar='F', type=str, nargs='+', help='Input CSV files (e.g., *.csv)') - parser.add_argument('-o', '--output', type=str, default='combined_output.csv', help='Output CSV file name') - - # Parse the arguments - args = parser.parse_args() - - # Combine the CSV files and save to the output file - combine_csv_files(args.files, args.output) - diff --git a/bin/post_processing/tree_with_annotation.R b/bin/post_processing/tree_with_annotation.R deleted file mode 100644 index fb6ea74..0000000 --- a/bin/post_processing/tree_with_annotation.R +++ /dev/null @@ -1,159 +0,0 @@ -# Load necessary libraries -if (!requireNamespace("argparse", quietly = TRUE)) { - install.packages("argparse") -} -if (!requireNamespace("ggtree", quietly = TRUE)) { - BiocManager::install("ggtree") -} -if (!requireNamespace("ggplot2", quietly = TRUE)) { - install.packages("ggplot2") -} -if (!requireNamespace("cowplot", quietly = TRUE)) { - install.packages("cowplot") -} - -library(ggtree) -library(ggplot2) -library(cowplot) -library(argparse) -library(dplyr) - -# Parse command-line arguments -parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') -parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') -parser$add_argument('data_file', type = 'character', help = 'Path to the CSV file with statistic and true/false data') -parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') -args <- parser$parse_args() - -# Read the Newick tree from the file -tree <- read.tree(args$tree_file) - -# Clean tree tip labels -tree$tip.label <- trimws(tree$tip.label) -tree$tip.label <- tolower(tree$tip.label) - -# Read the data table from the file, ensuring species column is read as character -data <- read.csv(args$data_file, sep = "\t", colClasses = c("species" = "character")) - -# Clean data species names -data$species <- trimws(data$species) -data$species <- tolower(data$species) - -# Extract the column headers and the second row to determine plot types -plot_types <- as.character(data[1, ]) -data <- data[-1, ] # Remove the second row used for plot types - -# Debugging: Print species names from the tree and the data -cat("Species names in the tree based on nw:\n") -print(tree$tip.label) -cat("\nSpecies names in the data table before processing:\n") -print(data$species) - -# Debugging: Print the data frame to check its structure -cat("\nData frame:\n") -print(data) - -# Ensure species names match the tree tips -missing_in_tree <- setdiff(data$species, tree$tip.label) -missing_in_data <- setdiff(tree$tip.label, data$species) - -if (length(missing_in_tree) > 0) { - cat("Species in data but not in tree:\n") - print(missing_in_tree) -} - -if (length(missing_in_data) > 0) { - cat("Species in tree but not in data:\n") - print(missing_in_data) -} - -if (length(missing_in_tree) > 0 || length(missing_in_data) > 0) { - stop("Species names in the data do not match the tree tips") -} - -# Plot the phylogenetic tree -tree_plot <- ggtree(tree) + - geom_tiplab(size=args$text_size) + # Add labels to the tips of the tree - ggtitle("Phylogenetic Tree") + - theme(plot.margin = margin(0, 0, 0, 0)) - -# Extract the exact tree tip names:axis.text -tree$tip.label <- get_taxa_name(tree_plot) - -# Reorder the data based on the order of species in the tree -data <- data %>% arrange(match(species, tree$tip.label)) - -# Debugging: Print plot types -cat("\nPlot types:\n") -print(plot_types) - -# Debugging: Print the reordered data frame to check its structure -cat("\nReordered data frame:\n") -print(data) - -write.table(data, "Reordered_output_tree.tsv", sep="\t", quote=F) - -# Check if the number of unique species matches the number of tree tips -if (length(unique(data$species)) != length(tree$tip.label)) { - warning("The number of unique species in the data does not match the number of tree tips.") -} - -# Find column headers -column_headers <- colnames(data) - -# Create plots based on the plot types -plots <- list() -for (i in 2:length(column_headers)) { - column_name <- column_headers[i] - plot_type <- plot_types[i] - - if (plot_type == "bar") { - bar_plot <- ggplot(data, aes(x = as.numeric(!!sym(column_name)), y = factor(species, levels = rev(tree$tip.label)))) + - geom_bar(stat = "identity", fill = "darkblue") + - geom_text(aes(label = !!sym(column_name)), hjust = -0.1, size = args$text_size) + # Add text labels - theme_minimal() + - theme(axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - panel.grid.major.y = element_blank(), # Remove horizontal grid lines - panel.grid.minor.y = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - labs(x = column_name) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- bar_plot - } else if (plot_type == "text") { - text_plot <- ggplot(data, aes(y = factor(species, levels = rev(tree$tip.label)), x = 1, label = !!sym(column_name))) + - geom_text(size = args$text_size, hjust = 0) + - theme_void() + - labs(x = NULL, y = NULL) + - theme(axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- text_plot - } else { - cat("Unknown plot type:", plot_type, "for column:", column_name, "\n") - } -} - -# Debugging: Print the list of plots -cat("List of plots:\n") -print(plots) - -# Combine the plots if there are any -if (length(plots) > 0) { - combined_plot <- plot_grid(tree_plot, plot_grid(plotlist = plots, ncol = length(plots)), - ncol = 2, rel_widths = c(1, length(plots))) # Adjust widths - - # Save the plot to a PDF file - ggsave("Phyloplot.pdf", plot = combined_plot, width = 10, height = 8) -} else { - cat("No plots to combine.\n") -} - -# Print warnings -warnings() diff --git a/bin/sum_cafe.pl b/bin/sum_cafe.pl deleted file mode 100755 index 3ffae9e..0000000 --- a/bin/sum_cafe.pl +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - - -print "Please be in folder with all the Species Go Summarys\n"; - - -my %go_key; -my %go_names; -my %species_list; -my @gos=`ls *results_ALL.tab`; -my $species; -my $subset; -foreach my $sp (@gos){ - chomp $sp; - my @split=split(/\./, $sp); - #my @sp_folder=split("\/", $split[0]); - $species=$split[0]; - $subset=$split[1]; - #print "$species $subset\n"; - $species_list{$species}="Done"; - open(my $filein, "<", $sp) or die "Could not open $sp\n"; - my $header=<$filein>; - while (my $line = <$filein>){ - chomp $line; - #print "$line\n"; - my @split2=split("\t", $line); - $go_key{$split2[0]}{$subset}{$species}=$split2[9]; - $go_names{$split2[0]}=$split2[1]; - #print "$split2[0] $subset $species $split2[9]\n"; - } -} - - - -#Create a species array: -my @species_array; -foreach my $sp (keys %species_list) { - push (@species_array, $sp); -} - -#Create job type array: -my @job_type_array=("pos","neg"); - -#Create output files -my $outname="Go_summary_posneg_merged.tsv"; -open(my $outhandle, ">", $outname) or die "Could not open $outname merged\n"; -my $outname2="Go_summary_pos.tsv"; -open(my $outhandle2, ">", $outname2) or die "Could not open $outname2 pos\n"; -my $outname3="Go_summary_neg.tsv"; -open(my $outhandle3, ">", $outname3) or die "Could not open $outname3 neg\n"; - - -print $outhandle "GO_ID\tGO_term"; -print $outhandle2 "GO_ID\tGO_term"; -print $outhandle3 "GO_ID\tGO_term"; - - - -foreach my $types ( @job_type_array ) { - foreach my $species ( @species_array ) { - print $outhandle "\t$species\_$types"; - } -} - -foreach my $species ( @species_array ) { - print $outhandle2 "\t$species"; - print $outhandle3 "\t$species"; -} - -print $outhandle "\tCount_posSynteny\tCount_negSynteny\n"; -print $outhandle2 "\tCount_significant\n"; -print $outhandle3 "\tCount_significant\n"; - - - -foreach my $goterms (keys %go_key) { - print $outhandle "$goterms\t$go_names{$goterms}"; - print $outhandle2 "$goterms\t$go_names{$goterms}"; - print $outhandle3 "$goterms\t$go_names{$goterms}"; - - #All job_type_array - #My tests_signif count for all sampled: - my %count_pval; - - #For each test (e.g. topSynteny","botSynteny) - foreach my $types ( @job_type_array ) { - - foreach my $species ( @species_array ) { - if (exists $go_key{$goterms}{$types}{$species} ){ - print $outhandle "\t$go_key{$goterms}{$types}{$species}"; - - #if p value is less than 0.05, add to counts for GO term: - if ($go_key{$goterms}{$types}{$species} <= 0.05){ - if ($count_pval{$goterms}{$types}){ - $count_pval{$goterms}{$types}++; - } - else{ - $count_pval{$goterms}{$types}=1; - } - - } - - if($types eq "pos"){ - print $outhandle2 "\t$go_key{$goterms}{$types}{$species}"; - } - if($types eq "neg"){ - print $outhandle3 "\t$go_key{$goterms}{$types}{$species}"; - } - - } - else{ - #ALL types: - print $outhandle "\tNA"; - - if($types eq "pos"){ - print $outhandle2 "\tNA"; - } - if($types eq "neg"){ - print $outhandle3 "\tNA"; - } - - } - } - - #Now print off the significant counts per row: - foreach my $types ( @job_type_array ) { - if ($count_pval{$goterms}{$types}){ - print $outhandle "\t$count_pval{$goterms}{$types}"; - } - else{ - print $outhandle "\t0"; - } - - } - - #Now do for each output table: - if($types eq "pos"){ - if ($count_pval{$goterms}{$types}){ - print $outhandle2 "\t$count_pval{$goterms}{$types}"; - } - else{ - print $outhandle2 "\t0"; - } - } - if($types eq "neg"){ - if ($count_pval{$goterms}{$types}){ - print $outhandle3 "\t$count_pval{$goterms}{$types}"; - } - else{ - print $outhandle3 "\t0"; - } - } - } - - print $outhandle "\n"; - print $outhandle2 "\n"; - print $outhandle3 "\n"; - -} - - - -