-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* validate taxonomy script * unit testing for taxonomy * unit testing for taxonomy * moved XXXXXX entries to a todo file * validating names.dmp and added new entries to make taxonomy more complete * Contributing.md doc * link to contributing.md * more description under contributions Co-authored-by: Lee Katz - Aspen <[email protected]>
- Loading branch information
Showing
9 changed files
with
203 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
on: [push] | ||
name: Validate taxonomy | ||
|
||
jobs: | ||
build: | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
matrix: | ||
os: ['ubuntu-18.04' ] | ||
perl: [ '5.32' ] | ||
name: Perl ${{ matrix.perl }} on ${{ matrix.os }} | ||
steps: | ||
- name: Set up perl | ||
uses: shogo82148/actions-setup-perl@v1 | ||
with: | ||
perl-version: ${{ matrix.perl }} | ||
multi-thread: "true" | ||
- name: checkout my repo | ||
uses: actions/checkout@v2 | ||
with: | ||
path: Kalamari | ||
|
||
- name: validate taxonomy | ||
run: perl Kalamari/bin/validateTaxonomy.pl Kalamari/src/taxonomy | ||
- name: matching taxids | ||
run: | | ||
echo "Making sure that all taxids in chromosomes.tsv and plasmids.tsv are present in nodes.tsv and names.tsv" | ||
tail -n +2 Kalamari/src/chromosomes.tsv Kalamari/src/plasmids.tsv -q | perl -F'\t' -lane 'BEGIN{@node=`cat Kalamari/src/taxonomy/nodes.dmp`; for $n(@node){($taxid)=split(/\t/, $n); $taxid{$taxid}++; } } for my $t($F[2], $F[3]){ if(!$taxid{$t}){ print "Could not find $t taxid";} }' | ||
tail -n +2 Kalamari/src/chromosomes.tsv Kalamari/src/plasmids.tsv -q | perl -F'\t' -lane 'BEGIN{@name=`cat Kalamari/src/taxonomy/names.dmp`; for $n(@name){($taxid)=split(/\t/, $n); $taxid{$taxid}++; } } for my $t($F[2], $F[3]){ if(!$taxid{$t}){ print "Could not find $t taxid";} }' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Contributing | ||
|
||
There are many ways to contribute to this project and so here are a couple of ways to contribute. | ||
Contributions will almost always result in a pull request. | ||
Contributions must pass the automated testing. | ||
|
||
## Add a taxon | ||
|
||
To add a taxon, add it to src/nodes.dmp and src/names.dmp. | ||
If it is present in the NCBI taxonomy, please use that identifier. | ||
Please adhere to the [NCBI taxonomy format specification](https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_readme.txt). | ||
For names.dmp, the scientific name field is required. | ||
|
||
Step 2 for adding a taxon is also adding representative chromosome(s). | ||
See the section below for details. | ||
You cannot add a taxon to this project without a representative chromosome. | ||
|
||
## Add a chromosome | ||
|
||
Add an entry to either src/chromosomes.tsv or src/plasmids.tsv. | ||
The format is four columns, separated by tab: | ||
|
||
* scientific name or similar | ||
* NCBI nucleotide accession | ||
* taxonomy ID | ||
* parent taxonomy ID | ||
|
||
The taxonomy IDs in each line must be represented in names.dmp and nodes.dmp in the folder src/taxonomy. | ||
|
||
New nucleotide entries must be | ||
|
||
* Trusted - subject matter experts must agree that this is a representative genome for the taxon | ||
* Completed - no gaps | ||
* Nonredundant - for the most part, most taxa are not represented by multiple assemblies | ||
|
||
Note: some species such as _Vibrio cholerae_ have multiple chromosomes. | ||
These can be denoted with multiple lines, one per nucleotide accession. | ||
|
||
## Other contributions | ||
|
||
Please make a new issues ticket on GitHub and describe the potential contribution. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/usr/bin/env perl | ||
use strict; | ||
use warnings; | ||
use Getopt::Long qw/GetOptions/; | ||
use File::Basename qw/basename/; | ||
use File::Path qw/make_path/; | ||
use Data::Dumper qw/Dumper/; | ||
|
||
local $0 = basename $0; | ||
sub logmsg{ print STDERR "$0: @_\n";} | ||
|
||
exit main(); | ||
|
||
sub main{ | ||
my $settings={}; | ||
GetOptions($settings,qw(help)) or die $!; | ||
die usage() if($$settings{help} || !@ARGV); | ||
|
||
for my $taxdir (@ARGV){ | ||
my $is_valid = validateTaxonomy($taxdir, $settings); | ||
logmsg "Valid: $taxdir"; | ||
} | ||
|
||
return 0; | ||
|
||
} | ||
|
||
# Return 1 if the taxonomy is good and 0 if not | ||
sub validateTaxonomy{ | ||
my($dir, $settings) = @_; | ||
|
||
my $names = readDmp("$dir/names.dmp", $settings); | ||
my $nodes = readDmp("$dir/nodes.dmp", $settings); | ||
|
||
# See if every element in nodes has a parent | ||
while(my($taxid, $taxinfo) = each(%$nodes)){ | ||
my $parent = $$taxinfo[0]; | ||
|
||
# Die with a useful message if the parent node is not present | ||
# and if the parent node is not 1 or 0 | ||
if(! $$nodes{$parent} && $parent > 1){ | ||
logmsg "ERROR: could not find node $parent which is the parent of $taxid"; | ||
return 0; | ||
} | ||
|
||
# Find matching entries in names.dmp | ||
if($taxid > 1 && !$$names{$taxid}){ | ||
logmsg "ERROR: could not find an entry in names.dmp for $taxid"; | ||
return 0; | ||
} | ||
if($parent > 1 && !$$names{$parent} ){ | ||
logmsg "ERROR: could not find an entry in names.dmp for $parent"; | ||
return 0; | ||
} | ||
} | ||
|
||
return 1; | ||
} | ||
|
||
sub readDmp{ | ||
my($dmp, $settings) = @_; | ||
my %dmp; | ||
open(my $fh, $dmp) or die "ERROR: could not read $dmp: $!"; | ||
while(<$fh>){ | ||
chomp; | ||
my @F = split /\t\|\t/; | ||
$F[-1] =~s/\t\|$//; # remove trailing chars for last field | ||
my $taxid = shift(@F); | ||
$dmp{$taxid} = \@F; | ||
} | ||
|
||
return \%dmp; | ||
} | ||
|
||
sub usage{ | ||
print "Validate a folder of taxonomy containing nodes.dmp and names.dmp | ||
Usage: $0 taxonomy/ [taxonomy2...] | ||
"; | ||
exit 0; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Arcobacter butzleri XXXXXX 28197 28196 | ||
Arcobacter cloacae XXXXXX 1054034 28196 | ||
Arcobacter cryaerophilus XXXXXX 28198 28196 | ||
Arcobacter nitrofigilis XXXXXX 28199 28196 | ||
Arcobacter venerupis XXXXXX 1054033 28196 | ||
Campylobacter canadensis XXXXXX 449520 194 | ||
Campylobacter corcagiensis XXXXXX 1448857 194 | ||
Campylobacter curvus XXXXXX 200 194 | ||
Campylobacter iguaniorum XXXXXX 1244531 194 | ||
Campylobacter jejuni doylei XXXXXX 32021 197 | ||
Campylobacter mucosalis XXXXXX 202 194 | ||
Campylobacter rectus XXXXXX 203 194 | ||
Campylobacter showae XXXXXX 204 194 | ||
Campylobacter upsaliensis XXXXXX 28080 194 | ||
Helicobacter bilis XXXXXX 37372 209 | ||
Helicobacter cinaedi XXXXXX 213 209 | ||
Helicobacter pullorum XXXXXX 35818 209 | ||
Helicobacter winghamensis XXXXXX 157268 209 | ||
Helicobacter valdiviensis XXXXXX 1458358 209 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters