From 6f4fd35b61dca9a89ba7e3306f0b2118dca9fd4f Mon Sep 17 00:00:00 2001 From: Abdullah Almsaeed Date: Fri, 3 Aug 2018 10:06:35 -0400 Subject: [PATCH 1/2] Add dev seed docs --- README.md | 33 +- src/Console/Commands/InitCommand.php | 1 + .../{DevseedSeeder.php => DevSeedSeeder.php} | 0 .../tests/DatabaseSeeders/DevSeedSeeder.php | 383 ++++++++++++++++++ 4 files changed, 416 insertions(+), 1 deletion(-) rename stubs/{DevseedSeeder.php => DevSeedSeeder.php} (100%) create mode 100644 tests/test_module/tests/DatabaseSeeders/DevSeedSeeder.php diff --git a/README.md b/README.md index 6a63b64..d179ddd 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ with data for use in testing). - [Creating Database Seeders](#creating-database-seeders) - [Using Database Seeders](#using-database-seeders) - [Running Seeders](#running-seeders) + - [Retrieving Seeder Data](#retrieving-seeder-data) + - [Using DevSeed for Quick Biological Data Seeding]() - [Data Factories](#factories) - [Defining Factories](#defining-factories) - [Using Factories](#using-factories) @@ -48,13 +50,14 @@ From your module's directory, execute: ```bash # You may specify the module name or leave it blank. # When left blank, the name of the current directory will be used as the module name. -./vendor/bin/tripaltest init MODULE_NAME +./vendor/bin/tripaltest init [MODULE_NAME] ``` This will - Set up the testing framework by creating the tests directory, phpunit.xml and tests/bootstrap.php - Create an example test in tests/ExampleTest.php - Create a DatabaseSeeders folder and an example seeder in tests/DatabaseSeeders/UsersTableSeeder.php +- Create DevSeedSeeder.php in DatabaseSeers. See the [DevSeed section] to learn more about automatically populating the database with biological data. - Create an example `.env` file. - Create `.travis.yml` configured to use a tripal3 docker container to run your tests @@ -230,6 +233,34 @@ class MyTest extends TripalTestCase { } ``` +#### Using DevSeed for Quick Biological Data Seeding + +Tripal Test Suite ships with a default seeder called `DevSeedSeeder`. This seeder provides a quick +and automated way of seeding your database with biological data such as organisms, mRNAs, BLAST +annotations and InterProScan annotations. The data in the default seeder is obtained +from [Tripal DevSeed](https://github.com/statonlab/tripal_dev_seed), which is a developer +mini-set of biological data. + +**NOTE:** DevSeedSeeder.php becomes available after running `tripaltest init`. If you do not have +the seeder available, you can find it your `vendor/statonlab/tripal-test-suite/stubs/DevSeedSeeder.php` +and make it available by copying it to `tests/DatabaseSeeders/DevSeedSeeder.php`. The `init` command will +not override your files unless you specify the `--force` flag so it it's safe to run it to get only +the DevSeeder. + +To run the dev seed seeder, you first have to configure it by uncommenting the type of data you +want seeded. Then, you can run the seeder using `tripaltest db:seed DevSeedSeeder`. + +1. Open `DatabaseSeeders/DevSeedSeeder.php` +2. You'll notice a few commented properties in the top of the file. +3. Uncomment and modify the properties to your need +4. Next, run `tripaltest db:seed DevSeedSeeder` +5. If the seeder runs successfully, you'll be able to see all the records in your Chado database. + +The records provided by DevSeed are not published to your site as entities. You can do that +by adding `$this->publish('CHADO_TABLE')` at the end of the `up()` method of the `DevSeedSeeder`. +Replace `CHADO_TABLE` with the name of the table such as `feature` for mRNAs and `analysis` for analyses. +Or, if you prefer, you can use the Tripal admin interface to publish the records. + ### Factories DB factories provide a method to populate the database with fake data. Using factories, you won't have to run SQL queries to populate the Database in every test. Since they are reusable, diff --git a/src/Console/Commands/InitCommand.php b/src/Console/Commands/InitCommand.php index 4fb562e..8f3763b 100644 --- a/src/Console/Commands/InitCommand.php +++ b/src/Console/Commands/InitCommand.php @@ -66,6 +66,7 @@ protected function handle() 'ExampleTest.php' => 'tests/ExampleTest.php', 'UsersTableSeeder.php' => 'tests/DatabaseSeeders/UsersTableSeeder.php', 'DataFactory.php' => 'tests/DataFactory.php', + 'DevSeedSeeder.php' => 'tests/DatabaseSeeders/DevSeedSeeder.php', ]); try { diff --git a/stubs/DevseedSeeder.php b/stubs/DevSeedSeeder.php similarity index 100% rename from stubs/DevseedSeeder.php rename to stubs/DevSeedSeeder.php diff --git a/tests/test_module/tests/DatabaseSeeders/DevSeedSeeder.php b/tests/test_module/tests/DatabaseSeeders/DevSeedSeeder.php new file mode 100644 index 0000000..ee1d675 --- /dev/null +++ b/tests/test_module/tests/DatabaseSeeders/DevSeedSeeder.php @@ -0,0 +1,383 @@ + 'F. excelsior miniature', +// 'genus' => 'Fraxinus', +// 'species' => 'excelsior', +// 'abbreviation' => 'F. excelsor', +// 'comment' => 'Loaded with TripalDev Seed.', +// ]; +// +// protected $sequence_analysis = [ +// 'name' => 'Fraxinus exclesior miniature dataset', +// 'description' => 'Tripal Dev Seed', +// ]; +// +// protected $expression_analysis = [ +// +// 'name' => 'Fraxinus exclesior miniature dataset Expression Analysis', +// 'description' => 'Tripal Dev Seed', +// ]; +// +// protected $blastdb = [ +// 'name' => 'DevSeed Database: TREMBL', +// 'description' => 'A dummy database created by DevSeed', +// ]; + + /** + * Part 2: + * Files. + * Each importer will take a file argument. This argument should be an array + * with one of the following two keys: file_remote => url where the file is + * located file_local => server path where the file is located. + */ + + // protected $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta']; + + protected $landmark_type = 'scaffold'; + + // protected $mRNA_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/mrna_mini.fasta']; + + // protected $protein_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/polypeptide_mini.fasta']; + + // protected $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff']; + + // protected $blast_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff']; + + // protected $biomaterial_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/biomaterials/biomaterials.xml']; + + // protected $expression_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/expression/expression.tsv']; + + // protected $interpro_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/ips/polypeptide_mini.fasta.xml']; + + // Regular expression that will link the protein name to the mRNA parent feature name. + // protected $prot_regexp = '/(FRA.*?)(?=:)/'; + + protected $prot_regexp = null; + + public function __construct() + { + + if ($this->organism) { + + try { + $organism = $this->fetch_chado_record('chado.organism', [ + 'common_name', + 'organism_id', + ], $this->organism); + } catch (\Exception $e) { + echo $e->getMessage(); + exit; + } + + $this->organism = $organism; + + if ($this->sequence_analysis) { + + try { + $seq_analysis = $this->fetch_chado_record('chado.analysis', ['analysis_id'], + $this->sequence_analysis); + } catch (\Exception $e) { + echo $e->getMessage(); + exit; + } + $this->sequence_analysis = $seq_analysis; + } + + if ($this->expression_analysis) { + try { + $expression_analysis = $this->fetch_chado_record('chado.analysis', ['analysis_id'], + $this->expression_analysis); + } catch (\Exception $e) { + echo $e->getMessage(); + exit; + } + + $this->expression_analysis = $expression_analysis; + } + } + + if ($this->blastdb) { + try { + $blastdb = $this->fetch_chado_record('chado.db', ['db_id'], $this->blastdb); + } catch (\Excetion $e) { + echo $e->getMessage(); + } + + $this->blastdb = $blastdb; + } + } + + /** + * Runs all loaders. + * Will only run loaders where the files have been uncommented at the start + * of the class. + */ + public function up() + { + + if ($this->landmark_file) { + + $run_args = [ + 'organism_id' => $this->organism->organism_id, + 'analysis_id' => $this->sequence_analysis->analysis_id, + 'seqtype' => $this->landmark_type, + 'method' => 2, //default insert and update + 'match_type' => 1, //unique name default + //optional + 're_name' => null, + 're_uname' => null, + 're_accession' => null, + 'db_id' => null, + 'rel_type' => null, + 're_subject' => null, + 'parent_type' => null, + ]; + $this->load_landmarks($run_args, $this->landmark_file); + } + + if ($this->gff_file) { + $run_args = [ + 'analysis_id' => $this->sequence_analysis->analysis_id, + 'organism_id' => $this->organism->organism_id, + 'use_transaction' => 1, + 'add_only' => 0, + 'update' => 1, + 'create_organism' => 0, + 'create_target' => 0, + + ///regexps for mRNA and protein. + 're_mrna' => null, + 're_protein' => $this->prot_regexp, + //optional + 'target_organism_id' => null, + 'target_type' => null, + 'start_line' => null, + 'landmark_type' => null, + 'alt_id_attr' => null, + ]; + $this->load_GFF($run_args, $this->gff_file); + } + + if ($this->mRNA_file) { + + $run_args = [ + 'organism_id' => $this->organism->organism_id, + 'analysis_id' => $this->sequence_analysis->analysis_id, + 'seqtype' => 'mRNA', + 'method' => 2, //default insert and update + 'match_type' => 1, //unique name default + //optional + 're_name' => null, + 're_uname' => null, + 're_accession' => null, + 'db_id' => null, + 'rel_type' => null, + 're_subject' => null, + 'parent_type' => null, + ]; + $this->load_mRNA_FASTA($run_args, $this->mRNA_file); + } + + if ($this->protein_file) { + $run_args = [ + 'organism_id' => $this->organism->organism_id, + 'analysis_id' => $this->sequence_analysis->analysis_id, + 'seqtype' => 'polypeptide', + 'method' => 2, + 'match_type' => 1, + //optional + 're_name' => null, + 're_uname' => null, + 're_accession' => null, + 'db_id' => null, + ]; + + if ($this->prot_regexp) { + //links polypeptide to mRNA + $run_args['rel_type'] = 'derives_from'; + $run_args['re_subject'] = $this->prot_regexp; + $run_args['parent_type'] = 'mRNA'; + } + $this->load_polypeptide_FASTA($run_args, $this->protein_file); + } + + if ($this->interpro_file) { + + $run_args = [ + 'analysis_id' => $this->sequence_analysis->analysis_id, + //optional + 'query_type' => mRNA, + 'query_re' => $this->prot_regexp, + 'query_uniquename' => null, + 'parsego' => true, + ]; + + $this->load_interpro_annotations($run_args, $this->interpro_file); + } + + if ($this->blast_file) { + $run_args = [ + 'analysis_id' => $this->sequence_analysis->analysis_id, + 'no_parsed' => 25,//number results to parse + 'query_type' => 'mRNA', + //optional + 'blastdb' => $this->blastdb->db_id, + 'blastfile_ext' => null, + 'is_concat' => 0, + 'query_re' => null, + 'query_uniquename' => 0, + ]; + + $this->load_blast_annotations($run_args, $this->blast_file); + } + + if ($this->biomaterial_file) { + $run_args = [ + 'organism_id' => $this->organism->organism_id, + 'analysis_id' => $this->sequence_analysis->analysis_id, + ]; + //optional: specifies specific CVterms for properties/property values. Not used here. + //'cvterm_configuration' => NULL, + //'cvalue_configuration' => NULL]; + + $this->load_biomaterials($run_args, $this->biomaterial_file); + } + + if ($this->expression_file) { + $run_args = [ + 'filetype' => 'mat', //matrix file type + 'organism_id' => $this->organism->organism_id, + 'analysis_id' => $this->sequence_analysis->analysis_id, + //optional + 'fileext' => null, + 'feature_uniquenames' => 'uniq', + 're_start' => null, + 're_stop' => null, + 'feature_uniquenames' => null, + 'quantificationunits' => null, + ]; + $this->load_expression($run_args, $this->expression_file); + } + } + + private function load_landmarks($run_args, $file) + { + module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter'); + + $importer = new \FASTAImporter(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_mRNA_FASTA($run_args, $file) + { + module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter'); + + $importer = new \FASTAImporter(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_polypeptide_FASTA($run_args, $file) + { + module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter'); + + $importer = new \FASTAImporter(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_interpro_annotations($run_args, $file) + { + module_load_include('inc', 'tripal_analysis_interpro', 'includes/TripalImporter/InterProImporter'); + + $importer = new \InterProImporter(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_GFF($run_args, $file) + { + module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer'); + + $importer = new \GFF3Importer(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_blast_annotations($run_args, $file) + { + module_load_include('inc', 'tripal_analysis_blast', 'includes/TripalImporter/BlastImporter'); + + $importer = new \BlastImporter(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_biomaterials($run_args, $file) + { + module_load_include('inc', 'tripal_biomaterial', 'includes/TripalImporter/tripal_biomaterial_loader_v3'); + + $importer = new \tripal_biomaterial_loader_v3(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function load_expression($run_args, $file) + { + module_load_include('inc', 'tripal_analysis_expression', + 'includes/TripalImporter/tripal_expression_data_loader'); + + $importer = new \tripal_expression_data_loader(); + $importer->create($run_args, $file); + $importer->prepareFiles(); + $importer->run(); + } + + private function fetch_chado_record($table, $fields, $factory_array) + { + $query = db_select($table, 't')->fields('t', $fields); + + foreach ($factory_array as $key => $value) { + $query->condition($key, $value); + } + + $count_query = $query; + $count_query->countQuery()->execute()->fetchField(); + + if ($count_query === 0) { + return factory($table)->create($factory_array); + } + + if ($count_query === 1) { + return $query->execute()->fetchObject(); + } + + throw new Exception("Error creating object for: ".$table.".\n Array supplied matches ".$count_query." results, must match 1."); + } +} From e92710d55210e03cbc3dfc62ffe509648f4942a3 Mon Sep 17 00:00:00 2001 From: Bradford Condon Date: Fri, 3 Aug 2018 10:17:00 -0400 Subject: [PATCH 2/2] Update README.md --- README.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d179ddd..4cc8377 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ with data for use in testing). - [Using Database Seeders](#using-database-seeders) - [Running Seeders](#running-seeders) - [Retrieving Seeder Data](#retrieving-seeder-data) - - [Using DevSeed for Quick Biological Data Seeding]() + - [Using DevSeed for Quick Biological Data Seeding](#using-devseed-for-quick-biological-data-seeding) - [Data Factories](#factories) - [Defining Factories](#defining-factories) - [Using Factories](#using-factories) @@ -241,20 +241,18 @@ annotations and InterProScan annotations. The data in the default seeder is obta from [Tripal DevSeed](https://github.com/statonlab/tripal_dev_seed), which is a developer mini-set of biological data. -**NOTE:** DevSeedSeeder.php becomes available after running `tripaltest init`. If you do not have -the seeder available, you can find it your `vendor/statonlab/tripal-test-suite/stubs/DevSeedSeeder.php` -and make it available by copying it to `tests/DatabaseSeeders/DevSeedSeeder.php`. The `init` command will -not override your files unless you specify the `--force` flag so it it's safe to run it to get only +**NOTE:** DevSeedSeeder.php becomes available after running `tripaltest init`. The `init` command will +not override existing files unless you specify the `--force` flag so it it's safe to run it to get only the DevSeeder. -To run the dev seed seeder, you first have to configure it by uncommenting the type of data you -want seeded. Then, you can run the seeder using `tripaltest db:seed DevSeedSeeder`. +By default, the DevSeed comes with all sub-loaders disabled. To run the DevSeed seeder, you first have to configure it by uncommenting the type of data you want seeded. Then, you can run the seeder using `tripaltest db:seed DevSeedSeeder`. 1. Open `DatabaseSeeders/DevSeedSeeder.php` 2. You'll notice a few commented properties in the top of the file. -3. Uncomment and modify the properties to your need -4. Next, run `tripaltest db:seed DevSeedSeeder` -5. If the seeder runs successfully, you'll be able to see all the records in your Chado database. +3. Uncomment and modify the properties to your need. +4. Carefully follow the instructions in this section. All loaders require an organism as well, but some are dependent on previous loaders. +5. Next, run `tripaltest db:seed DevSeedSeeder` +6. If the seeder runs successfully, you'll be able to see all the records in your Chado database. The records provided by DevSeed are not published to your site as entities. You can do that by adding `$this->publish('CHADO_TABLE')` at the end of the `up()` method of the `DevSeedSeeder`.