From 68c3ae751a9db2dc77871dfe2a840ec0fd796da3 Mon Sep 17 00:00:00 2001 From: Michael Nuhn Date: Fri, 18 Mar 2016 10:57:36 +0000 Subject: [PATCH] Various changes, mostly related to storing paths and files relevant to the quality check. That makes it easier for users to investigate the results. --- .../EnsEMBL/Funcgen/Hive/Config/QC_Chance.pm | 1 + .../EnsEMBL/Funcgen/Hive/Config/QC_Fastqc.pm | 1 + .../Funcgen/Hive/Config/QC_Flagstats.pm | 3 +++ .../Funcgen/Hive/Config/QC_PhantomPeaks.pm | 3 +++ .../Config/QC_ProportionOfReadsInPeaks.pm | 2 +- scripts/sequencing/load_argenrich_qc_file.pl | 10 ++++++--- .../sequencing/load_fastqc_summary_file.pl | 7 +++++-- scripts/sequencing/load_phantom_peak_file.pl | 21 ++++++++++++++----- scripts/sequencing/load_samtools_flagstats.pl | 15 +++++++++---- .../proportion_of_reads_in_peaks.pl | 9 ++++++-- 10 files changed, 55 insertions(+), 17 deletions(-) diff --git a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Chance.pm b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Chance.pm index b08af22a6..a19c60964 100644 --- a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Chance.pm +++ b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Chance.pm @@ -145,6 +145,7 @@ sub pipeline_analyses { . qq( --pass #tracking_db_pass# ) . qq( --host #tracking_db_host# ) . qq( --dbname #tracking_db_name# ) + . qq( --work_dir #tempdir# ) }, }, ]; diff --git a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Fastqc.pm b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Fastqc.pm index bf39f762d..64117503b 100644 --- a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Fastqc.pm +++ b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Fastqc.pm @@ -100,6 +100,7 @@ sub pipeline_analyses { cmd => qq(load_fastqc_summary_file.pl ) . qq( --input_subset_id #input_subset_id# ) . qq( --summary_file #fastqc_summary_file# ) + . qq( --work_dir #tempdir# ) . qq( | mysql ) . qq( --host #tracking_db_host# ) . qq( --port #tracking_db_port# ) diff --git a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Flagstats.pm b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Flagstats.pm index fffc4857a..1ffe5c7f1 100644 --- a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Flagstats.pm +++ b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_Flagstats.pm @@ -66,6 +66,9 @@ sub pipeline_analyses { . qq( --result_set_id #result_set_id# ) . qq( --flagstats_file #flagstats_file# ) . qq( --user #tracking_db_user# --pass #tracking_db_pass# --host #tracking_db_host# --dbname #tracking_db_name# ) + . qq( --work_dir #tempdir# ) + . qq( --bam_file #bam_file# ) + }, }, diff --git a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_PhantomPeaks.pm b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_PhantomPeaks.pm index 3e75af004..35cca3f18 100644 --- a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_PhantomPeaks.pm +++ b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_PhantomPeaks.pm @@ -107,6 +107,9 @@ sub pipeline_analyses { . qq( --pass #tracking_db_pass# ) . qq( --host #tracking_db_host# ) . qq( --dbname #tracking_db_name# ) + . qq( --work_dir #tempdir# ) + . qq( --bam_file #bam_file# ) + }, }, ]; diff --git a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_ProportionOfReadsInPeaks.pm b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_ProportionOfReadsInPeaks.pm index 0f4b94664..72e31dd88 100644 --- a/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_ProportionOfReadsInPeaks.pm +++ b/modules/Bio/EnsEMBL/Funcgen/Hive/Config/QC_ProportionOfReadsInPeaks.pm @@ -54,13 +54,13 @@ sub pipeline_analyses { cmd => qq( proportion_of_reads_in_peaks.pl ) . qq( --peak_file #peak_file# ) . qq( --temp_dir #temp_dir# ) - . qq( --bam_file #bam_file# ) . qq( --peak_caller #peak_caller# ) . qq( --feature_set_id #feature_set_id# ) . qq( --user #tracking_db_user# ) . qq( --pass #tracking_db_pass# ) . qq( --host #tracking_db_host# ) . qq( --dbname #tracking_db_name# ) + . qq( --bam_file #bam_file# ) }, -rc_name => 'normal_2GB', }, diff --git a/scripts/sequencing/load_argenrich_qc_file.pl b/scripts/sequencing/load_argenrich_qc_file.pl index ff030fe3e..7074886cb 100755 --- a/scripts/sequencing/load_argenrich_qc_file.pl +++ b/scripts/sequencing/load_argenrich_qc_file.pl @@ -78,6 +78,7 @@ =head1 DESCRIPTION my $host; my $dbname; my $signal_result_set_id; +my $work_dir; my %config_hash = ( "argenrich_file" => \$argenrich_file, @@ -88,6 +89,7 @@ =head1 DESCRIPTION 'pass' => \$pass, 'host' => \$host, 'dbname' => \$dbname, + 'work_dir' => \$work_dir, ); my $result = GetOptions( @@ -100,6 +102,7 @@ =head1 DESCRIPTION 'pass=s', 'host=s', 'dbname=s', + 'work_dir=s', ); die unless(-e $argenrich_file); @@ -147,7 +150,7 @@ =head1 DESCRIPTION signal_result_set_id, analysis_id, p, q, divergence, z_score, percent_genome_enriched, input_scaling_factor, differential_percentage_enrichment, control_enrichment_stronger_than_chip_at_bin, first_nonzero_bin_at, - pcr_amplification_bias_in_Input_coverage_of_1_percent_of_genome + pcr_amplification_bias_in_Input_coverage_of_1_percent_of_genome, path ) values ( $signal_result_set_id, $analysis_id, @@ -160,7 +163,8 @@ =head1 DESCRIPTION $key_value_pairs{'differential_percentage_enrichment'}, $key_value_pairs{'Control enrichment stronger than ChIP at bin'}, $key_value_pairs{'Zero-enriched IP, maximum difference at bin'}, - $key_value_pairs{'PCR amplification bias in Input, coverage of 1% of genome'} + $key_value_pairs{'PCR amplification bias in Input, coverage of 1% of genome'}, + '$work_dir' ) ); @@ -276,7 +280,7 @@ sub create_table { -- greater deviations from that are reported. -- `pcr_amplification_bias_in_Input_coverage_of_1_percent_of_genome`double default NULL, - `path` varchar(100) NOT NULL, + `path` varchar(512) NOT NULL, PRIMARY KEY (`result_set_qc_chance_id`) ); SQL diff --git a/scripts/sequencing/load_fastqc_summary_file.pl b/scripts/sequencing/load_fastqc_summary_file.pl index 2a477771c..eeb8241e0 100755 --- a/scripts/sequencing/load_fastqc_summary_file.pl +++ b/scripts/sequencing/load_fastqc_summary_file.pl @@ -74,16 +74,19 @@ =head1 DESCRIPTION my $summary_file; my $input_subset_id; +my $work_dir; my %config_hash = ( "summary_file" => \$summary_file, "input_subset_id" => \$input_subset_id, + "work_dir" => \$work_dir, ); my $result = GetOptions( \%config_hash, 'input_subset_id=s', 'summary_file=s', + 'work_dir=s', ); die unless(-e $summary_file); @@ -98,7 +101,7 @@ =head1 DESCRIPTION #print " - $current_line\n"; my @f = split "\t", $current_line; #print Dumper(\@f); - my $sql = "INSERT ignore INTO input_subset_fastqc (input_subset_id,status,title,file_name) VALUES (".$input_subset_id.", '".$f[0]."', '".$f[1]."', '".$f[2]."')"; + my $sql = "INSERT ignore INTO input_subset_fastqc (input_subset_id,status,title,file_name,path) VALUES (".$input_subset_id.", '".$f[0]."', '".$f[1]."', '".$f[2]."', '".$work_dir."')"; print "$sql;\n"; } @@ -114,7 +117,7 @@ sub create_table_sql { `status` varchar(100) NOT NULL, `title` varchar(100) NOT NULL, `file_name` varchar(100) NOT NULL, - `path` varchar(100) NOT NULL, + `path` varchar(512) NOT NULL, PRIMARY KEY (`input_subset_qc_id`), UNIQUE KEY `name_exp_idx` (`input_subset_id`,`title`) ) ENGINE=MyISAM; diff --git a/scripts/sequencing/load_phantom_peak_file.pl b/scripts/sequencing/load_phantom_peak_file.pl index bb5a060a0..9280a050d 100755 --- a/scripts/sequencing/load_phantom_peak_file.pl +++ b/scripts/sequencing/load_phantom_peak_file.pl @@ -59,6 +59,8 @@ =head1 DESCRIPTION my $pass; my $host; my $dbname; +my $work_dir; +my $bam_file; my %config_hash = ( "result_file" => \$result_file, @@ -68,6 +70,8 @@ =head1 DESCRIPTION 'pass' => \$pass, 'host' => \$host, 'dbname' => \$dbname, + 'work_dir' => \$work_dir, + 'bam_file' => \$bam_file, ); my $result = GetOptions( @@ -79,6 +83,8 @@ =head1 DESCRIPTION 'pass=s', 'host=s', 'dbname=s', + 'work_dir=s', + 'bam_file=s', ); if (! $result_file) { @@ -133,6 +139,7 @@ =head1 DESCRIPTION } else { $sql_processor = sub { my $sql = shift; + $logger->info($sql . "\n"); $dbc->do($sql); }; } @@ -219,13 +226,15 @@ =head1 DESCRIPTION . "min_corr, " . "NSC, " . "RSC, " - . "QualityTag " + . "QualityTag, " + . "path " . ") VALUES (" . ( join ', ', ( $result_set_id, $analysis_id, - quote($filename), + #quote($filename), + quote($bam_file), $numReads, $estFragLen, @@ -243,6 +252,7 @@ =head1 DESCRIPTION $NSC, $RSC, $QualityTag, + quote($work_dir) ) ) . ");"; @@ -265,7 +275,7 @@ sub create_table { `result_set_qc_phantom_peak_id` int(10) unsigned NOT NULL AUTO_INCREMENT, `analysis_id` int(10) unsigned, `result_set_id` int(10) unsigned NOT NULL, - `filename` varchar(100) NOT NULL, + `filename` varchar(512) NOT NULL, `numReads` int(10) unsigned NOT NULL, `estFragLen` double default NULL, `estFragLen2` double default NULL, @@ -309,9 +319,10 @@ sub create_table { -- Quality values derived from the RSC -- `QualityTag` int(10), - `path` varchar(100) NOT NULL, + `path` varchar(512) NOT NULL, PRIMARY KEY (`result_set_qc_phantom_peak_id`), - UNIQUE KEY `filename_idx` (`filename`) +-- UNIQUE KEY `filename_idx` (`filename`) + KEY `filename_idx` (`filename`) ); SQL ; diff --git a/scripts/sequencing/load_samtools_flagstats.pl b/scripts/sequencing/load_samtools_flagstats.pl index e8b490d7b..047eedcf1 100755 --- a/scripts/sequencing/load_samtools_flagstats.pl +++ b/scripts/sequencing/load_samtools_flagstats.pl @@ -51,6 +51,8 @@ =head1 DESCRIPTION my $pass; my $host; my $dbname; +my $work_dir; +my $bam_file; my %config_hash = ( 'flagstats_file' => \$flagstats_file, @@ -60,6 +62,8 @@ =head1 DESCRIPTION 'pass' => \$pass, 'host' => \$host, 'dbname' => \$dbname, + 'work_dir' => \$work_dir, + 'bam_file' => \$bam_file, ); # Loading command line paramters into variables and into a hash. @@ -72,6 +76,8 @@ =head1 DESCRIPTION 'pass=s', 'host=s', 'dbname=s', + 'work_dir=s', + 'bam_file=s', ); die unless(-e $flagstats_file); @@ -146,7 +152,7 @@ sub create_insert_sql { my $sql_processor = $param->{sql_processor}; open IN, $flagstats_file; - + while (my $current_line = ) { chomp $current_line; my $recognized = $current_line =~ /^(\d+) \+ (\d) (.+)$/; @@ -156,9 +162,9 @@ sub create_insert_sql { my $category = $3; my $sql = "INSERT INTO result_set_qc_flagstats " - . "(result_set_id,analysis_id,category,qc_passed_reads,qc_failed_reads) " + . "(result_set_id, analysis_id, category, qc_passed_reads, qc_failed_reads, path, bam_file) " . "VALUES " - . "($result_set_id, $analysis_id, '$category', $qc_passed_reads, $qc_failed_reads);"; + . "($result_set_id, $analysis_id, '$category', $qc_passed_reads, $qc_failed_reads, '$work_dir', '$bam_file');"; $sql_processor->($sql); } else { $logger->debug("Can't parse: " . $current_line . "\n"); @@ -182,7 +188,8 @@ sub create_flagstats_table { `category` varchar(100) NOT NULL, `qc_passed_reads` int(10) unsigned, `qc_failed_reads` int(10) unsigned, - `path` varchar(100) NOT NULL, + `path` varchar(512) NOT NULL, + `bam_file` varchar(512) NOT NULL, PRIMARY KEY (`result_set_qc_id`), UNIQUE KEY `name_exp_idx` (`result_set_qc_id`,`category`) ); diff --git a/scripts/sequencing/proportion_of_reads_in_peaks.pl b/scripts/sequencing/proportion_of_reads_in_peaks.pl index 322d5a83d..edbe24d94 100755 --- a/scripts/sequencing/proportion_of_reads_in_peaks.pl +++ b/scripts/sequencing/proportion_of_reads_in_peaks.pl @@ -394,7 +394,9 @@ sub create_insert_sql { . "analysis_id, " . "feature_set_id, " . "prop_reads_in_peaks, " - . "total_reads" + . "total_reads, " + . "path," + . "bam_file" . ") VALUES (" . ( join ', ', ( @@ -402,6 +404,8 @@ sub create_insert_sql { $feature_set_id, $proportion_of_reads_in_peaks, $num_reads_in_total, + "'$temp_dir'", + "'$bam_file'" ) ) . ");"; @@ -523,7 +527,8 @@ sub create_table { `feature_set_id` int(10) unsigned NOT NULL, `prop_reads_in_peaks` double default NULL, `total_reads` int(10) default NULL, - `path` varchar(100) NOT NULL, + `path` varchar(512) NOT NULL, + `bam_file` varchar(512) NOT NULL, PRIMARY KEY (`feature_set_qc_prop_reads_in_peaks_id`) ); SQL