From 61c9b6e842cc6cc9d3ffc55d7af0356761b9ed63 Mon Sep 17 00:00:00 2001 From: Murray Wham Date: Fri, 22 Mar 2019 12:19:23 +0000 Subject: [PATCH 1/6] EdinburghGenomics/Reporting-App#205: Renaming GenderValidation -> SexCheck, removing duplicate aliasing, schema change. --- README.md | 4 ++-- analysis_driver/pipelines/bcbio.py | 4 ++-- analysis_driver/quality_control/__init__.py | 2 +- .../quality_control/relatedness.py | 12 ++--------- .../{gender_validation.py => sex_check.py} | 21 +++++++++++++------ analysis_driver/report_generation/crawler.py | 12 ++--------- .../report_generation/sample_crawler.py | 14 ++++++------- bin/run_qc.py | 10 ++++----- etc/output_files.yaml | 2 +- integration_tests/integration_test.py | 4 ++-- .../expected_sample_crawler_data.json | 8 ++++--- tests/test_crawlers.py | 2 +- .../test_quality_control/test_relatedness.py | 13 +++++------- ...gender_validation.py => test_sex_check.py} | 10 ++++----- 14 files changed, 55 insertions(+), 63 deletions(-) rename analysis_driver/quality_control/{gender_validation.py => sex_check.py} (61%) rename tests/test_quality_control/{test_gender_validation.py => test_sex_check.py} (52%) diff --git a/README.md b/README.md index 32923c40..23cedc44 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline. - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the results of comparing the observed and expected vcfs -- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file - containing the called gender, to be compared against the gender supplied in the Lims +- SexCheck - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file + containing the called sex, to be compared against that supplied in the Lims - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen) - Blast - Checks a fastq file for contamination using NCBI Blast - VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID) diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py index f5398cea..f7d14fd0 100755 --- a/analysis_driver/pipelines/bcbio.py +++ b/analysis_driver/pipelines/bcbio.py @@ -130,11 +130,11 @@ def stage(cls, **params): bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val] - gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), + sex_check = stage(qc.SexCheck, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc), samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc) - post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth] + post_bcbio_qc = [sex_check, vcfstats, verify_bam_id, samtools_depth] output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio') cleanup = stage(common.Cleanup, previous_stages=[output]) diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py index ebae42ea..d1424789 100644 --- a/analysis_driver/quality_control/__init__.py +++ b/analysis_driver/quality_control/__init__.py @@ -1,6 +1,6 @@ from .genotype_validation import GenotypeValidation from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast -from .gender_validation import GenderValidation +from .sex_check import SexCheck from .median_coverage import SamtoolsDepth from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness from .bcl_validation import BCLValidator diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py index d9c6d267..5a75f820 100644 --- a/analysis_driver/quality_control/relatedness.py +++ b/analysis_driver/quality_control/relatedness.py @@ -7,22 +7,14 @@ from analysis_driver.util.bash_commands import java_command from analysis_driver.segmentation import Stage, Parameter, ListParameter from analysis_driver.exceptions import PipelineError +from analysis_driver.quality_control.sex_check import sex_alias class RelatednessStage(Stage): - _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} - @property def gatk_outfile(self): return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf') - @classmethod - def gender_alias(cls, gender): - for key in cls._gender_aliases: - if str(gender).lower() in cls._gender_aliases[key]: - return key - return 'unknown' - @staticmethod def family_id(sample_id): return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID' @@ -245,7 +237,7 @@ def get_member_details(self, family, all_families): family_lines = [] family_info = self.relationships(all_families[family]) for member in all_families[family]: - sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex')) + sex = sex_alias(clarity.get_sample_sex(member)) sex_codes = {'male': '1', 'female': '2', 'unknown': '0'} relationship = self.relationship(member) member_id = clarity.get_user_sample_name(member) diff --git a/analysis_driver/quality_control/gender_validation.py b/analysis_driver/quality_control/sex_check.py similarity index 61% rename from analysis_driver/quality_control/gender_validation.py rename to analysis_driver/quality_control/sex_check.py index 09e02ee8..b605ab78 100644 --- a/analysis_driver/quality_control/gender_validation.py +++ b/analysis_driver/quality_control/sex_check.py @@ -2,12 +2,21 @@ from egcg_core import executor, util from analysis_driver.segmentation import Parameter, Stage +_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} -class GenderValidation(Stage): + +def sex_alias(sex): + for key in _sex_aliases: + if str(sex).lower() in _sex_aliases[key]: + return key + return 'unknown' + + +class SexCheck(Stage): vcf_file = Parameter() def _run(self): - """Detect gender of the sample based on the %het on the X chromosome.""" + """Detect sex of the sample based on the %het on the X chromosome.""" name, ext = os.path.splitext(util.find_file(self.vcf_file)) if ext == '.gz': file_opener = 'zcat' @@ -15,21 +24,21 @@ def _run(self): else: file_opener = 'cat' - gender_call_file = name + '.sex' + sex_check_file = name + '.sex' command = util.str_join( '%s %s' % (file_opener, self.vcf_file), "grep -P '^chrX|^X'", "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'", "grep '0/1'", - "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'", + "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'", separator=' | ' - ) + ' > ' + gender_call_file + ) + ' > ' + sex_check_file self.info(command) return executor.execute( command, - job_name='sex_detection', + job_name='sex_check', working_dir=self.job_dir, walltime=6, cpus=1, diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py index 9c757d45..ef130a7a 100644 --- a/analysis_driver/report_generation/crawler.py +++ b/analysis_driver/report_generation/crawler.py @@ -1,25 +1,17 @@ from egcg_core import clarity from egcg_core import constants as c from egcg_core.app_logging import AppLogger +from analysis_driver.quality_control.sex_check import sex_alias class Crawler(AppLogger): - _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} - - @classmethod - def gender_alias(cls, gender): - for key in cls._gender_aliases: - if str(gender).lower() in cls._gender_aliases[key]: - return key - return 'unknown' - @classmethod def get_sample_information_from_lims(cls, sample_name): lims_sample = clarity.get_sample(sample_name) sample_info = { c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True), c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0], # returns [plate_id, well] - c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)), + c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))}, c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name) } if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf: diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py index fe8d7d26..49115f00 100755 --- a/analysis_driver/report_generation/sample_crawler.py +++ b/analysis_driver/report_generation/sample_crawler.py @@ -3,7 +3,7 @@ from egcg_core.rest_communication import post_or_patch as pp from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp from analysis_driver.config import output_file_config -from .crawler import Crawler +from .crawler import Crawler, sex_alias class SampleCrawler(Crawler): @@ -58,12 +58,12 @@ def _populate_lib_info(self): else: self.critical('Missing *-sort-callable.bed') - sex_file_path = self.get_output_file('gender_call') + sex_file_path = self.get_output_file('sex_check') if sex_file_path: with open(sex_file_path) as f: - gender, het_x = f.read().strip().split() - sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender) - sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x} + sex, het_x = f.read().strip().split() + sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex) + sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x genotype_validation_path = self.get_output_file('genoval') if genotype_validation_path: @@ -106,10 +106,10 @@ def _populate_lib_info(self): } sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics sample[ELEMENT_MEDIAN_COVERAGE] = median - if ELEMENT_GENDER_VALIDATION in sample: + if ELEMENT_SEX_VALIDATION in sample: cov_y = dm.get_coverage_y_chrom(coverage_statistics_path) if cov_y: - sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y + sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y else: self.critical('coverage statistics unavailable for %s', self.sample_id) diff --git a/bin/run_qc.py b/bin/run_qc.py index bfecc795..8ffa6288 100644 --- a/bin/run_qc.py +++ b/bin/run_qc.py @@ -44,9 +44,9 @@ def _parse_args(): sample_contamination.add_argument('--bam_file', required=True) sample_contamination.set_defaults(func=run_sample_contamination_check) - gender_val = subparsers.add_parser('gender_validation') - gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender') - gender_val.set_defaults(func=run_gender_validation) + sex_check = subparsers.add_parser('sex_check') + sex_check.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex') + sex_check.set_defaults(func=run_sex_check) median_cov = subparsers.add_parser('median_coverage') median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs') @@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args): v.run() -def run_gender_validation(dataset, args): +def run_sex_check(dataset, args): os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True) - g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file) + g = qc.SexCheck(dataset=dataset, vcf_file=args.vcf_file) g.run() diff --git a/etc/output_files.yaml b/etc/output_files.yaml index 3506c73c..ebae0816 100644 --- a/etc/output_files.yaml +++ b/etc/output_files.yaml @@ -101,7 +101,7 @@ bcbio: basename: '{sample_id}-chr22-vbi.selfSM' new_name: '{user_sample_id}-chr22-vbi.selfSM' - gender_call: + sex_check: location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}'] basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex' new_name: '{user_sample_id}.sex' diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py index 93fbb5e9..7d0e803c 100644 --- a/integration_tests/integration_test.py +++ b/integration_tests/integration_test.py @@ -15,7 +15,7 @@ class IntegrationTest(ReportingAppIntegrationTest): patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'), patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well), patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project), - patch('egcg_core.clarity.get_sample_gender'), + patch('egcg_core.clarity.get_sample_sex'), patch('egcg_core.clarity.get_sample_genotype', return_value=set()), patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()), patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()), @@ -289,7 +289,7 @@ def test_bcbio(self): ) self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen', - 'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth', + 'fixunmapped', 'blast', 'sexcheck', 'vcfstats', 'samtoolsdepth', 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview']) self.expect_equal( diff --git a/tests/assets/test_crawlers/expected_sample_crawler_data.json b/tests/assets/test_crawlers/expected_sample_crawler_data.json index 7d19e10a..8b7448aa 100644 --- a/tests/assets/test_crawlers/expected_sample_crawler_data.json +++ b/tests/assets/test_crawlers/expected_sample_crawler_data.json @@ -8,8 +8,6 @@ "project_id": "test_project", "bam_file_reads": 7928618, "mapped_reads": 7892452, - "called_gender": "male", - "provided_gender": "female", "species_name": "Homo sapiens", "species_contamination": { "contaminant_unique_mapped": { @@ -25,7 +23,11 @@ "total_reads_mapped": 100000 }, "sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01}, - "gender_validation": {"hetX": "0.10"}, + "sex_validation": { + "called": "male", + "provided": "female", + "hetX": "0.10" + }, "coverage": { "median": 478, "std_dev": 189.1911391390011, diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py index 4af04728..f61f04e4 100644 --- a/tests/test_crawlers.py +++ b/tests/test_crawlers.py @@ -119,7 +119,7 @@ def setUp(self): self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json'))) patched_sample_info = patch( ppath + 'SampleCrawler.get_sample_information_from_lims', - return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'} + return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'} ) patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample') with patched_sample_info, patched_user_sample_id: diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py index a38eb2ab..96884a08 100644 --- a/tests/test_quality_control/test_relatedness.py +++ b/tests/test_quality_control/test_relatedness.py @@ -1,5 +1,5 @@ import os -from unittest.mock import patch, Mock +from unittest.mock import patch from tests.test_quality_control.qc_tester import QCTester from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness from analysis_driver.exceptions import PipelineError @@ -140,15 +140,13 @@ def test_ped_file_content(self, pfams, pmem, pfam): pfam.side_effect = ['FAM1', 'FAM1', 'No_ID'] pmem.side_effect = [[['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0']], [['No_ID', '0', '0', '1', '0']]] pfams.return_value = {'FAM1': ['test_sample1', 'test_sample2'], 'No_ID': ['test_sample3']} - assert self.p.ped_file_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']] + assert self.p.ped_fi1le_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']] @patch(ppath + 'Peddy.relationships') @patch(ppath + 'Peddy.relationship') - @patch(ppath + 'Peddy.gender_alias') @patch(ppath + 'clarity.get_user_sample_name') - @patch(ppath + 'clarity.get_sample') - def test_get_member_details(self, psample, pname, psex, prel, prels): - psample.return_value = Mock(udf={'Sex': 'F'}) + @patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M']) + def test_get_member_details(self, psex, pname, prel, prels): prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'}, 'Mother': {'Mother': '0', 'Father': '0'}, 'Father': {'Mother': '0', 'Father': '0'}, @@ -156,7 +154,6 @@ def test_get_member_details(self, psample, pname, psex, prel, prels): 'Brother': {'Mother': 'test_sample1', 'Father': '0'}, 'Other': {'Mother': '0', 'Father': '0'}} prel.side_effect = ['Mother', 'Proband'] - psex.side_effect = ['female', 'male'] pname.side_effect = ['usersample1', 'usersample2', 'usersample1'] all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']} assert self.p.get_member_details('FAM1', all_families) == [ @@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels): 'Other': {'Mother': '0', 'Father': '0'}} prel.side_effect = ['Other', 'Other', 'Proband'] - psex.side_effect = ['unknown', 'unknown', 'male'] + psex.side_effect = ['unknown', 'unknown', 'M'] pname.side_effect = ['usersample1', 'usersample2', 'usersample3'] all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']} assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'], diff --git a/tests/test_quality_control/test_gender_validation.py b/tests/test_quality_control/test_sex_check.py similarity index 52% rename from tests/test_quality_control/test_gender_validation.py rename to tests/test_quality_control/test_sex_check.py index 31591171..6d5a0b8e 100644 --- a/tests/test_quality_control/test_gender_validation.py +++ b/tests/test_quality_control/test_sex_check.py @@ -1,20 +1,20 @@ from unittest.mock import patch -from analysis_driver.quality_control.gender_validation import GenderValidation +from analysis_driver.quality_control.sex_check import SexCheck from tests.test_quality_control.qc_tester import QCTester -class TestGenderValidation(QCTester): +class TestSexCheck(QCTester): @patch('egcg_core.executor.execute') def test_run(self, mocked_execute): - validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf') + validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf') - with patch('analysis_driver.quality_control.gender_validation.util.find_file', new=self.fake_find_file): + with patch('analysis_driver.quality_control.sex_check.util.find_file', new=self.fake_find_file): validator._run() command = mocked_execute.call_args[0][0] assert command.startswith('cat') assert len(command.split(' | ')) == 5 - validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') + validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') validator._run() assert mocked_execute.call_args[0][0].startswith('zcat') From e54f6e9caacae6204954f8c350289c9ab2fe9656 Mon Sep 17 00:00:00 2001 From: Murray Wham Date: Thu, 23 May 2019 15:51:10 +0100 Subject: [PATCH 2/6] SexCheck -> SexValidation --- README.md | 2 +- analysis_driver/pipelines/bcbio.py | 4 ++-- analysis_driver/quality_control/__init__.py | 2 +- analysis_driver/quality_control/relatedness.py | 2 +- .../{sex_check.py => sex_validation.py} | 8 ++++---- analysis_driver/report_generation/crawler.py | 2 +- analysis_driver/report_generation/sample_crawler.py | 2 +- bin/run_qc.py | 10 +++++----- etc/output_files.yaml | 2 +- tests/test_quality_control/test_relatedness.py | 2 +- tests/test_quality_control/test_sex_check.py | 10 +++++----- 11 files changed, 23 insertions(+), 23 deletions(-) rename analysis_driver/quality_control/{sex_check.py => sex_validation.py} (91%) diff --git a/README.md b/README.md index 23cedc44..a1c84d20 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Classes that run checks on output files generated from the main pipeline. - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the results of comparing the observed and expected vcfs -- SexCheck - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file +- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file containing the called sex, to be compared against that supplied in the Lims - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen) - Blast - Checks a fastq file for contamination using NCBI Blast diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py index f7d14fd0..2743a10f 100755 --- a/analysis_driver/pipelines/bcbio.py +++ b/analysis_driver/pipelines/bcbio.py @@ -130,11 +130,11 @@ def stage(cls, **params): bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val] - sex_check = stage(qc.SexCheck, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), + sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc), samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc) - post_bcbio_qc = [sex_check, vcfstats, verify_bam_id, samtools_depth] + post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth] output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio') cleanup = stage(common.Cleanup, previous_stages=[output]) diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py index d1424789..90d83f6d 100644 --- a/analysis_driver/quality_control/__init__.py +++ b/analysis_driver/quality_control/__init__.py @@ -1,6 +1,6 @@ from .genotype_validation import GenotypeValidation from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast -from .sex_check import SexCheck +from .sex_validation import SexValidation from .median_coverage import SamtoolsDepth from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness from .bcl_validation import BCLValidator diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py index 5a75f820..cfbc6beb 100644 --- a/analysis_driver/quality_control/relatedness.py +++ b/analysis_driver/quality_control/relatedness.py @@ -7,7 +7,7 @@ from analysis_driver.util.bash_commands import java_command from analysis_driver.segmentation import Stage, Parameter, ListParameter from analysis_driver.exceptions import PipelineError -from analysis_driver.quality_control.sex_check import sex_alias +from analysis_driver.quality_control.sex_validation import sex_alias class RelatednessStage(Stage): diff --git a/analysis_driver/quality_control/sex_check.py b/analysis_driver/quality_control/sex_validation.py similarity index 91% rename from analysis_driver/quality_control/sex_check.py rename to analysis_driver/quality_control/sex_validation.py index b605ab78..af33a139 100644 --- a/analysis_driver/quality_control/sex_check.py +++ b/analysis_driver/quality_control/sex_validation.py @@ -12,7 +12,7 @@ def sex_alias(sex): return 'unknown' -class SexCheck(Stage): +class SexValidation(Stage): vcf_file = Parameter() def _run(self): @@ -24,7 +24,7 @@ def _run(self): else: file_opener = 'cat' - sex_check_file = name + '.sex' + sex_file = name + '.sex' command = util.str_join( '%s %s' % (file_opener, self.vcf_file), @@ -33,12 +33,12 @@ def _run(self): "grep '0/1'", "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'", separator=' | ' - ) + ' > ' + sex_check_file + ) + ' > ' + sex_file self.info(command) return executor.execute( command, - job_name='sex_check', + job_name='sex_validation', working_dir=self.job_dir, walltime=6, cpus=1, diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py index ef130a7a..1195d36f 100644 --- a/analysis_driver/report_generation/crawler.py +++ b/analysis_driver/report_generation/crawler.py @@ -1,7 +1,7 @@ from egcg_core import clarity from egcg_core import constants as c from egcg_core.app_logging import AppLogger -from analysis_driver.quality_control.sex_check import sex_alias +from analysis_driver.quality_control.sex_validation import sex_alias class Crawler(AppLogger): diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py index 49115f00..3c695e79 100755 --- a/analysis_driver/report_generation/sample_crawler.py +++ b/analysis_driver/report_generation/sample_crawler.py @@ -58,7 +58,7 @@ def _populate_lib_info(self): else: self.critical('Missing *-sort-callable.bed') - sex_file_path = self.get_output_file('sex_check') + sex_file_path = self.get_output_file('sex_validation') if sex_file_path: with open(sex_file_path) as f: sex, het_x = f.read().strip().split() diff --git a/bin/run_qc.py b/bin/run_qc.py index 8ffa6288..ce7b3fc8 100644 --- a/bin/run_qc.py +++ b/bin/run_qc.py @@ -44,9 +44,9 @@ def _parse_args(): sample_contamination.add_argument('--bam_file', required=True) sample_contamination.set_defaults(func=run_sample_contamination_check) - sex_check = subparsers.add_parser('sex_check') - sex_check.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex') - sex_check.set_defaults(func=run_sex_check) + sex_validation = subparsers.add_parser('sex_validation') + sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex') + sex_validation.set_defaults(func=run_sex_validation) median_cov = subparsers.add_parser('median_coverage') median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs') @@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args): v.run() -def run_sex_check(dataset, args): +def run_sex_validation(dataset, args): os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True) - g = qc.SexCheck(dataset=dataset, vcf_file=args.vcf_file) + g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file) g.run() diff --git a/etc/output_files.yaml b/etc/output_files.yaml index ebae0816..01020562 100644 --- a/etc/output_files.yaml +++ b/etc/output_files.yaml @@ -101,7 +101,7 @@ bcbio: basename: '{sample_id}-chr22-vbi.selfSM' new_name: '{user_sample_id}-chr22-vbi.selfSM' - sex_check: + sex_validation: location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}'] basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex' new_name: '{user_sample_id}.sex' diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py index 96884a08..274644ba 100644 --- a/tests/test_quality_control/test_relatedness.py +++ b/tests/test_quality_control/test_relatedness.py @@ -140,7 +140,7 @@ def test_ped_file_content(self, pfams, pmem, pfam): pfam.side_effect = ['FAM1', 'FAM1', 'No_ID'] pmem.side_effect = [[['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0']], [['No_ID', '0', '0', '1', '0']]] pfams.return_value = {'FAM1': ['test_sample1', 'test_sample2'], 'No_ID': ['test_sample3']} - assert self.p.ped_fi1le_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']] + assert self.p.ped_file_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']] @patch(ppath + 'Peddy.relationships') @patch(ppath + 'Peddy.relationship') diff --git a/tests/test_quality_control/test_sex_check.py b/tests/test_quality_control/test_sex_check.py index 6d5a0b8e..061dfb98 100644 --- a/tests/test_quality_control/test_sex_check.py +++ b/tests/test_quality_control/test_sex_check.py @@ -1,20 +1,20 @@ from unittest.mock import patch -from analysis_driver.quality_control.sex_check import SexCheck +from analysis_driver.quality_control.sex_validation import SexValidation from tests.test_quality_control.qc_tester import QCTester -class TestSexCheck(QCTester): +class TestSexValdiation(QCTester): @patch('egcg_core.executor.execute') def test_run(self, mocked_execute): - validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf') + validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf') - with patch('analysis_driver.quality_control.sex_check.util.find_file', new=self.fake_find_file): + with patch('egcg_core.util.find_file', new=self.fake_find_file): validator._run() command = mocked_execute.call_args[0][0] assert command.startswith('cat') assert len(command.split(' | ')) == 5 - validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') + validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') validator._run() assert mocked_execute.call_args[0][0].startswith('zcat') From c1815ebd17956b703d958e55962919d5dde20af5 Mon Sep 17 00:00:00 2001 From: Murray Wham Date: Mon, 10 Jun 2019 15:26:16 +0100 Subject: [PATCH 3/6] Updating EGCG-Core --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5272828d..7c43b501 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -EGCG-Core==0.9.1 +EGCG-Core==0.11 luigi==2.8.0 ete3==3.0.0b35 pandas<0.21 From ef66e3dbd6b19f29f3540eae4b1260b89ee86e5d Mon Sep 17 00:00:00 2001 From: Murray Wham Date: Wed, 19 Jun 2019 16:44:28 +0100 Subject: [PATCH 4/6] Fixing integration tests --- integration_tests/integration_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py index 739277c7..dd9f07fe 100644 --- a/integration_tests/integration_test.py +++ b/integration_tests/integration_test.py @@ -334,7 +334,7 @@ def test_bcbio(self): ) self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen', - 'fixunmapped', 'blast', 'sexcheck', 'vcfstats', 'samtoolsdepth', + 'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth', 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview']) ad_proc = rest_communication.get_document('analysis_driver_procs') @@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self): self.expect_stage_data([ 'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup', 'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport', - 'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput', + 'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput', 'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex', 'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum', 'samtoolsstats', 'snpsfiltration' From 72a297ad54e8b474e88931d2fb8fc19c30f4b7b7 Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 25 Jun 2019 11:05:16 +0100 Subject: [PATCH 5/6] Add support for extra space in the IDT barcode --- analysis_driver/dataset.py | 4 ++-- integration_tests/mocked_data.py | 2 +- tests/test_dataset.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/analysis_driver/dataset.py b/analysis_driver/dataset.py index 77270874..bc5304cb 100644 --- a/analysis_driver/dataset.py +++ b/analysis_driver/dataset.py @@ -402,8 +402,8 @@ def _run_elements_from_lims(self): for pattern in ( # TruSeq label, e.g, A412-A208 (ATGCATGC-CTGACTGA) '(\w{4})-(\w{4}) \(([ATCG]{8})-([ATCG]{8})\)', - # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA) - '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes (Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)' + # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA) + '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes\s+(Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)' ): match = re.match(pattern, reagent_label) if match: diff --git a/integration_tests/mocked_data.py b/integration_tests/mocked_data.py index 3111a8af..db39cb7e 100644 --- a/integration_tests/mocked_data.py +++ b/integration_tests/mocked_data.py @@ -22,7 +22,7 @@ class MockedSample(NamedMock): samples=[MockedSample(real_name='10015AT0002', id='LP6002014-DTP_A02')]), Mock(reagent_labels=['D703-D502 (CGCTCATT-ATAGAGGC)'], samples=[MockedSample(real_name='10015AT0003', id='LP6002014-DTP_A03')]), - Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'], + Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'], samples=[MockedSample(real_name='10015AT0004', id='LP6002014-DTP_A04')]), Mock(reagent_labels=['D705-D502 (ATTCAGAA-ATAGAGGC)'], samples=[MockedSample(real_name='10015AT0006', id='LP6002014-DTP_A05')]), diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 47b21047..ce5672b4 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -220,7 +220,7 @@ def test_pipeline_instruction(self): mocked_lane_artifact8 = NamedMock(real_name='art8', reagent_labels=['D706-D502 (GAATTCGG-ATAGAGGC)'], samples=[MockedSample(real_name='sample8', udf={})]) mocked_idt_artifact = NamedMock( real_name='idt_art', - reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'], + reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'], samples=[MockedSample(real_name='idt_sample')] ) mocked_lane_artifact_pool = NamedMock(real_name='artpool', reagent_labels=[ From 6920bc10875d87e286961c358fa7210582bd5860 Mon Sep 17 00:00:00 2001 From: Murray Wham Date: Mon, 1 Jul 2019 11:03:07 +0100 Subject: [PATCH 6/6] Fixing human gatk4 var calling --- analysis_driver/pipelines/human_variant_calling_gatk4.py | 4 ++-- etc/output_files.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_driver/pipelines/human_variant_calling_gatk4.py b/analysis_driver/pipelines/human_variant_calling_gatk4.py index 0ecda1bd..edc4c2ad 100755 --- a/analysis_driver/pipelines/human_variant_calling_gatk4.py +++ b/analysis_driver/pipelines/human_variant_calling_gatk4.py @@ -198,11 +198,11 @@ def stage(cls, **params): output_vcf_file=hard_filter_indels.hard_filtered_vcf, previous_stages=[hard_filter_snps, hard_filter_indels]) - gender_val = stage(qc.GenderValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf]) + sex_val = stage(qc.SexValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf]) vcfstats = stage(qc.VCFStats, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf]) - final_stages = [contam, blast, geno_val, gender_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat, + final_stages = [contam, blast, geno_val, sex_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat, gather_gcvf] output = stage(common.SampleDataOutput, previous_stages=final_stages, output_fileset='gatk4_human_var_calling') diff --git a/etc/output_files.yaml b/etc/output_files.yaml index 5b4db0e8..b1a591ff 100644 --- a/etc/output_files.yaml +++ b/etc/output_files.yaml @@ -291,7 +291,7 @@ gatk4_human_var_calling: basename: '{sample_id}-chr22-vbi.selfSM' new_name: '{user_sample_id}-chr22-vbi.selfSM' - gender_call: + sex_validation: location: ['gatk4'] basename: '{user_sample_id}_hard_filter.sex' new_name: '{user_sample_id}.sex'