diff --git a/README.md b/README.md index 32923c40..a1c84d20 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline. - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the results of comparing the observed and expected vcfs -- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file - containing the called gender, to be compared against the gender supplied in the Lims +- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file + containing the called sex, to be compared against that supplied in the Lims - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen) - Blast - Checks a fastq file for contamination using NCBI Blast - VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID) diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py index f5398cea..2743a10f 100755 --- a/analysis_driver/pipelines/bcbio.py +++ b/analysis_driver/pipelines/bcbio.py @@ -130,11 +130,11 @@ def stage(cls, **params): bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val] - gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), + sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc), verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc), samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc) - post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth] + post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth] output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio') cleanup = stage(common.Cleanup, previous_stages=[output]) diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py index ebae42ea..90d83f6d 100644 --- a/analysis_driver/quality_control/__init__.py +++ b/analysis_driver/quality_control/__init__.py @@ -1,6 +1,6 @@ from .genotype_validation import GenotypeValidation from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast -from .gender_validation import GenderValidation +from .sex_validation import SexValidation from .median_coverage import SamtoolsDepth from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness from .bcl_validation import BCLValidator diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py index d9c6d267..cfbc6beb 100644 --- a/analysis_driver/quality_control/relatedness.py +++ b/analysis_driver/quality_control/relatedness.py @@ -7,22 +7,14 @@ from analysis_driver.util.bash_commands import java_command from analysis_driver.segmentation import Stage, Parameter, ListParameter from analysis_driver.exceptions import PipelineError +from analysis_driver.quality_control.sex_validation import sex_alias class RelatednessStage(Stage): - _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} - @property def gatk_outfile(self): return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf') - @classmethod - def gender_alias(cls, gender): - for key in cls._gender_aliases: - if str(gender).lower() in cls._gender_aliases[key]: - return key - return 'unknown' - @staticmethod def family_id(sample_id): return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID' @@ -245,7 +237,7 @@ def get_member_details(self, family, all_families): family_lines = [] family_info = self.relationships(all_families[family]) for member in all_families[family]: - sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex')) + sex = sex_alias(clarity.get_sample_sex(member)) sex_codes = {'male': '1', 'female': '2', 'unknown': '0'} relationship = self.relationship(member) member_id = clarity.get_user_sample_name(member) diff --git a/analysis_driver/quality_control/gender_validation.py b/analysis_driver/quality_control/sex_validation.py similarity index 61% rename from analysis_driver/quality_control/gender_validation.py rename to analysis_driver/quality_control/sex_validation.py index 09e02ee8..af33a139 100644 --- a/analysis_driver/quality_control/gender_validation.py +++ b/analysis_driver/quality_control/sex_validation.py @@ -2,12 +2,21 @@ from egcg_core import executor, util from analysis_driver.segmentation import Parameter, Stage +_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} -class GenderValidation(Stage): + +def sex_alias(sex): + for key in _sex_aliases: + if str(sex).lower() in _sex_aliases[key]: + return key + return 'unknown' + + +class SexValidation(Stage): vcf_file = Parameter() def _run(self): - """Detect gender of the sample based on the %het on the X chromosome.""" + """Detect sex of the sample based on the %het on the X chromosome.""" name, ext = os.path.splitext(util.find_file(self.vcf_file)) if ext == '.gz': file_opener = 'zcat' @@ -15,21 +24,21 @@ def _run(self): else: file_opener = 'cat' - gender_call_file = name + '.sex' + sex_file = name + '.sex' command = util.str_join( '%s %s' % (file_opener, self.vcf_file), "grep -P '^chrX|^X'", "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'", "grep '0/1'", - "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'", + "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'", separator=' | ' - ) + ' > ' + gender_call_file + ) + ' > ' + sex_file self.info(command) return executor.execute( command, - job_name='sex_detection', + job_name='sex_validation', working_dir=self.job_dir, walltime=6, cpus=1, diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py index 9c757d45..1195d36f 100644 --- a/analysis_driver/report_generation/crawler.py +++ b/analysis_driver/report_generation/crawler.py @@ -1,25 +1,17 @@ from egcg_core import clarity from egcg_core import constants as c from egcg_core.app_logging import AppLogger +from analysis_driver.quality_control.sex_validation import sex_alias class Crawler(AppLogger): - _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']} - - @classmethod - def gender_alias(cls, gender): - for key in cls._gender_aliases: - if str(gender).lower() in cls._gender_aliases[key]: - return key - return 'unknown' - @classmethod def get_sample_information_from_lims(cls, sample_name): lims_sample = clarity.get_sample(sample_name) sample_info = { c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True), c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0], # returns [plate_id, well] - c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)), + c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))}, c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name) } if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf: diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py index fe8d7d26..3c695e79 100755 --- a/analysis_driver/report_generation/sample_crawler.py +++ b/analysis_driver/report_generation/sample_crawler.py @@ -3,7 +3,7 @@ from egcg_core.rest_communication import post_or_patch as pp from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp from analysis_driver.config import output_file_config -from .crawler import Crawler +from .crawler import Crawler, sex_alias class SampleCrawler(Crawler): @@ -58,12 +58,12 @@ def _populate_lib_info(self): else: self.critical('Missing *-sort-callable.bed') - sex_file_path = self.get_output_file('gender_call') + sex_file_path = self.get_output_file('sex_validation') if sex_file_path: with open(sex_file_path) as f: - gender, het_x = f.read().strip().split() - sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender) - sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x} + sex, het_x = f.read().strip().split() + sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex) + sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x genotype_validation_path = self.get_output_file('genoval') if genotype_validation_path: @@ -106,10 +106,10 @@ def _populate_lib_info(self): } sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics sample[ELEMENT_MEDIAN_COVERAGE] = median - if ELEMENT_GENDER_VALIDATION in sample: + if ELEMENT_SEX_VALIDATION in sample: cov_y = dm.get_coverage_y_chrom(coverage_statistics_path) if cov_y: - sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y + sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y else: self.critical('coverage statistics unavailable for %s', self.sample_id) diff --git a/bin/run_qc.py b/bin/run_qc.py index bfecc795..ce7b3fc8 100644 --- a/bin/run_qc.py +++ b/bin/run_qc.py @@ -44,9 +44,9 @@ def _parse_args(): sample_contamination.add_argument('--bam_file', required=True) sample_contamination.set_defaults(func=run_sample_contamination_check) - gender_val = subparsers.add_parser('gender_validation') - gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender') - gender_val.set_defaults(func=run_gender_validation) + sex_validation = subparsers.add_parser('sex_validation') + sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex') + sex_validation.set_defaults(func=run_sex_validation) median_cov = subparsers.add_parser('median_coverage') median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs') @@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args): v.run() -def run_gender_validation(dataset, args): +def run_sex_validation(dataset, args): os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True) - g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file) + g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file) g.run() diff --git a/etc/output_files.yaml b/etc/output_files.yaml index cfe3e699..5b4db0e8 100644 --- a/etc/output_files.yaml +++ b/etc/output_files.yaml @@ -101,7 +101,7 @@ bcbio: basename: '{sample_id}-chr22-vbi.selfSM' new_name: '{user_sample_id}-chr22-vbi.selfSM' - gender_call: + sex_validation: location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}'] basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex' new_name: '{user_sample_id}.sex' diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py index 84ac8bd0..dd9f07fe 100644 --- a/integration_tests/integration_test.py +++ b/integration_tests/integration_test.py @@ -17,7 +17,7 @@ class IntegrationTest(ReportingAppIntegrationTest): patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'), patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well), patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project), - patch('egcg_core.clarity.get_sample_gender'), + patch('egcg_core.clarity.get_sample_sex'), patch('egcg_core.clarity.get_sample_genotype', return_value=set()), patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()), patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()), @@ -334,7 +334,7 @@ def test_bcbio(self): ) self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen', - 'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth', + 'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth', 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview']) ad_proc = rest_communication.get_document('analysis_driver_procs') @@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self): self.expect_stage_data([ 'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup', 'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport', - 'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput', + 'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput', 'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex', 'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum', 'samtoolsstats', 'snpsfiltration' diff --git a/requirements.txt b/requirements.txt index 137caa81..7c43b501 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -EGCG-Core==0.10 +EGCG-Core==0.11 luigi==2.8.0 ete3==3.0.0b35 pandas<0.21 diff --git a/tests/assets/test_crawlers/expected_sample_crawler_data.json b/tests/assets/test_crawlers/expected_sample_crawler_data.json index 7d19e10a..8b7448aa 100644 --- a/tests/assets/test_crawlers/expected_sample_crawler_data.json +++ b/tests/assets/test_crawlers/expected_sample_crawler_data.json @@ -8,8 +8,6 @@ "project_id": "test_project", "bam_file_reads": 7928618, "mapped_reads": 7892452, - "called_gender": "male", - "provided_gender": "female", "species_name": "Homo sapiens", "species_contamination": { "contaminant_unique_mapped": { @@ -25,7 +23,11 @@ "total_reads_mapped": 100000 }, "sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01}, - "gender_validation": {"hetX": "0.10"}, + "sex_validation": { + "called": "male", + "provided": "female", + "hetX": "0.10" + }, "coverage": { "median": 478, "std_dev": 189.1911391390011, diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py index 4af04728..f61f04e4 100644 --- a/tests/test_crawlers.py +++ b/tests/test_crawlers.py @@ -119,7 +119,7 @@ def setUp(self): self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json'))) patched_sample_info = patch( ppath + 'SampleCrawler.get_sample_information_from_lims', - return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'} + return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'} ) patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample') with patched_sample_info, patched_user_sample_id: diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py index a38eb2ab..274644ba 100644 --- a/tests/test_quality_control/test_relatedness.py +++ b/tests/test_quality_control/test_relatedness.py @@ -1,5 +1,5 @@ import os -from unittest.mock import patch, Mock +from unittest.mock import patch from tests.test_quality_control.qc_tester import QCTester from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness from analysis_driver.exceptions import PipelineError @@ -144,11 +144,9 @@ def test_ped_file_content(self, pfams, pmem, pfam): @patch(ppath + 'Peddy.relationships') @patch(ppath + 'Peddy.relationship') - @patch(ppath + 'Peddy.gender_alias') @patch(ppath + 'clarity.get_user_sample_name') - @patch(ppath + 'clarity.get_sample') - def test_get_member_details(self, psample, pname, psex, prel, prels): - psample.return_value = Mock(udf={'Sex': 'F'}) + @patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M']) + def test_get_member_details(self, psex, pname, prel, prels): prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'}, 'Mother': {'Mother': '0', 'Father': '0'}, 'Father': {'Mother': '0', 'Father': '0'}, @@ -156,7 +154,6 @@ def test_get_member_details(self, psample, pname, psex, prel, prels): 'Brother': {'Mother': 'test_sample1', 'Father': '0'}, 'Other': {'Mother': '0', 'Father': '0'}} prel.side_effect = ['Mother', 'Proband'] - psex.side_effect = ['female', 'male'] pname.side_effect = ['usersample1', 'usersample2', 'usersample1'] all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']} assert self.p.get_member_details('FAM1', all_families) == [ @@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels): 'Other': {'Mother': '0', 'Father': '0'}} prel.side_effect = ['Other', 'Other', 'Proband'] - psex.side_effect = ['unknown', 'unknown', 'male'] + psex.side_effect = ['unknown', 'unknown', 'M'] pname.side_effect = ['usersample1', 'usersample2', 'usersample3'] all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']} assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'], diff --git a/tests/test_quality_control/test_gender_validation.py b/tests/test_quality_control/test_sex_check.py similarity index 52% rename from tests/test_quality_control/test_gender_validation.py rename to tests/test_quality_control/test_sex_check.py index 31591171..061dfb98 100644 --- a/tests/test_quality_control/test_gender_validation.py +++ b/tests/test_quality_control/test_sex_check.py @@ -1,20 +1,20 @@ from unittest.mock import patch -from analysis_driver.quality_control.gender_validation import GenderValidation +from analysis_driver.quality_control.sex_validation import SexValidation from tests.test_quality_control.qc_tester import QCTester -class TestGenderValidation(QCTester): +class TestSexValdiation(QCTester): @patch('egcg_core.executor.execute') def test_run(self, mocked_execute): - validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf') + validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf') - with patch('analysis_driver.quality_control.gender_validation.util.find_file', new=self.fake_find_file): + with patch('egcg_core.util.find_file', new=self.fake_find_file): validator._run() command = mocked_execute.call_args[0][0] assert command.startswith('cat') assert len(command.split(' | ')) == 5 - validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') + validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz') validator._run() assert mocked_execute.call_args[0][0].startswith('zcat')