Merge pull request #409 from EdinburghGenomics/Release_v0.25.1

Release v0.25.1
EdinburghGenomics · Jul 10, 2019 · 0dd8cbe · 0dd8cbe
2 parents ac2ba91 + e566f91
commit 0dd8cbe
Show file tree

Hide file tree

Showing 19 changed files with 63 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline.
 - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
   a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
   results of comparing the observed and expected vcfs
-- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
-  containing the called gender, to be compared against the gender supplied in the Lims
+- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
+  containing the called sex, to be compared against that supplied in the Lims
 - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
 - Blast - Checks a fastq file for contamination using NCBI Blast
 - VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID)

diff --git a/analysis_driver/dataset.py b/analysis_driver/dataset.py
@@ -402,8 +402,8 @@ def _run_elements_from_lims(self):
                     for pattern in (
                         # TruSeq label, e.g, A412-A208 (ATGCATGC-CTGACTGA)
                         '(\w{4})-(\w{4}) \(([ATCG]{8})-([ATCG]{8})\)',
-                        # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA)
-                        '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes (Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
+                        # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (ATGCATGC-CTGACTGA)
+                        '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes\s+(Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
                     ):
                         match = re.match(pattern, reagent_label)
                         if match:

diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py
@@ -130,11 +130,11 @@ def stage(cls, **params):
 
     bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]
 
-    gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
+    sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
     samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
-    post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth]
+    post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth]
 
     output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
     cleanup = stage(common.Cleanup, previous_stages=[output])

diff --git a/analysis_driver/pipelines/human_variant_calling_gatk4.py b/analysis_driver/pipelines/human_variant_calling_gatk4.py
@@ -198,11 +198,11 @@ def stage(cls, **params):
                      output_vcf_file=hard_filter_indels.hard_filtered_vcf,
                      previous_stages=[hard_filter_snps, hard_filter_indels])
 
-    gender_val = stage(qc.GenderValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
+    sex_val = stage(qc.SexValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
 
     vcfstats = stage(qc.VCFStats, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
 
-    final_stages = [contam, blast, geno_val, gender_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
+    final_stages = [contam, blast, geno_val, sex_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
                     gather_gcvf]
 
     output = stage(common.SampleDataOutput, previous_stages=final_stages, output_fileset='gatk4_human_var_calling')

diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py
@@ -1,6 +1,6 @@
 from .genotype_validation import GenotypeValidation
 from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
-from .gender_validation import GenderValidation
+from .sex_validation import SexValidation
 from .median_coverage import SamtoolsDepth
 from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
 from .bcl_validation import BCLValidator

diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py
@@ -7,22 +7,14 @@
 from analysis_driver.util.bash_commands import java_command
 from analysis_driver.segmentation import Stage, Parameter, ListParameter
 from analysis_driver.exceptions import PipelineError
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class RelatednessStage(Stage):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
     @property
     def gatk_outfile(self):
         return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf')
 
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @staticmethod
     def family_id(sample_id):
         return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID'
@@ -245,7 +237,7 @@ def get_member_details(self, family, all_families):
         family_lines = []
         family_info = self.relationships(all_families[family])
         for member in all_families[family]:
-            sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex'))
+            sex = sex_alias(clarity.get_sample_sex(member))
             sex_codes = {'male': '1', 'female': '2', 'unknown': '0'}
             relationship = self.relationship(member)
             member_id = clarity.get_user_sample_name(member)

diff --git a/...iver/quality_control/gender_validation.py → ..._driver/quality_control/sex_validation.py b/...iver/quality_control/gender_validation.py → ..._driver/quality_control/sex_validation.py
@@ -2,34 +2,43 @@
 from egcg_core import executor, util
 from analysis_driver.segmentation import Parameter, Stage
 
+_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
 
-class GenderValidation(Stage):
+
+def sex_alias(sex):
+    for key in _sex_aliases:
+        if str(sex).lower() in _sex_aliases[key]:
+            return key
+    return 'unknown'
+
+
+class SexValidation(Stage):
     vcf_file = Parameter()
 
     def _run(self):
-        """Detect gender of the sample based on the %het on the X chromosome."""
+        """Detect sex of the sample based on the %het on the X chromosome."""
         name, ext = os.path.splitext(util.find_file(self.vcf_file))
         if ext == '.gz':
             file_opener = 'zcat'
             name, gz = os.path.splitext(name)
         else:
             file_opener = 'cat'
 
-        gender_call_file = name + '.sex'
+        sex_file = name + '.sex'
 
         command = util.str_join(
             '%s %s' % (file_opener, self.vcf_file),
             "grep -P '^chrX|^X'",
             "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
             "grep '0/1'",
-            "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
+            "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
             separator=' | '
-        ) + ' > ' + gender_call_file
+        ) + ' > ' + sex_file
         self.info(command)
 
         return executor.execute(
             command,
-            job_name='sex_detection',
+            job_name='sex_validation',
             working_dir=self.job_dir,
             walltime=6,
             cpus=1,

diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py
@@ -1,25 +1,17 @@
 from egcg_core import clarity
 from egcg_core import constants as c
 from egcg_core.app_logging import AppLogger
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class Crawler(AppLogger):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @classmethod
     def get_sample_information_from_lims(cls, sample_name):
         lims_sample = clarity.get_sample(sample_name)
         sample_info = {
             c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True),
             c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0],  # returns [plate_id, well]
-            c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)),
+            c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))},
             c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name)
         }
         if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf:

diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py
@@ -3,7 +3,7 @@
 from egcg_core.rest_communication import post_or_patch as pp
 from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp
 from analysis_driver.config import output_file_config
-from .crawler import Crawler
+from .crawler import Crawler, sex_alias
 
 
 class SampleCrawler(Crawler):
@@ -58,12 +58,12 @@ def _populate_lib_info(self):
         else:
             self.critical('Missing *-sort-callable.bed')
 
-        sex_file_path = self.get_output_file('gender_call')
+        sex_file_path = self.get_output_file('sex_validation')
         if sex_file_path:
             with open(sex_file_path) as f:
-                gender, het_x = f.read().strip().split()
-                sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender)
-                sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x}
+                sex, het_x = f.read().strip().split()
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex)
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x
 
         genotype_validation_path = self.get_output_file('genoval')
         if genotype_validation_path:
@@ -106,10 +106,10 @@ def _populate_lib_info(self):
             }
             sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics
             sample[ELEMENT_MEDIAN_COVERAGE] = median
-            if ELEMENT_GENDER_VALIDATION in sample:
+            if ELEMENT_SEX_VALIDATION in sample:
                 cov_y = dm.get_coverage_y_chrom(coverage_statistics_path)
                 if cov_y:
-                    sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y
+                    sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y
         else:
             self.critical('coverage statistics unavailable for %s', self.sample_id)
 

diff --git a/bin/run_qc.py b/bin/run_qc.py
@@ -44,9 +44,9 @@ def _parse_args():
     sample_contamination.add_argument('--bam_file', required=True)
     sample_contamination.set_defaults(func=run_sample_contamination_check)
 
-    gender_val = subparsers.add_parser('gender_validation')
-    gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender')
-    gender_val.set_defaults(func=run_gender_validation)
+    sex_validation = subparsers.add_parser('sex_validation')
+    sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
+    sex_validation.set_defaults(func=run_sex_validation)
 
     median_cov = subparsers.add_parser('median_coverage')
     median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
@@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
     v.run()
 
 
-def run_gender_validation(dataset, args):
+def run_sex_validation(dataset, args):
     os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
-    g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file)
+    g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file)
     g.run()
 
 

diff --git a/etc/output_files.yaml b/etc/output_files.yaml
@@ -101,7 +101,7 @@ bcbio:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    gender_call:
+    sex_validation:
         location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
         basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
         new_name: '{user_sample_id}.sex'
@@ -291,7 +291,7 @@ gatk4_human_var_calling:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    gender_call:
+    sex_validation:
         location: ['gatk4']
         basename: '{user_sample_id}_hard_filter.sex'
         new_name: '{user_sample_id}.sex'

diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py
@@ -17,7 +17,7 @@ class IntegrationTest(ReportingAppIntegrationTest):
         patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'),
         patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well),
         patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project),
-        patch('egcg_core.clarity.get_sample_gender'),
+        patch('egcg_core.clarity.get_sample_sex'),
         patch('egcg_core.clarity.get_sample_genotype', return_value=set()),
         patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()),
         patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()),
@@ -334,7 +334,7 @@ def test_bcbio(self):
         )
 
         self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
-                                'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth',
+                                'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth',
                                 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])
 
         ad_proc = rest_communication.get_document('analysis_driver_procs')
@@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self):
         self.expect_stage_data([
             'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup',
             'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport',
-            'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput',
+            'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput',
             'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex',
             'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum',
             'samtoolsstats', 'snpsfiltration'

diff --git a/integration_tests/mocked_data.py b/integration_tests/mocked_data.py
@@ -22,7 +22,7 @@ class MockedSample(NamedMock):
                  samples=[MockedSample(real_name='10015AT0002', id='LP6002014-DTP_A02')]),
             Mock(reagent_labels=['D703-D502 (CGCTCATT-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0003', id='LP6002014-DTP_A03')]),
-            Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
+            Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0004', id='LP6002014-DTP_A04')]),
             Mock(reagent_labels=['D705-D502 (ATTCAGAA-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0006', id='LP6002014-DTP_A05')]),

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-EGCG-Core==0.10
+EGCG-Core==0.11
 luigi==2.8.0
 ete3==3.0.0b35
 pandas<0.21

diff --git a/tests/assets/test_crawlers/expected_sample_crawler_data.json b/tests/assets/test_crawlers/expected_sample_crawler_data.json
@@ -8,8 +8,6 @@
     "project_id": "test_project",
     "bam_file_reads": 7928618,
     "mapped_reads": 7892452,
-    "called_gender": "male",
-    "provided_gender": "female",
     "species_name": "Homo sapiens",
     "species_contamination": {
         "contaminant_unique_mapped": {
@@ -25,7 +23,11 @@
         "total_reads_mapped": 100000
     },
     "sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01},
-    "gender_validation": {"hetX": "0.10"},
+    "sex_validation": {
+        "called": "male",
+        "provided": "female",
+        "hetX": "0.10"
+    },
     "coverage": {
         "median": 478,
         "std_dev": 189.1911391390011,

diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py
@@ -119,7 +119,7 @@ def setUp(self):
         self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json')))
         patched_sample_info = patch(
             ppath + 'SampleCrawler.get_sample_information_from_lims',
-            return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'}
+            return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'}
         )
         patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample')
         with patched_sample_info, patched_user_sample_id:

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -220,7 +220,7 @@ def test_pipeline_instruction(self):
 mocked_lane_artifact8 = NamedMock(real_name='art8', reagent_labels=['D706-D502 (GAATTCGG-ATAGAGGC)'], samples=[MockedSample(real_name='sample8', udf={})])
 mocked_idt_artifact = NamedMock(
     real_name='idt_art',
-    reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
+    reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
     samples=[MockedSample(real_name='idt_sample')]
 )
 mocked_lane_artifact_pool = NamedMock(real_name='artpool', reagent_labels=[

diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py
@@ -1,5 +1,5 @@
 import os
-from unittest.mock import patch, Mock
+from unittest.mock import patch
 from tests.test_quality_control.qc_tester import QCTester
 from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness
 from analysis_driver.exceptions import PipelineError
@@ -144,19 +144,16 @@ def test_ped_file_content(self, pfams, pmem, pfam):
 
     @patch(ppath + 'Peddy.relationships')
     @patch(ppath + 'Peddy.relationship')
-    @patch(ppath + 'Peddy.gender_alias')
     @patch(ppath + 'clarity.get_user_sample_name')
-    @patch(ppath + 'clarity.get_sample')
-    def test_get_member_details(self, psample, pname, psex, prel, prels):
-        psample.return_value = Mock(udf={'Sex': 'F'})
+    @patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M'])
+    def test_get_member_details(self, psex, pname, prel, prels):
         prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'},
                               'Mother': {'Mother': '0', 'Father': '0'},
                               'Father': {'Mother': '0', 'Father': '0'},
                               'Sister': {'Mother': 'test_sample1', 'Father': '0'},
                               'Brother': {'Mother': 'test_sample1', 'Father': '0'},
                               'Other': {'Mother': '0', 'Father': '0'}}
         prel.side_effect = ['Mother', 'Proband']
-        psex.side_effect = ['female', 'male']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample1']
         all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [
@@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
                               'Other': {'Mother': '0', 'Father': '0'}}
 
         prel.side_effect = ['Other', 'Other', 'Proband']
-        psex.side_effect = ['unknown', 'unknown', 'male']
+        psex.side_effect = ['unknown', 'unknown', 'M']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample3']
         all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'],