EdinburghGenomics · mwhamgenomics · Jun 20, 2019 · Mar 22, 2019 · May 23, 2019 · Jun 10, 2019
diff --git a/README.md b/README.md
@@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline.
 - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
   a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
   results of comparing the observed and expected vcfs
-- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
-  containing the called gender, to be compared against the gender supplied in the Lims
+- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
+  containing the called sex, to be compared against that supplied in the Lims
 - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
 - Blast - Checks a fastq file for contamination using NCBI Blast
 - VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID)

diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py
@@ -130,11 +130,11 @@ def stage(cls, **params):
 
     bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]
 
-    gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
+    sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
     samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
-    post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth]
+    post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth]
 
     output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
     cleanup = stage(common.Cleanup, previous_stages=[output])

diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py
@@ -1,6 +1,6 @@
 from .genotype_validation import GenotypeValidation
 from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
-from .gender_validation import GenderValidation
+from .sex_validation import SexValidation
 from .median_coverage import SamtoolsDepth
 from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
 from .bcl_validation import BCLValidator

diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py
@@ -7,22 +7,14 @@
 from analysis_driver.util.bash_commands import java_command
 from analysis_driver.segmentation import Stage, Parameter, ListParameter
 from analysis_driver.exceptions import PipelineError
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class RelatednessStage(Stage):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
     @property
     def gatk_outfile(self):
         return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf')
 
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @staticmethod
     def family_id(sample_id):
         return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID'
@@ -245,7 +237,7 @@ def get_member_details(self, family, all_families):
         family_lines = []
         family_info = self.relationships(all_families[family])
         for member in all_families[family]:
-            sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex'))
+            sex = sex_alias(clarity.get_sample_sex(member))
             sex_codes = {'male': '1', 'female': '2', 'unknown': '0'}
             relationship = self.relationship(member)
             member_id = clarity.get_user_sample_name(member)

diff --git a/...iver/quality_control/gender_validation.py → ..._driver/quality_control/sex_validation.py b/...iver/quality_control/gender_validation.py → ..._driver/quality_control/sex_validation.py
@@ -2,34 +2,43 @@
 from egcg_core import executor, util
 from analysis_driver.segmentation import Parameter, Stage
 
+_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
 
-class GenderValidation(Stage):
+
+def sex_alias(sex):
+    for key in _sex_aliases:
+        if str(sex).lower() in _sex_aliases[key]:
+            return key
+    return 'unknown'
+
+
+class SexValidation(Stage):
     vcf_file = Parameter()
 
     def _run(self):
-        """Detect gender of the sample based on the %het on the X chromosome."""
+        """Detect sex of the sample based on the %het on the X chromosome."""
         name, ext = os.path.splitext(util.find_file(self.vcf_file))
         if ext == '.gz':
             file_opener = 'zcat'
             name, gz = os.path.splitext(name)
         else:
             file_opener = 'cat'
 
-        gender_call_file = name + '.sex'
+        sex_file = name + '.sex'
 
         command = util.str_join(
             '%s %s' % (file_opener, self.vcf_file),
             "grep -P '^chrX|^X'",
             "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
             "grep '0/1'",
-            "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
+            "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
             separator=' | '
-        ) + ' > ' + gender_call_file
+        ) + ' > ' + sex_file
         self.info(command)
 
         return executor.execute(
             command,
-            job_name='sex_detection',
+            job_name='sex_validation',
             working_dir=self.job_dir,
             walltime=6,
             cpus=1,

diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py
@@ -1,25 +1,17 @@
 from egcg_core import clarity
 from egcg_core import constants as c
 from egcg_core.app_logging import AppLogger
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class Crawler(AppLogger):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @classmethod
     def get_sample_information_from_lims(cls, sample_name):
         lims_sample = clarity.get_sample(sample_name)
         sample_info = {
             c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True),
             c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0],  # returns [plate_id, well]
-            c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)),
+            c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))},
             c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name)
         }
         if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf:

diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py
@@ -3,7 +3,7 @@
 from egcg_core.rest_communication import post_or_patch as pp
 from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp
 from analysis_driver.config import output_file_config
-from .crawler import Crawler
+from .crawler import Crawler, sex_alias
 
 
 class SampleCrawler(Crawler):
@@ -58,12 +58,12 @@ def _populate_lib_info(self):
         else:
             self.critical('Missing *-sort-callable.bed')
 
-        sex_file_path = self.get_output_file('gender_call')
+        sex_file_path = self.get_output_file('sex_validation')
         if sex_file_path:
             with open(sex_file_path) as f:
-                gender, het_x = f.read().strip().split()
-                sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender)
-                sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x}
+                sex, het_x = f.read().strip().split()
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex)
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x
 
         genotype_validation_path = self.get_output_file('genoval')
         if genotype_validation_path:
@@ -106,10 +106,10 @@ def _populate_lib_info(self):
             }
             sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics
             sample[ELEMENT_MEDIAN_COVERAGE] = median
-            if ELEMENT_GENDER_VALIDATION in sample:
+            if ELEMENT_SEX_VALIDATION in sample:
                 cov_y = dm.get_coverage_y_chrom(coverage_statistics_path)
                 if cov_y:
-                    sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y
+                    sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y
         else:
             self.critical('coverage statistics unavailable for %s', self.sample_id)
 

diff --git a/bin/run_qc.py b/bin/run_qc.py
@@ -44,9 +44,9 @@ def _parse_args():
     sample_contamination.add_argument('--bam_file', required=True)
     sample_contamination.set_defaults(func=run_sample_contamination_check)
 
-    gender_val = subparsers.add_parser('gender_validation')
-    gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender')
-    gender_val.set_defaults(func=run_gender_validation)
+    sex_validation = subparsers.add_parser('sex_validation')
+    sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
+    sex_validation.set_defaults(func=run_sex_validation)
 
     median_cov = subparsers.add_parser('median_coverage')
     median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
@@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
     v.run()
 
 
-def run_gender_validation(dataset, args):
+def run_sex_validation(dataset, args):
     os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
-    g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file)
+    g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file)
     g.run()
 
 

diff --git a/etc/output_files.yaml b/etc/output_files.yaml
@@ -101,7 +101,7 @@ bcbio:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    gender_call:
+    sex_validation:
         location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
         basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
         new_name: '{user_sample_id}.sex'

diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py
@@ -17,7 +17,7 @@ class IntegrationTest(ReportingAppIntegrationTest):
         patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'),
         patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well),
         patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project),
-        patch('egcg_core.clarity.get_sample_gender'),
+        patch('egcg_core.clarity.get_sample_sex'),
         patch('egcg_core.clarity.get_sample_genotype', return_value=set()),
         patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()),
         patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()),
@@ -333,7 +333,7 @@ def test_bcbio(self):
         )
 
         self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
-                                'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth',
+                                'fixunmapped', 'blast', 'sexcheck', 'vcfstats', 'samtoolsdepth',
                                 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])
 
         self.expect_equal(

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-EGCG-Core==0.10
+EGCG-Core==0.11
 luigi==2.8.0
 ete3==3.0.0b35
 pandas<0.21

diff --git a/tests/assets/test_crawlers/expected_sample_crawler_data.json b/tests/assets/test_crawlers/expected_sample_crawler_data.json
@@ -8,8 +8,6 @@
     "project_id": "test_project",
     "bam_file_reads": 7928618,
     "mapped_reads": 7892452,
-    "called_gender": "male",
-    "provided_gender": "female",
     "species_name": "Homo sapiens",
     "species_contamination": {
         "contaminant_unique_mapped": {
@@ -25,7 +23,11 @@
         "total_reads_mapped": 100000
     },
     "sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01},
-    "gender_validation": {"hetX": "0.10"},
+    "sex_validation": {
+        "called": "male",
+        "provided": "female",
+        "hetX": "0.10"
+    },
     "coverage": {
         "median": 478,
         "std_dev": 189.1911391390011,

diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py
@@ -119,7 +119,7 @@ def setUp(self):
         self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json')))
         patched_sample_info = patch(
             ppath + 'SampleCrawler.get_sample_information_from_lims',
-            return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'}
+            return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'}
         )
         patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample')
         with patched_sample_info, patched_user_sample_id:

diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py
@@ -1,5 +1,5 @@
 import os
-from unittest.mock import patch, Mock
+from unittest.mock import patch
 from tests.test_quality_control.qc_tester import QCTester
 from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness
 from analysis_driver.exceptions import PipelineError
@@ -144,19 +144,16 @@ def test_ped_file_content(self, pfams, pmem, pfam):
 
     @patch(ppath + 'Peddy.relationships')
     @patch(ppath + 'Peddy.relationship')
-    @patch(ppath + 'Peddy.gender_alias')
     @patch(ppath + 'clarity.get_user_sample_name')
-    @patch(ppath + 'clarity.get_sample')
-    def test_get_member_details(self, psample, pname, psex, prel, prels):
-        psample.return_value = Mock(udf={'Sex': 'F'})
+    @patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M'])
+    def test_get_member_details(self, psex, pname, prel, prels):
         prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'},
                               'Mother': {'Mother': '0', 'Father': '0'},
                               'Father': {'Mother': '0', 'Father': '0'},
                               'Sister': {'Mother': 'test_sample1', 'Father': '0'},
                               'Brother': {'Mother': 'test_sample1', 'Father': '0'},
                               'Other': {'Mother': '0', 'Father': '0'}}
         prel.side_effect = ['Mother', 'Proband']
-        psex.side_effect = ['female', 'male']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample1']
         all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [
@@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
                               'Other': {'Mother': '0', 'Father': '0'}}
 
         prel.side_effect = ['Other', 'Other', 'Proband']
-        psex.side_effect = ['unknown', 'unknown', 'male']
+        psex.side_effect = ['unknown', 'unknown', 'M']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample3']
         all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'],

diff --git a/...quality_control/test_gender_validation.py → tests/test_quality_control/test_sex_check.py b/...quality_control/test_gender_validation.py → tests/test_quality_control/test_sex_check.py
@@ -1,20 +1,20 @@
 from unittest.mock import patch
-from analysis_driver.quality_control.gender_validation import GenderValidation
+from analysis_driver.quality_control.sex_validation import SexValidation
 from tests.test_quality_control.qc_tester import QCTester
 
 
-class TestGenderValidation(QCTester):
+class TestSexValdiation(QCTester):
     @patch('egcg_core.executor.execute')
     def test_run(self, mocked_execute):
-        validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')
+        validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')
 
-        with patch('analysis_driver.quality_control.gender_validation.util.find_file', new=self.fake_find_file):
+        with patch('egcg_core.util.find_file', new=self.fake_find_file):
             validator._run()
 
             command = mocked_execute.call_args[0][0]
             assert command.startswith('cat')
             assert len(command.split(' | ')) == 5
 
-            validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
+            validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
             validator._run()
             assert mocked_execute.call_args[0][0].startswith('zcat')