From 61c9b6e842cc6cc9d3ffc55d7af0356761b9ed63 Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Fri, 22 Mar 2019 12:19:23 +0000
Subject: [PATCH 1/6] EdinburghGenomics/Reporting-App#205: Renaming
 GenderValidation -> SexCheck, removing duplicate aliasing, schema change.

---
 README.md                                     |  4 ++--
 analysis_driver/pipelines/bcbio.py            |  4 ++--
 analysis_driver/quality_control/__init__.py   |  2 +-
 .../quality_control/relatedness.py            | 12 ++---------
 .../{gender_validation.py => sex_check.py}    | 21 +++++++++++++------
 analysis_driver/report_generation/crawler.py  | 12 ++---------
 .../report_generation/sample_crawler.py       | 14 ++++++-------
 bin/run_qc.py                                 | 10 ++++-----
 etc/output_files.yaml                         |  2 +-
 integration_tests/integration_test.py         |  4 ++--
 .../expected_sample_crawler_data.json         |  8 ++++---
 tests/test_crawlers.py                        |  2 +-
 .../test_quality_control/test_relatedness.py  | 13 +++++-------
 ...gender_validation.py => test_sex_check.py} | 10 ++++-----
 14 files changed, 55 insertions(+), 63 deletions(-)
 rename analysis_driver/quality_control/{gender_validation.py => sex_check.py} (61%)
 rename tests/test_quality_control/{test_gender_validation.py => test_sex_check.py} (52%)

diff --git a/README.md b/README.md
index 32923c40..23cedc44 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline.
 - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
   a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
   results of comparing the observed and expected vcfs
-- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
-  containing the called gender, to be compared against the gender supplied in the Lims
+- SexCheck - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
+  containing the called sex, to be compared against that supplied in the Lims
 - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
 - Blast - Checks a fastq file for contamination using NCBI Blast
 - VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID)
diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py
index f5398cea..f7d14fd0 100755
--- a/analysis_driver/pipelines/bcbio.py
+++ b/analysis_driver/pipelines/bcbio.py
@@ -130,11 +130,11 @@ def stage(cls, **params):
 
     bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]
 
-    gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
+    sex_check = stage(qc.SexCheck, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
     samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
-    post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth]
+    post_bcbio_qc = [sex_check, vcfstats, verify_bam_id, samtools_depth]
 
     output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
     cleanup = stage(common.Cleanup, previous_stages=[output])
diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py
index ebae42ea..d1424789 100644
--- a/analysis_driver/quality_control/__init__.py
+++ b/analysis_driver/quality_control/__init__.py
@@ -1,6 +1,6 @@
 from .genotype_validation import GenotypeValidation
 from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
-from .gender_validation import GenderValidation
+from .sex_check import SexCheck
 from .median_coverage import SamtoolsDepth
 from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
 from .bcl_validation import BCLValidator
diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py
index d9c6d267..5a75f820 100644
--- a/analysis_driver/quality_control/relatedness.py
+++ b/analysis_driver/quality_control/relatedness.py
@@ -7,22 +7,14 @@
 from analysis_driver.util.bash_commands import java_command
 from analysis_driver.segmentation import Stage, Parameter, ListParameter
 from analysis_driver.exceptions import PipelineError
+from analysis_driver.quality_control.sex_check import sex_alias
 
 
 class RelatednessStage(Stage):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
     @property
     def gatk_outfile(self):
         return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf')
 
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @staticmethod
     def family_id(sample_id):
         return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID'
@@ -245,7 +237,7 @@ def get_member_details(self, family, all_families):
         family_lines = []
         family_info = self.relationships(all_families[family])
         for member in all_families[family]:
-            sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex'))
+            sex = sex_alias(clarity.get_sample_sex(member))
             sex_codes = {'male': '1', 'female': '2', 'unknown': '0'}
             relationship = self.relationship(member)
             member_id = clarity.get_user_sample_name(member)
diff --git a/analysis_driver/quality_control/gender_validation.py b/analysis_driver/quality_control/sex_check.py
similarity index 61%
rename from analysis_driver/quality_control/gender_validation.py
rename to analysis_driver/quality_control/sex_check.py
index 09e02ee8..b605ab78 100644
--- a/analysis_driver/quality_control/gender_validation.py
+++ b/analysis_driver/quality_control/sex_check.py
@@ -2,12 +2,21 @@
 from egcg_core import executor, util
 from analysis_driver.segmentation import Parameter, Stage
 
+_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
 
-class GenderValidation(Stage):
+
+def sex_alias(sex):
+    for key in _sex_aliases:
+        if str(sex).lower() in _sex_aliases[key]:
+            return key
+    return 'unknown'
+
+
+class SexCheck(Stage):
     vcf_file = Parameter()
 
     def _run(self):
-        """Detect gender of the sample based on the %het on the X chromosome."""
+        """Detect sex of the sample based on the %het on the X chromosome."""
         name, ext = os.path.splitext(util.find_file(self.vcf_file))
         if ext == '.gz':
             file_opener = 'zcat'
@@ -15,21 +24,21 @@ def _run(self):
         else:
             file_opener = 'cat'
 
-        gender_call_file = name + '.sex'
+        sex_check_file = name + '.sex'
 
         command = util.str_join(
             '%s %s' % (file_opener, self.vcf_file),
             "grep -P '^chrX|^X'",
             "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
             "grep '0/1'",
-            "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
+            "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
             separator=' | '
-        ) + ' > ' + gender_call_file
+        ) + ' > ' + sex_check_file
         self.info(command)
 
         return executor.execute(
             command,
-            job_name='sex_detection',
+            job_name='sex_check',
             working_dir=self.job_dir,
             walltime=6,
             cpus=1,
diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py
index 9c757d45..ef130a7a 100644
--- a/analysis_driver/report_generation/crawler.py
+++ b/analysis_driver/report_generation/crawler.py
@@ -1,25 +1,17 @@
 from egcg_core import clarity
 from egcg_core import constants as c
 from egcg_core.app_logging import AppLogger
+from analysis_driver.quality_control.sex_check import sex_alias
 
 
 class Crawler(AppLogger):
-    _gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}
-
-    @classmethod
-    def gender_alias(cls, gender):
-        for key in cls._gender_aliases:
-            if str(gender).lower() in cls._gender_aliases[key]:
-                return key
-        return 'unknown'
-
     @classmethod
     def get_sample_information_from_lims(cls, sample_name):
         lims_sample = clarity.get_sample(sample_name)
         sample_info = {
             c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True),
             c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0],  # returns [plate_id, well]
-            c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)),
+            c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))},
             c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name)
         }
         if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf:
diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py
index fe8d7d26..49115f00 100755
--- a/analysis_driver/report_generation/sample_crawler.py
+++ b/analysis_driver/report_generation/sample_crawler.py
@@ -3,7 +3,7 @@
 from egcg_core.rest_communication import post_or_patch as pp
 from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp
 from analysis_driver.config import output_file_config
-from .crawler import Crawler
+from .crawler import Crawler, sex_alias
 
 
 class SampleCrawler(Crawler):
@@ -58,12 +58,12 @@ def _populate_lib_info(self):
         else:
             self.critical('Missing *-sort-callable.bed')
 
-        sex_file_path = self.get_output_file('gender_call')
+        sex_file_path = self.get_output_file('sex_check')
         if sex_file_path:
             with open(sex_file_path) as f:
-                gender, het_x = f.read().strip().split()
-                sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender)
-                sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x}
+                sex, het_x = f.read().strip().split()
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex)
+                sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x
 
         genotype_validation_path = self.get_output_file('genoval')
         if genotype_validation_path:
@@ -106,10 +106,10 @@ def _populate_lib_info(self):
             }
             sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics
             sample[ELEMENT_MEDIAN_COVERAGE] = median
-            if ELEMENT_GENDER_VALIDATION in sample:
+            if ELEMENT_SEX_VALIDATION in sample:
                 cov_y = dm.get_coverage_y_chrom(coverage_statistics_path)
                 if cov_y:
-                    sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y
+                    sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y
         else:
             self.critical('coverage statistics unavailable for %s', self.sample_id)
 
diff --git a/bin/run_qc.py b/bin/run_qc.py
index bfecc795..8ffa6288 100644
--- a/bin/run_qc.py
+++ b/bin/run_qc.py
@@ -44,9 +44,9 @@ def _parse_args():
     sample_contamination.add_argument('--bam_file', required=True)
     sample_contamination.set_defaults(func=run_sample_contamination_check)
 
-    gender_val = subparsers.add_parser('gender_validation')
-    gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender')
-    gender_val.set_defaults(func=run_gender_validation)
+    sex_check = subparsers.add_parser('sex_check')
+    sex_check.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
+    sex_check.set_defaults(func=run_sex_check)
 
     median_cov = subparsers.add_parser('median_coverage')
     median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
@@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
     v.run()
 
 
-def run_gender_validation(dataset, args):
+def run_sex_check(dataset, args):
     os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
-    g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file)
+    g = qc.SexCheck(dataset=dataset, vcf_file=args.vcf_file)
     g.run()
 
 
diff --git a/etc/output_files.yaml b/etc/output_files.yaml
index 3506c73c..ebae0816 100644
--- a/etc/output_files.yaml
+++ b/etc/output_files.yaml
@@ -101,7 +101,7 @@ bcbio:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    gender_call:
+    sex_check:
         location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
         basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
         new_name: '{user_sample_id}.sex'
diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py
index 93fbb5e9..7d0e803c 100644
--- a/integration_tests/integration_test.py
+++ b/integration_tests/integration_test.py
@@ -15,7 +15,7 @@ class IntegrationTest(ReportingAppIntegrationTest):
         patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'),
         patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well),
         patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project),
-        patch('egcg_core.clarity.get_sample_gender'),
+        patch('egcg_core.clarity.get_sample_sex'),
         patch('egcg_core.clarity.get_sample_genotype', return_value=set()),
         patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()),
         patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()),
@@ -289,7 +289,7 @@ def test_bcbio(self):
         )
 
         self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
-                                'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth',
+                                'fixunmapped', 'blast', 'sexcheck', 'vcfstats', 'samtoolsdepth',
                                 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])
 
         self.expect_equal(
diff --git a/tests/assets/test_crawlers/expected_sample_crawler_data.json b/tests/assets/test_crawlers/expected_sample_crawler_data.json
index 7d19e10a..8b7448aa 100644
--- a/tests/assets/test_crawlers/expected_sample_crawler_data.json
+++ b/tests/assets/test_crawlers/expected_sample_crawler_data.json
@@ -8,8 +8,6 @@
     "project_id": "test_project",
     "bam_file_reads": 7928618,
     "mapped_reads": 7892452,
-    "called_gender": "male",
-    "provided_gender": "female",
     "species_name": "Homo sapiens",
     "species_contamination": {
         "contaminant_unique_mapped": {
@@ -25,7 +23,11 @@
         "total_reads_mapped": 100000
     },
     "sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01},
-    "gender_validation": {"hetX": "0.10"},
+    "sex_validation": {
+        "called": "male",
+        "provided": "female",
+        "hetX": "0.10"
+    },
     "coverage": {
         "median": 478,
         "std_dev": 189.1911391390011,
diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py
index 4af04728..f61f04e4 100644
--- a/tests/test_crawlers.py
+++ b/tests/test_crawlers.py
@@ -119,7 +119,7 @@ def setUp(self):
         self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json')))
         patched_sample_info = patch(
             ppath + 'SampleCrawler.get_sample_information_from_lims',
-            return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'}
+            return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'}
         )
         patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample')
         with patched_sample_info, patched_user_sample_id:
diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py
index a38eb2ab..96884a08 100644
--- a/tests/test_quality_control/test_relatedness.py
+++ b/tests/test_quality_control/test_relatedness.py
@@ -1,5 +1,5 @@
 import os
-from unittest.mock import patch, Mock
+from unittest.mock import patch
 from tests.test_quality_control.qc_tester import QCTester
 from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness
 from analysis_driver.exceptions import PipelineError
@@ -140,15 +140,13 @@ def test_ped_file_content(self, pfams, pmem, pfam):
         pfam.side_effect = ['FAM1', 'FAM1', 'No_ID']
         pmem.side_effect = [[['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0']], [['No_ID', '0', '0', '1', '0']]]
         pfams.return_value = {'FAM1': ['test_sample1', 'test_sample2'], 'No_ID': ['test_sample3']}
-        assert self.p.ped_file_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']]
+        assert self.p.ped_fi1le_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']]
 
     @patch(ppath + 'Peddy.relationships')
     @patch(ppath + 'Peddy.relationship')
-    @patch(ppath + 'Peddy.gender_alias')
     @patch(ppath + 'clarity.get_user_sample_name')
-    @patch(ppath + 'clarity.get_sample')
-    def test_get_member_details(self, psample, pname, psex, prel, prels):
-        psample.return_value = Mock(udf={'Sex': 'F'})
+    @patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M'])
+    def test_get_member_details(self, psex, pname, prel, prels):
         prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'},
                               'Mother': {'Mother': '0', 'Father': '0'},
                               'Father': {'Mother': '0', 'Father': '0'},
@@ -156,7 +154,6 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
                               'Brother': {'Mother': 'test_sample1', 'Father': '0'},
                               'Other': {'Mother': '0', 'Father': '0'}}
         prel.side_effect = ['Mother', 'Proband']
-        psex.side_effect = ['female', 'male']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample1']
         all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [
@@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
                               'Other': {'Mother': '0', 'Father': '0'}}
 
         prel.side_effect = ['Other', 'Other', 'Proband']
-        psex.side_effect = ['unknown', 'unknown', 'male']
+        psex.side_effect = ['unknown', 'unknown', 'M']
         pname.side_effect = ['usersample1', 'usersample2', 'usersample3']
         all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']}
         assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'],
diff --git a/tests/test_quality_control/test_gender_validation.py b/tests/test_quality_control/test_sex_check.py
similarity index 52%
rename from tests/test_quality_control/test_gender_validation.py
rename to tests/test_quality_control/test_sex_check.py
index 31591171..6d5a0b8e 100644
--- a/tests/test_quality_control/test_gender_validation.py
+++ b/tests/test_quality_control/test_sex_check.py
@@ -1,20 +1,20 @@
 from unittest.mock import patch
-from analysis_driver.quality_control.gender_validation import GenderValidation
+from analysis_driver.quality_control.sex_check import SexCheck
 from tests.test_quality_control.qc_tester import QCTester
 
 
-class TestGenderValidation(QCTester):
+class TestSexCheck(QCTester):
     @patch('egcg_core.executor.execute')
     def test_run(self, mocked_execute):
-        validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')
+        validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf')
 
-        with patch('analysis_driver.quality_control.gender_validation.util.find_file', new=self.fake_find_file):
+        with patch('analysis_driver.quality_control.sex_check.util.find_file', new=self.fake_find_file):
             validator._run()
 
             command = mocked_execute.call_args[0][0]
             assert command.startswith('cat')
             assert len(command.split(' | ')) == 5
 
-            validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
+            validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
             validator._run()
             assert mocked_execute.call_args[0][0].startswith('zcat')

From e54f6e9caacae6204954f8c350289c9ab2fe9656 Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Thu, 23 May 2019 15:51:10 +0100
Subject: [PATCH 2/6] SexCheck -> SexValidation

---
 README.md                                              |  2 +-
 analysis_driver/pipelines/bcbio.py                     |  4 ++--
 analysis_driver/quality_control/__init__.py            |  2 +-
 analysis_driver/quality_control/relatedness.py         |  2 +-
 .../{sex_check.py => sex_validation.py}                |  8 ++++----
 analysis_driver/report_generation/crawler.py           |  2 +-
 analysis_driver/report_generation/sample_crawler.py    |  2 +-
 bin/run_qc.py                                          | 10 +++++-----
 etc/output_files.yaml                                  |  2 +-
 tests/test_quality_control/test_relatedness.py         |  2 +-
 tests/test_quality_control/test_sex_check.py           | 10 +++++-----
 11 files changed, 23 insertions(+), 23 deletions(-)
 rename analysis_driver/quality_control/{sex_check.py => sex_validation.py} (91%)

diff --git a/README.md b/README.md
index 23cedc44..a1c84d20 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ Classes that run checks on output files generated from the main pipeline.
 - GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
   a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
   results of comparing the observed and expected vcfs
-- SexCheck - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
+- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
   containing the called sex, to be compared against that supplied in the Lims
 - FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
 - Blast - Checks a fastq file for contamination using NCBI Blast
diff --git a/analysis_driver/pipelines/bcbio.py b/analysis_driver/pipelines/bcbio.py
index f7d14fd0..2743a10f 100755
--- a/analysis_driver/pipelines/bcbio.py
+++ b/analysis_driver/pipelines/bcbio.py
@@ -130,11 +130,11 @@ def stage(cls, **params):
 
     bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]
 
-    sex_check = stage(qc.SexCheck, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
+    sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
     verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
     samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
-    post_bcbio_qc = [sex_check, vcfstats, verify_bam_id, samtools_depth]
+    post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth]
 
     output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
     cleanup = stage(common.Cleanup, previous_stages=[output])
diff --git a/analysis_driver/quality_control/__init__.py b/analysis_driver/quality_control/__init__.py
index d1424789..90d83f6d 100644
--- a/analysis_driver/quality_control/__init__.py
+++ b/analysis_driver/quality_control/__init__.py
@@ -1,6 +1,6 @@
 from .genotype_validation import GenotypeValidation
 from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
-from .sex_check import SexCheck
+from .sex_validation import SexValidation
 from .median_coverage import SamtoolsDepth
 from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
 from .bcl_validation import BCLValidator
diff --git a/analysis_driver/quality_control/relatedness.py b/analysis_driver/quality_control/relatedness.py
index 5a75f820..cfbc6beb 100644
--- a/analysis_driver/quality_control/relatedness.py
+++ b/analysis_driver/quality_control/relatedness.py
@@ -7,7 +7,7 @@
 from analysis_driver.util.bash_commands import java_command
 from analysis_driver.segmentation import Stage, Parameter, ListParameter
 from analysis_driver.exceptions import PipelineError
-from analysis_driver.quality_control.sex_check import sex_alias
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class RelatednessStage(Stage):
diff --git a/analysis_driver/quality_control/sex_check.py b/analysis_driver/quality_control/sex_validation.py
similarity index 91%
rename from analysis_driver/quality_control/sex_check.py
rename to analysis_driver/quality_control/sex_validation.py
index b605ab78..af33a139 100644
--- a/analysis_driver/quality_control/sex_check.py
+++ b/analysis_driver/quality_control/sex_validation.py
@@ -12,7 +12,7 @@ def sex_alias(sex):
     return 'unknown'
 
 
-class SexCheck(Stage):
+class SexValidation(Stage):
     vcf_file = Parameter()
 
     def _run(self):
@@ -24,7 +24,7 @@ def _run(self):
         else:
             file_opener = 'cat'
 
-        sex_check_file = name + '.sex'
+        sex_file = name + '.sex'
 
         command = util.str_join(
             '%s %s' % (file_opener, self.vcf_file),
@@ -33,12 +33,12 @@ def _run(self):
             "grep '0/1'",
             "awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
             separator=' | '
-        ) + ' > ' + sex_check_file
+        ) + ' > ' + sex_file
         self.info(command)
 
         return executor.execute(
             command,
-            job_name='sex_check',
+            job_name='sex_validation',
             working_dir=self.job_dir,
             walltime=6,
             cpus=1,
diff --git a/analysis_driver/report_generation/crawler.py b/analysis_driver/report_generation/crawler.py
index ef130a7a..1195d36f 100644
--- a/analysis_driver/report_generation/crawler.py
+++ b/analysis_driver/report_generation/crawler.py
@@ -1,7 +1,7 @@
 from egcg_core import clarity
 from egcg_core import constants as c
 from egcg_core.app_logging import AppLogger
-from analysis_driver.quality_control.sex_check import sex_alias
+from analysis_driver.quality_control.sex_validation import sex_alias
 
 
 class Crawler(AppLogger):
diff --git a/analysis_driver/report_generation/sample_crawler.py b/analysis_driver/report_generation/sample_crawler.py
index 49115f00..3c695e79 100755
--- a/analysis_driver/report_generation/sample_crawler.py
+++ b/analysis_driver/report_generation/sample_crawler.py
@@ -58,7 +58,7 @@ def _populate_lib_info(self):
         else:
             self.critical('Missing *-sort-callable.bed')
 
-        sex_file_path = self.get_output_file('sex_check')
+        sex_file_path = self.get_output_file('sex_validation')
         if sex_file_path:
             with open(sex_file_path) as f:
                 sex, het_x = f.read().strip().split()
diff --git a/bin/run_qc.py b/bin/run_qc.py
index 8ffa6288..ce7b3fc8 100644
--- a/bin/run_qc.py
+++ b/bin/run_qc.py
@@ -44,9 +44,9 @@ def _parse_args():
     sample_contamination.add_argument('--bam_file', required=True)
     sample_contamination.set_defaults(func=run_sample_contamination_check)
 
-    sex_check = subparsers.add_parser('sex_check')
-    sex_check.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
-    sex_check.set_defaults(func=run_sex_check)
+    sex_validation = subparsers.add_parser('sex_validation')
+    sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
+    sex_validation.set_defaults(func=run_sex_validation)
 
     median_cov = subparsers.add_parser('median_coverage')
     median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
@@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
     v.run()
 
 
-def run_sex_check(dataset, args):
+def run_sex_validation(dataset, args):
     os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
-    g = qc.SexCheck(dataset=dataset, vcf_file=args.vcf_file)
+    g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file)
     g.run()
 
 
diff --git a/etc/output_files.yaml b/etc/output_files.yaml
index ebae0816..01020562 100644
--- a/etc/output_files.yaml
+++ b/etc/output_files.yaml
@@ -101,7 +101,7 @@ bcbio:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    sex_check:
+    sex_validation:
         location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
         basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
         new_name: '{user_sample_id}.sex'
diff --git a/tests/test_quality_control/test_relatedness.py b/tests/test_quality_control/test_relatedness.py
index 96884a08..274644ba 100644
--- a/tests/test_quality_control/test_relatedness.py
+++ b/tests/test_quality_control/test_relatedness.py
@@ -140,7 +140,7 @@ def test_ped_file_content(self, pfams, pmem, pfam):
         pfam.side_effect = ['FAM1', 'FAM1', 'No_ID']
         pmem.side_effect = [[['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0']], [['No_ID', '0', '0', '1', '0']]]
         pfams.return_value = {'FAM1': ['test_sample1', 'test_sample2'], 'No_ID': ['test_sample3']}
-        assert self.p.ped_fi1le_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']]
+        assert self.p.ped_file_content == [['FAM1', '0', '0', '2', '0'], ['FAM1', '0', 'test_sample1', '1', '0'], ['No_ID', '0', '0', '1', '0']]
 
     @patch(ppath + 'Peddy.relationships')
     @patch(ppath + 'Peddy.relationship')
diff --git a/tests/test_quality_control/test_sex_check.py b/tests/test_quality_control/test_sex_check.py
index 6d5a0b8e..061dfb98 100644
--- a/tests/test_quality_control/test_sex_check.py
+++ b/tests/test_quality_control/test_sex_check.py
@@ -1,20 +1,20 @@
 from unittest.mock import patch
-from analysis_driver.quality_control.sex_check import SexCheck
+from analysis_driver.quality_control.sex_validation import SexValidation
 from tests.test_quality_control.qc_tester import QCTester
 
 
-class TestSexCheck(QCTester):
+class TestSexValdiation(QCTester):
     @patch('egcg_core.executor.execute')
     def test_run(self, mocked_execute):
-        validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf')
+        validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')
 
-        with patch('analysis_driver.quality_control.sex_check.util.find_file', new=self.fake_find_file):
+        with patch('egcg_core.util.find_file', new=self.fake_find_file):
             validator._run()
 
             command = mocked_execute.call_args[0][0]
             assert command.startswith('cat')
             assert len(command.split(' | ')) == 5
 
-            validator = SexCheck(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
+            validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
             validator._run()
             assert mocked_execute.call_args[0][0].startswith('zcat')

From c1815ebd17956b703d958e55962919d5dde20af5 Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Mon, 10 Jun 2019 15:26:16 +0100
Subject: [PATCH 3/6] Updating EGCG-Core

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5272828d..7c43b501 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-EGCG-Core==0.9.1
+EGCG-Core==0.11
 luigi==2.8.0
 ete3==3.0.0b35
 pandas<0.21

From ef66e3dbd6b19f29f3540eae4b1260b89ee86e5d Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Wed, 19 Jun 2019 16:44:28 +0100
Subject: [PATCH 4/6] Fixing integration tests

---
 integration_tests/integration_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integration_tests/integration_test.py b/integration_tests/integration_test.py
index 739277c7..dd9f07fe 100644
--- a/integration_tests/integration_test.py
+++ b/integration_tests/integration_test.py
@@ -334,7 +334,7 @@ def test_bcbio(self):
         )
 
         self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
-                                'fixunmapped', 'blast', 'sexcheck', 'vcfstats', 'samtoolsdepth',
+                                'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth',
                                 'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])
 
         ad_proc = rest_communication.get_document('analysis_driver_procs')
@@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self):
         self.expect_stage_data([
             'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup',
             'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport',
-            'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput',
+            'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput',
             'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex',
             'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum',
             'samtoolsstats', 'snpsfiltration'

From 72a297ad54e8b474e88931d2fb8fc19c30f4b7b7 Mon Sep 17 00:00:00 2001
From: tcezard <timothee.cezard@ed.ac.uk>
Date: Tue, 25 Jun 2019 11:05:16 +0100
Subject: [PATCH 5/6] Add support for extra space in the IDT barcode

---
 analysis_driver/dataset.py       | 4 ++--
 integration_tests/mocked_data.py | 2 +-
 tests/test_dataset.py            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/analysis_driver/dataset.py b/analysis_driver/dataset.py
index 77270874..bc5304cb 100644
--- a/analysis_driver/dataset.py
+++ b/analysis_driver/dataset.py
@@ -402,8 +402,8 @@ def _run_elements_from_lims(self):
                     for pattern in (
                         # TruSeq label, e.g, A412-A208 (ATGCATGC-CTGACTGA)
                         '(\w{4})-(\w{4}) \(([ATCG]{8})-([ATCG]{8})\)',
-                        # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA)
-                        '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes (Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
+                        # IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (ATGCATGC-CTGACTGA)
+                        '(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes\s+(Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
                     ):
                         match = re.match(pattern, reagent_label)
                         if match:
diff --git a/integration_tests/mocked_data.py b/integration_tests/mocked_data.py
index 3111a8af..db39cb7e 100644
--- a/integration_tests/mocked_data.py
+++ b/integration_tests/mocked_data.py
@@ -22,7 +22,7 @@ class MockedSample(NamedMock):
                  samples=[MockedSample(real_name='10015AT0002', id='LP6002014-DTP_A02')]),
             Mock(reagent_labels=['D703-D502 (CGCTCATT-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0003', id='LP6002014-DTP_A03')]),
-            Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
+            Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0004', id='LP6002014-DTP_A04')]),
             Mock(reagent_labels=['D705-D502 (ATTCAGAA-ATAGAGGC)'],
                  samples=[MockedSample(real_name='10015AT0006', id='LP6002014-DTP_A05')]),
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 47b21047..ce5672b4 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -220,7 +220,7 @@ def test_pipeline_instruction(self):
 mocked_lane_artifact8 = NamedMock(real_name='art8', reagent_labels=['D706-D502 (GAATTCGG-ATAGAGGC)'], samples=[MockedSample(real_name='sample8', udf={})])
 mocked_idt_artifact = NamedMock(
     real_name='idt_art',
-    reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
+    reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes  Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
     samples=[MockedSample(real_name='idt_sample')]
 )
 mocked_lane_artifact_pool = NamedMock(real_name='artpool', reagent_labels=[

From 6920bc10875d87e286961c358fa7210582bd5860 Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Mon, 1 Jul 2019 11:03:07 +0100
Subject: [PATCH 6/6] Fixing human gatk4 var calling

---
 analysis_driver/pipelines/human_variant_calling_gatk4.py | 4 ++--
 etc/output_files.yaml                                    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/analysis_driver/pipelines/human_variant_calling_gatk4.py b/analysis_driver/pipelines/human_variant_calling_gatk4.py
index 0ecda1bd..edc4c2ad 100755
--- a/analysis_driver/pipelines/human_variant_calling_gatk4.py
+++ b/analysis_driver/pipelines/human_variant_calling_gatk4.py
@@ -198,11 +198,11 @@ def stage(cls, **params):
                      output_vcf_file=hard_filter_indels.hard_filtered_vcf,
                      previous_stages=[hard_filter_snps, hard_filter_indels])
 
-    gender_val = stage(qc.GenderValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
+    sex_val = stage(qc.SexValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
 
     vcfstats = stage(qc.VCFStats, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
 
-    final_stages = [contam, blast, geno_val, gender_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
+    final_stages = [contam, blast, geno_val, sex_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
                     gather_gcvf]
 
     output = stage(common.SampleDataOutput, previous_stages=final_stages, output_fileset='gatk4_human_var_calling')
diff --git a/etc/output_files.yaml b/etc/output_files.yaml
index 5b4db0e8..b1a591ff 100644
--- a/etc/output_files.yaml
+++ b/etc/output_files.yaml
@@ -291,7 +291,7 @@ gatk4_human_var_calling:
         basename: '{sample_id}-chr22-vbi.selfSM'
         new_name: '{user_sample_id}-chr22-vbi.selfSM'
 
-    gender_call:
+    sex_validation:
         location: ['gatk4']
         basename: '{user_sample_id}_hard_filter.sex'
         new_name: '{user_sample_id}.sex'