Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Renaming gender to sex #400

Merged
merged 6 commits into from
Jun 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline.
- GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
results of comparing the observed and expected vcfs
- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
containing the called gender, to be compared against the gender supplied in the Lims
- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
containing the called sex, to be compared against that supplied in the Lims
- FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
- Blast - Checks a fastq file for contamination using NCBI Blast
- VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID)
Expand Down
4 changes: 2 additions & 2 deletions analysis_driver/pipelines/bcbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ def stage(cls, **params):

bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]

gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth]
post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth]

output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
cleanup = stage(common.Cleanup, previous_stages=[output])
Expand Down
2 changes: 1 addition & 1 deletion analysis_driver/quality_control/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .genotype_validation import GenotypeValidation
from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
from .gender_validation import GenderValidation
from .sex_validation import SexValidation
from .median_coverage import SamtoolsDepth
from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
from .bcl_validation import BCLValidator
Expand Down
12 changes: 2 additions & 10 deletions analysis_driver/quality_control/relatedness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,14 @@
from analysis_driver.util.bash_commands import java_command
from analysis_driver.segmentation import Stage, Parameter, ListParameter
from analysis_driver.exceptions import PipelineError
from analysis_driver.quality_control.sex_validation import sex_alias


class RelatednessStage(Stage):
_gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

@property
def gatk_outfile(self):
return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf')

@classmethod
def gender_alias(cls, gender):
for key in cls._gender_aliases:
if str(gender).lower() in cls._gender_aliases[key]:
return key
return 'unknown'

@staticmethod
def family_id(sample_id):
return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID'
Expand Down Expand Up @@ -245,7 +237,7 @@ def get_member_details(self, family, all_families):
family_lines = []
family_info = self.relationships(all_families[family])
for member in all_families[family]:
sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex'))
sex = sex_alias(clarity.get_sample_sex(member))
sex_codes = {'male': '1', 'female': '2', 'unknown': '0'}
relationship = self.relationship(member)
member_id = clarity.get_user_sample_name(member)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,43 @@
from egcg_core import executor, util
from analysis_driver.segmentation import Parameter, Stage

_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

class GenderValidation(Stage):

def sex_alias(sex):
for key in _sex_aliases:
if str(sex).lower() in _sex_aliases[key]:
return key
return 'unknown'


class SexValidation(Stage):
vcf_file = Parameter()

def _run(self):
"""Detect gender of the sample based on the %het on the X chromosome."""
"""Detect sex of the sample based on the %het on the X chromosome."""
name, ext = os.path.splitext(util.find_file(self.vcf_file))
if ext == '.gz':
file_opener = 'zcat'
name, gz = os.path.splitext(name)
else:
file_opener = 'cat'

gender_call_file = name + '.sex'
sex_file = name + '.sex'

command = util.str_join(
'%s %s' % (file_opener, self.vcf_file),
"grep -P '^chrX|^X'",
"awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
"grep '0/1'",
"awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
"awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
separator=' | '
) + ' > ' + gender_call_file
) + ' > ' + sex_file
self.info(command)

return executor.execute(
command,
job_name='sex_detection',
job_name='sex_validation',
working_dir=self.job_dir,
walltime=6,
cpus=1,
Expand Down
12 changes: 2 additions & 10 deletions analysis_driver/report_generation/crawler.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
from egcg_core import clarity
from egcg_core import constants as c
from egcg_core.app_logging import AppLogger
from analysis_driver.quality_control.sex_validation import sex_alias


class Crawler(AppLogger):
_gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

@classmethod
def gender_alias(cls, gender):
for key in cls._gender_aliases:
if str(gender).lower() in cls._gender_aliases[key]:
return key
return 'unknown'

@classmethod
def get_sample_information_from_lims(cls, sample_name):
lims_sample = clarity.get_sample(sample_name)
sample_info = {
c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True),
c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0], # returns [plate_id, well]
c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)),
c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))},
c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name)
}
if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf:
Expand Down
14 changes: 7 additions & 7 deletions analysis_driver/report_generation/sample_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from egcg_core.rest_communication import post_or_patch as pp
from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp
from analysis_driver.config import output_file_config
from .crawler import Crawler
from .crawler import Crawler, sex_alias


class SampleCrawler(Crawler):
Expand Down Expand Up @@ -58,12 +58,12 @@ def _populate_lib_info(self):
else:
self.critical('Missing *-sort-callable.bed')

sex_file_path = self.get_output_file('gender_call')
sex_file_path = self.get_output_file('sex_validation')
if sex_file_path:
with open(sex_file_path) as f:
gender, het_x = f.read().strip().split()
sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender)
sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x}
sex, het_x = f.read().strip().split()
sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex)
sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x

genotype_validation_path = self.get_output_file('genoval')
if genotype_validation_path:
Expand Down Expand Up @@ -106,10 +106,10 @@ def _populate_lib_info(self):
}
sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics
sample[ELEMENT_MEDIAN_COVERAGE] = median
if ELEMENT_GENDER_VALIDATION in sample:
if ELEMENT_SEX_VALIDATION in sample:
cov_y = dm.get_coverage_y_chrom(coverage_statistics_path)
if cov_y:
sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y
sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y
else:
self.critical('coverage statistics unavailable for %s', self.sample_id)

Expand Down
10 changes: 5 additions & 5 deletions bin/run_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ def _parse_args():
sample_contamination.add_argument('--bam_file', required=True)
sample_contamination.set_defaults(func=run_sample_contamination_check)

gender_val = subparsers.add_parser('gender_validation')
gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender')
gender_val.set_defaults(func=run_gender_validation)
sex_validation = subparsers.add_parser('sex_validation')
sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
sex_validation.set_defaults(func=run_sex_validation)

median_cov = subparsers.add_parser('median_coverage')
median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
Expand Down Expand Up @@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
v.run()


def run_gender_validation(dataset, args):
def run_sex_validation(dataset, args):
os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file)
g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file)
g.run()


Expand Down
2 changes: 1 addition & 1 deletion etc/output_files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ bcbio:
basename: '{sample_id}-chr22-vbi.selfSM'
new_name: '{user_sample_id}-chr22-vbi.selfSM'

gender_call:
sex_validation:
location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
new_name: '{user_sample_id}.sex'
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class IntegrationTest(ReportingAppIntegrationTest):
patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'),
patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well),
patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project),
patch('egcg_core.clarity.get_sample_gender'),
patch('egcg_core.clarity.get_sample_sex'),
patch('egcg_core.clarity.get_sample_genotype', return_value=set()),
patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()),
patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()),
Expand Down Expand Up @@ -334,7 +334,7 @@ def test_bcbio(self):
)

self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth',
'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth',
'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])

ad_proc = rest_communication.get_document('analysis_driver_procs')
Expand Down Expand Up @@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self):
self.expect_stage_data([
'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup',
'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport',
'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput',
'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput',
'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex',
'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum',
'samtoolsstats', 'snpsfiltration'
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
EGCG-Core==0.10
EGCG-Core==0.11
luigi==2.8.0
ete3==3.0.0b35
pandas<0.21
Expand Down
8 changes: 5 additions & 3 deletions tests/assets/test_crawlers/expected_sample_crawler_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
"project_id": "test_project",
"bam_file_reads": 7928618,
"mapped_reads": 7892452,
"called_gender": "male",
"provided_gender": "female",
"species_name": "Homo sapiens",
"species_contamination": {
"contaminant_unique_mapped": {
Expand All @@ -25,7 +23,11 @@
"total_reads_mapped": 100000
},
"sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01},
"gender_validation": {"hetX": "0.10"},
"sex_validation": {
"called": "male",
"provided": "female",
"hetX": "0.10"
},
"coverage": {
"median": 478,
"std_dev": 189.1911391390011,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def setUp(self):
self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json')))
patched_sample_info = patch(
ppath + 'SampleCrawler.get_sample_information_from_lims',
return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'}
return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'}
)
patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample')
with patched_sample_info, patched_user_sample_id:
Expand Down
11 changes: 4 additions & 7 deletions tests/test_quality_control/test_relatedness.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from unittest.mock import patch, Mock
from unittest.mock import patch
from tests.test_quality_control.qc_tester import QCTester
from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness
from analysis_driver.exceptions import PipelineError
Expand Down Expand Up @@ -144,19 +144,16 @@ def test_ped_file_content(self, pfams, pmem, pfam):

@patch(ppath + 'Peddy.relationships')
@patch(ppath + 'Peddy.relationship')
@patch(ppath + 'Peddy.gender_alias')
@patch(ppath + 'clarity.get_user_sample_name')
@patch(ppath + 'clarity.get_sample')
def test_get_member_details(self, psample, pname, psex, prel, prels):
psample.return_value = Mock(udf={'Sex': 'F'})
@patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M'])
def test_get_member_details(self, psex, pname, prel, prels):
prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'},
'Mother': {'Mother': '0', 'Father': '0'},
'Father': {'Mother': '0', 'Father': '0'},
'Sister': {'Mother': 'test_sample1', 'Father': '0'},
'Brother': {'Mother': 'test_sample1', 'Father': '0'},
'Other': {'Mother': '0', 'Father': '0'}}
prel.side_effect = ['Mother', 'Proband']
psex.side_effect = ['female', 'male']
pname.side_effect = ['usersample1', 'usersample2', 'usersample1']
all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']}
assert self.p.get_member_details('FAM1', all_families) == [
Expand All @@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
'Other': {'Mother': '0', 'Father': '0'}}

prel.side_effect = ['Other', 'Other', 'Proband']
psex.side_effect = ['unknown', 'unknown', 'male']
psex.side_effect = ['unknown', 'unknown', 'M']
pname.side_effect = ['usersample1', 'usersample2', 'usersample3']
all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']}
assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from unittest.mock import patch
from analysis_driver.quality_control.gender_validation import GenderValidation
from analysis_driver.quality_control.sex_validation import SexValidation
from tests.test_quality_control.qc_tester import QCTester


class TestGenderValidation(QCTester):
class TestSexValdiation(QCTester):
@patch('egcg_core.executor.execute')
def test_run(self, mocked_execute):
validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')
validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf')

with patch('analysis_driver.quality_control.gender_validation.util.find_file', new=self.fake_find_file):
with patch('egcg_core.util.find_file', new=self.fake_find_file):
validator._run()

command = mocked_execute.call_args[0][0]
assert command.startswith('cat')
assert len(command.split(' | ')) == 5

validator = GenderValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
validator = SexValidation(dataset=self.dataset, vcf_file='path/to/test/vcf.gz')
validator._run()
assert mocked_execute.call_args[0][0].startswith('zcat')