Skip to content

Commit

Permalink
Merge pull request #409 from EdinburghGenomics/Release_v0.25.1
Browse files Browse the repository at this point in the history
Release v0.25.1
  • Loading branch information
mwhamgenomics authored Jul 10, 2019
2 parents ac2ba91 + e566f91 commit 0dd8cbe
Show file tree
Hide file tree
Showing 19 changed files with 63 additions and 71 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ Classes that run checks on output files generated from the main pipeline.
- GenotypeValidation - Uses bwa, samtools and gatk to validate called snps against a test dataset. Compares
a sample's genotype with an expected, queries the LIMS for an expected genotype vcf, and writes a file containing the
results of comparing the observed and expected vcfs
- GenderValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
containing the called gender, to be compared against the gender supplied in the Lims
- SexValidation - Quantifies X-chromosome heterozygosity in BCBio's output haplotype vcf. Produces a file
containing the called sex, to be compared against that supplied in the Lims
- FastqScreen - Checks fastqs for sample contamination using [fastqscreen](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqscreen)
- Blast - Checks a fastq file for contamination using NCBI Blast
- VerifyBamID - Checks a Bam file for species contamination using [VerifyBamID](http://genome.sph.umich.edu/wiki/VerifyBamID)
Expand Down
4 changes: 2 additions & 2 deletions analysis_driver/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,8 @@ def _run_elements_from_lims(self):
for pattern in (
# TruSeq label, e.g, A412-A208 (ATGCATGC-CTGACTGA)
'(\w{4})-(\w{4}) \(([ATCG]{8})-([ATCG]{8})\)',
# IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA)
'(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes (Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
# IDT label, e.g, 001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (ATGCATGC-CTGACTGA)
'(\w{4}) IDT-ILMN TruSeq DNA-RNA UD 96 Indexes\s+(Plate_\w{7}) \(([ATGC]{8})-([ATGC]{8})\)'
):
match = re.match(pattern, reagent_label)
if match:
Expand Down
4 changes: 2 additions & 2 deletions analysis_driver/pipelines/bcbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ def stage(cls, **params):

bcbio_and_qc = [fix_unmapped, fastqc, contam_check, blast, geno_val]

gender_val = stage(qc.GenderValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
sex_val = stage(qc.SexValidation, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
vcfstats = stage(qc.VCFStats, vcf_file=bcbio.vcf_path, previous_stages=bcbio_and_qc),
verify_bam_id = stage(qc.VerifyBamID, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc),
samtools_depth = stage(qc.SamtoolsDepth, bam_file=bcbio.bam_path_fixed, previous_stages=bcbio_and_qc)
post_bcbio_qc = [gender_val, vcfstats, verify_bam_id, samtools_depth]
post_bcbio_qc = [sex_val, vcfstats, verify_bam_id, samtools_depth]

output = stage(common.SampleDataOutput, previous_stages=post_bcbio_qc, output_fileset='bcbio')
cleanup = stage(common.Cleanup, previous_stages=[output])
Expand Down
4 changes: 2 additions & 2 deletions analysis_driver/pipelines/human_variant_calling_gatk4.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ def stage(cls, **params):
output_vcf_file=hard_filter_indels.hard_filtered_vcf,
previous_stages=[hard_filter_snps, hard_filter_indels])

gender_val = stage(qc.GenderValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])
sex_val = stage(qc.SexValidation, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])

vcfstats = stage(qc.VCFStats, vcf_file=merge_hf.hard_filtered_vcf, previous_stages=[merge_hf])

final_stages = [contam, blast, geno_val, gender_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
final_stages = [contam, blast, geno_val, sex_val, vcfstats, verify_bam_id, samtools_depth, samtools_stat,
gather_gcvf]

output = stage(common.SampleDataOutput, previous_stages=final_stages, output_fileset='gatk4_human_var_calling')
Expand Down
2 changes: 1 addition & 1 deletion analysis_driver/quality_control/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .genotype_validation import GenotypeValidation
from .contamination_checks import FastqScreen, VerifyBamID, VCFStats, Blast
from .gender_validation import GenderValidation
from .sex_validation import SexValidation
from .median_coverage import SamtoolsDepth
from .relatedness import Relatedness, Peddy, GenotypeGVCFs, ParseRelatedness
from .bcl_validation import BCLValidator
Expand Down
12 changes: 2 additions & 10 deletions analysis_driver/quality_control/relatedness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,14 @@
from analysis_driver.util.bash_commands import java_command
from analysis_driver.segmentation import Stage, Parameter, ListParameter
from analysis_driver.exceptions import PipelineError
from analysis_driver.quality_control.sex_validation import sex_alias


class RelatednessStage(Stage):
_gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

@property
def gatk_outfile(self):
return os.path.join(self.job_dir, self.dataset.name + '_genotype_gvcfs.vcf')

@classmethod
def gender_alias(cls, gender):
for key in cls._gender_aliases:
if str(gender).lower() in cls._gender_aliases[key]:
return key
return 'unknown'

@staticmethod
def family_id(sample_id):
return clarity.get_sample(sample_id).udf.get('Family ID') or 'No_ID'
Expand Down Expand Up @@ -245,7 +237,7 @@ def get_member_details(self, family, all_families):
family_lines = []
family_info = self.relationships(all_families[family])
for member in all_families[family]:
sex = self.gender_alias(clarity.get_sample(member).udf.get('Sex'))
sex = sex_alias(clarity.get_sample_sex(member))
sex_codes = {'male': '1', 'female': '2', 'unknown': '0'}
relationship = self.relationship(member)
member_id = clarity.get_user_sample_name(member)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,43 @@
from egcg_core import executor, util
from analysis_driver.segmentation import Parameter, Stage

_sex_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

class GenderValidation(Stage):

def sex_alias(sex):
for key in _sex_aliases:
if str(sex).lower() in _sex_aliases[key]:
return key
return 'unknown'


class SexValidation(Stage):
vcf_file = Parameter()

def _run(self):
"""Detect gender of the sample based on the %het on the X chromosome."""
"""Detect sex of the sample based on the %het on the X chromosome."""
name, ext = os.path.splitext(util.find_file(self.vcf_file))
if ext == '.gz':
file_opener = 'zcat'
name, gz = os.path.splitext(name)
else:
file_opener = 'cat'

gender_call_file = name + '.sex'
sex_file = name + '.sex'

command = util.str_join(
'%s %s' % (file_opener, self.vcf_file),
"grep -P '^chrX|^X'",
"awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
"grep '0/1'",
"awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
"awk '{if ($2>.35){sex=\"FEMALE\"}else{if ($2<.15){sex=\"MALE\"}else{sex=\"UNKNOWN\"}} print sex, $2}'",
separator=' | '
) + ' > ' + gender_call_file
) + ' > ' + sex_file
self.info(command)

return executor.execute(
command,
job_name='sex_detection',
job_name='sex_validation',
working_dir=self.job_dir,
walltime=6,
cpus=1,
Expand Down
12 changes: 2 additions & 10 deletions analysis_driver/report_generation/crawler.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
from egcg_core import clarity
from egcg_core import constants as c
from egcg_core.app_logging import AppLogger
from analysis_driver.quality_control.sex_validation import sex_alias


class Crawler(AppLogger):
_gender_aliases = {'female': ['f', 'female', 'girl', 'woman'], 'male': ['m', 'male', 'boy', 'man']}

@classmethod
def gender_alias(cls, gender):
for key in cls._gender_aliases:
if str(gender).lower() in cls._gender_aliases[key]:
return key
return 'unknown'

@classmethod
def get_sample_information_from_lims(cls, sample_name):
lims_sample = clarity.get_sample(sample_name)
sample_info = {
c.ELEMENT_SAMPLE_EXTERNAL_ID: clarity.get_user_sample_name(sample_name, lenient=True),
c.ELEMENT_SAMPLE_PLATE: clarity.get_plate_id_and_well(sample_name)[0], # returns [plate_id, well]
c.ELEMENT_PROVIDED_GENDER: cls.gender_alias(clarity.get_sample_gender(sample_name)),
c.ELEMENT_SEX_VALIDATION: {c.ELEMENT_PROVIDED_SEX: sex_alias(clarity.get_sample_sex(sample_name))},
c.ELEMENT_SAMPLE_SPECIES: clarity.get_species_from_sample(sample_name)
}
if 'Yield for Quoted Coverage (Gb)' in lims_sample.udf:
Expand Down
14 changes: 7 additions & 7 deletions analysis_driver/report_generation/sample_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from egcg_core.rest_communication import post_or_patch as pp
from analysis_driver.reader import demultiplexing_parsers as dm, mapping_stats_parsers as mp
from analysis_driver.config import output_file_config
from .crawler import Crawler
from .crawler import Crawler, sex_alias


class SampleCrawler(Crawler):
Expand Down Expand Up @@ -58,12 +58,12 @@ def _populate_lib_info(self):
else:
self.critical('Missing *-sort-callable.bed')

sex_file_path = self.get_output_file('gender_call')
sex_file_path = self.get_output_file('sex_validation')
if sex_file_path:
with open(sex_file_path) as f:
gender, het_x = f.read().strip().split()
sample[ELEMENT_CALLED_GENDER] = self.gender_alias(gender)
sample[ELEMENT_GENDER_VALIDATION] = {ELEMENT_GENDER_HETX: het_x}
sex, het_x = f.read().strip().split()
sample[ELEMENT_SEX_VALIDATION][ELEMENT_CALLED_SEX] = sex_alias(sex)
sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_HETX] = het_x

genotype_validation_path = self.get_output_file('genoval')
if genotype_validation_path:
Expand Down Expand Up @@ -106,10 +106,10 @@ def _populate_lib_info(self):
}
sample[ELEMENT_COVERAGE_STATISTICS] = coverage_statistics
sample[ELEMENT_MEDIAN_COVERAGE] = median
if ELEMENT_GENDER_VALIDATION in sample:
if ELEMENT_SEX_VALIDATION in sample:
cov_y = dm.get_coverage_y_chrom(coverage_statistics_path)
if cov_y:
sample[ELEMENT_GENDER_VALIDATION][ELEMENT_GENDER_COVY] = cov_y
sample[ELEMENT_SEX_VALIDATION][ELEMENT_SEX_COVY] = cov_y
else:
self.critical('coverage statistics unavailable for %s', self.sample_id)

Expand Down
10 changes: 5 additions & 5 deletions bin/run_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ def _parse_args():
sample_contamination.add_argument('--bam_file', required=True)
sample_contamination.set_defaults(func=run_sample_contamination_check)

gender_val = subparsers.add_parser('gender_validation')
gender_val.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect gender')
gender_val.set_defaults(func=run_gender_validation)
sex_validation = subparsers.add_parser('sex_validation')
sex_validation.add_argument('-v', '--vcf_file', dest='vcf_file', type=str, help='vcf file used to detect sex')
sex_validation.set_defaults(func=run_sex_validation)

median_cov = subparsers.add_parser('median_coverage')
median_cov.add_argument('--bam_file', required=True, help='the fastq file pairs')
Expand Down Expand Up @@ -135,9 +135,9 @@ def run_sample_contamination_check(dataset, args):
v.run()


def run_gender_validation(dataset, args):
def run_sex_validation(dataset, args):
os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
g = qc.GenderValidation(dataset=dataset, vcf_file=args.vcf_file)
g = qc.SexValidation(dataset=dataset, vcf_file=args.vcf_file)
g.run()


Expand Down
4 changes: 2 additions & 2 deletions etc/output_files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ bcbio:
basename: '{sample_id}-chr22-vbi.selfSM'
new_name: '{user_sample_id}-chr22-vbi.selfSM'

gender_call:
sex_validation:
location: ['samples_{sample_id}-merged', 'final', '*_{user_sample_id}']
basename: '{user_sample_id}-joint-gatk-haplotype-joint.sex'
new_name: '{user_sample_id}.sex'
Expand Down Expand Up @@ -291,7 +291,7 @@ gatk4_human_var_calling:
basename: '{sample_id}-chr22-vbi.selfSM'
new_name: '{user_sample_id}-chr22-vbi.selfSM'

gender_call:
sex_validation:
location: ['gatk4']
basename: '{user_sample_id}_hard_filter.sex'
new_name: '{user_sample_id}.sex'
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class IntegrationTest(ReportingAppIntegrationTest):
patch('egcg_core.clarity.find_project_name_from_sample', return_value='10015AT'),
patch('egcg_core.clarity.get_plate_id_and_well', new=mocked_data.fake_get_plate_id_and_well),
patch('egcg_core.clarity.get_project', return_value=mocked_data.mocked_clarity_project),
patch('egcg_core.clarity.get_sample_gender'),
patch('egcg_core.clarity.get_sample_sex'),
patch('egcg_core.clarity.get_sample_genotype', return_value=set()),
patch('egcg_core.clarity.get_sample_names_from_project', return_value=set()),
patch('egcg_core.clarity.get_samples_arrived_with', return_value=set()),
Expand Down Expand Up @@ -334,7 +334,7 @@ def test_bcbio(self):
)

self.expect_stage_data(['mergefastqs', 'fastqc', 'genotypevalidation', 'bcbio', 'fastqscreen',
'fixunmapped', 'blast', 'gendervalidation', 'vcfstats', 'samtoolsdepth',
'fixunmapped', 'blast', 'sexvalidation', 'vcfstats', 'samtoolsdepth',
'verifybamid', 'sampledataoutput', 'md5sum', 'cleanup', 'samplereview'])

ad_proc = rest_communication.get_document('analysis_driver_procs')
Expand Down Expand Up @@ -682,7 +682,7 @@ def test_gatk4_var_calling_human(self):
self.expect_stage_data([
'gathervcfvc', 'mergebamanddup', 'splitgenotypegvcfs', 'selectsnps', 'mergefastqs', 'cleanup',
'splithaplotypecallervc', 'variantannotation', 'genotypevalidation', 'gatherbqsrreport',
'selectindels', 'verifybamid', 'gathergvcf', 'gendervalidation', 'fastqscreen', 'sampledataoutput',
'selectindels', 'verifybamid', 'gathergvcf', 'sexvalidation', 'fastqscreen', 'sampledataoutput',
'gatherrecalbam', 'indelsfiltration', 'samtoolsdepth', 'scatterapplybqsr', 'samplereview', 'fastqindex',
'scatterbaserecalibrator', 'merge_variants_hard_filter', 'blast', 'splitbwa', 'vcfstats', 'md5sum',
'samtoolsstats', 'snpsfiltration'
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/mocked_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class MockedSample(NamedMock):
samples=[MockedSample(real_name='10015AT0002', id='LP6002014-DTP_A02')]),
Mock(reagent_labels=['D703-D502 (CGCTCATT-ATAGAGGC)'],
samples=[MockedSample(real_name='10015AT0003', id='LP6002014-DTP_A03')]),
Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
Mock(reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (GAGATTCC-ATAGAGGC)'],
samples=[MockedSample(real_name='10015AT0004', id='LP6002014-DTP_A04')]),
Mock(reagent_labels=['D705-D502 (ATTCAGAA-ATAGAGGC)'],
samples=[MockedSample(real_name='10015AT0006', id='LP6002014-DTP_A05')]),
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
EGCG-Core==0.10
EGCG-Core==0.11
luigi==2.8.0
ete3==3.0.0b35
pandas<0.21
Expand Down
8 changes: 5 additions & 3 deletions tests/assets/test_crawlers/expected_sample_crawler_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
"project_id": "test_project",
"bam_file_reads": 7928618,
"mapped_reads": 7892452,
"called_gender": "male",
"provided_gender": "female",
"species_name": "Homo sapiens",
"species_contamination": {
"contaminant_unique_mapped": {
Expand All @@ -25,7 +23,11 @@
"total_reads_mapped": 100000
},
"sample_contamination": {"het_hom_ratio": 1.6, "ti_tv_ratio": 2.01},
"gender_validation": {"hetX": "0.10"},
"sex_validation": {
"called": "male",
"provided": "female",
"hetX": "0.10"
},
"coverage": {
"median": 478,
"std_dev": 189.1911391390011,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def setUp(self):
self.expected_output = json.load(open(os.path.join(self.test_data, 'expected_sample_crawler_data.json')))
patched_sample_info = patch(
ppath + 'SampleCrawler.get_sample_information_from_lims',
return_value={'user_sample_id': 'test_sample', 'provided_gender': 'female', 'species_name': 'Homo sapiens'}
return_value={'user_sample_id': 'test_sample', 'sex_validation': {'provided': 'female'}, 'species_name': 'Homo sapiens'}
)
patched_user_sample_id = patch(ppath + 'sample_crawler.clarity.get_user_sample_name', return_value='test_sample')
with patched_sample_info, patched_user_sample_id:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_pipeline_instruction(self):
mocked_lane_artifact8 = NamedMock(real_name='art8', reagent_labels=['D706-D502 (GAATTCGG-ATAGAGGC)'], samples=[MockedSample(real_name='sample8', udf={})])
mocked_idt_artifact = NamedMock(
real_name='idt_art',
reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
reagent_labels=['001A IDT-ILMN TruSeq DNA-RNA UD 96 Indexes Plate_UDI0001 (CCGCGGTT-AGCGCTAG)'],
samples=[MockedSample(real_name='idt_sample')]
)
mocked_lane_artifact_pool = NamedMock(real_name='artpool', reagent_labels=[
Expand Down
11 changes: 4 additions & 7 deletions tests/test_quality_control/test_relatedness.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from unittest.mock import patch, Mock
from unittest.mock import patch
from tests.test_quality_control.qc_tester import QCTester
from analysis_driver.quality_control.relatedness import Relatedness, GenotypeGVCFs, Peddy, ParseRelatedness
from analysis_driver.exceptions import PipelineError
Expand Down Expand Up @@ -144,19 +144,16 @@ def test_ped_file_content(self, pfams, pmem, pfam):

@patch(ppath + 'Peddy.relationships')
@patch(ppath + 'Peddy.relationship')
@patch(ppath + 'Peddy.gender_alias')
@patch(ppath + 'clarity.get_user_sample_name')
@patch(ppath + 'clarity.get_sample')
def test_get_member_details(self, psample, pname, psex, prel, prels):
psample.return_value = Mock(udf={'Sex': 'F'})
@patch(ppath + 'clarity.get_sample_sex', side_effect=['F', 'M'])
def test_get_member_details(self, psex, pname, prel, prels):
prels.return_value = {'Proband': {'Mother': 'test_sample1', 'Father': '0'},
'Mother': {'Mother': '0', 'Father': '0'},
'Father': {'Mother': '0', 'Father': '0'},
'Sister': {'Mother': 'test_sample1', 'Father': '0'},
'Brother': {'Mother': 'test_sample1', 'Father': '0'},
'Other': {'Mother': '0', 'Father': '0'}}
prel.side_effect = ['Mother', 'Proband']
psex.side_effect = ['female', 'male']
pname.side_effect = ['usersample1', 'usersample2', 'usersample1']
all_families = {'FAM1': ['test_sample1', 'test_sample2'], 'FAM2': ['test_sample3']}
assert self.p.get_member_details('FAM1', all_families) == [
Expand All @@ -171,7 +168,7 @@ def test_get_member_details(self, psample, pname, psex, prel, prels):
'Other': {'Mother': '0', 'Father': '0'}}

prel.side_effect = ['Other', 'Other', 'Proband']
psex.side_effect = ['unknown', 'unknown', 'male']
psex.side_effect = ['unknown', 'unknown', 'M']
pname.side_effect = ['usersample1', 'usersample2', 'usersample3']
all_families = {'FAM1': ['test_sample1', 'test_sample2', 'test_sample3']}
assert self.p.get_member_details('FAM1', all_families) == [['FAM1', 'usersample1', '0', '0', '0', '0'],
Expand Down
Loading

0 comments on commit 0dd8cbe

Please sign in to comment.