Skip to content

Commit

Permalink
DeepVariant 1 (generating pileups with pysam) (deepchem#4027)
Browse files Browse the repository at this point in the history
  • Loading branch information
KitVB authored Jul 1, 2024
1 parent 0355e3d commit c738d88
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 5 deletions.
15 changes: 13 additions & 2 deletions deepchem/data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1997,6 +1997,8 @@ class BAMLoader(DataLoader):
Here, we extract Query Name, Query Sequence, Query Length, Reference Name,
Reference Start, CIGAR and Mapping Quality of each read in the BAM file.
This class provides methods to load and featurize data from BAM files.
Additionally, we can also get pileups from BAM files by setting
get_pileup=True.
Examples
--------
Expand All @@ -2012,13 +2014,19 @@ class BAMLoader(DataLoader):
or MacOS X. To use Pysam on Windows, use Windows Subsystem for Linux(WSL).
"""

def __init__(self, featurizer: Optional[Featurizer] = None):
def __init__(self,
featurizer: Optional[Featurizer] = None,
get_pileup: Optional[bool] = False):
"""Initialize BAMLoader.
Parameters
----------
featurizer: Featurizer (default: None)
The Featurizer to be used for the loaded BAM data.
get_pileup: bool, optional (default: False)
If True, the pileup of reads at each position is
returned, to be used in DeepVariant.
"""

# Set attributes
Expand All @@ -2030,7 +2038,10 @@ def __init__(self, featurizer: Optional[Featurizer] = None):
self.user_specified_features = featurizer.feature_fields
elif featurizer is None: # Default featurizer
from deepchem.feat import BAMFeaturizer
featurizer = BAMFeaturizer(max_records=None)
if get_pileup:
featurizer = BAMFeaturizer(max_records=None, get_pileup=True)
else:
featurizer = BAMFeaturizer(max_records=None)

# Set self.featurizer
self.featurizer = featurizer
Expand Down
11 changes: 11 additions & 0 deletions deepchem/data/tests/test_bam_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ def test_bam_featurizer(self):

assert dataset.shape == (5, 7)

def test_bam_featurizer_with_pileup(self):
"""
Tests BAMFeaturizer with pileup generation.
"""
bam_featurizer = dc.feat.BAMFeaturizer(max_records=5, get_pileup=True)
bam_file_path = os.path.join(self.current_dir, "example.bam")
bamfile = pysam.AlignmentFile(bam_file_path, "rb")
dataset = bam_featurizer._featurize(bamfile)

assert dataset.shape == (4, 8)


if __name__ == "__main__":
unittest.main()
46 changes: 43 additions & 3 deletions deepchem/feat/bio_seq_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
from typing import Optional
from deepchem.feat import Featurizer


Expand Down Expand Up @@ -105,15 +106,31 @@ class BAMFeaturizer(Featurizer):
Sequence, Query Length, Reference Name, Reference Start, CIGAR and Mapping
Quality of the alignment in the BAM file.
This is the default featurizer used by BAMLoader, and it extracts the following
fields from each read in each BAM file in the given order:-
This is the default featurizer used by BAMLoader, and it extracts the
following fields from each read in each BAM file in the given order:-
- Column 0: Query Name
- Column 1: Query Sequence
- Column 2: Query Length
- Column 3: Reference Name
- Column 4: Reference Start
- Column 5: CIGAR
- Column 6: Mapping Quality
- Column 7: Pileup Information (if get_pileup=True)
Additionally, we can also get pileups from BAM files by setting
get_pileup=True.A pileup is a summary of the alignment of reads
at each position in a reference sequence. Specifically, it
provides information on the position on the reference genome,
the depth of coverage (i.e., the number of reads aligned to that
position), and the actual bases from the aligned reads at that
position, along with their quality scores. This data structure
is useful for identifying variations, such as single nucleotide
polymorphisms (SNPs), insertions, and deletions by comparing the
aligned reads to the reference genome. A pileup can be visualized
as a vertical stack of aligned sequences, showing how each read
matches or mismatches the reference at each position.
In DeepVariant, pileups are utilized during the initial stages to
select candidate windows for further analysis.
Examples
--------
Expand All @@ -132,7 +149,7 @@ class BAMFeaturizer(Featurizer):
"""

def __init__(self, max_records=None):
def __init__(self, max_records=None, get_pileup: Optional[bool] = False):
"""
Initialize BAMFeaturizer.
Expand All @@ -141,9 +158,13 @@ def __init__(self, max_records=None):
max_records : int or None, optional
The maximum number of records to extract from the BAM file. If None, all
records will be extracted.
get_pileup : bool, optional
If True, pileup information will be extracted from the BAM file.
This is used in DeepVariant. False by default.
"""
self.max_records = max_records
self.get_pileup = get_pileup

def _featurize(self, datapoint):
"""
Expand Down Expand Up @@ -176,6 +197,25 @@ def _featurize(self, datapoint):
record.mapping_quality,
]

if (self.get_pileup):
pileup_columns = []
for pileupcolumn in datapoint.pileup(
reference=record.reference_name,
start=record.reference_start,
end=record.reference_end):
pileup_info = {
"pos":
pileupcolumn.reference_pos,
"depth":
pileupcolumn.nsegments,
"reads": [
pileupread.alignment.query_sequence
for pileupread in pileupcolumn.pileups
]
}
pileup_columns.append(pileup_info)
feature_vector.append(pileup_columns)

features.append(feature_vector)
record_count += 1

Expand Down

0 comments on commit c738d88

Please sign in to comment.