DeepVariant 1 (generating pileups with pysam) (deepchem#4027)

GreatRSingh · Jul 1, 2024 · c738d88 · c738d88
1 parent 0355e3d
commit c738d88
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 5 deletions.
diff --git a/deepchem/data/data_loader.py b/deepchem/data/data_loader.py
@@ -1997,6 +1997,8 @@ class BAMLoader(DataLoader):
     Here, we extract Query Name, Query Sequence, Query Length, Reference Name,
     Reference Start, CIGAR and Mapping Quality of each read in the BAM file.
     This class provides methods to load and featurize data from BAM files.
+    Additionally, we can also get pileups from BAM files by setting
+    get_pileup=True.
 
     Examples
     --------
@@ -2012,13 +2014,19 @@ class BAMLoader(DataLoader):
     or MacOS X. To use Pysam on Windows, use Windows Subsystem for Linux(WSL).
     """
 
-    def __init__(self, featurizer: Optional[Featurizer] = None):
+    def __init__(self,
+                 featurizer: Optional[Featurizer] = None,
+                 get_pileup: Optional[bool] = False):
         """Initialize BAMLoader.
 
         Parameters
         ----------
         featurizer: Featurizer (default: None)
             The Featurizer to be used for the loaded BAM data.
+        get_pileup: bool, optional (default: False)
+            If True, the pileup of reads at each position is
+            returned, to be used in DeepVariant.
+        
        """
 
         # Set attributes
@@ -2030,7 +2038,10 @@ def __init__(self, featurizer: Optional[Featurizer] = None):
             self.user_specified_features = featurizer.feature_fields
         elif featurizer is None:  # Default featurizer
             from deepchem.feat import BAMFeaturizer
-            featurizer = BAMFeaturizer(max_records=None)
+            if get_pileup:
+                featurizer = BAMFeaturizer(max_records=None, get_pileup=True)
+            else:
+                featurizer = BAMFeaturizer(max_records=None)
 
         # Set self.featurizer
         self.featurizer = featurizer

diff --git a/deepchem/data/tests/test_bam_loader.py b/deepchem/data/tests/test_bam_loader.py
@@ -56,6 +56,17 @@ def test_bam_featurizer(self):
 
         assert dataset.shape == (5, 7)
 
+    def test_bam_featurizer_with_pileup(self):
+        """
+        Tests BAMFeaturizer with pileup generation.
+        """
+        bam_featurizer = dc.feat.BAMFeaturizer(max_records=5, get_pileup=True)
+        bam_file_path = os.path.join(self.current_dir, "example.bam")
+        bamfile = pysam.AlignmentFile(bam_file_path, "rb")
+        dataset = bam_featurizer._featurize(bamfile)
+
+        assert dataset.shape == (4, 8)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/deepchem/feat/bio_seq_featurizer.py b/deepchem/feat/bio_seq_featurizer.py
@@ -1,4 +1,5 @@
 import numpy as np
+from typing import Optional
 from deepchem.feat import Featurizer
 
 
@@ -105,15 +106,31 @@ class BAMFeaturizer(Featurizer):
     Sequence, Query Length, Reference Name, Reference Start, CIGAR and Mapping
     Quality of the alignment in the BAM file.
 
-    This is the default featurizer used by BAMLoader, and it extracts the following
-    fields from each read in each BAM file in the given order:-
+    This is the default featurizer used by BAMLoader, and it extracts the
+    following fields from each read in each BAM file in the given order:-
     - Column 0: Query Name
     - Column 1: Query Sequence
     - Column 2: Query Length
     - Column 3: Reference Name
     - Column 4: Reference Start
     - Column 5: CIGAR
     - Column 6: Mapping Quality
+    - Column 7: Pileup Information (if get_pileup=True)
+
+    Additionally, we can also get pileups from BAM files by setting
+    get_pileup=True.A pileup is a summary of the alignment of reads
+    at each position in a reference sequence. Specifically, it
+    provides information on the position on the reference genome,
+    the depth of coverage (i.e., the number of reads aligned to that
+    position), and the actual bases from the aligned reads at that
+    position, along with their quality scores. This data structure
+    is useful for identifying variations, such as single nucleotide
+    polymorphisms (SNPs), insertions, and deletions by comparing the
+    aligned reads to the reference genome. A pileup can be visualized
+    as a vertical stack of aligned sequences, showing how each read
+    matches or mismatches the reference at each position.
+    In DeepVariant, pileups are utilized during the initial stages to
+    select candidate windows for further analysis.
 
     Examples
     --------
@@ -132,7 +149,7 @@ class BAMFeaturizer(Featurizer):
 
     """
 
-    def __init__(self, max_records=None):
+    def __init__(self, max_records=None, get_pileup: Optional[bool] = False):
         """
         Initialize BAMFeaturizer.
 
@@ -141,9 +158,13 @@ def __init__(self, max_records=None):
         max_records : int or None, optional
             The maximum number of records to extract from the BAM file. If None, all
             records will be extracted.
+        get_pileup : bool, optional
+            If True, pileup information will be extracted from the BAM file.
+            This is used in DeepVariant. False by default.
 
         """
         self.max_records = max_records
+        self.get_pileup = get_pileup
 
     def _featurize(self, datapoint):
         """
@@ -176,6 +197,25 @@ def _featurize(self, datapoint):
                 record.mapping_quality,
             ]
 
+            if (self.get_pileup):
+                pileup_columns = []
+                for pileupcolumn in datapoint.pileup(
+                        reference=record.reference_name,
+                        start=record.reference_start,
+                        end=record.reference_end):
+                    pileup_info = {
+                        "pos":
+                            pileupcolumn.reference_pos,
+                        "depth":
+                            pileupcolumn.nsegments,
+                        "reads": [
+                            pileupread.alignment.query_sequence
+                            for pileupread in pileupcolumn.pileups
+                        ]
+                    }
+                    pileup_columns.append(pileup_info)
+                feature_vector.append(pileup_columns)
+
             features.append(feature_vector)
             record_count += 1