[#83] Introduce abstractions for signals and fast5 files (#84)

* [#83] Introduce abstractions for signals and fast5 files * Removed class method from FractionalizedSignal * Architecting signal and fast5 data organization Co-authored-by: Jessica Dunstan <[email protected]> Co-authored-by: Katie Doroschak <[email protected]>
uwmisl · Feb 15, 2021 · 3aa47ac · 3aa47ac
1 parent 4acc013
commit 3aa47ac
Show file tree

Hide file tree

Showing 8 changed files with 504 additions and 603 deletions.
diff --git a/poretitioner/__init__.py b/poretitioner/__init__.py
@@ -0,0 +1,7 @@
+import matplotlib
+
+from . import signals, utils
+
+# We're using TK as a matplotlib backend since it doesn't require any extra dependencies.
+# If need a different backend, it can be configured here.
+matplotlib.use("TkAgg")
diff --git a/poretitioner/fast5s.py b/poretitioner/fast5s.py
@@ -6,7 +6,6 @@
 Classes for reading, writing and validating fast5 files.
 """
 
-from dataclasses import dataclass
 from os import PathLike
 from pathlib import Path, PurePosixPath
 from typing import List, NewType, Optional, Union
@@ -21,14 +20,14 @@
     RawSignal,
     VoltageSignal,
 )
+
 from .utils.classify import (
     NULL_CLASSIFICATION_RESULT,
     ClassificationResult,
     ClassifierConfiguration,
     ClassifierDetails,
     NullClassificationResult,
 )
-from .utils.core import NumpyArrayLike
 
 __all__ = ["BulkFile", "CaptureFile", "channel_path_for_read_id", "signal_path_for_read_id"]
 
@@ -66,17 +65,17 @@ def __init__(self, filepath: Union[str, PathLike], logger: Logger = getLogger())
 
         Parameters
         ----------
-        filepath : PathLike
-            Path to the fast5 file to interface with.
+        filepath : [type]
+            [description]
         logger : Logger, optional
-            Logger to use, by default getLogger()
+            [description], by default getLogger()
 
         Raises
         ------
         OSError
-            File couldn't be opened (e.g. didn't exist, OS 'Resource temporarily unavailable'). Details in message.
+            Bulk file couldn't be opened (e.g. didn't exist, OS 'Resource temporarily unavailable'). Details in message.
         ValueError
-            File had validation errors, details in message.
+            Bulk file had validation errors, details in message.
         """
 
         self.filepath = Path(filepath).expanduser().resolve()
@@ -154,7 +153,7 @@ def __init__(self, bulk_filepath: PathLike, logger: Logger = getLogger()):
         Parameters
         ----------
         bulk_filepath : PathLike
-            File path to the bulk fast 5 file.
+            Absolute path to the bulk fast 5 file.
         logger : Logger, optional
             Logger to use, by default getLogger()
 
@@ -232,6 +231,11 @@ def sampling_rate(self) -> int:
         Also referred to as the sample rate, sample frequency, or sampling
         frequency.
 
+        Parameters
+        ----------
+        f5 : h5py.File
+            Fast5 file, open for reading using h5py.File.
+
         Returns
         -------
         int
@@ -303,6 +307,9 @@ def get_raw_signal(
 
         Parameters
         ----------
+        f5 : h5py.File
+            Fast5 file, open for reading using h5py.File.
+
         channel_number : int
             Channel number for which to retrieve raw signal.
         start : Optional[int], optional
@@ -340,7 +347,7 @@ def get_voltage(self, start=None, end=None) -> VoltageSignal:
 
         Returns
         -------
-        NumpyArrayLike[int]
+        VoltageSignal[int]
             Voltage(s) in millivolts (mV).
         """
         bias_voltage_multiplier = 5  # Signal changes in increments of 5 mV.
@@ -358,7 +365,7 @@ def __init__(self, capture_filepath: PathLike, logger: Logger = getLogger()):
         capture_filepath : PathLike
             Path to the capture file. Capture files are the result of running `poretitioner segment` on a bulk file.
         logger : Logger, optional
-            Logger to use, by default getLogger()
+            [description], by default getLogger()
 
         Raises
         ------
@@ -369,7 +376,7 @@ def __init__(self, capture_filepath: PathLike, logger: Logger = getLogger()):
         """
         super().__init__(capture_filepath, logger=logger)
         if not self.filepath.exists():
-            error_msg = f"Capture fast5 file does not exist at path: {self.filepath}. Make sure the capture file is in this location."
+            error_msg = f"Capture fast5 file does not exist at path: {self.filepath}. Make sure the bulk file is in this location."
             raise OSError(error_msg)
         self.validate(self.filepath, logger)
 
@@ -456,6 +463,8 @@ def get_channel_calibration_for_read(self, read_id: str) -> ChannelCalibration:
 
         Parameters
         ----------
+        f5 : h5py.File
+            Fast5 file, open for reading using h5py.File.
         read_id : str
             Read id to retrieve raw signal. Can be formatted as a path ("read_xxx...")
             or just the read id ("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx").
@@ -470,18 +479,6 @@ def get_channel_calibration_for_read(self, read_id: str) -> ChannelCalibration:
         return calibration
 
     def get_capture_metadata_for_read(self, read_id: str) -> CaptureMetadata:
-        """Retrieve the capture metadata for given read.
-
-        Parameters
-        ----------
-        read_id : str
-            Which read to fetch the metadata for.
-
-        Returns
-        -------
-        CaptureMetadata
-            Metadata around the captures in this read.
-        """
         channel_path = channel_path_for_read_id(read_id)
         signal_path = signal_path_for_read_id(read_id)
 

diff --git a/poretitioner/signals.py b/poretitioner/signals.py
@@ -18,8 +18,8 @@
 from .utils.core import NumpyArrayLike, Window
 
 __all__ = [
-    "compute_fractional_blockage",
     "find_open_channel_current",
+    "find_segments_below_threshold",
     "Capture",
     "ChannelCalibration",
     "FractionalizedSignal",
@@ -31,8 +31,6 @@
     "DEFAULT_OPEN_CHANNEL_BOUND",
 ]
 
-Logger = logger.Logger  # Logger type
-
 DEFAULT_OPEN_CHANNEL_GUESS = 220  # In picoAmperes (pA).
 DEFAULT_OPEN_CHANNEL_BOUND = 15  # In picoAmperes (pA).
 
@@ -108,8 +106,6 @@ class BaseSignal(NumpyArrayLike):
     """Base class for signals (time series data generated from a nanopore channel).
         Do not use this class directly, instead subclass it.
 
-        Creates a view of an underlying numpy array.
-
         Subclasses must implement:
         `def __array_finalize__(self, obj):` and `def __reduce__(self):`. See `FractionalizedSignal` for an example.
 
@@ -350,7 +346,7 @@ def to_picoamperes(self) -> PicoampereSignal:
         """
         picoamps = compute_picoamperes_from_fractional_blockage(self)
         picoamperes = PicoampereSignal(
-            picoamps, self.channel_number, self.calibration, read_id=self.read_id
+            picoamps, self.channel_number, self.calibration, read_id=self.read_id, *args, **kwargs
         )
         return picoamperes
 
@@ -386,21 +382,12 @@ def to_picoamperes(self) -> PicoampereSignal:
         """
         picoamps = calculate_raw_in_picoamperes(self, self.calibration)
         picoamperes = PicoampereSignal(
-            picoamps, self.channel_number, self.calibration, read_id=self.read_id
+            picoamps, self.channel_number, self.calibration, read_id=self.read_id, *args, **kwargs
         )
         return picoamperes
 
-    def to_fractionalized(
-        self,
-        open_channel_guess=DEFAULT_OPEN_CHANNEL_GUESS,
-        open_channel_bound=None,
-        default=DEFAULT_OPEN_CHANNEL_GUESS,
-    ) -> FractionalizedSignal:
-        return self.to_picoamperes().to_fractionalized(
-            open_channel_guess=open_channel_guess,
-            open_channel_bound=open_channel_bound,
-            default=default,
-        )
+    def to_fractionalized(self, *args, **kwargs) -> FractionalizedSignal:
+        return self.to_picoamperes(*args, **kwargs).to_fractionalized(*args, **kwargs)
 
 
 class PicoampereSignal(CurrentSignal):
@@ -772,9 +759,7 @@ def digitize_current(
     )
 
 
-def compute_fractional_blockage(
-    picoamperes: NumpyArrayLike, open_channel: float
-) -> NumpyArrayLike:
+def compute_fractional_blockage(picoamperes, open_channel) -> NumpyArrayLike:
     """Converts a nanopore signal (in units of pA) to fractionalized current
     in the range (0, 1).
 
@@ -792,8 +777,9 @@ def compute_fractional_blockage(
     [NumpyArrayLike]
         Array of fractionalized nanopore current in the range [0, 1]
     """
-
-    frac = np.clip(picoamperes / open_channel, a_min=0.0, a_max=1.0)
+    pico = np.frombuffer(picoamperes, dtype=float)
+    pico /= open_channel
+    frac = np.clip(pico, a_max=1.0, a_min=0.0)
     return frac
 
 
@@ -818,7 +804,7 @@ def compute_picoamperes_from_fractional_blockage(
         Array of fractionalized nanopore current in the range [0, 1]
     """
 
-    pico = fractional * fractional.open_channel_pA
+    pico = np.frombuffer(fractional * fractional.open_channel_pA, dtype=float)
     return pico
 
 
@@ -837,15 +823,15 @@ def find_open_channel_current(
     ----------
     picoamperes : NumpyArrayLike
         Array representing sampled nanopore current, scaled to pA.
-    open_channel_guess : float, optional
+    open_channel_guess : numeric, optional
         Approximate estimate of the open channel current value, by default DEFAULT_OPEN_CHANNEL_GUESS.
-    bound : float, optional
+    bound : numeric, optional
         Approximate estimate of the variance in open channel current value from
         channel to channel (AKA the range to search). If no bound is specified,
         the default is to use 10% of the open_channel guess.
-    default : float, optional
+    default : numeric, optional
         Default open channel current value to use if one could not be calculated, by default DEFAULT_OPEN_CHANNEL_GUESS.
-    log: logger: Logger, optional
+    log: logger, optional
         Logger to use, defaults to singleton logger.
     Returns
     -------
@@ -854,10 +840,9 @@ def find_open_channel_current(
     """
     if open_channel_bound is None:
         open_channel_bound = 0.1 * open_channel_guess
-
-    lower_bound = open_channel_guess - open_channel_bound
     upper_bound = open_channel_guess + open_channel_bound
-    ix_in_range = np.where(np.logical_and(lower_bound < picoamperes, picoamperes <= upper_bound))[
+    lower_bound = open_channel_guess - open_channel_bound
+    ix_in_range = np.where(np.logical_and(picoamperes <= upper_bound, picoamperes > lower_bound))[
         0
     ]
     if len(ix_in_range) == 0:
@@ -869,3 +854,80 @@ def find_open_channel_current(
     else:
         open_channel = np.median(picoamperes[ix_in_range])[()]
     return open_channel
+
+
+def find_signal_off_regions(
+    picoamperes: PicoampereSignal, window_sz=200, slide=100, current_range=50
+) -> List[Window]:
+    """Helper function for judge_channels(). Finds regions of current where the
+    channel is likely off.
+
+    Parameters
+    ----------
+    picoamperes : PicoampereSignal
+        Nanopore current (in units of pA).
+    window_sz : int, optional
+        Sliding window width, by default 200
+    slide : int, optional
+        How much to slide the window by, by default 100
+    current_range : int, optional
+        How much the current is allowed to deviate, by default 50
+
+    Returns
+    -------
+    List[Window]
+        List of Windows (start, end).
+        Start and end points for where the channel is likely off.
+    """
+    off = []
+    for start in range(0, len(picoamperes), slide):
+        window_mean = np.mean(picoamperes[start : start + window_sz])
+        if -np.abs(current_range) < window_mean and window_mean < np.abs(current_range):
+            off.append(True)
+        else:
+            off.append(False)
+    off_locs = np.multiply(np.where(off)[0], slide)
+    loc = None
+    if len(off_locs) > 0:
+        last_loc = off_locs[0]
+        start = last_loc
+        regions = []
+        for loc in off_locs[1:]:
+            if loc - last_loc != slide:
+                regions.append((start, last_loc))
+                start = loc
+            last_loc = loc
+        if loc is not None:
+            regions.append((start, loc))
+        return regions
+    else:
+        return []
+
+
+def find_segments_below_threshold(time_series, threshold) -> List[Window]:
+    """
+    Find regions where the time series data points drop at or below the
+    specified threshold.
+
+    Parameters
+    ----------
+    time_series : np.array
+        Array containing time series data points.
+    threshold : numeric
+        Find regions where the time series drops at or below this value
+
+    Returns
+    -------
+    List[Window]
+        List of capture windows (start, end).
+        Each window in the list represents the (start, end) points of regions
+        where the input array drops at or below the threshold.
+    """
+    diff_points = np.where(np.abs(np.diff(np.where(time_series <= threshold, 1, 0))) == 1)[0]
+    if time_series[0] <= threshold:
+        diff_points = np.hstack([[0], diff_points])
+    if time_series[-1] <= threshold:
+        diff_points = np.hstack([diff_points, [len(time_series)]])
+
+    windows = [Window(start, end) for start, end in zip(diff_points[::2], diff_points[1::2])]
+    return windows
diff --git a/poretitioner/utils/__init__.py b/poretitioner/utils/__init__.py
@@ -1,4 +1,4 @@
-# from . import core
+from . import core
 
 # Re-import once we've figured out our module structure.
 # from . import classify