Skip to content

Commit

Permalink
[#83] Introduce abstractions for signals and fast5 files (#84)
Browse files Browse the repository at this point in the history
* [#83] Introduce abstractions for signals and fast5 files

* Removed class method from FractionalizedSignal

* Architecting signal and fast5 data organization

Co-authored-by: Jessica Dunstan <[email protected]>
Co-authored-by: Katie Doroschak <[email protected]>
  • Loading branch information
3 people committed Feb 15, 2021
1 parent 4acc013 commit 3aa47ac
Show file tree
Hide file tree
Showing 8 changed files with 504 additions and 603 deletions.
7 changes: 7 additions & 0 deletions poretitioner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import matplotlib

from . import signals, utils

# We're using TK as a matplotlib backend since it doesn't require any extra dependencies.
# If need a different backend, it can be configured here.
matplotlib.use("TkAgg")
43 changes: 20 additions & 23 deletions poretitioner/fast5s.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
Classes for reading, writing and validating fast5 files.
"""

from dataclasses import dataclass
from os import PathLike
from pathlib import Path, PurePosixPath
from typing import List, NewType, Optional, Union
Expand All @@ -21,14 +20,14 @@
RawSignal,
VoltageSignal,
)

from .utils.classify import (
NULL_CLASSIFICATION_RESULT,
ClassificationResult,
ClassifierConfiguration,
ClassifierDetails,
NullClassificationResult,
)
from .utils.core import NumpyArrayLike

__all__ = ["BulkFile", "CaptureFile", "channel_path_for_read_id", "signal_path_for_read_id"]

Expand Down Expand Up @@ -66,17 +65,17 @@ def __init__(self, filepath: Union[str, PathLike], logger: Logger = getLogger())
Parameters
----------
filepath : PathLike
Path to the fast5 file to interface with.
filepath : [type]
[description]
logger : Logger, optional
Logger to use, by default getLogger()
[description], by default getLogger()
Raises
------
OSError
File couldn't be opened (e.g. didn't exist, OS 'Resource temporarily unavailable'). Details in message.
Bulk file couldn't be opened (e.g. didn't exist, OS 'Resource temporarily unavailable'). Details in message.
ValueError
File had validation errors, details in message.
Bulk file had validation errors, details in message.
"""

self.filepath = Path(filepath).expanduser().resolve()
Expand Down Expand Up @@ -154,7 +153,7 @@ def __init__(self, bulk_filepath: PathLike, logger: Logger = getLogger()):
Parameters
----------
bulk_filepath : PathLike
File path to the bulk fast 5 file.
Absolute path to the bulk fast 5 file.
logger : Logger, optional
Logger to use, by default getLogger()
Expand Down Expand Up @@ -232,6 +231,11 @@ def sampling_rate(self) -> int:
Also referred to as the sample rate, sample frequency, or sampling
frequency.
Parameters
----------
f5 : h5py.File
Fast5 file, open for reading using h5py.File.
Returns
-------
int
Expand Down Expand Up @@ -303,6 +307,9 @@ def get_raw_signal(
Parameters
----------
f5 : h5py.File
Fast5 file, open for reading using h5py.File.
channel_number : int
Channel number for which to retrieve raw signal.
start : Optional[int], optional
Expand Down Expand Up @@ -340,7 +347,7 @@ def get_voltage(self, start=None, end=None) -> VoltageSignal:
Returns
-------
NumpyArrayLike[int]
VoltageSignal[int]
Voltage(s) in millivolts (mV).
"""
bias_voltage_multiplier = 5 # Signal changes in increments of 5 mV.
Expand All @@ -358,7 +365,7 @@ def __init__(self, capture_filepath: PathLike, logger: Logger = getLogger()):
capture_filepath : PathLike
Path to the capture file. Capture files are the result of running `poretitioner segment` on a bulk file.
logger : Logger, optional
Logger to use, by default getLogger()
[description], by default getLogger()
Raises
------
Expand All @@ -369,7 +376,7 @@ def __init__(self, capture_filepath: PathLike, logger: Logger = getLogger()):
"""
super().__init__(capture_filepath, logger=logger)
if not self.filepath.exists():
error_msg = f"Capture fast5 file does not exist at path: {self.filepath}. Make sure the capture file is in this location."
error_msg = f"Capture fast5 file does not exist at path: {self.filepath}. Make sure the bulk file is in this location."
raise OSError(error_msg)
self.validate(self.filepath, logger)

Expand Down Expand Up @@ -456,6 +463,8 @@ def get_channel_calibration_for_read(self, read_id: str) -> ChannelCalibration:
Parameters
----------
f5 : h5py.File
Fast5 file, open for reading using h5py.File.
read_id : str
Read id to retrieve raw signal. Can be formatted as a path ("read_xxx...")
or just the read id ("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx").
Expand All @@ -470,18 +479,6 @@ def get_channel_calibration_for_read(self, read_id: str) -> ChannelCalibration:
return calibration

def get_capture_metadata_for_read(self, read_id: str) -> CaptureMetadata:
"""Retrieve the capture metadata for given read.
Parameters
----------
read_id : str
Which read to fetch the metadata for.
Returns
-------
CaptureMetadata
Metadata around the captures in this read.
"""
channel_path = channel_path_for_read_id(read_id)
signal_path = signal_path_for_read_id(read_id)

Expand Down
124 changes: 93 additions & 31 deletions poretitioner/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from .utils.core import NumpyArrayLike, Window

__all__ = [
"compute_fractional_blockage",
"find_open_channel_current",
"find_segments_below_threshold",
"Capture",
"ChannelCalibration",
"FractionalizedSignal",
Expand All @@ -31,8 +31,6 @@
"DEFAULT_OPEN_CHANNEL_BOUND",
]

Logger = logger.Logger # Logger type

DEFAULT_OPEN_CHANNEL_GUESS = 220 # In picoAmperes (pA).
DEFAULT_OPEN_CHANNEL_BOUND = 15 # In picoAmperes (pA).

Expand Down Expand Up @@ -108,8 +106,6 @@ class BaseSignal(NumpyArrayLike):
"""Base class for signals (time series data generated from a nanopore channel).
Do not use this class directly, instead subclass it.
Creates a view of an underlying numpy array.
Subclasses must implement:
`def __array_finalize__(self, obj):` and `def __reduce__(self):`. See `FractionalizedSignal` for an example.
Expand Down Expand Up @@ -350,7 +346,7 @@ def to_picoamperes(self) -> PicoampereSignal:
"""
picoamps = compute_picoamperes_from_fractional_blockage(self)
picoamperes = PicoampereSignal(
picoamps, self.channel_number, self.calibration, read_id=self.read_id
picoamps, self.channel_number, self.calibration, read_id=self.read_id, *args, **kwargs
)
return picoamperes

Expand Down Expand Up @@ -386,21 +382,12 @@ def to_picoamperes(self) -> PicoampereSignal:
"""
picoamps = calculate_raw_in_picoamperes(self, self.calibration)
picoamperes = PicoampereSignal(
picoamps, self.channel_number, self.calibration, read_id=self.read_id
picoamps, self.channel_number, self.calibration, read_id=self.read_id, *args, **kwargs
)
return picoamperes

def to_fractionalized(
self,
open_channel_guess=DEFAULT_OPEN_CHANNEL_GUESS,
open_channel_bound=None,
default=DEFAULT_OPEN_CHANNEL_GUESS,
) -> FractionalizedSignal:
return self.to_picoamperes().to_fractionalized(
open_channel_guess=open_channel_guess,
open_channel_bound=open_channel_bound,
default=default,
)
def to_fractionalized(self, *args, **kwargs) -> FractionalizedSignal:
return self.to_picoamperes(*args, **kwargs).to_fractionalized(*args, **kwargs)


class PicoampereSignal(CurrentSignal):
Expand Down Expand Up @@ -772,9 +759,7 @@ def digitize_current(
)


def compute_fractional_blockage(
picoamperes: NumpyArrayLike, open_channel: float
) -> NumpyArrayLike:
def compute_fractional_blockage(picoamperes, open_channel) -> NumpyArrayLike:
"""Converts a nanopore signal (in units of pA) to fractionalized current
in the range (0, 1).
Expand All @@ -792,8 +777,9 @@ def compute_fractional_blockage(
[NumpyArrayLike]
Array of fractionalized nanopore current in the range [0, 1]
"""

frac = np.clip(picoamperes / open_channel, a_min=0.0, a_max=1.0)
pico = np.frombuffer(picoamperes, dtype=float)
pico /= open_channel
frac = np.clip(pico, a_max=1.0, a_min=0.0)
return frac


Expand All @@ -818,7 +804,7 @@ def compute_picoamperes_from_fractional_blockage(
Array of fractionalized nanopore current in the range [0, 1]
"""

pico = fractional * fractional.open_channel_pA
pico = np.frombuffer(fractional * fractional.open_channel_pA, dtype=float)
return pico


Expand All @@ -837,15 +823,15 @@ def find_open_channel_current(
----------
picoamperes : NumpyArrayLike
Array representing sampled nanopore current, scaled to pA.
open_channel_guess : float, optional
open_channel_guess : numeric, optional
Approximate estimate of the open channel current value, by default DEFAULT_OPEN_CHANNEL_GUESS.
bound : float, optional
bound : numeric, optional
Approximate estimate of the variance in open channel current value from
channel to channel (AKA the range to search). If no bound is specified,
the default is to use 10% of the open_channel guess.
default : float, optional
default : numeric, optional
Default open channel current value to use if one could not be calculated, by default DEFAULT_OPEN_CHANNEL_GUESS.
log: logger: Logger, optional
log: logger, optional
Logger to use, defaults to singleton logger.
Returns
-------
Expand All @@ -854,10 +840,9 @@ def find_open_channel_current(
"""
if open_channel_bound is None:
open_channel_bound = 0.1 * open_channel_guess

lower_bound = open_channel_guess - open_channel_bound
upper_bound = open_channel_guess + open_channel_bound
ix_in_range = np.where(np.logical_and(lower_bound < picoamperes, picoamperes <= upper_bound))[
lower_bound = open_channel_guess - open_channel_bound
ix_in_range = np.where(np.logical_and(picoamperes <= upper_bound, picoamperes > lower_bound))[
0
]
if len(ix_in_range) == 0:
Expand All @@ -869,3 +854,80 @@ def find_open_channel_current(
else:
open_channel = np.median(picoamperes[ix_in_range])[()]
return open_channel


def find_signal_off_regions(
picoamperes: PicoampereSignal, window_sz=200, slide=100, current_range=50
) -> List[Window]:
"""Helper function for judge_channels(). Finds regions of current where the
channel is likely off.
Parameters
----------
picoamperes : PicoampereSignal
Nanopore current (in units of pA).
window_sz : int, optional
Sliding window width, by default 200
slide : int, optional
How much to slide the window by, by default 100
current_range : int, optional
How much the current is allowed to deviate, by default 50
Returns
-------
List[Window]
List of Windows (start, end).
Start and end points for where the channel is likely off.
"""
off = []
for start in range(0, len(picoamperes), slide):
window_mean = np.mean(picoamperes[start : start + window_sz])
if -np.abs(current_range) < window_mean and window_mean < np.abs(current_range):
off.append(True)
else:
off.append(False)
off_locs = np.multiply(np.where(off)[0], slide)
loc = None
if len(off_locs) > 0:
last_loc = off_locs[0]
start = last_loc
regions = []
for loc in off_locs[1:]:
if loc - last_loc != slide:
regions.append((start, last_loc))
start = loc
last_loc = loc
if loc is not None:
regions.append((start, loc))
return regions
else:
return []


def find_segments_below_threshold(time_series, threshold) -> List[Window]:
"""
Find regions where the time series data points drop at or below the
specified threshold.
Parameters
----------
time_series : np.array
Array containing time series data points.
threshold : numeric
Find regions where the time series drops at or below this value
Returns
-------
List[Window]
List of capture windows (start, end).
Each window in the list represents the (start, end) points of regions
where the input array drops at or below the threshold.
"""
diff_points = np.where(np.abs(np.diff(np.where(time_series <= threshold, 1, 0))) == 1)[0]
if time_series[0] <= threshold:
diff_points = np.hstack([[0], diff_points])
if time_series[-1] <= threshold:
diff_points = np.hstack([diff_points, [len(time_series)]])

windows = [Window(start, end) for start, end in zip(diff_points[::2], diff_points[1::2])]
return windows
2 changes: 1 addition & 1 deletion poretitioner/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# from . import core
from . import core

# Re-import once we've figured out our module structure.
# from . import classify
Expand Down
Loading

0 comments on commit 3aa47ac

Please sign in to comment.