Skip to content

Commit

Permalink
Fixes for iterative inference
Browse files Browse the repository at this point in the history
  • Loading branch information
dominikstanojevic committed Feb 22, 2022
1 parent e78ed98 commit 3ac4778
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 11 deletions.
8 changes: 4 additions & 4 deletions rockfish/model/datasets.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from dataclasses import dataclass
from io import BufferedReader
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import math

import pytorch_lightning as pl

from dataclasses import dataclass
from io import BufferedReader
import struct
import sys

Expand Down Expand Up @@ -74,7 +74,7 @@ class Example:
read_id: str
ctg: int
pos: int
signal: List[float]
signal: np.ndarray
q_indices: List[int]
lengths: List[int]
bases: str
Expand Down
48 changes: 41 additions & 7 deletions rockfish/model/iterative_inference.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
from numpy import dtype
import torch
from torch.nn import DataParallel
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import IterableDataset, DataLoader
from tqdm import tqdm
import numpy as np

import math
import struct
import argparse
import collections

from datasets import read_offsets, parse_ctgs, Example, MappingEncodings
from model import Rockfish

from typing import *

ENCODING = {b: i for i, b in enumerate('ACGT')}


Expand Down Expand Up @@ -46,7 +51,7 @@ def __init__(self,
self.end = len(self.offsets)

def __iter__(self):
bins = [list() for _ in range(9)]
bins = [list() for _ in range(10)]
stored = 0

self.fd.seek(self.offsets[self.start])
Expand Down Expand Up @@ -75,8 +80,12 @@ def __iter__(self):
stored -= len(bins[bin])
bins[bin].clear()
elif stored >= 4 * self.batch_size:
for bin in bins:
for example in bin:
batch_processed = 0

for bin in reversed(bins):
while len(bin) > 0:
example = bin.pop()

signal = torch.tensor(example.signal,
dtype=torch.half).unfold(
-1, self.block_size,
Expand All @@ -92,9 +101,34 @@ def __iter__(self):
example.
ctg], example.pos, signal, bases, r_pos_enc, q_indices

bin.clear()
stored = 0

batch_processed += 1
stored -= 1

if batch_processed == self.batch_size:
break

if batch_processed == self.batch_size:
break

for bin in bins:
for example in bin:
signal = torch.tensor(example.signal,
dtype=torch.half).unfold(
-1, self.block_size,
self.block_size)
bases = torch.tensor(
[ENCODING[b] for b in example.bases])
q_indices = torch.tensor(example.q_indices)
lengths = torch.tensor(example.lengths)

r_pos_enc = self.mapping_encodings(lengths)

yield example.read_id, self.ctgs[
example.
ctg], example.pos, signal, bases, r_pos_enc, q_indices

bin.clear()

def read_iter_example(self):
read_id, ctg, pos, n_points, q_indices_len = struct.unpack(
'=36sHIHH', self.fd.read(46))
Expand All @@ -105,7 +139,7 @@ def read_iter_example(self):
self.fd.read(n_bytes))
event_len_start = n_points + q_indices_len

return Example(read_id.decode(), ctg, pos, data[:n_points],
return Example(read_id.decode(), ctg, pos, np.array(data[:n_points], dtype=np.half),
data[n_points:event_len_start],
data[event_len_start:-1], data[-1].decode())

Expand Down

0 comments on commit 3ac4778

Please sign in to comment.