Skip to content

Commit

Permalink
Improved memory usage in gCNV. (#5781)
Browse files Browse the repository at this point in the history
* Changed sampling of denoised copy ratios to address memory spike and updated output formats and filenames.

* Updated theano version to 1.0.4 and changed numpy install source to conda defaults to enable MKL.

* Updated theano flags to use MKL and OpenMP elemwise.
  • Loading branch information
samuelklee authored Mar 18, 2019
1 parent 69f4006 commit 0429d5a
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 26 deletions.
4 changes: 2 additions & 2 deletions scripts/gatkcondaenv.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- intel-openmp=2018.0.0
- mkl=2018.0.1
- mkl-service=1.1.2
- defaults::numpy==1.13.3
- openssl=1.0.2l=0
- pip=9.0.1=py36_1
- python=3.6.2=0
Expand All @@ -29,7 +30,6 @@ dependencies:
- keras==2.2.0
- markdown==2.6.9
- matplotlib==2.1.0
- numpy==1.13.3
- pandas==0.21.0
- patsy==0.4.1
- protobuf==3.5.0.post1
Expand All @@ -44,7 +44,7 @@ dependencies:
- scipy==1.0.0
- six==1.11.0
- $tensorFlowDependency
- theano==0.9.0
- theano==1.0.4
- tqdm==4.19.4
- werkzeug==0.12.2
- gatkPythonPackageArchive.zip
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
default_class_log_posterior_tsv_filename = "log_q_tau_tk.tsv"
default_baseline_copy_number_tsv_filename = "baseline_copy_number_t.tsv"
default_copy_number_segments_tsv_filename = "copy_number_segments.tsv"
default_denoised_copy_ratios_mean_tsv_filename = "denoised_copy_ratios_mu.tsv"
default_denoised_copy_ratios_std_tsv_filename = "denoised_copy_ratios_std.tsv"
default_denoised_copy_ratios_mean_tsv_filename = "mu_denoised_copy_ratio_t.tsv"
default_denoised_copy_ratios_std_tsv_filename = "std_denoised_copy_ratio_t.tsv"

default_denoising_config_json_filename = "denoising_config.json"
default_calling_config_json_filename = "calling_config.json"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ def __call__(self):
self.denoising_model_approx)

# compute approximate denoised copy ratios
denoising_copy_ratios_approx_generator = commons.get_sampling_generator_for_model_approximation(
model_approx=self.denoising_model_approx, model_var_name='denoised_copy_ratios')
denoised_copy_ratios_mean, denoised_copy_ratios_variance =\
math.calculate_mean_and_variance_online(denoising_copy_ratios_approx_generator)
denoised_copy_ratios_mean = np.transpose(denoised_copy_ratios_mean)
denoised_copy_ratios_std = np.transpose(np.sqrt(denoised_copy_ratios_variance))
_logger.info("Sampling and approximating posteriors for denoised copy ratios...")
denoising_copy_ratios_st_approx_generator = commons.get_sampling_generator_for_model_approximation(
model_approx=self.denoising_model_approx, node=self.denoising_model['denoised_copy_ratio_st'])
mu_denoised_copy_ratio_st, var_denoised_copy_ratio_st =\
math.calculate_mean_and_variance_online(denoising_copy_ratios_st_approx_generator)
std_denoised_copy_ratio_st = np.sqrt(var_denoised_copy_ratio_st)

for si, sample_name in enumerate(self.denoising_calling_workspace.sample_names):
sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name]
Expand Down Expand Up @@ -201,20 +201,20 @@ def __call__(self):
write_shape_info=False)

# write denoised copy ratio means
denoised_copy_ratio_mu_s = denoised_copy_ratios_mean[:, si]
mu_denoised_copy_ratio_t = mu_denoised_copy_ratio_st[si, :]
io_commons.write_ndarray_to_tsv(
os.path.join(sample_posterior_path, io_consts.default_denoised_copy_ratios_mean_tsv_filename),
denoised_copy_ratio_mu_s,
mu_denoised_copy_ratio_t,
extra_comment_lines=sample_name_comment_line,
header=io_consts.denoised_copy_ratio_mean_column_name,
write_shape_info=False
)

# write denoised copy ratio standard deviations
denoised_copy_ratio_std_s = denoised_copy_ratios_std[:, si]
std_denoised_copy_ratio_t = std_denoised_copy_ratio_st[si, :]
io_commons.write_ndarray_to_tsv(
os.path.join(sample_posterior_path, io_consts.default_denoised_copy_ratios_std_tsv_filename),
denoised_copy_ratio_std_s,
std_denoised_copy_ratio_t,
extra_comment_lines=sample_name_comment_line,
header=io_consts.denoised_copy_ratio_std_column_name,
write_shape_info=False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,16 +226,18 @@ def add_sample_to_cum_sum(posterior_sample, _cum_sum):
return outputs[-1] / size


def get_sampling_generator_for_model_approximation(model_approx: pm.MeanField, model_var_name: str,
def get_sampling_generator_for_model_approximation(model_approx: pm.MeanField, node,
num_samples: int = 20) -> Generator:
"""Get a generator that returns samples of a precomputed model approximation for a specific variable in that model
Args:
model_approx: an instance of PyMC3 mean-field approximation
model_var_name: a stochastic node in the model
node: a stochastic node in the model
num_samples: number of samples to draw
Returns:
A generator that will yield `num_samples` samples from an approximation to a posterior
"""
return (model_approx.sample()[model_var_name] for _ in range(num_samples))

sample = model_approx.sample_node(node, size=1)[0]
return (sample.eval() for _ in range(num_samples))
Original file line number Diff line number Diff line change
Expand Up @@ -437,13 +437,18 @@ def __init__(self,
self.log_trans_tkk: Optional[np.ndarray] = None

# GC bias factors
# (to be initialize by calling `initialize_bias_inference_vars`)
# (to be initialized by calling `initialize_bias_inference_vars`)
self.W_gc_tg: Optional[tst.SparseConstant] = None

# auxiliary data structures for hybrid q_c_expectation_mode calculation
# (to be initialize by calling `initialize_bias_inference_vars`)
# (to be initialized by calling `initialize_bias_inference_vars`)
self.interval_neighbor_index_list: Optional[List[List[int]]] = None

# denoised copy ratios
denoised_copy_ratio_st = np.zeros((self.num_samples, self.num_intervals), dtype=types.floatX)
self.denoised_copy_ratio_st: types.TensorSharedVariable = th.shared(
denoised_copy_ratio_st, name="denoised_copy_ratio_st", borrow=config.borrow_numpy)

# initialize posterior
posterior_initializer.initialize_posterior(denoising_config, calling_config, self)
self.initialize_bias_inference_vars()
Expand Down Expand Up @@ -778,10 +783,10 @@ def __init__(self,
# the expected number of erroneously mapped reads
mean_mapping_error_correction_s = eps * read_depth_s * shared_workspace.average_ploidy_s

denoised_copy_ratios = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x'))
denoised_copy_ratio_st = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x'))
/ ((1.0 - eps) * read_depth_s.dimshuffle(0, 'x') * bias_st))

Deterministic(name='denoised_copy_ratios', var=denoised_copy_ratios)
Deterministic(name='denoised_copy_ratio_st', var=denoised_copy_ratio_st)

mu_stc = ((1.0 - eps) * read_depth_s.dimshuffle(0, 'x', 'x')
* bias_st.dimshuffle(0, 1, 'x')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

# set theano flags
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true"
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \
"openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10"

import logging
import argparse
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

# set theano flags
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true"
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \
"openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10"

import argparse
import gcnvkernel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import shutil

# set theano flags
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true"
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \
"openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10"

import logging
import argparse
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

# set theano flags
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true"
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \
"openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10"

import argparse
import gcnvkernel
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

# set theano flags
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true"
os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \
"openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10"

import logging
import argparse
Expand Down

0 comments on commit 0429d5a

Please sign in to comment.