diff --git a/deepcpg/__init__.py b/deepcpg/__init__.py index a6221b3..3f6fab6 100644 --- a/deepcpg/__init__.py +++ b/deepcpg/__init__.py @@ -1 +1 @@ -__version__ = '1.0.2' +__version__ = '1.0.3' diff --git a/deepcpg/data/utils.py b/deepcpg/data/utils.py index 985b629..aaf41ee 100644 --- a/deepcpg/data/utils.py +++ b/deepcpg/data/utils.py @@ -147,8 +147,24 @@ def sample_frame(frame): return frame -def read_cpg_profile(filename, chromos=None, nb_sample=None, round=True, +def is_binary(values): + """Check if values are binary, i.e. zero or one.""" + return ~np.any((values > 0) & (values < 1)) + + +def read_cpg_profile(filename, chromos=None, nb_sample=None, round=False, sort=True, nb_sample_chromo=None): + """Read CpG profile. + + Reads CpG profile from either tab delimited file with columns + `chromo`, `pos`, `value`. `value` or bedGraph file. `value` columns contains + methylation states, which can be binary or continuous. + + Returns + ------- + Pandas table with columns `chromo`, `pos`, `value`. + """ + if is_bedgraph(filename): usecols = [0, 1, 3] skiprows = 1 @@ -162,6 +178,8 @@ def read_cpg_profile(filename, chromos=None, nb_sample=None, round=True, d = pd.read_table(filename, header=None, comment='#', nrows=nrows, usecols=usecols, dtype=dtype, skiprows=skiprows) d.columns = ['chromo', 'pos', 'value'] + if np.any((d['value'] < 0) | (d['value'] > 1)): + raise ValueError('Methylation values must be between 0 and 1!') d['chromo'] = format_chromo(d['chromo']) if chromos is not None: if not isinstance(chromos, list): @@ -175,8 +193,8 @@ def read_cpg_profile(filename, chromos=None, nb_sample=None, round=True, d.sort_values(['chromo', 'pos'], inplace=True) if round: d['value'] = np.round(d.value) - if not np.all((d.value == 0) | (d.value == 1)): - raise 'Invalid methylation states' + if is_binary(d['value']): + d['value'] = d['value'].astype(np.int8) return d diff --git a/deepcpg/evaluation.py b/deepcpg/evaluation.py index 2081dc7..674c2b3 100644 --- a/deepcpg/evaluation.py +++ b/deepcpg/evaluation.py @@ -14,7 +14,7 @@ def cor(y, z): - """Computes Pearon's correlation.""" + """Compute Pearson correlation coefficient.""" return np.corrcoef(y, z)[0, 1] diff --git a/deepcpg/models/utils.py b/deepcpg/models/utils.py index b2bf714..acaca46 100644 --- a/deepcpg/models/utils.py +++ b/deepcpg/models/utils.py @@ -357,8 +357,8 @@ def _prepro_cpg(self, states, dists): for state, dist in zip(states, dists): nan = state == dat.CPG_NAN if np.any(nan): - tmp = np.sum(state == 1) / state.size - state[nan] = np.random.binomial(1, tmp, nan.sum()) + state[nan] = np.random.binomial(1, state[~nan].mean(), + nan.sum()) dist[nan] = self.cpg_max_dist dist = np.minimum(dist, self.cpg_max_dist) / self.cpg_max_dist prepro_states.append(np.expand_dims(state, 1)) diff --git a/docs/fig1.png b/docs/fig1.png deleted file mode 100644 index 81c4089..0000000 Binary files a/docs/fig1.png and /dev/null differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 0b36c78..26d06b4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,9 +68,9 @@ # built documents. # # The short X.Y version. -version = '1.0.2' +version = '1.0.3' # The full version, including alpha/beta/rc tags. -release = '1.0.2' +release = '1.0.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/data.rst b/docs/source/data.rst new file mode 100644 index 0000000..0367774 --- /dev/null +++ b/docs/source/data.rst @@ -0,0 +1,192 @@ +.. _data: + +============= +Data creation +============= + + +This tutorial describes how to create and analyze the input data of DeepCpG. + +Creating data +============= + +``dcpg_data.py`` creates the data for model training and evaluation, given multiple methylation profiles: + +.. code:: bash + + dcpg_data.py + --cpg_profiles ./cpg/*.tsv + --dna_files mm10/*.dna.*.fa.gz + --cpg_wlen 50 + --dna_wlen 1001 + --out_dir ./data + +``--cpg_profiles`` specifies a list of files that store the observed CpG methylation states for either a bulk- or single-cell methylation profile. Supports bedGraph and TSV files. + +BedGraph files must start with ``track type=bedGraph``. Each following line represents the methylation state of a CpG site, for example: + +.. parsed-literal:: + + track type=bedGraph + chr1 3007532 3007533 1.0 + chr1 3007580 3007581 0.4 + chr1 3012096 3012097 1.0 + chr1 3017509 3017510 0.1 + +The columns have the following meaning: + +* Column 1: the chromosome of the CpG site starting with ``chr``. +* Column 2: the location of the C of the CpG site. Positions are enumerated starting at one. +* Column 3: the position of the G of the CpG site. +* Column 4: the observed methylation value (:math:`\in (0;1)`) of the CpG site. If all values are binary, i.e. either zero or one, they will also be stored by DeepCpG as binary values, which reduces disk usage. Continuous values are required for representing hemi-methylation or bulk methylation profiles. + + +TSV files do not start with a track column and only contain three columns, for example: + +.. parsed-literal:: + + chr1 3007532 1.0 + chr1 3007580 0.4 + chr1 3012096 1.0 + chr1 3017509 0.1 + +``--cpg_profiles`` files can be gzip-compressed (``*.gz``) to reduce disk usage. + +``--dna_files`` specifies a list of FASTA files, where each file stores the DNA sequence of a particular chromosome. Files can be downloaded from `Ensembl `_, e.g. `mm10 `_ for mouse or `hg38 `_ for human, and specified either via a glob pattern, e.g. ``--dna_files mm10/*.dna.*fa.gz`` or simply by the directory name, e.g. ``--dna_files mm10``. The argument ``--dna_files`` is not required for imputing methylation states from neighboring methylation states without using the DNA sequence. + +``--cpg_wlen`` specifies the sum of CpG sites to the left and right of the target site that DeepCpG will use for making predictions. For example, DeepCpG will use 25 CpG sites to the left and right of the target CpG site using ``--cpg_wlen 50``. A value of about 50 usually covers a wide methylation context and is sufficient to achieve a good performance. If you are dealing with many cells, I recommend using a smaller value to reduce disk usage. + +``--dna_wlen`` specifies the width of DNA sequence windows in base pairs that are centered on the target CpG site. Wider windows usually improve prediction accuracy but increase compute- and storage costs. I recommend ``--dna_wlen 1001``. + +These are the most important arguments for imputing methylation profiles. ``dcpg_data.py`` provides additional arguments for debugging and predicting statistics across profiles, e.g. the mean methylation rate or cell-to-cell variance. + + +Debugging +--------- + +For debugging, testing, or reducing compute costs, ``--chromos`` can be used the select certain chromosomes. ``--nb_sample_chromo`` randomly samples a certain number of CpG sites from each chromosome, and ``--nb_sample`` specifies the maximum number of CpG sites in total. + + +Predicting statistics +--------------------- + +For predicting statistics across methylation profiles, ``--stats`` and ``--win_stats`` can be used. These arguments specify a list of statistics that are computed across profiles for either a single CpG site or in a window of size ``--win_stats_wlen`` that is centered on a target CpG site. Following statistics are supported: + +* ``mean``: the mean methylation rate. +* ``mode``: the mode of methylation rates. +* ``var``: the cell-to-cell variance. +* ``cat_var``: three categories of cell-to-cell variance, i.e. low, medium, or high variance. +* ``cat2_var``: two categories of cell-to-cell variance, i.e. low or high variance. +* ``entropy``: the entropy across cells. +* ``diff``: if a CpG site is differentially methylated, i.e. methylated in one profile but zero in others. +* ``cov``: the CpG coverage, i.e. the number of profiles for which the methylation state of the target CpG site is observed. + +Statistics are only computed or CpG sites that are covered by at least ``--stats_cov`` (default 1) cells. Increasing ``--stats_cov`` will lead to more robust estimates. + + +Common issues +------------- + +**Why am I getting warnings 'No CpG site at position X!' when using ``dcpg_data.py``?** + +This means that some sites in ``--cpg_profile`` files are not CpG sites, i.e. there is no CG dinucleotide at the given position in the DNA sequence. Make sure that ``--dna_files`` point to the correct genome and CpG sites are correctly aligned. Since DeepCpG currently does not support allele-specific methylation, data from different alleles must be merged (recommended) or only one allele be used. + + +Computing data statistics +========================= + +``dcpg_data_stats.py`` enables to compute statistics for a list of DeepCpG input files: + +.. code:: bash + + dcpg_data_stats.py ./data/c1_000000-001000.h5 ./data/c13_000000-001000.h5 + +.. parsed-literal:: + + output nb_tot nb_obs frac_obs mean var + 0 cpg/BS27_1_SER 2000 391 0.1955 0.790281 0.165737 + 1 cpg/BS27_3_SER 2000 408 0.2040 0.740196 0.192306 + 2 cpg/BS27_5_SER 2000 393 0.1965 0.692112 0.213093 + 3 cpg/BS27_6_SER 2000 402 0.2010 0.666667 0.222222 + 4 cpg/BS27_8_SER 2000 408 0.2040 0.776961 0.173293 + +The columns have the following meaning: + +* ``output``: The name of the target cell. +* ``nb_tot``: The total number of CpG sites. +* ``nb_obs``: The number of CpG sites for which the true label of ``output`` is observed. +* ``frac_obs``: The fraction ``nb_obs/nb_tot`` of observed CpG sites. +* ``mean``: The mean of ``output``, e.g. the mean methylation rate. +* ``var``: The variance of ``output``, e.g. the variance in CpG methylation levels. + +``--nb_tot`` and ``--nb_obs`` are particularly useful for deciding how to split the data into a training, test, validation set as explained in the :ref:`training tutorial `. Statistics can be written to a TSV file using ``--out_tsv`` and be visualized using ``--out_plot``. + + +Printing data +============= + +``dcpg_data_show.py`` enables to selectively print the content of a list of DeepCpG data files. Using ``--outputs`` prints all DeepCpG model outputs in a selected region: + +.. code:: bash + + dcpg_data_show.py ./data/c1_000000-001000.h5 --chromo 1 --start 189118909 --end 189867450 --outputs + +.. parsed-literal:: + + loc outputs + chromo pos cpg/BS27_1_SER cpg/BS27_3_SER cpg/BS27_5_SER cpg/BS27_6_SER cpg/BS27_8_SER + 950 1 189118909 -1 -1 1 -1 -1 + 951 1 189314906 -1 -1 1 -1 -1 + 952 1 189506185 1 -1 -1 -1 -1 + 953 1 189688256 -1 0 -1 -1 -1 + 954 1 189688274 -1 -1 -1 -1 0 + 955 1 189699529 -1 -1 -1 1 -1 + 956 1 189728263 -1 -1 0 -1 -1 + 957 1 189741539 -1 1 -1 -1 -1 + 958 1 189850865 -1 -1 -1 1 -1 + 959 1 189867450 -1 1 -1 -1 -1 + + +``-1`` indicates unobserved methylation states. If ``--outputs`` is followed by a list of output names, only they will be printed. The arguments ``--cpg``, ``--cpg_wlen``, and ``--cpg_dist`` control how many (``--cpg_wlen``) neighboring methylation states (``--cpg``) and corresponding distances (``--cpg_dist``) are printed. For example, the following commands prints the state and distance of four neighboring CpG sites of cell *BS27_1_SER*: + +.. code:: bash + + dcpg_data_show.py ./data/c1_000000-001000.h5 --chromo 1 --start 189118909 --end 189867450 --outputs cpg/BS27_1_SER --cpg BS27_1_SER --cpg_wlen 4 --cpg_dist + +.. parsed-literal:: + + loc outputs BS27_1_SER/state BS27_1_SER/dist + chromo pos cpg/BS27_1_SER -2 -1 +1 +2 -2 -1 +1 +2 + 950 1 189118909 -1 1 1 1 1 84023.0 65557.0 114153.0 373437.0 + 951 1 189314906 -1 1 1 1 1 261554.0 81844.0 177440.0 191279.0 + 952 1 189506185 1 1 1 1 0 273123.0 13839.0 162360.0 662239.0 + 953 1 189688256 -1 1 1 0 1 182071.0 19711.0 480168.0 705968.0 + 954 1 189688274 -1 1 1 0 1 182089.0 19729.0 480150.0 705950.0 + 955 1 189699529 -1 1 1 0 1 193344.0 30984.0 468895.0 694695.0 + 956 1 189728263 -1 1 1 0 1 222078.0 59718.0 440161.0 665961.0 + 957 1 189741539 -1 1 1 0 1 235354.0 72994.0 426885.0 652685.0 + 958 1 189850865 -1 1 1 0 1 344680.0 182320.0 317559.0 543359.0 + 959 1 189867450 -1 1 1 0 1 361265.0 198905.0 300974.0 526774.0 + +Analogously, ``--dna_wlen`` prints the DNA sequence window of that length centered on the target CpG sites: + +.. code:: bash + + dcpg_data_show.py ./data/c1_000000-001000.h5 --chromo 1 --start 189118909 --end 189867450 --outputs cpg/BS27_1_SER --dna_wlen 11 + +.. parsed-literal:: + + loc outputs dna + chromo pos cpg/BS27_1_SER -5 -4 -3 -2 -1 0 +1 +2 +3 +4 +5 + 950 1 189118909 -1 2 1 0 0 0 3 2 2 0 0 3 + 951 1 189314906 -1 3 1 3 3 2 3 2 3 0 1 3 + 952 1 189506185 1 0 3 3 3 0 3 2 2 2 0 1 + 953 1 189688256 -1 2 3 3 2 2 3 2 2 3 2 2 + 954 1 189688274 -1 2 3 0 2 0 3 2 1 3 2 2 + 955 1 189699529 -1 2 3 2 2 0 3 2 3 1 1 1 + 956 1 189728263 -1 3 1 3 3 3 3 2 2 3 3 2 + 957 1 189741539 -1 2 0 2 1 2 3 2 1 2 2 3 + 958 1 189850865 -1 2 2 3 2 2 3 2 2 3 2 2 + 959 1 189867450 -1 3 1 3 0 3 3 2 1 2 3 0 + +With ``--out_hdf``, the selected data can be stored as `Pandas data frame `_ to a HDF5 file. diff --git a/docs/source/fig1.png b/docs/source/fig1.png new file mode 100644 index 0000000..429616e Binary files /dev/null and b/docs/source/fig1.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 4b5b31d..d9e9190 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,11 +2,19 @@ DeepCpG: Deep neural networks for predicting single-cell DNA methylation ======================================================================== -DeepCpG is a deep neural network for predicting the methylation state of CpG dinucleotides in multiple cells. It allows to accurately impute incomplete DNA methylation profiles, to discover predictive sequence motifs, and to quantify the effect of sequence mutations. (`Angermueller et al, 2017 `_). +DeepCpG [1]_ is a deep neural network for predicting the methylation state of CpG dinucleotides in multiple cells. It allows to accurately impute incomplete DNA methylation profiles, to discover predictive sequence motifs, and to quantify the effect of sequence mutations. (`Angermueller et al, 2017 `_). -:: +.. figure:: fig1.png + :width: 640 px + :align: left + :alt: DeepCpG model architecture and applications - Angermueller, Christof, Heather Lee, Wolf Reik, and Oliver Stegle. Accurate Prediction of Single-Cell DNA Methylation States Using Deep Learning. http://biorxiv.org/content/early/2017/02/01/055715 bioRxiv, February 1, 2017, 55715. doi:10.1101/055715. + **DeepCpG model architecture and applications.** + + \(a\) Sparse single-cell CpG profiles as obtained from scBS-seq or scRRBS-seq. Methylated CpG sites are denoted by ones, unmethylated CpG sites by zeros, and question marks denote CpG sites with unknown methylation state (missing data). (b) DeepCpG model architecture. The DNA model consists of two convolutional and pooling layers to identify predictive motifs from the local sequence context, and one fully connected layer to model motif interactions. The CpG model scans the CpG neighborhood of multiple cells (rows in b), using a bidirectional gated recurrent network (GRU), yielding compressed features in a vector of constant size. The Joint model learns interactions between higher-level features derived from the DNA- and CpG model to predict methylation states in all cells. (c, d) The trained DeepCpG model can be used for different downstream analyses, including genome-wide imputation of missing CpG sites (c) and the discovery of DNA sequence motifs that are associated with DNA methylation levels or cell-to-cell variability (d). + + +.. [1] Angermueller, Christof, Heather Lee, Wolf Reik, and Oliver Stegle. Accurate Prediction of Single-Cell DNA Methylation States Using Deep Learning. http://biorxiv.org/content/early/2017/02/01/055715 bioRxiv, February 1, 2017, 55715. doi:10.1101/055715. Installation @@ -41,9 +49,11 @@ Interactive examples on how to use DeepCpG can be found `here `__. ResNets are very deep networks with skip connections to improve the gradient flow and to allow learning how many layers to use. A residual network consists of multiple @@ -109,22 +109,22 @@ convolutional layers to speed up computations. ``ResNet01`` and units, respectively. ResNets are slower than CNNs, but can perform better on large datasets. -Modules starting with ``ResConv`` are ResNets with modified residual +Models starting with ``ResConv`` are ResNets with modified residual units that have two convolutional layers instead of a bottleneck -architecture. ``ResConv`` modules performed worse than ``ResNet`` -modules in my experiments. +architecture. ``ResConv`` models performed worse than ``ResNet`` +models in my experiments. -Modules starting with ``ResAtrous`` are ResNets with modified residual +Models starting with ``ResAtrous`` are ResNets with modified residual units that use `Atrous convolutional layers `__ instead of normal convolutional layers. Atrous convolutional layers have dilated filters, i.e. filters with 'holes', which allow scanning wider regions in the inputs sequence and thereby better capturing distant patters in the DNA -sequence. However, ``ResAtrous`` modules performed worse than ``ResNet`` -modules in my experiments +sequence. However, ``ResAtrous`` models performed worse than ``ResNet`` +models in my experiments -CpG modules -=========== +CpG model architectures +======================= +---------+--------------+-----------------------------------+ | Name | Parameters | Specification | @@ -136,12 +136,12 @@ CpG modules | RnnL2 | 1,100,000 | fc[256]\_bgru[128]\_bgru[256]\_do | +---------+--------------+-----------------------------------+ -``FcAvg`` is a lightweight module with only 54000 parameters, which +``FcAvg`` is a lightweight model with only 54000 parameters, which first transforms observed neighboring CpG sites of all cells independently, and than averages the transformed features across cells. -``FcAvg`` is very fast, but performs worse than RNN modules. +``FcAvg`` is very fast, but performs worse than RNN models. -``Rnn`` modules consists of bidirectional recurrent neural networks +``Rnn`` models consists of bidirectional recurrent neural networks (RNNs) with gated recurrent units (GRUs) to summarize the methylation neighborhood of cells in a more clever way than averaging. ``RnnL1`` consists of one fully-connected layer with 256 units to transform the @@ -151,8 +151,8 @@ methylation neighborhood of cells. ``RnnL2`` has two instead of one GRU layer. ``RnnL1`` is faster and performed as good as ``RnnL2`` in my experiments. -Joint modules -============= +Joint model architectures +========================= +---------------+--------------+---------------------------+ | Name | Parameters | Specification | @@ -166,8 +166,8 @@ Joint modules | JointL3h512 | 1,000,000 | fc[512]\_fc[512]\_fc[512] | +---------------+--------------+---------------------------+ -Joint modules join the feature from the DNA and CpG module. ``JointL0`` +Joint models join the feature from the DNA and CpG model. ``JointL0`` simply concatenates the features and has no learnable parameters (ultra fast). ``JointLXh512`` has ``X`` fully-connect layers with 512 neurons. -Modules with more layers usually perform better, at the cost of a higher +Models with more layers usually perform better, at the cost of a higher runtime. I recommend using ``JointL2h512`` or ``JointL3h12``. diff --git a/docs/source/scripts/index.rst b/docs/source/scripts/index.rst new file mode 100644 index 0000000..2fb6397 --- /dev/null +++ b/docs/source/scripts/index.rst @@ -0,0 +1,71 @@ +.. _scripts: + +======= +Scripts +======= + +Documentation of DeepCpG scripts. + +.. toctree:: + :maxdepth: 2 + + +dcpg_data.py +============= + +.. automodule:: scripts.dcpg_data + :members: + +dcpg_data_show.py +================= + +.. automodule:: scripts.dcpg_data_show + :members: + +dcpg_data_stats.py +================== + +.. automodule:: scripts.dcpg_data_stats + :members: + +dcpg_download.py +================ + +.. automodule:: scripts.dcpg_download + :members: + +dcpg_eval.py +============= + +.. automodule:: scripts.dcpg_eval + :members: + +dcpg_eval_export.py +=================== + +.. automodule:: scripts.dcpg_eval_export + :members: + +dcpg_filter_act.py +================== + +.. automodule:: scripts.dcpg_filter_act + :members: + +dcpg_filter_motifs.py +===================== + +.. automodule:: scripts.dcpg_filter_motifs + :members: + +dcpg_train.py +============= + +.. automodule:: scripts.dcpg_train + :members: + +dcpg_train_viz.py +================= + +.. automodule:: scripts.dcpg_train_viz + :members: diff --git a/docs/source/train.rst b/docs/source/train.rst index 79de709..ac1d583 100644 --- a/docs/source/train.rst +++ b/docs/source/train.rst @@ -26,7 +26,7 @@ test set: train_files="$data_dir/c{1,2,3,4,5,7,9,11,13}_*.h5 val_files="$data_dir/c{14,15,16,17,18,19}_*.h5" test_files="$data_dir/c{6,8,10,12,14}_*.h5" - + dcpg_train.py $train_files --val_file $val_files @@ -38,18 +38,18 @@ example, you could use ``train_files=$data_Dir/c*_[01].h5`` to select all data files starting with index 0 or 1 for training, and use the remaining files for validation. -If you are not concerend about comparing DeepCpG with other models, you +If you are not concerned about comparing DeepCpG with other models, you do not need a test set. In this case, you could, for example, leave out -chromsome 14-19 as validation set, and use the remaining chromosomes for +chromosome 14-19 as validation set, and use the remaining chromosomes for training. If your data were generated using whole-genome scBS-seq, then the number of CpG sites on few chromosomes is usually already sufficient for -training. For example, chromsome 1, 3, and 5 from *Smallwood et al +training. For example, chromosome 1, 3, and 5 from *Smallwood et al (2014)* cover already more than 3 million CpG sites. I found about 3 million CpG sites as sufficient for training models without overfitting. However, if you are working with scRRBS-seq data, you probably need more -chromosomes for trainig. To check how many CpG sites are stored in a set +chromosomes for training. To check how many CpG sites are stored in a set of DeepCpG data files, you can use the ``dcpg_data_stats.py``. The following command will compute different statistics for the training set, including the number number of CpG sites: @@ -79,18 +79,18 @@ mean methylation rate, and ``var`` the variance of the methylation rate. .. _train_joint: -Training DeepCpG modules jointly +Training DeepCpG models jointly ================================ As described in `Angermueller et al (2017) `__, DeepCpG -consists of a DNA, CpG, and joint module. The DNA module recognizes +consists of a DNA, CpG, and Joint model. The DNA model recognizes features in the DNA sequence window that is centered on a target site, -the CpG module recognizes features in observed neighboring methylation -states of multiple cells, and the joint module integrates features from -the DNA and CpG module and predicts the methylation state of all cells. +the CpG model recognizes features in observed neighboring methylation +states of multiple cells, and the Joint model integrates features from +the DNA and CpG model and predicts the methylation state of all cells. -The easiest way is to train all modules jointly: +The easiest way is to train all models jointly: .. code:: bash @@ -102,27 +102,27 @@ The easiest way is to train all modules jointly: --out_dir $models_dir/joint --nb_epoch 30 -``--dna_model``, ``--cpg_model``, and ``--joint_module`` specify the -name of DNA, CpG, and joint architecture, respectively, which are -described in `here <./modules.rst>_`. +``--dna_model``, ``--cpg_model``, and ``--joint_model`` specify the +architecture of the DNA, CpG, and Joint model, respectively, which are +described in `here <./models.rst>_`. .. _train_sep: -Training DeepCpG modules separately +Training DeepCpG models separately =================================== -Although it is convenient to train all modules jointly by running only a -single command as described above, I suggest to train modules -separately. First, because it enables to train the DNA and CpG module in +Although it is convenient to train all models jointly by running only a +single command as described above, I suggest to train models +separately. First, because it enables to train the DNA and CpG model in parallel on separate machines and thereby to reduce the training time. -Second, it enables to compare how predictive the DNA module is relative -to CpG module. If you think the CpG module is already accurate enough -alone, you might not need the DNA module. Thirdly, I obtained better -results by training the modules separately. However, this may not be +Second, it enables to compare how predictive the DNA model is relative +to CpG model. If you think the CpG model is already accurate enough +alone, you might not need the DNA model. Thirdly, I obtained better +results by training the models separately. However, this may not be true for your particular dataset. -You can train the CpG module separately by only using the -``--cpg_model`` argument, but not ``--dna_module``: +You can train the CpG model separately by only using the +``--cpg_model`` argument, but not ``--dna_model``: .. code:: bash @@ -133,7 +133,7 @@ You can train the CpG module separately by only using the --out_dir $models_dir/dna --nb_epoch 30 -You can train the DNA module separately by only using ``--dna_model``: +You can train the DNA model separately by only using ``--dna_model``: .. code:: bash @@ -144,8 +144,8 @@ You can train the DNA module separately by only using ``--dna_model``: --out_dir $models_dir/cpg --nb_epoch 30 -After training the CpG and DNA module, we are joining them by specifying -the name of the joint module with ``--joint_module``: +After training the CpG and DNA model, we are joining them by specifying +the name of the Joint model with ``--joint_model``: .. code:: bash @@ -158,8 +158,8 @@ the name of the joint module with ``--joint_module``: --out_dir $models_dir/joint --nb_epoch 10 -``--dna_module`` and ``--cpg_module`` point to the output training -directory of the DNA and CpG module, respectively, which contains their +``--dna_model`` and ``--cpg_model`` point to the output training +directory of the DNA and CpG model, respectively, which contains their specification and weights: .. code:: bash @@ -184,10 +184,10 @@ i.e. the validation weights will be used. The training weights can be used by ``--dna_model ./dna/model.json ./dna/model_weights_train.h5`` In the command above, we used ``--joint_only`` to only train the -paramters of the joint module without training the pre-trained DNA and -CpG module. Although the ``--joint_only`` arguments reduces training -time, you might obtain better results by also fine-tuning the paramters -of the DNA and CpG module without using ``--joint_only``: +parameters of the Joint model without training the pre-trained DNA and +CpG model. Although the ``--joint_only`` arguments reduces training +time, you might obtain better results by also fine-tuning the parameters +of the DNA and CpG model without using ``--joint_only``: .. _train_monitor: @@ -244,7 +244,7 @@ hardware of your machine. On a large dataset, you have to train for fewer epochs than on a small dataset, since your model will have seen already a lot of training samples after one epoch. For training on about 3,000,000 samples, good default values are 20 for the DNA and CpG -module, and 10 for the joint module. +model, and 10 for the Joint model. Early stopping stops training if the loss on the validation set did not improve after the number of epochs that is specified by @@ -269,22 +269,23 @@ the training loss starts to saturate regardless of ``--nb_epoch`` or Optimizing hyper-parameters =========================== -DeepCpG has differernt hyper-parameters, such as the learning rate, -dropout rate, or module architectures. Although the performance of +DeepCpG has different hyper-parameters, such as the learning rate, +dropout rate, or model architectures. Although the performance of DeepCpG is relatively robust to different hyper-parameters, you can tweak performances by trying out different parameter combinations. For -example, you could train different models with different paramters on a +example, you could train different models with different parameters on a subset of your data, select the parameters with the highest performance on the validation set, and then train the full model. -The following hyper-parameters are most important (default values -shown): 1. Learning rate: ``--learning_rate 0.0001`` 2. Dropout rate: -``--dropout 0.0`` 3. DNA model architecture: ``--dna_model CnnL2h128`` -4. Joint model architecture: ``--joint_model JointL2h512`` 5. CpG model -architecture: ``--cpg_model RnnL1`` 6. L2 weight decay: -``--l2_decay 0.0001`` +The following hyper-parameters are most important (default values shown): +1. Learning rate: ``--learning_rate 0.0001`` +2. Dropout rate: ``--dropout 0.0`` +3. DNA model architecture: ``--dna_model CnnL2h128`` +4. Joint model architecture: ``--joint_model JointL2h512`` +5. CpG model architecture: ``--cpg_model RnnL1`` +6. L2 weight decay: ``--l2_decay 0.0001`` -The learning rate defines how agressively model parameters are updated +The learning rate defines how aggressively model parameters are updated during training. If the training loss :ref:`changes only slowly `, you could try increasing the learning rate. If your model is overfitting of if the training loss fluctuates, you should decrease the learning @@ -296,10 +297,10 @@ have only few data and your model is overfitting, then you should increase the dropout rate. Reasonable values are, e.g., 0.0, 0.2, 0.4. DeepCpG provides different architectures for the DNA, CpG, and joint -module. Architectures are more or less complex, depending on how many +model. Architectures are more or less complex, depending on how many layers and neurons say have. More complex model might yield better performances, but take longer to train and might overfit your data. You can find -more information about available module architecture :doc:`here `. +more information about available model architecture :doc:`here `. L2 weight decay is an alternative to dropout for regularizing model training. If your model is overfitting, you might try 0.001, or 0.005. @@ -323,19 +324,19 @@ the first three outputs, and ``--output_names cpg/.*SER.*`` only on outputs that include 'SER' in their name. Analogously, ``--nb_replicate`` and ``--replicate_name`` define the -number and name of cells that are used as input to the CpG module. +number and name of cells that are used as input to the CpG model. ``--nb_replicate 3`` will only use observed methylation states from the -first three cells, and allows to briefly test the CpG module. +first three cells, and allows to briefly test the CpG model. ``--dna_wlen`` specifies the size of DNA sequence windows that will be -used as input to the DNA module. For example, ``--dna_wlen 101`` will +used as input to the DNA model. For example, ``--dna_wlen 101`` will train only on windows of size 101, instead of using the full window length that was specified when creating data files with ``dcpg_data.py``. Analogously, ``--cpg_wlen`` specifies the sum of the number of observed CpG sites to the left and the right of the target CpG site for training -the CpG module. For example, ``--cpg_wlen 10`` will use 5 observed CpG +the CpG model. For example, ``--cpg_wlen 10`` will use 5 observed CpG sites to the left and to the right. .. _train_tune: @@ -349,18 +350,18 @@ train only some components of a model. With ``--fine_tune``, only the output layer will be trained. As the name implies, this argument is useful for fine-tuning a pre-trained model. -``--train_models`` specifies which modules are trained. For example, -``--train_models joint`` will train the joint module, but not the DNA -and CpG module. ``--train_models cpg joint`` will train the CpG and -joint module, but not the DNA module. +``--train_models`` specifies which models are trained. For example, +``--train_models joint`` will train the Joint model, but not the DNA +and CpG model. ``--train_models cpg joint`` will train the CpG and +Joint model, but not the DNA model. ``--trainable`` and ``--not_trainable`` allow including and excluding certain layers. For example, ``--not_trainable '.*' --trainable 'dna/.*_2'`` will only train the -second layers of the DNA module. +second layers of the DNA model. ``--freeze_filter`` excludes the first convolutional layer of the DNA -module from training. +model from training. .. _train_backend: diff --git a/examples/README.md b/examples/README.md index 8deebdf..7e0f245 100644 --- a/examples/README.md +++ b/examples/README.md @@ -20,5 +20,5 @@ bash setup.sh * [lib.sh](./scripts/lib.sh): Global variable definitions and functions. * [data.sh](./scripts/data.sh): Creates DeepCpG data files. -* [train.sh](./scripts/train.sh): Trains DeepCpG DNA, CpG, and joint module separately. +* [train.sh](./scripts/train.sh): Trains DeepCpG DNA, CpG, and Joint model separately. * [eval.sh](./scripts/eval.sh): Evaluates prediction performances and imputes methylation profiles. diff --git a/examples/notebooks/fine_tune/index.ipynb b/examples/notebooks/fine_tune/index.ipynb index 945a902..6ad81eb 100644 --- a/examples/notebooks/fine_tune/index.ipynb +++ b/examples/notebooks/fine_tune/index.ipynb @@ -2,21 +2,30 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Fine-tuning a pre-trained model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This tutorial describes how to fine-tune a pre-trained model from the [DeepCpG model zoo](https://github.com/cangermueller/deepcpg/blob/master/docs/models.md). Fine-tuning a model that has been pre-trained on a cells which are similar to the cells of interest can considerably decrease training time. " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Initialization\n", "We first initialize some variables that will be used throughout the tutorial. `test_mode=1` should be used for testing purposes, which speeds up computations by only using a subset of the data. For real applications, `test_mode=0` should be used." @@ -26,7 +35,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -53,14 +64,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Creating DeepCpG data files" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "First, we create DeepCpG data files using `dcpg_data.py`. Since we will fine-tune a CpG model, we do not extract sequence windows. Otherwise, `--dna_files` and `--dna_wlen` must to be specified." ] @@ -69,7 +86,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -108,14 +127,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Downloading a pre-trained model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "`dcpg_download.py` downloads a pre-trained model from the DeepCpG model zoo. Available models and their corresponding description can be found on the [model zoo website](https://github.com/cangermueller/deepcpg/blob/master/docs/source/zoo.md), or retrieved with `dcpg_download.py --show`:" ] @@ -124,7 +149,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -156,9 +183,12 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "A model name consist of three parts, which are separated by '_'. The first part corresponds to the publication, the second to the cell type, and the third to the module type(CpG, DNA, or joint module). Cells from 'Hou2016' were profiled using scRRBS-seq, cells from 'Smallwood2014' using scBS-seq. 'HCC' and 'HepG2' are human cancer cells, and the rest mouse cells. You should use the cell-type that is most similar to the cell-type you are working with. More information about the available models can be found [here](https://github.com/cangermueller/deepcpg/blob/master/docs/models.md). \n", + "A model name consist of three parts, which are separated by '_'. The first part corresponds to the publication, the second to the cell type, and the third to the modle type(CpG, DNA, or Joint model). Cells from 'Hou2016' were profiled using scRRBS-seq, cells from 'Smallwood2014' using scBS-seq. 'HCC' and 'HepG2' are human cancer cells, and the rest mouse cells. You should use the cell-type that is most similar to the cell-type you are working with. More information about the available models can be found [here](https://github.com/cangermueller/deepcpg/blob/master/docs/models.md). \n", "\n", "Since we are dealing with 2i cells and want to train a CpG model, we will fine-tune 'Smallwood2014_2i_cpg':" ] @@ -167,7 +197,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -226,7 +258,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The command downloads and stores model files in the output directory, including the weights and JSON file with the model specification:" ] @@ -235,7 +270,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -253,21 +290,30 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "`model.json` stores the model specification, and `model_weights_train.h5` and `model_weights_val.h5` the weights that yielded the highest performance on the training and validation set, respectively. `model.h5` combines `model.json` and `model_weights_val.h5`." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Fine-tuning the model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To fine-tune the downloaded model, we use `--cpg_model` followed by the model directory, and `--fine_tune` to only train the output layers.\n", "\n", @@ -280,7 +326,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -434,14 +482,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Model evaluation " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, we evaluate our fine-tuned model and impute methylation profiles using `dcpg_eval.py`:" ] @@ -450,7 +504,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { diff --git a/examples/notebooks/motifs/index.ipynb b/examples/notebooks/motifs/index.ipynb index 1634625..f836759 100644 --- a/examples/notebooks/motifs/index.ipynb +++ b/examples/notebooks/motifs/index.ipynb @@ -2,30 +2,42 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Motif analysis" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "This tutorial describes how to visualize and analyze motifs learnt by DeepCpG, i.e. the filters of the first convolutional layer of the DNA module.\n", + "This tutorial describes how to visualize and analyze motifs learnt by DeepCpG, i.e. the filters of the first convolutional layer of the DNA model.\n", "\n", "We fill first compute the activations (occurrence frequencies) of motifs in sequence windows, and then extract and align sequence fragments that maximally activate each motif. Finally, we will visualize the resulting alignments via [Weblogo](http://weblogo.threeplusone.com/) and compare them to known motifs via [Tomtom](http://web.mit.edu/meme_v4.9.0/doc/tomtom.html)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Requirements " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "[Tomtom](http://web.mit.edu/meme_v4.9.0/doc/tomtom.html) is required for comparing motifs and is part of the [MEME-Suite](http://meme-suite.org/), which can be downloaded [here](http://meme-suite.org/doc/download.html).\n", "\n", @@ -38,7 +50,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Initialization\n", "We first initialize some variables that will be used throughout the tutorial. `test_mode=1` should be used for testing purposes, which speeds up computations by only using a subset of the data. For real applications, `test_mode=0` should be used." @@ -48,7 +63,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -77,23 +94,31 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Dowloading the DeepCpG model " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "We will analyze the filters of the DeepCpG DNA module that was trained on serum cells from *Smallwood et al. (2014)*, and which is described in the DeepCpG publication. This model can be downloaded with `dcpg_download.py`:" + "We will analyze the filters of the DeepCpG DNA model that was trained on serum cells from *Smallwood et al. (2014)*, and which is described in the DeepCpG publication. This model can be downloaded with `dcpg_download.py`:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -187,14 +212,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Creating DeepCpG data files" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next, we use `dcpg_data.py` to extract sequence windows for computing filter activations. If you have already created a dataset, you can skip this step. We use `--nb_sample_chromo` to randomly sample only 1500 CpG sites from each chromosome, which is sufficient for visualizing motifs and reduces compute costs." ] @@ -203,7 +234,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -243,14 +276,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Computing filter activations " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now we use `dcpg_filter_act.py` to compute the activation of filters in DNA sequence windows. We are using `--store_inputs` to copy DNA sequences from the input file to the output file, since they are needed in the following step align sequence fragments. We are considering only 30000 CpG sites, which is usually sufficient for visualizing motifs." ] @@ -259,7 +298,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -306,14 +347,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Visualizing and analyzing motifs" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "`dcpg_filter_motifs.py` enables to visualize and cluster motifs, compare motifs to known motifs, and compute motif statistics. We will compare motifs to motifs in the CIS-BP database, and plot motif heatmaps, motif activations, as well as the first two principal components of motif activations via the `--plot_heat`, `--plot_dens` and `--plot_pca` argument, respectively. In test mode, we only select filter 0, 1, and 2:" ] @@ -322,7 +369,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -410,7 +459,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can now have a look into the output motif directory. `report_top.csv` contains for each DeepCpG motif the most similar motif in the CIS-BP database. `report.csv` lists all motifs that are similar to DeepCpG motifs, not only the top motif. Both files include the following columns:\n", "* `idx`: Index of the filter starting with zero.\n", @@ -428,7 +480,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -448,7 +502,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Sequence logos are stored in the `logos/` directory. Let's have a look at the top-ranked motif 121-Ctcf (not true in test mode):\n", "" @@ -457,7 +514,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "The corresponding PWM weight matrix stored in `heat/` looks as follows:\n", @@ -466,7 +525,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Densities of motif activations are stored in `dens/`, e.g. the density of the selected motif:\n", "" @@ -474,7 +536,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Finally, `plot_wmean.png` shows the two first principal components of weighted-mean motif activations in sequence windows. Motifs with similar activaton pattern cluster close to each other:\n", "" diff --git a/examples/notebooks/train/index.ipynb b/examples/notebooks/train/index.ipynb deleted file mode 100644 index 8b4cfef..0000000 --- a/examples/notebooks/train/index.ipynb +++ /dev/null @@ -1,371 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here you can find information about how to train DeepCpG models." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Splitting data into training, validation, and test set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For comparing different models, it is necessary to train, select hyper-parameters, and test models on distinct data. In holdout validation, the dataset is split into a training set (~60% of the data), validation set (~20% of the data), and test set (~20% of the data). Models are trained on the training set, hyper-parameters selected on the validation set, and the selected models compared on the test set. For example, you could use chromosome 1-5, 7, 9, 11, 13 as training set, chromosome 14-19 as validation set, and chromosome 6, 8, 10, 12, 14 as test set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "train_files=\"$data_dir/c{1,2,3,4,5,7,9,11,13}_*.h5\n", - "val_files=\"$data_dir/c{14,15,16,17,18,19}_*.h5\"\n", - "test_files=\"$data_dir/c{6,8,10,12,14}_*.h5\"\n", - "\n", - "dcpg_train.py\n", - " $train_files\n", - " --val_file $val_files\n", - " ... " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, DeepCpG allows to easily split the data by glob patterns. You do not have to split the dataset by chromosomes. For example, you could use `train_files=$data_Dir/c*_[01].h5` to select all data files starting with index 0 or 1 for training, and use the remaining files for validation. \n", - "\n", - "If you are not concerend about comparing DeepCpG with other models, you do not need a test set. In this case, you could, for example, leave out chromsome 14-19 as validation set, and use the remaining chromosomes for training.\n", - "\n", - "If your data were generated using whole-genome scBS-seq, then the number of CpG sites on few chromosomes is usually already sufficient for training. For example, chromsome 1, 3, and 5 from *Smallwood et al (2014)* cover already more than 3 million CpG sites. I found about 3 million CpG sites as sufficient for training models without overfitting. However, if you are working with scRRBS-seq data, you probably need more chromosomes for trainig. To check how many CpG sites are stored in a set of DeepCpG data files, you can use the `dcpg_data_stats.py`. The following command will compute different statistics for the training set, including the number number of CpG sites:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "#################################\r\n", - "dcpg_data_stats.py ./data/c19_000000-032768.h5 ./data/c19_032768-050000.h5\r\n", - "#################################\r\n", - " output nb_tot nb_obs frac_obs mean var\r\n", - "0 cpg/BS27_1_SER 50000 20621 0.41242 0.665972 0.222453\r\n", - "1 cpg/BS27_3_SER 50000 13488 0.26976 0.573102 0.244656\r\n", - "2 cpg/BS27_5_SER 50000 25748 0.51496 0.529633 0.249122\r\n", - "3 cpg/BS27_6_SER 50000 17618 0.35236 0.508117 0.249934\r\n", - "4 cpg/BS27_8_SER 50000 16998 0.33996 0.661019 0.224073\r\n" - ] - } - ], - "source": [ - "dcpg_data_stats.py $data_dir/$train_files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each output cell, `nb_tot` is the total number of CpG sites, `nb_obs` the number of CpG sites with known methylation state, `frac_obs` the ratio between `nb_obs` and `nb_tot`, `mean` the mean methylation rate, and `var` the variance of the methylation rate." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training DeepCpG modules jointly\n", - "\n", - "As described in [Angermueller et al (2017)](http://biorxiv.org/content/early/2017/02/01/055715), DeepCpG consists of a DNA, CpG, and joint module. The DNA module recognizes features in the DNA sequence window that is centered on a target site, the CpG module recognizes features in observed neighboring methylation states of multiple cells, and the joint module integrates features from the DNA and CpG module and predicts the methylation state of all cells.\n", - "\n", - "The easiest way is to train all modules jointly:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "dcpg_train.py\n", - " $train_files\n", - " --val_files $val_files\n", - " --dna_model CnnL2h128\n", - " --cpg_model RnnL1\n", - " --out_dir $models_dir/joint\n", - " --nb_epoch 30" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`--dna_model`, `--cpg_model`, and `--joint_module` specify the name of DNA, CpG, and joint architecture, respectively, which are described in **X**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training DeepCpG modules separately\n", - "\n", - "Although it is convenient to train all modules jointly by running only a single command as described above, I suggest to train modules separately. First, because it enables to train the DNA and CpG module in parallel on separate machines and thereby to reduce the training time. Second, it enables to compare how predictive the DNA module is relative to CpG module. If you think the CpG module is already accurate enough alone, you might not need the DNA module. Thirdly, I obtained better results by training the modules separately. However, this may not be true for your particular dataset.\n", - "\n", - "You can train the CpG module separately by only using the `--cpg_model` argument, but not `--dna_module`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "dcpg_train.py\n", - " $train_files\n", - " --val_files $val_files\n", - " --dna_model CnnL2h128\n", - " --out_dir $models_dir/dna\n", - " --nb_epoch 30" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can train the DNA module separately by only using `--dna_model`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "dcpg_train.py\n", - " $train_files\n", - " --val_files $val_files\n", - " --cpg_model RnnL1\n", - " --out_dir $models_dir/cpg\n", - " --nb_epoch 30" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After training the CpG and DNA module, we are joining them by specifying the name of the joint module with `--joint_module`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "dcpg_train.py\n", - " $train_files\n", - " --val_files $val_files\n", - " --dna_model $models_dir/dna\n", - " --cpg_model $models_dir/cpg\n", - " --joint_model JointL2h512\n", - " --out_dir $models_dir/joint\n", - " --nb_epoch 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`--dna_module` and `--cpg_module` point to the output training directory of the DNA and CpG module, respectively, which contains their specification and weights:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "events.out.tfevents.1488213772.lawrence model.json\r\n", - "lc_train.csv model_weights_train.h5\r\n", - "lc_val.csv model_weights_val.h5\r\n", - "model.h5\r\n" - ] - } - ], - "source": [ - "ls $models_dir/dna" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`model.json` is the specification of the trained model, `model_weights_train.h5` the weights with the best performance on the training set, and `model_weights_val.h5` the weights with the best performance on the validation set. `--dna_model ./dna` is equivalent to using `--dna_model ./dna/model.json ./dna/model_weights_val.h5`, i.e. the validation weights will be used. The training weights can be used by `--dna_model ./dna/model.json ./dna/model_weights_train.h5` " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the command above, we used `--joint_only` to only train the paramters of the joint module without training the pre-trained DNA and CpG module. Although the `--joint_only` arguments reduces training time, you might obtain better results by also fine-tuning the paramters of the DNA and CpG module without using `--joint_only`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Monitoring training progress\n", - "\n", - "To check if your model is training correctly, you should monitor the training and validation loss. DeepCpG prints the loss and performance metrics for each output to the console as you can see from the previous commands. `loss` is the loss on the training set, `val_loss` the loss on the validation set, and `cpg/X_acc`, is, for example, the accuracy for output cell X. DeepCpG also stores these metrics in `X.csv` in the training output directory.\n", - "\n", - "Both the training loss and validation loss should continually decrease until saturation. If at some point the validation loss starts to increase while the training loss is still decreasing, your model is overfitting the training set and you should stop training. DeepCpG will automatically stop training if the validation loss does not increase over the number of epochs that is specified by `--early_stopping` (by default 5). If your model is overfitting already after few epochs, your training set might be to small, and you could try to regularize your model model by choosing a higher value for `--dropout` or `--l2_decay`.\n", - "\n", - "If your training loss fluctuates or increases, then you should decrease the learning rate. For more information on interpreting learning curves I recommend this tutorial.\n", - "\n", - "To stop training before reaching the number of epochs specified by `--nb_epoch`, you can create a stop file (default name `STOP`) in the training output directory with `touch STOP`. See also **X**.\n", - "\n", - "Watching numeric console outputs is not particular user friendly. [TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard) provides a more convenient and visually appealing way to mointor training. You can use TensorBoard provided that you are using the Tensorflow backend (**see X**). Simply go to the training output directory and run `tensorboard --logdir .`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deciding how long to train\n", - "The arguments `--nb_epoch` and `--early_stopping` control how long models are trained. \n", - "\n", - "`--nb_epoch` defines the maximum number of training epochs (default 30). After one epoch, the model has seen the entire training set once. The time per epoch hence depends on the size of the training set, but also on the complexity of the model that you are training and the hardware of your machine. On a large dataset, you have to train for fewer epochs than on a small dataset, since your model will have seen already a lot of training samples after one epoch. For training on about 3,000,000 samples, good default values are 20 for the DNA and CpG module, and 10 for the joint module.\n", - "\n", - "Early stopping stops training if the loss on the validation set did not improve after the number of epochs that is specified by `--early_stopping` (default 5). If you are training without specifying a validation set with `--val_files`, early stopping will be deactivated.\n", - "\n", - "`--max_time` sets the maximum training time in hours. This guarantees that training terminates after a certain amount of time regardless of the `--nb_epoch` or `--early_stopping` argument.\n", - "\n", - "`--stop_file` defines the path of a file that, if it exists, stop training after the end of the current epoch. This is useful if you are monitoring training and want to terminate training manually as soon as the training loss starts to saturate regardless of `--nb_epoch` or `--early_stopping`. For example, when using `--stop_file ./train/STOP`, you can create an empty file with `touch ./train/STOP` to stop training at the end of the current epoch." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Optimizing hyper-parameters\n", - "\n", - "DeepCpG has differernt hyper-parameters, such as the learning rate, dropout rate, or module architectures. Although the performance of DeepCpG is relatively robust to different hyper-parameters, you can tweak performances by trying out different parameter combinations. For example, you could train different models with different paramters on a subset of your data, select the parameters with the highest performance on the validation set, and then train the full model.\n", - "\n", - "The following hyper-parameters are most important (default values shown):\n", - "1. Learning rate: `--learning_rate 0.0001`\n", - "2. Dropout rate: `--dropout 0.0`\n", - "3. DNA model architecture: `--dna_model CnnL2h128`\n", - "4. Joint model architecture: `--joint_model JointL2h512`\n", - "5. CpG model architecture: `--cpg_model RnnL1`\n", - "6. L2 weight decay: `--l2_decay 0.0001`\n", - "\n", - "The learning rate defines how agressively model parameters are updated during training. If the training loss changes only slowly (**see X**), you could try increasing the learning rate. If your model is overfitting of if the training loss fluctuates, you should decrease the learning rate. Reasonable values are 0.001, 0.0005, 0.0001, 0.00001, or values in between.\n", - "\n", - "The dropout rate defines how strongly your model is regularized. If you have only few data and your model is overfitting, then you should increase the dropout rate. Reasonable values are, e.g., 0.0, 0.2, 0.4.\n", - "\n", - "DeepCpG provides different architectures for the DNA, CpG, and joint module. Architectures are more or less complex, depending on how many layers and neurons say have. More complex model might yield better performances, but take longer to train and might overfit your data. See **X** for more information about different architectures.\n", - "\n", - "L2 weight decay is an alternative to dropout for regularizing model training. If your model is overfitting, you might try 0.001, or 0.005." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Testing training\n", - "\n", - "`dcpg_train.py` provides different arguments that allow to briefly test training before training the full model for a about a day.\n", - "\n", - "`--nb_train_sample` and `--nb_val_sample` specify the number of training and validation samples. When using `--nb_train_sample 500`, the training loss should briefly decay and your model should start overfitting.\n", - "\n", - "`--nb_output` and `--output_names` define the maximum number and the name of model outputs. For example, `--nb_output 3` will train only on the first three outputs, and `--output_names cpg/.*SER.*` only on outputs that include 'SER' in their name.\n", - "\n", - "Analogously, `--nb_replicate` and `--replicate_name` define the number and name of cells that are used as input to the CpG module. `--nb_replicate 3` will only use observed methylation states from the first three cells, and allows to briefly test the CpG module.\n", - "\n", - "`--dna_wlen` specifies the size of DNA sequence windows that will be used as input to the DNA module. For example, `--dna_wlen 101` will train only on windows of size 101, instead of using the full window length that was specified when creating data files with `dcpg_data.py`.\n", - "\n", - "Analogously, `--cpg_wlen` specifies the sum of the number of observed CpG sites to the left and the right of the target CpG site for training the CpG module. For example, `--cpg_wlen 10` will use 5 observed CpG sites to the left and to the right." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fine-tuning and training selected components\n", - "`dcpg_train.py` provides different arguments that allow to selectively train only some components of a model. \n", - "\n", - "With `--fine_tune`, only the output layer will be trained. As the name implies, this argument is useful for fine-tuning a pre-trained model.\n", - "\n", - "`--train_models` specifies which modules are trained. For example, `--train_models joint` will train the joint module, but not the DNA and CpG module. `--train_models cpg joint` will train the CpG and joint module, but not the DNA module.\n", - "\n", - "`--trainable` and `--not_trainable` allow including and excluding certain layers. For example, `--not_trainable '.*' --trainable 'dna/.*_2'` will only train the second layers of the DNA module.\n", - "\n", - "\n", - "`--freeze_filter` excludes the first convolutional layer of the DNA module from training." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuring the Keras backend\n", - "\n", - "DeepCpG use the [Keras](https://keras.io) deep learning library, which supports [Theano](http://deeplearning.net/software/theano/) or [Tensorflow](https://www.tensorflow.org/) as backend. While Theano has long been the dominant deep learning library, Tensorflow is more suited for parallelizing computations on multiple GPUs and CPUs, and provides [TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard) to interactively monitor training.\n", - "\n", - "You can configure the backend by setting the `backend` attribute in `~/.keras/keras.json` to `tensorflow` or `theano`. Alternatively you can set the environemnt variable `KERAS_BACKEND='tensorflow'` to use Tensorflow, or `KERAS_BACKEND='theano'` to use Theano.\n", - "\n", - "You can find more information about Keras backends [here](https://keras.io/backend/)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/examples/scripts/eval.sh b/examples/scripts/eval.sh index 30530d3..a8c1b43 100755 --- a/examples/scripts/eval.sh +++ b/examples/scripts/eval.sh @@ -13,7 +13,7 @@ run "rm -rf $eval_dir" run "mkdir -p $eval_dir" # Evaluate model and impute methylation profiles. You can change the input model -# to `dna` or `cpg` for evaluation the DNA or CpG module, respectively. +# to `dna` or `cpg` for evaluation the DNA or CpG model, respectively. cmd="dcpg_eval.py $data_dir/*.h5 --model $models_dir/joint diff --git a/examples/scripts/train.sh b/examples/scripts/train.sh index 2fe825d..36a88d3 100755 --- a/examples/scripts/train.sh +++ b/examples/scripts/train.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Trains DeepCpG modules separately. +# Trains DeepCpG models separately. # Source dependencies. source "./lib.sh" @@ -23,7 +23,7 @@ if [[ $test_mode -eq 1 ]]; then fi -# Train DNA module. +# Train DNA model. cmd="dcpg_train.py $train_files --val_files $val_files @@ -45,7 +45,7 @@ fi run $cmd -# Train CpG module. +# Train CpG model. cmd="dcpg_train.py $train_files --val_files $val_files @@ -67,7 +67,7 @@ fi run $cmd -# Train joint module. +# Train Joint model. cmd="dcpg_train.py $train_files --val_files $val_files diff --git a/scripts/dcpg_data.py b/scripts/dcpg_data.py index 9e3560a..32b54f7 100755 --- a/scripts/dcpg_data.py +++ b/scripts/dcpg_data.py @@ -1,19 +1,46 @@ #!/usr/bin/env python -"""Creates DeepCpG input data from incomplete methylation profiles. +"""Create DeepCpG input data from incomplete methylation profiles. Takes as input incomplete CpG methylation profiles of multiple cells, extracts neighboring CpG sites and/or DNA sequences windows, and writes data chunk files to output directory. Output data can than be used for model training using -`dcpg_train.py` model evaluation using `dcpg_eval.py`. - -Examples: - dcpg_data.py \ - --cpg_profiles cell1.dcpg cell2.dcpg cell3.dcpg \ - --cpg_wlen 50 \ - --dna_files ./mm10/ \ - --dna_wlen 1001 \ +``dcpg_train.py`` model evaluation using ``dcpg_eval.py``. + +Examples +-------- +Create data files for training a CpG and DNA model, using 50 neighboring +methylation states and DNA sequence windows of 1001 bp from the mm10 genome +build: + +.. code:: bash + + dcpg_data.py + --cpg_profiles ./cpg/*.tsv + --cpg_wlen 50 + --dna_files ./mm10 + --dna_wlen 1001 + --out_dir ./data + +Create data files from gzip-compressed bedGraph files for predicting the mean +methylation rate and cell-to-cell variance from the DNA sequence: + +.. code:: bash + + dcpg_data.py + --cpg_profiles ./cpg/*.bedGraph.gz + --dna_files ./mm10 + --dna_wlen 1001 + --win_stats mean var + --win_stats_wlen 1001 2001 3001 4001 5001 --out_dir ./data + + +See Also +-------- +* ``dcpg_data_stats.py``: For computing statistics of data files. +* ``dcpg_data_show.py``: For showing the content of data files. +* ``dcpg_train.py``: For training a model. """ from __future__ import print_function @@ -62,10 +89,21 @@ def prepro_pos_table(pos_tables): def split_ext(filename): + """Remove file extension from `filename`.""" return os.path.basename(filename).split(os.extsep)[0] def read_cpg_profiles(filenames, log=None, *args, **kwargs): + """Read methylation profiles. + + Input files can be gzip compressed. + + Returns + ------- + `dict (key, value)`, where `key` is the output name and `value` the CpG + table. + """ + cpg_profiles = OrderedDict() for filename in filenames: if log: @@ -119,7 +157,10 @@ def extract_seq_windows(seq, pos, wlen, seq_index=1, assert_cpg=False): def map_values(values, pos, target_pos, dtype=None, nan=dat.CPG_NAN): - """Maps `values` array at positions `pos` to `target_pos`.""" + """Maps `values` array at positions `pos` to `target_pos`. + + Inserts `nan` for uncovered positions. + """ assert len(values) == len(pos) assert np.all(pos == np.sort(pos)) assert np.all(target_pos == np.sort(target_pos)) @@ -145,6 +186,7 @@ def map_cpg_tables(cpg_tables, chromo, chromo_pos): """Maps values from cpg_tables to `chromo_pos`. Positions in `cpg_tables` for `chromo` must be a subset of `chromo_pos`. + Inserts `dat.CPG_NAN` for uncovered positions. """ chromo_pos.sort() mapped_tables = OrderedDict() @@ -234,11 +276,6 @@ def create_parser(self, name): ' methylation state is known in at least that many cells.', type=int, default=1) - p.add_argument( - '--bulk_profiles', - help='Input bulk methylation profiles in dcpg or bedGraph format' - ' that are to be imputed', - nargs='+') p.add_argument( '--dna_files', help='Directory or FASTA files named "*.chromosome.`chromo`.fa*"' @@ -286,7 +323,7 @@ def create_parser(self, name): help='Window lengths for computing statistics', type=int, nargs='+', - default=[3001]) + default=[1001, 2001, 3001, 4001, 5001]) g = p.add_argument_group('advanced arguments') g.add_argument( @@ -335,7 +372,7 @@ def main(self, name, opts): log.debug(opts) # Check input arguments - if not (opts.cpg_profiles or opts.bulk_profiles): + if not opts.cpg_profiles: if not (opts.pos_file or opts.dna_files): raise ValueError('Position table and DNA database expected!') @@ -365,14 +402,6 @@ def main(self, name, opts): nb_sample_chromo=opts.nb_sample_chromo, log=log.info) - if opts.bulk_profiles: - log.info('Reading bulk profiles ...') - outputs['bulk'] = read_cpg_profiles(opts.bulk_profiles, - chromos=opts.chromos, - nb_sample=opts.nb_sample, - round=False, - log=log.info) - # Create table with unique positions if opts.pos_file: # Read positions from file @@ -418,11 +447,6 @@ def main(self, name, opts): list(chromo_outputs['cpg'].values())).T assert len(chromo_outputs['cpg_mat']) == len(chromo_pos) - if 'bulk' in outputs: - # Concatenate CpG tables into single nb_site x nb_output matrix - chromo_outputs['bulk'] = map_cpg_tables(outputs['bulk'], - chromo, chromo_pos) - if 'cpg_mat' in chromo_outputs and opts.cpg_cov: cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1) assert np.all(cov >= 1) @@ -478,8 +502,9 @@ def main(self, name, opts): if 'cpg' in chunk_outputs: for name, value in six.iteritems(chunk_outputs['cpg']): assert len(value) == len(chunk_pos) + # Round continuous values out_group.create_dataset('cpg/%s' % name, - data=value, + data=value.round(), dtype=np.int8, compression='gzip') # Compute and write statistics @@ -498,15 +523,6 @@ def main(self, name, opts): dtype=fun[1], compression='gzip') - # Write bulk profiles - if 'bulk' in chunk_outputs: - for name, value in six.iteritems(chunk_outputs['bulk']): - assert len(value) == len(chunk_pos) - out_group.create_dataset('bulk/%s' % name, - data=value, - dtype=np.float32, - compression='gzip') - # Write input features in_group = chunk_file.create_group('inputs') @@ -534,12 +550,12 @@ def main(self, name, opts): nan = np.isnan(state) state[nan] = dat.CPG_NAN dist[nan] = dat.CPG_NAN - state = state.astype(np.int8, copy=False) + # States can be binary (np.int8) or continuous + # (np.float32). + state = state.astype(cpg_table.value.dtype, copy=False) dist = dist.astype(np.float32, copy=False) assert len(state) == len(chunk_pos) - assert np.all((state == 0) | (state == 1) | - (state == dat.CPG_NAN)) assert len(dist) == len(chunk_pos) assert np.all((dist > 0) | (dist == dat.CPG_NAN)) diff --git a/scripts/dcpg_data_show.py b/scripts/dcpg_data_show.py index e21fbcc..8a6e517 100755 --- a/scripts/dcpg_data_show.py +++ b/scripts/dcpg_data_show.py @@ -1,46 +1,53 @@ #!/usr/bin/env python -"""Shows the content of DeepCpG data files. - -Allows to selectively show the content of `dcpg_data.py` output files, for -example to analyze CpG sites in a particular region. - -Examples: - Show the output methylation state of CpG sites on on chromosome 19 between - position 3028955 and 3079682: - - dcpg_data_show.py \ - ./data/*.h5 \ - --chromo 1 \ - --start 3028955 \ - --end 3079682 \ - --outputs - - Show output methylation states and the state as well as the distance of - 10 neighboring CpG sites of cell BS27_1_SER: - - dcpg_data_show.py \ - ./data/*.h5 \ - --chromo 1 \ - --start 3028955 \ - --end 3079682 \ - --outputs cpg/BS27_1_SER \ - --cpg BS27_1_SER \ - --cpg_wlen 10 \ - --cpg_dist - - Show output methylation states and DNA sequence windows of length 11 and - store the results in HDF5 file `selected.h5`: - - dcpg_data_show.py \ - ./data/*.h5 \ - --chromo 1 \ - --start 3028955 \ - --end 3079682 \ - --outputs \ - --dna_wlen 11 \ - --out_file selected.h5 - +"""Show the content of DeepCpG data files. + +Shows the content of ``dcpg_data.py`` output files for a selected region, for +example the methylation state of the target CpG site, neighboring CpG sites, or +the DNA sequence. + +Examples +-------- +Show the output methylation state of CpG sites on on chromosome 19 between +position 3028955 and 3079682: + +.. code:: bash + + dcpg_data_show.py + ./data/*.h5 + --chromo 1 + --start 3028955 + --end 3079682 + --outputs + +Show output methylation states and the state as well as the distance of +10 neighboring CpG sites of cell BS27_1_SER: + +.. code:: bash + + dcpg_data_show.py + ./data/*.h5 + --chromo 1 + --start 3028955 + --end 3079682 + --outputs cpg/BS27_1_SER + --cpg BS27_1_SER + --cpg_wlen 10 + --cpg_dist + +Show output methylation states and DNA sequence windows of length 11 and +store the results in HDF5 file ``selected.h5``: + +.. code:: bash + + dcpg_data_show.py + ./data/*.h5 + --chromo 1 + --start 3028955 + --end 3079682 + --outputs + --dna_wlen 11 + --out_hdf selected.h5 """ from __future__ import print_function @@ -53,6 +60,7 @@ import argparse import h5py as h5 import logging +import numpy.random import pandas as pd from deepcpg.data import hdf @@ -84,7 +92,7 @@ def create_parser(self, name): nargs='+', help='Data files') p.add_argument( - '-o', '--out_file', + '-o', '--out_hdf', help='Write selected data to HDF5 file') p.add_argument( '--outputs', @@ -212,8 +220,8 @@ def main(self, name, opts): data = pd.concat(data) data[('loc', 'chromo')] = [x.decode() for x in data['loc']['chromo']] - if opts.out_file: - data.to_hdf(opts.out_file, '/data') + if opts.out_hdf: + data.to_hdf(opts.out_hdf, '/data') else: print(data.to_string()) diff --git a/scripts/dcpg_data_stats.py b/scripts/dcpg_data_stats.py index bd3f47a..5c17cab 100755 --- a/scripts/dcpg_data_stats.py +++ b/scripts/dcpg_data_stats.py @@ -1,12 +1,16 @@ #!/usr/bin/env python -"""Computes summary statistics of data files. +"""Compute summary statistics of data files. Computes summary statistics of data files such as the number of samples or the mean and variance of output variables. -Examples: - dcpg_data_stats.py \ +Examples +-------- + +.. code:: bash + + dcpg_data_stats.py ./data/*.h5 """ @@ -42,9 +46,9 @@ def get_output_stats(output): def plot_stats(stats): stats = stats.sort_values('frac_obs', ascending=False) stats = pd.melt(stats, id_vars=['output'], var_name='metric') - stats = stats.loc[stats.metric.isin(['frac_obs', 'frac_one'])] - stats.metric = stats.metric.str.replace('frac_obs', 'cov') - stats.metric = stats.metric.str.replace('frac_one', 'met') + # stats = stats.loc[stats.metric.isin(['frac_obs', 'frac_one'])] + # stats.metric = stats.metric.str.replace('frac_obs', 'cov') + # stats.metric = stats.metric.str.replace('frac_one', 'met') grid = sns.FacetGrid(data=stats, col='metric', sharex=False) grid.map(sns.barplot, 'value', 'output') for ax in grid.axes.ravel(): @@ -70,8 +74,8 @@ def create_parser(self, name): nargs='+', help='Data files') p.add_argument( - '-o', '--out_csv', - help='Write statistics to csv file') + '-o', '--out_tsv', + help='Write statistics to tsv file') p.add_argument( '-f', '--out_fig', help='Create output figure') @@ -118,8 +122,8 @@ def main(self, name, opts): stats.reset_index(inplace=True) print(stats.to_string()) - if opts.out_csv: - stats.to_csv(opts.out_csv, sep='\t', index=False) + if opts.out_tsv: + stats.to_csv(opts.out_tsv, sep='\t', index=False) if opts.out_fig: plot_stats(stats).savefig(opts.out_fig) diff --git a/scripts/dcpg_download.py b/scripts/dcpg_download.py index 33b17bb..0422307 100755 --- a/scripts/dcpg_download.py +++ b/scripts/dcpg_download.py @@ -1,13 +1,24 @@ #!/usr/bin/env python -"""Downloads a pre-trained model from DeepCpG model zoo. +"""Download a pre-trained model from DeepCpG model zoo. Downloads a pre-trained model from the DeepCpG model zoo by its identifier. Model descriptions can be found on online. -Example: - dcpg_download.py \ - Smallwood2014_serum_dna \ +Examples +-------- +Show available models: + +.. code:: bash + + dcpg_download --show + +Download DNA model trained on serum cells from Smallwood et al: + +.. code:: bash + + dcpg_download.py + Smallwood2014_serum_dna -o ./model """ diff --git a/scripts/dcpg_eval.py b/scripts/dcpg_eval.py index 3253cd3..55e8411 100755 --- a/scripts/dcpg_eval.py +++ b/scripts/dcpg_eval.py @@ -1,14 +1,28 @@ #!/usr/bin/env python -"""Evaluates prediction performance of a DeepCpG model. +"""Evaluate the prediction performance of a DeepCpG model. -Imputes missing methylation states and evaluates model on observered states. +Imputes missing methylation states and evaluates model on observed states. +``--out_report`` will write evaluation metrics to a TSV file using. +``--out_data`` will write predicted and observed methylation state to a HDF5 +file with following structure: -Example: - dcpg_eval.py \ - ./data/*.h5 \ - --out_file ./eval.h5 \ - --out_report ./eval.tsv +* ``chromo``: The chromosome of the CpG site. +* ``pos``: The position of the CpG site on the chromosome. +* ``outputs``: The input methylation state of each cell and CpG site, which \ + can either observed or missing (-1). +* ``preds``: The predicted methylation state of each cell and CpG site. + +Examples +-------- + +.. code:: bash + + dcpg_eval.py + ./data/*.h5 + --model_files ./model + --out_data ./eval/data.h5 + --out_report ./eval/report.tsv """ from __future__ import print_function diff --git a/scripts/dcpg_eval_export.py b/scripts/dcpg_eval_export.py index 2caa010..cb8c5c8 100755 --- a/scripts/dcpg_eval_export.py +++ b/scripts/dcpg_eval_export.py @@ -1,19 +1,33 @@ #!/usr/bin/env python -"""Exports imputed methylation profiles. +"""Export imputed methylation profiles. Exports imputed methylation profiles from `dcpg_eval.py` output file to -different data formats. Creates for each methylation profile one file in -output directory. +different data formats. Outputs for each CpG site and cell either the +experimentally observed or predicted methylation state depending on whether or +not the methylation state was observed in the input file or not, respectively. +Creates for each methylation profile one file in the output directory. -Example: - Export profile of cell Ca01 and chromosomes 4 5 to `./eval`: +Examples +-------- +Export profiles of all cells as HDF5 files to `./eval`: - dcpg_export.py \ - ./eval/data.h5 \ - --out_dir ./eval \ - --output cpg/Ca01 \ - --chromo 4 5 +.. code:: bash + + dcpg_eval_export.py + ./eval/data.h5 + --out_dir ./eval + +Export the profile of cell Ca01 for chromosomes 4 and 5 to a bedGraph file: + +.. code:: bash + + dcpg_eval_export.py + ./eval/data.h5 + --output cpg/Ca01 + --chromo 4 5 + --format bedGraph + --out_dir ./eval """ from __future__ import print_function diff --git a/scripts/dcpg_filter_act.py b/scripts/dcpg_filter_act.py index bd35959..a131fdc 100755 --- a/scripts/dcpg_filter_act.py +++ b/scripts/dcpg_filter_act.py @@ -1,31 +1,40 @@ #!/usr/bin/env python -"""Computes filter activations of a DeepCpG model. +"""Compute filter activations of a DeepCpG model. -Computes the activation of filters of the first convolutional layer for a +Computes the activation of the filters of the first convolutional layer for a given DNA model. The resulting activations can be used to visualize and cluster motifs, or correlated with model outputs. -Example: - Compute activations in 25000 sequence windows and also store DNA sequences. - For example to visualize motifs. - - dcpg_filter_act.py \ - ./data/*.h5 \ - --model_files ./models/dna \ - --out_file ./activations.h5 \ - --nb_sample 25000 \ - --store_inputs - - Compute the weighted mean activation in each sequence window and also store - model predictions. For example to cluster motifs or to correlated mean motif - activations with model predictions. - - dcpg_filter_act.py \ - ./data/*.h5 \ - --model_files ./models/dna \ - --out_file ./activations.h5 \ - --act_fun wmean +Examples +-------- +Compute activations in 25000 sequence windows and also store DNA sequences. +For example to visualize motifs. + +.. code:: bash + + dcpg_filter_act.py + ./data/*.h5 + --model_files ./models/dna + --out_file ./activations.h5 + --nb_sample 25000 + --store_inputs + +Compute the weighted mean activation in each sequence window and also store +model predictions. For example to cluster motifs or to correlated mean motif +activations with model predictions. + +.. code:: bash + + dcpg_filter_act.py + ./data/*.h5 + --model_files ./models/dna + --out_file ./activations.h5 + --act_fun wmean + +See Also +-------- +* ``dcpg_filter_motifs.py``: For motif visualization and analysis. """ from __future__ import print_function diff --git a/scripts/dcpg_filter_motifs.py b/scripts/dcpg_filter_motifs.py index 7219082..76235f1 100755 --- a/scripts/dcpg_filter_motifs.py +++ b/scripts/dcpg_filter_motifs.py @@ -2,30 +2,37 @@ """Visualizes and analyzes filter motifs. -Allows to visualize motifs as sequence logos, compare motifs to annotated +Enables to visualize motifs as sequence logos, compare motifs to annotated motifs, cluster motifs, and compute motif summary statistics. Requires Weblogo3 for visualization, and Tomtom for motif comparison. -Examples: - Compute filter activations and also store input DNA sequence windows: +Copyright (c) 2015 David Kelley since since parts of the code are based on the +`Basset `_ script ``basset_motifs.py`` +from David Kelley. - dcpg_filter_act.py \ - ./data/*.h5 \ - --out_file ./activations.h5 \ - --store_inputs \ - --nb_sample 100000 +Examples +-------- +Compute filter activations and also store input DNA sequence windows: - Visualize and analyze motifs: +.. code:: bash - dcpg_filter_motifs.py \ - ./activations.h5 \ - --out_dir ./motifs \ - --motif_db ./motif_databases/CIS-BP/Mus_musculus.meme \ - --plot_heat \ - --plot_dens \ - --plot_pca + dcpg_filter_act.py + ./data/*.h5 + --out_file ./activations.h5 + --store_inputs + --nb_sample 100000 -Based on `basset_motifs.py`, therefore Copyright (c) 2015 David Kelley. +Visualize and analyze motifs: + +.. code:: bash + + dcpg_filter_motifs.py + ./activations.h5 + --out_dir ./motifs + --motif_db ./motif_databases/CIS-BP/Mus_musculus.meme + --plot_heat + --plot_dens + --plot_pca """ from __future__ import print_function @@ -561,7 +568,7 @@ def main(self, name, opts): print() print('\nFilter statistics:') print(filter_stats.to_string()) - filter_stats.to_csv(pt.join(opts.out_dir, 'stats.csv'), + filter_stats.to_csv(pt.join(opts.out_dir, 'stats.tsv'), float_format='%.4f', sep='\t', index=False) @@ -580,16 +587,16 @@ def main(self, name, opts): for motif_db in opts.motif_dbs: meme_motifs.append(read_meme_db(motif_db)) meme_motifs = pd.concat(meme_motifs) - tmp = pt.join(opts.out_dir, 'tomtom', 'meme_motifs.csv') + tmp = pt.join(opts.out_dir, 'tomtom', 'meme_motifs.tsv') meme_motifs.to_csv(tmp, sep='\t', index=False) report = get_report( - pt.join(opts.out_dir, 'stats.csv'), + pt.join(opts.out_dir, 'stats.tsv'), pt.join(opts.out_dir, 'tomtom', 'tomtom.txt'), meme_motifs) report.sort_values(['idx', 'q-value', 'act_mean'], ascending=[True, True, False], inplace=True) - report.to_csv(pt.join(opts.out_dir, 'report.csv'), index=False, + report.to_csv(pt.join(opts.out_dir, 'report.tsv'), index=False, sep='\t', float_format='%.3f') report_top = report.groupby('idx').first().reset_index() @@ -597,7 +604,7 @@ def main(self, name, opts): ascending=[True, False], inplace=True) report_top.index = range(len(report_top)) report_top.to_csv(pt.join(opts.out_dir, - 'report_top.csv'), index=False, + 'report_top.tsv'), index=False, sep='\t', float_format='%.3f') print('\nTomtom results:') diff --git a/scripts/dcpg_train.py b/scripts/dcpg_train.py index 5fad735..9f0d9f1 100755 --- a/scripts/dcpg_train.py +++ b/scripts/dcpg_train.py @@ -1,38 +1,49 @@ #!/usr/bin/env python -"""Trains DeepCpG model to predict DNA methylation. +"""Train a DeepCpG model to predict DNA methylation. -Trains model on DNA (DNA model), neighboring methylation states (CpG model), -or both (Joint model) to predict CpG methylation of multiple cells. +Trains a DeepCpG model on DNA (DNA model), neighboring methylation states +(CpG model), or both (Joint model) to predict CpG methylation of multiple cells. Allows to fine-tune individual models or to train them from scratch. -Examples: - Train DNA model on chromosome 1, 3, and 5, and use chromosome 13, 14, and - 15 for validation. - - dcpg_train.py \ - ./data/c{1,3,5}_*.h5 \ - --val_files ./data/c{13,14,15}_*.h5 \ - --dna_model CnnL2h128 \ - --out_dir ./models/dna - - Train CpG model: - - dcpg_train.py \ - ./data/c{1,3,5}_*.h5 \ - --val_files ./data/c{13,14,15}_*.h5 \ - --cpg_model RnnL1 \ - --out_dir ./models/cpg - - Train joint model based on pre-trained DNA and CpG model: - - dcpg_train.py \ - ./data/c{1,3,5}_*.h5 \ - --val_files ./data/c{13,14,15}_*.h5 \ - --dna_model ./models/dna \ - --cpg_model ./models/cpg \ - --out_dir ./models/joint \ - --fine_tune +Examples +-------- +Train a DNA model on chromosome 1, 3, and 5, and use chromosome 13, 14, and +15 for validation: + +.. code:: bash + + dcpg_train.py + ./data/c{1,3,5}_*.h5 + --val_files ./data/c{13,14,15}_*.h5 + --dna_model CnnL2h128 + --out_dir ./models/dna + +Train a CpG model: + +.. code:: bash + + dcpg_train.py + ./data/c{1,3,5}_*.h5 + --val_files ./data/c{13,14,15}_*.h5 + --cpg_model RnnL1 + --out_dir ./models/cpg + +Train a Joint model using a pre-trained DNA and CpG model: + +.. code:: bash + + dcpg_train.py + ./data/c{1,3,5}_*.h5 + --val_files ./data/c{13,14,15}_*.h5 + --dna_model ./models/dna + --cpg_model ./models/cpg + --out_dir ./models/joint + --fine_tune + +See Also +-------- +* ``dcpg_eval.py``: For evaluating a trained model and imputing methylation profiles. """ from __future__ import print_function @@ -227,7 +238,7 @@ def create_parser(self, name): models = sorted(list(mod.joint.list_models().keys())) g.add_argument( '--joint_model', - help='Name of joint model.' + help='Name of Joint model.' ' Available models: %s' % ', '.join(models), default='JointL2h512') g.add_argument( @@ -415,8 +426,8 @@ def learning_rate_schedule(epoch): callbacks.append(kcbk.LearningRateScheduler(learning_rate_schedule)) def save_lc(epoch, epoch_logs, val_epoch_logs): - logs = {'lc_train.csv': epoch_logs, - 'lc_val.csv': val_epoch_logs} + logs = {'lc_train.tsv': epoch_logs, + 'lc_val.tsv': val_epoch_logs} for name, logs in six.iteritems(logs): if not logs: continue diff --git a/scripts/dcpg_train_viz.py b/scripts/dcpg_train_viz.py index 4f803fb..dd2783d 100755 --- a/scripts/dcpg_train_viz.py +++ b/scripts/dcpg_train_viz.py @@ -1,13 +1,17 @@ #!/usr/bin/env python -"""Visualizes learning curves of `dcpg_train.py`. +"""Visualizes learning curves of ``dcpg_train.py``. -Visualizes training and validation learning curve that `dcpg_train.py` stores in -training output directory. For advanced visualization use Tensorboard. +Visualizes training and validation learning from `dcpg_train.py`. Tensorboard is +recommended for advanced visualization. -Example: - dcpg_train_viz.py \ - ./model/lc_train.csv ./model/lc_val.csv \ +Examples +-------- + +.. code:: bash + + dcpg_train_viz.py + ./model/lc_train.tsv ./model/lc_val.tsv --out_file ./lc.pdf """ diff --git a/setup.py b/setup.py index d7ab7da..bd40363 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): setup(name='deepcpg', - version='1.0.2', + version='1.0.3', description='Deep learning for predicting CpG methylation', long_description=read('README.rst'), author='Christof Angermueller',