Merge pull request #8 from Gaius-Augustus/migrate_tf

Migrate tf
Gaius-Augustus · Nov 4, 2024 · 823fd69 · 823fd69
2 parents a16f8e8 + 0d885a0
commit 823fd69
Show file tree

Hide file tree

Showing 22 changed files with 1,140 additions and 762 deletions.
diff --git a/README.md b/README.md
@@ -9,56 +9,59 @@ and long short-term memory layers with a differentiable HMM layer. It can be use
 Currently, we provide only model weights for mammalian species and Tiberius does not predict alternative splicing variants. 
 
 
+## Installation
 
-:warning: **We will be migrating Tiberius from TensorFlow 2.10 to TensorFlow 2.17 during the week of October 7–12., which will make the installation of a compatible TensorFlow version easier.**
+Tiberius can either be installed from source or can be run with a **Singularty** container
 
-## Installation
-### Git Repositories
+### Installation with Singularity container
+Build Singularity container with:
+```
+singularity build tiberius.sif docker://larsgabriel23/tiberius:latest
+```
 
-Clone the repository, including learnMSA as submodule:
+Run Tiberius with the Singularity container (use `-nv` for GPU support):
 ```
-git clone --recursive https://github.com/Gaius-Augustus/Tiberius
+ singularity run --nv tiberius.sif tiberius.py [options]
 ```
-In case you cloned the repository without the submodule, you can load the submodule with:
+
+### Installation from Source
+#### Git Repositories
+
+Clone the repository:
 ```
-git submodule update --init --recursive
+git clone https://github.com/Gaius-Augustus/Tiberius
 ```
-Alternatively, you can clone the learnMSA repository separately:
-```shell
-git clone https://github.com/Gaius-Augustus/learnMSA
-cd learnMSA
-git checkout parallel
+Install [learnMSA](https://github.com/Gaius-Augustus/learnMSA) either from GitHub or with `pip`
+```
+pip install learnMSA
 ```
-Ensure that learnMSA was loaded and that is on the branch `parallel`.
 
-### Python Libraries
+#### Python Libraries
 
 The following Python libraries are required:
 - tensorflow==2.10.*
-- tensorflow_probability==0.18.0
-- transformers (optional)
 - pyBigWig
 - biopython 
 - bcbio-gff
 - requests
 
 They can be installed with:
 ```
-pip install tensorflow_probability==0.18.0 transformers pyBigWig bio scikit-learn biopython bcbio-gff requests
+pip install pyBigWig bio scikit-learn biopython bcbio-gff requests
 ```
-Tensorflow should be installed with GPU support. If you are using conda, you can install it with these [instructions](docs/install_tensorflow.md).
+Tensorflow should be installed with GPU support. If you are using conda, you can install Tensorflow 2.10 with these [instructions](docs/install_tensorflow.md).
 
-Alternatively, you can install Tesorflow using pip:
+Tiberius does also work with TensorFlow >2.10, however, **it will produce an error if you use a sequence length > 260.000 during inference!**
+You can install the current TensorFlow version with 
 ```shell
-pip install tensorflow-gpu==2.10.*
+python3 -m pip install tensorflow[and-cuda]
 ```
 
 If you want to use GPUs, verify that TensorFlow is installed correctly with GPU support:
 ```shell
 python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
 ```
 
-
 ## Running Tiberius for Gene Prediction
 
 To run Tiberius with `bin/tiberius.py`, you need to provide a FASTA file containing the genomic sequences. The sequence can either include repeat softmasking (recommended) or be run without softmasking. See [softmasking_workflow](docs/softmasking_workflow.md) for recommandations on how to mask repeats for Tiberius. Currently, we only provide weights for mammalian species, they will be downloaded automatically.

diff --git a/bin/__init__.py b/bin/__init__.py
@@ -0,0 +1 @@
+from ._version import __version__
diff --git a/bin/_version.py b/bin/_version.py
@@ -0,0 +1 @@
+__version__ = "1.1.0"
diff --git a/bin/convert_weights.py b/bin/convert_weights.py
@@ -0,0 +1,63 @@
+import sys
+sys.path.append("bin")   
+sys.path.append("../../learnMSA") 
+import tensorflow as tf
+from models import custom_cce_f1_loss
+from packaging import version
+import json 
+from models import custom_cce_f1_loss, lstm_model, add_hmm_layer
+
+
+
+assert((version.parse(tf.__version__) >= version.parse("2.10.0") and version.parse(tf.__version__) < version.parse("2.11.0") or
+        version.parse(tf.__version__) >= version.parse("2.17.0")), 
+        "Run this script with tensorflow 2.10.x or 2.17.x. Your TF: " + tf.__version__)
+
+
+# loads the old Tiberius models and converts them to H5 format, which can be loaded by keras 3 (tf 2.17+)
+
+model_path = "../model_weights/"
+
+# if run under tf 2.10.x; convert to H5
+if version.parse(tf.__version__) >= version.parse("2.10.0") and version.parse(tf.__version__) < version.parse("2.11.0"):
+    for model_name in ["tiberius_weights", "tiberius_nosm_weights"]:
+        model = tf.keras.models.load_model(model_path + model_name, 
+                                            custom_objects={'custom_cce_f1_loss': custom_cce_f1_loss(2, 32),
+                                                'loss_': custom_cce_f1_loss(2, 32)})
+        model.save_weights(model_path + model_name + ".h5")
+#else if run under tf 2.17.x; load from H5 and save again
+elif version.parse(tf.__version__) >= version.parse("2.17.0"):
+    for model_name in ["tiberius_weights"]:
+        cfg_file  = "../config.json"
+        with open(cfg_file, 'r') as f: 
+            config = json.load(f)
+
+        relevant_keys = ['units', 'filter_size', 'kernel_size', 
+                                'numb_conv', 'numb_lstm', 'dropout_rate', 
+                                'pool_size', 'stride', 'lstm_mask', 'clamsa',
+                                'output_size', 'residual_conv', 'softmasking',
+                                'clamsa_kernel', 'lru_layer']
+        relevant_args = {key: config[key] for key in relevant_keys if key in config}
+        model1 = lstm_model(**relevant_args)
+        model1.summary()
+        model = add_hmm_layer(model1, None,
+                                            dense_size=config['hmm_dense'], 
+                                            pool_size=config['pool_size'],
+                                            output_size=config['output_size'], 
+                                            num_hmm=config['num_hmm_layers'],
+                                            l2_lambda=config['l2_lambda'],
+                                            hmm_factor=config['hmm_factor'], 
+                                            batch_size=config['batch_size'],
+                                            seq_len=config['w_size'],
+                                            initial_variance=config['initial_variance'],
+                                            temperature=config['temperature'],
+                                            emit_embeddings=config['hmm_emit_embeddings'], 
+                                            share_intron_parameters=config['hmm_share_intron_parameters'],
+                                            trainable_nucleotides_at_exons=config['hmm_nucleotides_at_exons'],
+                                            trainable_emissions=config['hmm_trainable_emissions'],
+                                            trainable_transitions=config['hmm_trainable_transitions'],                                    trainable_starting_distribution=config['hmm_trainable_starting_distribution'],
+                                            use_border_hints=False,
+                                            include_lstm_in_output=config['multi_loss'],
+                                            neutral_hmm=config['neutral_hmm'])
+        model.load_weights(model_path + model_name + ".h5")
+        model.save(model_path + model_name + ".keras")
diff --git a/bin/create_tf_records.py b/bin/create_tf_records.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 from genome_fasta import GenomeSequences
 from annotation_gtf import GeneStructure
 # import tensorflow records

diff --git a/bin/data_generator.py b/bin/data_generator.py
@@ -10,7 +10,7 @@
 import sys
 import tensorflow as tf
 import numpy as np
-from transformers import AutoTokenizer, TFAutoModelForMaskedLM, TFEsmForMaskedLM
+# from transformers import AutoTokenizer, TFAutoModelForMaskedLM, TFEsmForMaskedLM
 
 class DataGenerator:
     """DataGenerator class for reading and processing TFRecord files 
@@ -22,8 +22,8 @@ class DataGenerator:
         shuffle (bool): Whether to shuffle the data.
         repeat (bool): Whether to repeat the data set.
         output_size (int): Number of class labels in traings examples.
-        trans (bool): Whether the data should fit the transformer only model.
-        trans_lstm (bool): Whether the data should fit the transformer-LSTM hybrid model.
+        trans (bool): Whether the data should fit the transformer only model. (deprecated!!)
+        trans_lstm (bool): Whether the data should fit the transformer-LSTM hybrid model. (deprecated!!)
         seq_weights (int): Weight of positons around exon borders. They aren't used if 0.
         softmasking (bool): Whether softmasking track should be added to input.
         clamsa (bool): Whether Clamsa track should be prepared as additional input,
@@ -36,7 +36,7 @@ def __init__(self, file_path,
                  filter=False,
                  output_size=5,
                 hmm_factor=None,
-                trans=False, trans_lstm=False, 
+                # trans=False, trans_lstm=False, 
                  seq_weights=0, softmasking=True,
                 clamsa=False,
                 oracle=False):
@@ -48,8 +48,8 @@ def __init__(self, file_path,
         self.seq_weights = seq_weights
         self.output_size = output_size
         self.hmm_factor = hmm_factor
-        self.trans=trans
-        self.trans_lstm=trans_lstm
+        # self.trans=trans
+        # self.trans_lstm=trans_lstm
         self.softmasking=softmasking
         self.clamsa = clamsa
         self.oracle = oracle
@@ -273,29 +273,30 @@ def __next__(self):
                     y_new[:,:,2] = np.sum(y_batch[:,:,[5, 8, 13]], axis=-1)
                     y_new[:,:,3] = np.sum(y_batch[:,:,[6, 9, 11, 14]], axis=-1)
                 y_batch = y_new
-        if self.trans_lstm:
-            # prepare tokens for transformer lstm as additional input for the model
-            y_batch = y_batch[:,:99036]
-            x_batch = x_batch[:,:99036]
-            tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
-            max_token_len = 5502
-            x_token = np.reshape(x_batch[:,:,:5], (-1, max_token_len, 5))
-            x_token = self.decode_one_hot(x_token)
-            x_token = tokenizer.batch_encode_plus(x_token, return_tensors="tf", 
-                      padding="max_length", max_length=max_token_len//6+1)  
+        # if self.trans_lstm:
+        #     # prepare tokens for transformer lstm as additional input for the model
+        #     y_batch = y_batch[:,:99036]
+        #     x_batch = x_batch[:,:99036]
+        #     tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
+        #     max_token_len = 5502
+        #     x_token = np.reshape(x_batch[:,:,:5], (-1, max_token_len, 5))
+        #     x_token = self.decode_one_hot(x_token)
+        #     x_token = tokenizer.batch_encode_plus(x_token, return_tensors="tf", 
+        #               padding="max_length", max_length=max_token_len//6+1)  
 
-            X = [x_batch, x_token['input_ids'], x_token['attention_mask']]
-            Y = y_batch                
-        elif self.trans:            
-            # prepare tokens for transformer lstm as input for the model
-            tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
-            max_len = 5994
-            x_batch = self.decode_one_hot(x_batch[:,:,:5])
-            x_batch = tokenizer.batch_encode_plus(x_batch, return_tensors="tf", 
-                      padding="max_length", max_length=tokenizer.model_max_length)
-            X = [x_batch['input_ids'], x_batch['attention_mask']]
-            Y = y_batch            
-        elif self.hmm_factor:
+        #     X = [x_batch, x_token['input_ids'], x_token['attention_mask']]
+        #     Y = y_batch                
+        # elif self.trans:            
+        #     # prepare tokens for transformer lstm as input for the model
+        #     tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
+        #     max_len = 5994
+        #     x_batch = self.decode_one_hot(x_batch[:,:,:5])
+        #     x_batch = tokenizer.batch_encode_plus(x_batch, return_tensors="tf", 
+        #               padding="max_length", max_length=tokenizer.model_max_length)
+        #     X = [x_batch['input_ids'], x_batch['attention_mask']]
+        #     Y = y_batch            
+        # elif self.hmm_factor:
+        if self.hmm_factor:
             # deprecated by the parallelization of the HMM
             step_width = y_batch.shape[1] // self.hmm_factor
             start = y_batch[:,::step_width,:] # shape (batch_size, hmm_factor, 5)