diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 3ea9f494..566ae4c9 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -7,16 +7,16 @@ We love your input! We want to make contributing to this project as easy and tra
 - Proposing new features
 - Becoming a maintainer
 
-## We Develop with Github
-We use github to host code, to track issues and feature requests, as well as accept pull requests.
+## We Develop with GitHub
+We use GitHub to host code, to track issues and feature requests, as well as accept pull requests.
 
-## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
+## We Use [GitHub Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
 Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
 
 1. Fork the repo and create your branch from the **`dev` branch**.
 2. If you've added code that should be tested, you **must** ensure it is properly tested.
 3. If you've changed APIs, update the documentation.
-4. Ensure the Travis test suite passes.
+4. Ensure the CI/CD test suite passes.
 5. Make sure your code lints.
 6. Submit that pull request!
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 3d3d4a04..daa2de3a 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -21,6 +21,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          python -m pip install --upgrade pip
           pip install -Ur requirements.txt
           pip install -Ur docs/requirements.txt
           pip install -e .
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index eab6483a..64bae30e 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -17,6 +17,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          python -m pip install --upgrade pip
           pip install -Ur requirements.txt
           pip install -Ur styling_requirements.txt
           pip install -Ur tests/requirements.txt
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 81648578..641707e1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -17,6 +17,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          python -m pip install --upgrade pip
           pip install -Ur requirements.txt
           pip install -Ur tests/requirements.txt
           python setup.py develop
@@ -38,6 +39,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          python -m pip install --upgrade pip
           pip install -Ur requirements.txt
           pip install -Ur tests/requirements.txt
           python setup.py develop
diff --git a/.gitignore b/.gitignore
index 25785c8e..b7bbdfbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -144,3 +144,5 @@ deepparse/version.py
 *.ckpt
 
 *mlruns/
+
+*model/
\ No newline at end of file
diff --git a/.release/bpemb.version b/.release/bpemb.version
new file mode 100644
index 00000000..b31b8547
--- /dev/null
+++ b/.release/bpemb.version
@@ -0,0 +1 @@
+aa32fa918494b461202157c57734c374
diff --git a/.release/bpemb_attention.version b/.release/bpemb_attention.version
new file mode 100644
index 00000000..bcc9ea1f
--- /dev/null
+++ b/.release/bpemb_attention.version
@@ -0,0 +1 @@
+cfb190902476376573591c0ec6f91ece
diff --git a/.release/fasttext.version b/.release/fasttext.version
new file mode 100644
index 00000000..b19d26d5
--- /dev/null
+++ b/.release/fasttext.version
@@ -0,0 +1 @@
+f67a0517c70a314bdde0b8440f21139d
diff --git a/.release/fasttext_attention.version b/.release/fasttext_attention.version
new file mode 100644
index 00000000..12db9cc1
--- /dev/null
+++ b/.release/fasttext_attention.version
@@ -0,0 +1 @@
+a2b688bdfa2aa7c009bb7d980e352978
diff --git a/.release/model_version_release.md b/.release/model_version_release.md
new file mode 100644
index 00000000..46e9616e
--- /dev/null
+++ b/.release/model_version_release.md
@@ -0,0 +1,5 @@
+# How to Create a New Model's Version
+
+1. `md5sum <model.ckpt> > model.version`
+2. Remove the model.cpkt text in `model.version` file
+3. Update latests BPEMB and FastText hash in `tests/test_tools.py`
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 885630e8..6b9c8daa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -313,4 +313,17 @@
   suggested in the [documentation](https://pytorch.org/tutorials//intermediate/torch_compile_tutorial.html). It
   increases the performance by about 1/100.
 
+## 0.9.7
+
+- New models release with more meta-data.
+- Add a feature to use an AddressParser from a URI.
+- Add a feature to upload the trained model to a URI.
+- Add an example of how to use URI for parsing from and uploading to.
+- Improve error handling of `path_to_retrain_model`.
+- Bug-fix pre-processor error.
+- Add verbose override and improve verbosity handling in retrain.
+- Bug-fix the broken FastText installation using `fasttext-wheel` instead of `fasttext` (
+  see [here](https://github.com/facebookresearch/fastText/issues/512#issuecomment-1534519551)
+  and [here](https://github.com/facebookresearch/fastText/pull/1292)).
+
 ## dev
diff --git a/deepparse/__init__.py b/deepparse/__init__.py
index 462804e9..dd64adac 100644
--- a/deepparse/__init__.py
+++ b/deepparse/__init__.py
@@ -2,4 +2,4 @@
 from .fasttext_tools import *
 from .tools import *
 from .version import __version__
-from .weights_init import *
+from .weights_tools import *
diff --git a/deepparse/cli/parse.py b/deepparse/cli/parse.py
index 39fc54f8..a075cf0a 100644
--- a/deepparse/cli/parse.py
+++ b/deepparse/cli/parse.py
@@ -50,7 +50,7 @@ def main(args=None) -> None:
 
     .. code-block:: sh
 
-        parse fasttext ./dataset.csv parsed_address.pckl --path_to_retrained_model ./path
+        parse fasttext ./dataset.csv parsed_address.pckl --path_to_model_weights ./path
 
     """
     if args is None:  # pragma: no cover
diff --git a/deepparse/cli/parser_arguments_adder.py b/deepparse/cli/parser_arguments_adder.py
index c50426f0..a398a9c8 100644
--- a/deepparse/cli/parser_arguments_adder.py
+++ b/deepparse/cli/parser_arguments_adder.py
@@ -108,7 +108,7 @@ def add_batch_size_arg(parser: ArgumentParser) -> None:
 def add_path_to_retrained_model_arg(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--path_to_retrained_model",
-        help=wrap("A path to a retrained model to use for testing."),
+        help=wrap("A path to a retrained model to use. It can be an S3-URI."),
         type=str,
         default=None,
     )
diff --git a/deepparse/network/decoder.py b/deepparse/network/decoder.py
index 8d578069..f0c347ab 100644
--- a/deepparse/network/decoder.py
+++ b/deepparse/network/decoder.py
@@ -6,7 +6,7 @@
 import torch
 from torch import nn
 
-from ..weights_init import weights_init
+from .. import weights_init
 
 
 class Decoder(nn.Module):
diff --git a/deepparse/network/encoder.py b/deepparse/network/encoder.py
index 27d911f6..5fafb917 100644
--- a/deepparse/network/encoder.py
+++ b/deepparse/network/encoder.py
@@ -7,7 +7,7 @@
 from torch import nn
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
-from ..weights_init import weights_init
+from .. import weights_init
 
 
 class Encoder(nn.Module):
diff --git a/deepparse/network/seq2seq.py b/deepparse/network/seq2seq.py
index 38fc5b5a..5e05625b 100644
--- a/deepparse/network/seq2seq.py
+++ b/deepparse/network/seq2seq.py
@@ -4,7 +4,6 @@
 import random
 import warnings
 from abc import ABC
-from collections import OrderedDict
 from typing import Tuple, Union, List
 
 import torch
@@ -12,6 +11,7 @@
 
 from .decoder import Decoder
 from .encoder import Encoder
+from .. import handle_weights_upload
 from ..tools import download_weights, latest_version
 
 
@@ -113,20 +113,21 @@ def _load_pre_trained_weights(self, model_type: str, cache_dir: str, offline: bo
                     )
                 download_weights(model_type, cache_dir, verbose=self.verbose)
 
-        all_layers_params = torch.load(model_path, map_location=self.device)
-        self.load_state_dict(all_layers_params)
+        self._load_weights(path_to_model_torch_archive=model_path)
 
-    def _load_weights(self, path_to_retrained_model: str) -> None:
+    def _load_weights(self, path_to_model_torch_archive: str) -> None:
         """
         Method to load (into the network) the weights.
 
         Args:
-            path_to_retrained_model (str): The path to the fine-tuned model.
+            path_to_model_torch_archive (str): The path to the fine-tuned model Torch archive.
         """
-        all_layers_params = torch.load(path_to_retrained_model, map_location=self.device)
-        if isinstance(all_layers_params, dict) and not isinstance(all_layers_params, OrderedDict):
-            # Case where we have a retrained model with a different tagging space
-            all_layers_params = all_layers_params.get("address_tagger_model")
+        all_layers_params = handle_weights_upload(
+            path_to_model_to_upload=path_to_model_torch_archive, device=self.device
+        )
+
+        # All the time, our torch archive include meta-data along with the model weights
+        all_layers_params = all_layers_params.get("address_tagger_model")
         self.load_state_dict(all_layers_params)
 
     def _encoder_step(self, to_predict: torch.Tensor, lengths: List, batch_size: int) -> Tuple:
diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py
index 18718ae5..f824c134 100644
--- a/deepparse/parser/address_parser.py
+++ b/deepparse/parser/address_parser.py
@@ -14,6 +14,7 @@
 from typing import Dict, List, Tuple, Union, Callable
 
 import torch
+from cloudpathlib import CloudPath, S3Path
 from poutyne.framework import Experiment
 from torch.optim import SGD
 from torch.utils.data import DataLoader, Subset
@@ -43,6 +44,7 @@
 from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning
 from ..tools import CACHE_PATH, valid_poutyne_version
 from ..vectorizer import VectorizerFactory
+from ..weights_tools import handle_weights_upload
 
 _pre_trained_tags_to_idx = {
     "StreetNumber": 0,
@@ -86,7 +88,7 @@ class AddressParser:
             - ``"lightest"`` (the one using the less RAM and GPU usage) (equivalent to ``"fasttext-light"``),
             - ``"best"`` (the best accuracy performance) (equivalent to ``"bpemb"``).
 
-            The default value is ``"best"`` for the most accurate model. Ignored if ``path_to_retrained_model`` is not
+            The default value is ``"best"`` for the most accurate model. Ignored if ``path_to_model_weights`` is not
             ``None``. To further improve performance, consider using the models (fasttext or BPEmb) with their
             counterparts using an attention mechanism with the ``attention_mechanism`` flag.
         attention_mechanism (bool): Whether to use the model with an attention mechanism. The model will use an
@@ -102,10 +104,13 @@ class AddressParser:
             The default value is GPU with the index ``0`` if it exists. Otherwise, the value is ``CPU``.
         rounding (int): The rounding to use when asking the probability of the tags. The default value is four digits.
         verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is True.
-        path_to_retrained_model (Union[str, None]): The path to the retrained model to use for prediction. We will
-            infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our
+        path_to_retrained_model (Union[S3Path, str, None]): The path to the retrained model to use for prediction.
+            We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our
             pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to
-            be set to True.
+            be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI string path
+            (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib`
+            to handle S3-like bucket. See `cloudpathlib <https://cloudpathlib.drivendata.org/stable/>`
+            for detail on supported S3 buckets provider and URI condition. The default value is None.
         cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the
             embeddings model and the model pretrained weights.
         offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained
@@ -117,7 +122,7 @@ class AddressParser:
     Note:
         For both networks, we will download the pretrained weights and embeddings in the ``.cache`` directory
         for the root user. The pretrained weights take at most 44 MB. The fastText embeddings take 6.8 GO,
-        the fastText-light embeddings take 3.3 GO and bpemb take 116 MB (in .cache/bpemb).
+        the fastText-light embeddings take 3.3 GO and bpemb take 116 MB (in ``".cache/bpemb"``).
 
         Also, one can download all the dependencies of our pretrained model using our CLI
         (e.g. download_model fasttext) before sending it to a node without access to Internet.
@@ -164,15 +169,15 @@ class AddressParser:
         .. code-block:: python
 
             address_parser = AddressParser(model_type="fasttext",
-                                           path_to_retrained_model="/path_to_a_retrain_fasttext_model")
+                                           path_to_model_weights="/path_to_a_retrain_fasttext_model.ckpt")
             parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
 
         Using a retrained model trained on different tags
 
         .. code-block:: python
 
-            # We don't give the model_type since it's ignored when using path_to_retrained_model
-            address_parser = AddressParser(path_to_retrained_model="/path_to_a_retrain_fasttext_model")
+            # We don't give the model_type since it's ignored when using path_to_model_weights
+            address_parser = AddressParser(path_to_model_weights="/path_to_a_retrain_fasttext_model.ckpt")
             parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
 
         Using a retrained model with attention
@@ -180,7 +185,7 @@ class AddressParser:
         .. code-block:: python
 
             address_parser = AddressParser(model_type="fasttext",
-                                           path_to_retrained_model="/path_to_a_retrain_fasttext_attention_model",
+                                           path_to_model_weights="/path_to_a_retrain_fasttext_attention_model.ckpt",
                                            attention_mechanism=True)
             parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
 
@@ -193,6 +198,21 @@ class AddressParser:
                                            offline=True)
             parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
 
+         Using a retrained model in an S3-like bucket.
+
+        .. code-block:: python
+
+            address_parser = AddressParser(model_type="fasttext",
+                                           path_to_model_weights="s3://path/to/bucket.ckpt")
+            parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
+
+         Using a retrained model in an S3-like bucket using CloudPathLib.
+
+        .. code-block:: python
+
+            address_parser = AddressParser(model_type="fasttext",
+                                           path_to_model_weights=CloudPath("s3://path/to/bucket.ckpt"))
+            parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
     """
 
     def __init__(
@@ -202,7 +222,7 @@ def __init__(
         device: Union[int, str, torch.device] = 0,
         rounding: int = 4,
         verbose: bool = True,
-        path_to_retrained_model: Union[str, None] = None,
+        path_to_retrained_model: Union[S3Path, str, None] = None,
         cache_dir: Union[str, None] = None,
         offline: bool = False,
     ) -> None:
@@ -222,17 +242,21 @@ def __init__(
         seq2seq_kwargs = {}  # Empty for default settings
 
         if path_to_retrained_model is not None:
-            checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu")
+            checkpoint_weights = handle_weights_upload(path_to_model_to_upload=path_to_retrained_model)
             if checkpoint_weights.get("model_type") is None:
                 # Validate if we have the proper metadata, it has at least the parser model type
                 # if no other thing have been modified.
-                raise RuntimeError(
-                    "You are not using the proper retrained checkpoint. "
+                error_text = (
+                    "You are not using the proper retrained checkpoint for Deepparse, since we also export other"
+                    "informations along with the model weights. "
                     "When we retrain an AddressParser, by default, we create a "
-                    "checkpoint name 'retrained_modeltype_address_parser.ckpt'. Be sure to use that"
-                    "checkpoint since it includes some metadata for the reloading."
+                    "checkpoint name 'retrained_modeltype_address_parser.ckpt'. "
+                    "Where 'modeltype' is the AddressParser model type (e.g. 'fasttext', 'bpemb'). "
+                    "The checkpoint name can also change if you give the retrained model a name. "
+                    "Be sure to use that checkpoint since it includes some metadata for the reloading. "
                     "See AddressParser.retrain for more details."
                 )
+                raise RuntimeError(error_text)
             if validate_if_new_seq2seq_params(checkpoint_weights):
                 seq2seq_kwargs = checkpoint_weights.get("seq2seq_params")
             if validate_if_new_prediction_tags(checkpoint_weights):
@@ -453,6 +477,7 @@ def retrain(
         seq2seq_params: Union[Dict, None] = None,
         layers_to_freeze: Union[str, None] = None,
         name_of_the_retrain_parser: Union[None, str] = None,
+        verbose: Union[None, bool] = None,
     ) -> List[Dict]:
         # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements
 
@@ -497,6 +522,12 @@ def retrain(
             logging_path (str): The logging path for the checkpoints. Poutyne will use the best one and reload the
                 state if any checkpoints are there. Thus, an error will be raised if you change the model type.
                 For example,  you retrain a FastText model and then retrain a BPEmb in the same logging path directory.
+                The logging_path can also be a S3-like (Azure, AWS, Google) bucket URI string path
+                (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib`
+                to handle S3-like bucket. See `cloudpathlib <https://cloudpathlib.drivendata.org/stable/>`
+                for detail on supported S3 buckets provider and URI condition.
+                If the logging_path is a S3 bucket, we will only save the best checkpoint to the S3 Bucket at the end
+                of training.
                 By default, the path is ``./checkpoints``.
             disable_tensorboard (bool): To disable Poutyne automatic Tensorboard monitoring. By default, we disable them
                 (true).
@@ -542,6 +573,9 @@ def retrain(
                     - if prediction_tags is not ``None``, the following tag: ``ModifiedPredictionTags``,
                     - if seq2seq_params is not ``None``, the following tag: ``ModifiedSeq2SeqConfiguration``, and
                     - if layers_to_freeze is not ``None``, the following tag: ``FreezedLayer{portion}``.
+            verbose (Union[None, bool]): To override the AddressParser verbosity for the test. When set to True or
+                False, it will override (but it does not change the AddressParser verbosity) the test verbosity.
+                If set to the default value None, the AddressParser verbosity is used as the test verbosity.
 
 
         Return:
@@ -742,6 +776,10 @@ def retrain(
             batch_metrics=[accuracy],
         )
 
+        # Handle the verbose overriding param
+        if verbose is None:
+            verbose = self.verbose
+
         try:
             with_capturing_context = False
             if not valid_poutyne_version(min_major=1, min_minor=8):
@@ -760,6 +798,7 @@ def retrain(
                 callbacks=callbacks,
                 disable_tensorboard=disable_tensorboard,
                 capturing_context=with_capturing_context,
+                verbose=verbose,
             )
         except RuntimeError as error:
             list_of_file_path = os.listdir(path=".")
@@ -797,6 +836,7 @@ def retrain(
                 else f"retrained_{self.model_type}_address_parser.ckpt"
             )
             file_path = os.path.join(logging_path, file_name)
+
             torch_save = {
                 "address_tagger_model": exp.model.network.state_dict(),
                 "model_type": self.model_type,
@@ -817,7 +857,29 @@ def retrain(
                 }
             )
 
-            torch.save(torch_save, file_path)
+            if isinstance(file_path, S3Path):
+                # To handle CloudPath path_to_model_weights
+                try:
+                    with file_path.open("wb") as file:
+                        torch.save(torch_save, file)
+                except FileNotFoundError as error:
+                    raise FileNotFoundError("The file in the S3 bucket was not found.") from error
+
+            elif "s3://" in file_path:
+                file_path = CloudPath(file_path)
+                try:
+                    with file_path.open("wb") as file:
+                        torch.save(torch_save, file)
+                except FileNotFoundError as error:
+                    raise FileNotFoundError("The file in the S3 bucket was not found.") from error
+            else:
+                try:
+                    torch.save(torch_save, file_path)
+                except FileNotFoundError as error:
+                    if "s3" in file_path or "//" in file_path or ":" in file_path:
+                        raise FileNotFoundError(
+                            "Are You trying to use a AWS S3 URI? If so path need to start with s3://."
+                        ) from error
             return train_res
 
     def test(
@@ -1114,8 +1176,8 @@ def _predict_pipeline(self, data: List) -> Tuple:
         """
         return self.processor.process_for_inference(data)
 
-    @staticmethod
     def _retrain(
+        self,
         experiment: Experiment,
         train_generator: DatasetContainer,
         valid_generator: DatasetContainer,
@@ -1124,6 +1186,7 @@ def _retrain(
         callbacks: List,
         disable_tensorboard: bool,
         capturing_context: bool,
+        verbose: Union[None, bool],
     ) -> List[Dict]:
         # pylint: disable=too-many-arguments
         # If Poutyne 1.7 and before, we capture poutyne print since it print some exception.
@@ -1136,6 +1199,7 @@ def _retrain(
                 seed=seed,
                 callbacks=callbacks,
                 disable_tensorboard=disable_tensorboard,
+                verbose=verbose,
             )
         return train_res
 
@@ -1250,9 +1314,12 @@ def _apply_pre_processors(self, addresses: List[str]) -> List[str]:
         res = []
 
         for address in addresses:
+            processed_address = address
+
             for pre_processor in self.pre_processors:
                 processed_address = pre_processor(address)
-                res.append(" ".join(processed_address.split()))
+
+            res.append(" ".join(processed_address.split()))
         return res
 
     def is_same_model_type(self, other) -> bool:
diff --git a/deepparse/parser/tools.py b/deepparse/parser/tools.py
index 8e77afe9..e4a3b495 100644
--- a/deepparse/parser/tools.py
+++ b/deepparse/parser/tools.py
@@ -1,7 +1,7 @@
-import math
 import os
 from typing import List, OrderedDict, Tuple
 
+import math
 import numpy as np
 import torch
 
@@ -134,7 +134,10 @@ def infer_model_type(checkpoint_weights: OrderedDict, attention_mechanism: bool)
         else:
             model_type = "fasttext"
 
-    if "decoder.linear_attention_mechanism_encoder_outputs.weight" in checkpoint_weights.keys():
+    if (
+        "decoder.linear_attention_mechanism_encoder_outputs.weight"
+        in checkpoint_weights.get("address_tagger_model").keys()
+    ):
         attention_mechanism = True
 
     return model_type, attention_mechanism
diff --git a/deepparse/weights_init.py b/deepparse/weights_init.py
deleted file mode 100644
index 5e6b13b2..00000000
--- a/deepparse/weights_init.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from torch import nn
-from torch.nn import init
-
-
-def weights_init(m: nn.Module) -> None:
-    """
-    Function to initialize the weights of a model layers.
-
-    Usage:
-        network = Model()
-        network.apply(weight_init)
-    """
-    if isinstance(m, nn.Linear):
-        init.xavier_normal_(m.weight.data)
-        init.normal_(m.bias.data)
-    elif isinstance(m, (nn.LSTM, nn.LSTMCell, nn.GRU, nn.GRUCell)):
-        for param in m.parameters():
-            if len(param.shape) >= 2:
-                init.orthogonal_(param.data)
-            else:
-                init.normal_(param.data)
diff --git a/deepparse/weights_tools.py b/deepparse/weights_tools.py
new file mode 100644
index 00000000..dd5831bd
--- /dev/null
+++ b/deepparse/weights_tools.py
@@ -0,0 +1,56 @@
+from typing import OrderedDict, Union
+
+import torch
+from cloudpathlib import CloudPath, S3Path
+from torch import nn
+from torch.nn import init
+
+
+def weights_init(m: nn.Module) -> None:
+    """
+    Function to initialize the weights of a model layers.
+
+    Usage:
+        network = Model()
+        network.apply(weight_init)
+    """
+    if isinstance(m, nn.Linear):
+        init.xavier_normal_(m.weight.data)
+        init.normal_(m.bias.data)
+    elif isinstance(m, (nn.LSTM, nn.LSTMCell, nn.GRU, nn.GRUCell)):
+        for param in m.parameters():
+            if len(param.shape) >= 2:
+                init.orthogonal_(param.data)
+            else:
+                init.normal_(param.data)
+
+
+def handle_weights_upload(
+    path_to_model_to_upload: Union[str, S3Path], device: Union[str, torch.device] = "cpu"
+) -> OrderedDict:
+    if isinstance(path_to_model_to_upload, S3Path):
+        # To handle CloudPath path_to_model_weights
+        try:
+            with path_to_model_to_upload.open("rb") as file:
+                checkpoint_weights = torch.load(file, map_location=device)
+        except FileNotFoundError as error:
+            raise FileNotFoundError("The file in the S3 bucket was not found.") from error
+    elif "s3://" in path_to_model_to_upload:
+        # To handle str S3-like URI.
+        path_to_model_to_upload = CloudPath(path_to_model_to_upload)
+        try:
+            with path_to_model_to_upload.open("rb") as file:
+                checkpoint_weights = torch.load(file, map_location=device)
+        except FileNotFoundError as error:
+            raise FileNotFoundError("The file in the S3 bucket was not found.") from error
+    else:
+        # Path is a local one (or a wrongly written S3 URI).
+        try:
+            checkpoint_weights = torch.load(path_to_model_to_upload, map_location=device)
+        except FileNotFoundError as error:
+            if "s3" in path_to_model_to_upload or "//" in path_to_model_to_upload or ":" in path_to_model_to_upload:
+                raise FileNotFoundError(
+                    "Are You trying to use a AWS S3 URI? If so path need to start with s3://."
+                ) from error
+            raise FileNotFoundError(f"The file {path_to_model_to_upload} was not found.") from error
+    return checkpoint_weights
diff --git a/docs/source/examples/fine_tuning_uri.rst b/docs/source/examples/fine_tuning_uri.rst
new file mode 100644
index 00000000..c734f7ab
--- /dev/null
+++ b/docs/source/examples/fine_tuning_uri.rst
@@ -0,0 +1,63 @@
+.. role:: hidden
+    :class: hidden-section
+
+Retrain a Pretrained Model
+**************************
+
+.. code-block:: python
+
+    import poutyne
+
+    from deepparse import download_from_public_repository
+    from deepparse.dataset_container import PickleDatasetContainer
+    from deepparse.parser import AddressParser
+
+
+First, let's download the train and test data from the public repository.
+
+.. code-block:: python
+
+    saving_dir = "./data"
+    file_extension = "p"
+    training_dataset_name = "sample_incomplete_data"
+    test_dataset_name = "test_sample_data"
+    download_from_public_repository(training_dataset_name, saving_dir, file_extension=file_extension)
+    download_from_public_repository(test_dataset_name, saving_dir, file_extension=file_extension)
+
+Now let's create a training and test container.
+
+.. code-block:: python
+
+    training_container = PickleDatasetContainer(os.path.join(saving_dir,
+                                                             training_dataset_name + "." + file_extension))
+    test_container = PickleDatasetContainer(os.path.join(saving_dir,
+                                                         test_dataset_name + "." + file_extension))
+
+We will retrain the ``FastText`` version of our pretrained model.
+
+.. code-block:: python
+
+    path_to_your_uri = "s3://<path_to_your_bucket>/fasttext.ckpt"
+    address_parser = AddressParser(model_type="fasttext", device=0, path_to_retrained_model=path_to_your_uri)
+
+
+Now, let's retrain for ``5`` epochs using a batch size of ``8`` since the data is really small for the example.
+Let's start with the default learning rate of ``0.01`` and use a learning rate scheduler to lower the learning rate as we progress.
+
+.. code-block:: python
+
+    # Reduce LR by a factor of 10 each epoch
+    lr_scheduler = poutyne.StepLR(step_size=1, gamma=0.1)
+
+The retrained model best checkpoint (ckpt) will be saved in the S3 Bucket <path_to_your_bucket.
+
+
+.. code-block:: python
+
+    address_parser.retrain(training_container, logging_path="s3://<path_to_your_bucket/", train_ratio=0.8, epochs=5, batch_size=8, num_workers=2, callbacks=[lr_scheduler])
+
+Now, let's test our fine-tuned model using the best checkpoint (default parameter).
+
+.. code-block:: python
+
+    address_parser.test(test_container, batch_size=256)
diff --git a/docs/source/examples/parse_addresses_uri.rst b/docs/source/examples/parse_addresses_uri.rst
new file mode 100644
index 00000000..f8fb8f48
--- /dev/null
+++ b/docs/source/examples/parse_addresses_uri.rst
@@ -0,0 +1,43 @@
+.. role:: hidden
+    :class: hidden-section
+
+Parse Addresses
+***************
+
+.. code-block:: python
+
+    import pandas as pd
+
+    from deepparse import download_from_public_repository
+    from deepparse.dataset_container import PickleDatasetContainer
+    from deepparse.parser import AddressParser
+
+Here is an example on how to parse multiple addresses. First, let's download the train and test data from the public repository.
+
+.. code-block:: python
+
+    saving_dir = "./data"
+    file_extension = "p"
+    test_dataset_name = "predict"
+    download_from_public_repository(test_dataset_name, saving_dir, file_extension=file_extension)
+
+Now let's load the dataset using one of our dataset container
+
+.. code-block:: python
+
+    addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False)
+
+# Let's use the ``FastText`` model on a GPU.
+
+.. code-block:: python
+
+    path_to_your_uri = "s3://<path_to_your_bucket>/fasttext.ckpt"
+    address_parser = AddressParser(model_type="fasttext", device=0, path_to_retrained_model=path_to_your_uri)
+
+
+.. code-block:: python
+
+    parsed_addresses = address_parser(test_data[0:300])
+
+    # Print one of the parsed address
+    print(parsed_addresses[0])
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b1993d06..fe320485 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -776,9 +776,11 @@ API Reference
   :caption: Examples
 
   examples/parse_addresses
+  examples/parse_addresses_uri
   examples/parse_addresses_with_cli
   examples/retrained_model_parsing
   examples/fine_tuning
+  examples/fine_tuning_uri
   examples/fine_tuning_with_csv_dataset
   examples/retrain_attention_model
   examples/retrain_with_new_prediction_tags
diff --git a/examples/fine_tuning_uri.py b/examples/fine_tuning_uri.py
new file mode 100644
index 00000000..ab9d666b
--- /dev/null
+++ b/examples/fine_tuning_uri.py
@@ -0,0 +1,42 @@
+import os
+
+import poutyne
+
+from deepparse import download_from_public_repository
+from deepparse.dataset_container import PickleDatasetContainer
+from deepparse.parser import AddressParser
+
+# First, let's download the train and test data from the public repository.
+saving_dir = "./data"
+file_extension = "p"
+training_dataset_name = "sample_incomplete_data"
+test_dataset_name = "test_sample_data"
+download_from_public_repository(training_dataset_name, saving_dir, file_extension=file_extension)
+download_from_public_repository(test_dataset_name, saving_dir, file_extension=file_extension)
+
+# Now let's create a training and test container.
+training_container = PickleDatasetContainer(os.path.join(saving_dir, training_dataset_name + "." + file_extension))
+test_container = PickleDatasetContainer(os.path.join(saving_dir, test_dataset_name + "." + file_extension))
+
+# We will retrain the FastText version of our pretrained model.
+path_to_your_uri = "s3://<path_to_your_bucket>/fasttext.ckpt"
+address_parser = AddressParser(model_type="fasttext", device=0, path_to_retrained_model=path_to_your_uri)
+
+# Now, let's retrain for 5 epochs using a batch size of 8 since the data is really small for the example.
+# Let's start with the default learning rate of 0.01 and use a learning rate scheduler to lower the learning rate
+# as we progress.
+lr_scheduler = poutyne.StepLR(step_size=1, gamma=0.1)  # reduce LR by a factor of 10 each epoch
+
+# The retrained model best checkpoint (ckpt) will be saved in the S3 Bucket <path_to_your_bucket.
+address_parser.retrain(
+    training_container,
+    logging_path="s3://<path_to_your_bucket/",
+    train_ratio=0.8,
+    epochs=5,
+    batch_size=8,
+    num_workers=2,
+    callbacks=[lr_scheduler],
+)
+
+# Now, let's test our fine-tuned model using the best checkpoint (default parameter).
+address_parser.test(test_container, batch_size=256)
diff --git a/examples/parse_addresses_uri.py b/examples/parse_addresses_uri.py
new file mode 100644
index 00000000..7f0b2c72
--- /dev/null
+++ b/examples/parse_addresses_uri.py
@@ -0,0 +1,23 @@
+from deepparse import download_from_public_repository
+from deepparse.dataset_container import PickleDatasetContainer
+from deepparse.parser import AddressParser
+
+# Here is an example on how to parse multiple addresses using a URI model place in a S3 Bucket
+# First, let's download the train and test data from the public repository.
+saving_dir = "./data"
+file_extension = "p"
+test_dataset_name = "predict"
+download_from_public_repository(test_dataset_name, saving_dir, file_extension=file_extension)
+
+#  Now let's load the dataset using one of our dataset container
+addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False)
+
+# We can sneak peek some addresses
+print(addresses_to_parse[:2])
+
+# Let's use the FastText model on a GPU
+path_to_your_uri = "s3://<path_to_your_bucket>/fasttext.ckpt"
+address_parser = AddressParser(model_type="fasttext", device=0, path_to_retrained_model=path_to_your_uri)
+
+# We can now parse some addresses
+parsed_addresses = address_parser(addresses_to_parse[0:300])
diff --git a/pyproject.toml b/pyproject.toml
index 1f9703f0..315261c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,3 +13,6 @@ testpaths = [
 
 [tool.pylint.ini_options]
 DJANGO_SETTINGS_MODULE = "settings"
+
+[build-system]
+requires = ["setuptools", "wheel", "pybind11"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index bb532601..e3312f90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ pymagnitude-light
 colorama>=0.4.3
 poutyne
 gensim>=4.2.0
-fasttext
+fasttext-wheel
 pandas
-urllib3
\ No newline at end of file
+urllib3
+cloudpathlib[s3, gs, azure]
\ No newline at end of file
diff --git a/tests/cli/test_parse.py b/tests/cli/test_parse.py
index 6f489d1d..657f4d85 100644
--- a/tests/cli/test_parse.py
+++ b/tests/cli/test_parse.py
@@ -264,7 +264,7 @@ def test_ifPathToFakeRetrainModel_thenUseFakeRetrainModel(self):
             )
 
         expected_first_message = (
-            f"Parsing dataset file {self.fake_data_path_pickle} using the parser " f"PreTrainedFastTextAddressParser"
+            f"Parsing dataset file {self.fake_data_path_pickle} using the parser PreTrainedFastTextAddressParser"
         )
         actual_first_message = self._caplog.records[0].message
         self.assertEqual(expected_first_message, actual_first_message)
diff --git a/tests/network/test_bpemb_seq2seq_model_cpu.py b/tests/network/test_bpemb_seq2seq_model_cpu.py
index b6bbd174..515b0af1 100644
--- a/tests/network/test_bpemb_seq2seq_model_cpu.py
+++ b/tests/network/test_bpemb_seq2seq_model_cpu.py
@@ -40,23 +40,25 @@ def test_whenInstantiatingABPEmbSeq2SeqModel_thenShouldInstantiateAEmbeddingNetw
             seq2seq_model.embedding_network.projection_layer.out_features,
         )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenNotLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, torch_load_mock
     ):
         isfile_mock.return_value = False
         with patch("deepparse.network.seq2seq.download_weights") as download_weights_mock:
             BPEmbSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeightsNotLastVersion_whenInstantiatingABPEmbSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = False
@@ -64,12 +66,13 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingABPEmbSeq2SeqModel_the
             BPEmbSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = True
@@ -77,7 +80,7 @@ def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownl
             BPEmbSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_not_called()
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights(
         self, load_state_dict_mock, torch_mock
@@ -95,9 +98,9 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould
         torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_cpu_device)]
         torch_mock.assert_has_calls(torch_load_call)
 
-        load_state_dict_call = [call(all_layers_params)]
-        load_state_dict_mock.assert_has_calls(load_state_dict_call)
+        load_state_dict_mock.assert_called()
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -112,6 +115,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncoderIsCalledOnce(
         last_version_mock,
         download_weights_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(self.cache_dir, self.a_cpu_device, self.output_size, self.verbose)
 
@@ -123,6 +127,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncoderIsCalledOnce(
 
         encoder_mock.assert_has_calls(encoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -137,6 +142,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(
             self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose
@@ -163,6 +169,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -177,6 +184,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(
             self.cache_dir,
@@ -207,6 +215,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -223,6 +232,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
         download_weights_mock,
         decoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -254,6 +264,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
 
         self.assert_has_calls_tensor_equals(decoder_mock, decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -270,6 +281,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
         download_weights_mock,
         decoder_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         to_predict_mock, lengths_list = self.setup_encoder_mocks()
 
@@ -316,6 +328,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
                             ]
                         )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -334,6 +347,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -380,6 +394,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
             )
             target_mock.assert_has_calls([call.transpose(0, 1)])
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -398,6 +413,7 @@ def test_givenABPEmbAttSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
diff --git a/tests/network/test_bpemb_seq2seq_model_gpu.py b/tests/network/test_bpemb_seq2seq_model_gpu.py
index 19c63ad0..6c171cda 100644
--- a/tests/network/test_bpemb_seq2seq_model_gpu.py
+++ b/tests/network/test_bpemb_seq2seq_model_gpu.py
@@ -41,23 +41,25 @@ def test_whenInstantiatingABPEmbSeq2SeqModel_thenShouldInstantiateAEmbeddingNetw
             seq2seq_model.embedding_network.projection_layer.out_features,
         )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenNotLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, torch_load_mock
     ):
         isfile_mock.return_value = False
         with patch("deepparse.network.seq2seq.download_weights") as download_weights_mock:
             BPEmbSeq2SeqModel(self.cache_dir, self.a_torch_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeightsNotLastVersion_whenInstantiatingABPEmbSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = False
@@ -65,12 +67,13 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingABPEmbSeq2SeqModel_the
             BPEmbSeq2SeqModel(self.cache_dir, self.a_torch_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = True
@@ -78,7 +81,7 @@ def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownl
             BPEmbSeq2SeqModel(self.cache_dir, self.a_torch_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_not_called()
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights(
         self, load_state_dict_mock, torch_mock
@@ -96,9 +99,9 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould
         torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_torch_device)]
         torch_mock.assert_has_calls(torch_load_call)
 
-        load_state_dict_call = [call(all_layers_params)]
-        load_state_dict_mock.assert_has_calls(load_state_dict_call)
+        load_state_dict_mock.assert_called()
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -113,6 +116,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncoderIsCalledOnce(
         last_version_mock,
         download_weights_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(self.cache_dir, self.a_torch_device, self.output_size, self.verbose)
 
@@ -124,6 +128,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncoderIsCalledOnce(
 
         encoder_mock.assert_has_calls(encoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -138,6 +143,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(
             self.cache_dir, self.a_torch_device, output_size=self.output_size, verbose=self.verbose
@@ -164,6 +170,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -178,6 +185,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = BPEmbSeq2SeqModel(
             self.cache_dir,
@@ -208,6 +216,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -224,6 +233,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
         download_weights_mock,
         decoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -255,6 +265,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
 
         self.assert_has_calls_tensor_equals(decoder_mock, decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -271,6 +282,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
         download_weights_mock,
         decoder_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         to_predict_mock, lengths_list = self.setup_encoder_mocks()
 
@@ -312,6 +324,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
                 ]
             )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -330,6 +343,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -376,6 +390,7 @@ def test_givenABPEmbSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
             )
             target_mock.assert_has_calls([call.transpose(0, 1)])
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -394,6 +409,7 @@ def test_givenABPEmbAttSeq2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
diff --git a/tests/network/test_fasttext_seq2seq_model_cpu.py b/tests/network/test_fasttext_seq2seq_model_cpu.py
index e21663cb..6524d175 100644
--- a/tests/network/test_fasttext_seq2seq_model_cpu.py
+++ b/tests/network/test_fasttext_seq2seq_model_cpu.py
@@ -25,23 +25,25 @@ def setUpClass(cls):
         cls.a_target_vector = torch.tensor([[0, 1, 1, 4, 5, 8], [1, 0, 3, 8, 0, 0]], device=cls.a_cpu_device)
         cls.a_transpose_target_vector = cls.a_target_vector.transpose(0, 1)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenNotLocalWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, torch_load_mock
     ):
         isfile_mock.return_value = False
         with patch("deepparse.network.seq2seq.download_weights") as download_weights_mock:
             FastTextSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = False
@@ -49,7 +51,7 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_
             FastTextSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose)
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights(
         self, load_state_dict_mock, torch_mock
@@ -67,9 +69,9 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould
         torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_cpu_device)]
         torch_mock.assert_has_calls(torch_load_call)
 
-        load_state_dict_call = [call(all_layers_params)]
-        load_state_dict_mock.assert_has_calls(load_state_dict_call)
+        load_state_dict_mock.assert_called()
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -84,6 +86,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncodeIsCalledOnce(
         last_version_mock,
         download_weights_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(self.cache_dir, self.a_cpu_device, self.output_size, self.verbose)
 
@@ -96,6 +99,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncodeIsCalledOnce(
 
         encoder_mock.assert_has_calls(encoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -110,6 +114,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(
             self.cache_dir,
@@ -140,6 +145,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -154,6 +160,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(
             self.cache_dir,
@@ -184,6 +191,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -200,6 +208,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
         download_weights_mock,
         decoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -231,6 +240,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
 
         self.assert_has_calls_tensor_equals(decoder_mock, decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -247,6 +257,7 @@ def test_givenAFasttextSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
         download_weights_mock,
         decoder_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         to_predict_mock, lengths_list = self.setup_encoder_mocks()
 
@@ -277,6 +288,7 @@ def test_givenAFasttextSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
             ]
         )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -295,6 +307,7 @@ def test_givenAFasttext2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -333,6 +346,7 @@ def test_givenAFasttext2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         )
         target_mock.assert_has_calls([call.transpose(0, 1)])
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -351,6 +365,7 @@ def test_givenAFasttext2SeqAttModel_whenForwardPassWithTarget_thenProperlyDoPAss
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
diff --git a/tests/network/test_fasttext_seq2seq_model_gpu.py b/tests/network/test_fasttext_seq2seq_model_gpu.py
index 2d66aebe..7d9afa4d 100644
--- a/tests/network/test_fasttext_seq2seq_model_gpu.py
+++ b/tests/network/test_fasttext_seq2seq_model_gpu.py
@@ -26,11 +26,12 @@ def setUpClass(cls):
         cls.a_target_vector = torch.tensor([[0, 1, 1, 4, 5, 8], [1, 0, 3, 8, 0, 0]], device=cls.a_torch_device)
         cls.a_transpose_target_vector = cls.a_target_vector.transpose(0, 1)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenNotLocalWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, torch_load_mock
     ):
         isfile_mock.return_value = False
         with patch("deepparse.network.seq2seq.download_weights") as download_weights_mock:
@@ -39,12 +40,13 @@ def test_givenNotLocalWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldD
             )
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
     @patch("deepparse.network.seq2seq.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_thenShouldDownloadWeights(
-        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock
+        self, load_state_dict_mock, torch_mock, isfile_mock, last_version_mock, torch_load_mock
     ):
         isfile_mock.return_value = True
         last_version_mock.return_value = False
@@ -54,7 +56,7 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_
             )
             download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose)
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict")
     def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights(
         self, load_state_dict_mock, torch_mock
@@ -72,9 +74,9 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould
         torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_torch_device)]
         torch_mock.assert_has_calls(torch_load_call)
 
-        load_state_dict_call = [call(all_layers_params)]
-        load_state_dict_mock.assert_has_calls(load_state_dict_call)
+        load_state_dict_mock.assert_called()
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -89,6 +91,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncodeIsCalledOnce(
         last_version_mock,
         download_weights_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(self.cache_dir, self.a_torch_device, self.output_size, self.verbose)
 
@@ -101,6 +104,7 @@ def test_whenInstantiateASeq2SeqModel_thenEncodeIsCalledOnce(
 
         encoder_mock.assert_has_calls(encoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -115,6 +119,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(
             self.cache_dir,
@@ -145,6 +150,7 @@ def test_whenInstantiateASeq2SeqModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
     @patch("deepparse.network.seq2seq.latest_version")
@@ -159,6 +165,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
         last_version_mock,
         download_weights_mock,
         decoder_mock,
+        torch_load_mock,
     ):
         seq2seq_model = FastTextSeq2SeqModel(
             self.cache_dir,
@@ -189,6 +196,7 @@ def test_whenInstantiateASeq2SeqAttModelNoTarget_thenDecoderIsCalled(
 
         decoder_mock.assert_has_calls(decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -205,6 +213,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
         download_weights_mock,
         decoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -236,6 +245,7 @@ def test_whenInstantiateASeq2SeqModelWithTarget_thenDecoderIsCalled(
 
         self.assert_has_calls_tensor_equals(decoder_mock, decoder_call)
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
     @patch("deepparse.network.seq2seq.download_weights")
@@ -252,6 +262,7 @@ def test_givenAFasttextSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
         download_weights_mock,
         decoder_mock,
         encoder_mock,
+        torch_load_mock,
     ):
         to_predict_mock, lengths_list = self.setup_encoder_mocks()
 
@@ -282,6 +293,7 @@ def test_givenAFasttextSeq2SeqModel_whenForwardPass_thenProperlyDoPAss(
             ]
         )
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -300,6 +312,7 @@ def test_givenAFasttext2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
@@ -338,6 +351,7 @@ def test_givenAFasttext2SeqModel_whenForwardPassWithTarget_thenProperlyDoPAss(
         )
         target_mock.assert_has_calls([call.transpose(0, 1)])
 
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.random.random")
     @patch("deepparse.network.seq2seq.Encoder")
     @patch("deepparse.network.seq2seq.Decoder")
@@ -356,6 +370,7 @@ def test_givenAFasttext2SeqAttModel_whenForwardPassWithTarget_thenProperlyDoPAss
         decoder_mock,
         encoder_mock,
         random_mock,
+        torch_load_mock,
     ):
         random_mock.return_value = self.a_value_lower_than_threshold
 
diff --git a/tests/network/test_seq2seq.py b/tests/network/test_seq2seq.py
index ee3570a9..804c3806 100644
--- a/tests/network/test_seq2seq.py
+++ b/tests/network/test_seq2seq.py
@@ -130,7 +130,7 @@ def test_whenHandleNewOutputDim_thenProperlyHandleNewDim(self):
         self.assertEqual(expected, actual)
 
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2seqModel_whenNoPretrainedWeights_thenDownloadIt(
         self,
@@ -156,7 +156,7 @@ def test_givenSeq2seqModel_whenNoPretrainedWeights_thenDownloadIt(
             download_weights_mock.assert_called_with(self.a_model_type, self.cache_dir, verbose=False)
 
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2seqModelVerbose_whenNoPretrainedWeights_thenWarns(
         self,
@@ -181,7 +181,7 @@ def test_givenSeq2seqModelVerbose_whenNoPretrainedWeights_thenWarns(
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     @skipIf(not torch.cuda.is_available(), "no gpu available")
     def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotRecentVersion_thenDownloadIt(
@@ -206,7 +206,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotRecentVersion_thenDownloa
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     @skipIf(not torch.cuda.is_available(), "no gpu available")
     def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseGPU_thenWarningsRaised(
@@ -230,7 +230,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseGPU_thenWarningsRaise
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseGPU_thenWarningsNotRaised(
         self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock
@@ -254,7 +254,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseGPU_thenWarningsNo
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseCPU_thenWarningsRaised(
         self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock
@@ -277,7 +277,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseCPU_thenWarningsRaise
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseCPU_thenWarningsNotRaised(
         self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock
@@ -299,7 +299,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseCPU_thenWarningsNo
                 seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False)
             self.assertEqual(0, len(record))
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeights_thenLoadProperly(self, torch_nn_mock, torch_mock):
         # pylint: disable=unnecessary-dunder-call
@@ -324,7 +324,7 @@ def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeights_thenLoadProperly(se
         torch_nn_mock.assert_called()
         torch_nn_mock.asser_has_calls([call(all_layers_params_mock)])
 
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeightsNewTagModel_thenLoadProperDict(
         self, torch_nn_mock, torch_mock
@@ -351,7 +351,7 @@ def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeightsNewTagModel_thenLoad
 
     @patch("deepparse.network.seq2seq.latest_version")
     @patch("os.path.isfile")
-    @patch("deepparse.network.seq2seq.torch")
+    @patch("deepparse.weights_tools.torch")
     @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict")
     def test_givenAnOfflineSeq2SeqModel_whenInit_thenDontCallOnlineFunctions(
         self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock
diff --git a/tests/parser/base.py b/tests/parser/base.py
index d4d6af09..645d4c88 100644
--- a/tests/parser/base.py
+++ b/tests/parser/base.py
@@ -30,7 +30,7 @@ def prepare_pre_trained_weights(cls):
         path_to_default_model = os.path.join(os.path.expanduser("~"), ".cache", "deepparse", "fasttext.ckpt")
         default_checkpoint_weights = torch.load(path_to_default_model, map_location="cpu")
         checkpoint_weights = {
-            "address_tagger_model": default_checkpoint_weights,
+            "address_tagger_model": default_checkpoint_weights.get("address_tagger_model"),
             "model_type": "fasttext",
             "named_parser": "PreTrainedFastTextAddressParser",
         }
@@ -44,7 +44,7 @@ def prepare_pre_trained_weights(cls):
         path_to_default_model = os.path.join(os.path.expanduser("~"), ".cache", "deepparse", "bpemb.ckpt")
         default_checkpoint_weights = torch.load(path_to_default_model, map_location="cpu")
         checkpoint_weights = {
-            "address_tagger_model": default_checkpoint_weights,
+            "address_tagger_model": default_checkpoint_weights.get("address_tagger_model"),
             "model_type": "bpemb",
             "named_parser": "PreTrainedBPEmbAddressParser",
         }
diff --git a/tests/parser/integration/base_predict.py b/tests/parser/integration/base_predict.py
index 86d1c442..177f4db1 100644
--- a/tests/parser/integration/base_predict.py
+++ b/tests/parser/integration/base_predict.py
@@ -20,6 +20,7 @@ class AddressParserBase(TestCase):
     @classmethod
     def setUpClass(cls):
         cls.an_address_to_parse = "350 rue des lilas o"
+        cls.another_address_to_parse = "425 rue des lilas o"
 
     def setup_model_with_config(self, config):
         self.a_model = AddressParser(**config)
@@ -29,6 +30,7 @@ class AddressParserPredictBase(AddressParserBase):
     def assert_properly_parse(self, parsed_address, multiple_address=False):
         if multiple_address:
             self.assertIsInstance(parsed_address, List)
+            self.assertNotEqual(parsed_address[0], parsed_address[1])
             parsed_address = parsed_address[0]
         self.assertIsInstance(parsed_address, FormattedParsedAddress)
 
diff --git a/tests/parser/integration/test_integration_address_parser_cpu.py b/tests/parser/integration/test_integration_address_parser_cpu.py
index 4928357a..12d010fb 100644
--- a/tests/parser/integration/test_integration_address_parser_cpu.py
+++ b/tests/parser/integration/test_integration_address_parser_cpu.py
@@ -104,15 +104,15 @@ def test_givenAListOfAddress_whenParseFastText_thenParseAllAddress(self):
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse])
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse])
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmb_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse])
-        self.assert_properly_parse(parse_address, multiple_address=True)
+        parse_addresses = self.a_model([self.an_address_to_parse, self.another_address_to_parse])
+        self.assert_properly_parse(parse_addresses, multiple_address=True)
 
 
 # test if num_workers > 0 is correct for the data loader
@@ -144,14 +144,14 @@ def test_givenAListOfAddress_whenParseFastTextNumWorkers1_thenParseAllAddress(se
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=1)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=1)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmbNumWorkers1_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=1)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=1)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAAddress_whenParseFastTextNumWorkers2_thenParseAddress(self):
@@ -172,14 +172,14 @@ def test_givenAListOfAddress_whenParseFastTextNumWorkers2_thenParseAllAddress(se
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=2)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=2)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmbNumWorkers2_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=2)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=2)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAAttentionModel_whenParseFastTextNumWorkers2_thenProperlyParseAddress(
diff --git a/tests/parser/integration/test_integration_address_parser_gpu.py b/tests/parser/integration/test_integration_address_parser_gpu.py
index 4cdba708..a9e83379 100644
--- a/tests/parser/integration/test_integration_address_parser_gpu.py
+++ b/tests/parser/integration/test_integration_address_parser_gpu.py
@@ -57,14 +57,14 @@ def test_givenAListOfAddress_whenParseFastText_thenParseAllAddress(self):
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse])
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse])
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmb_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse])
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse])
         self.assert_properly_parse(parse_address, multiple_address=True)
 
 
@@ -94,14 +94,14 @@ def test_givenAListOfAddress_whenParseFastTextNumWorkers1_thenParseAllAddress(se
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=1)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=1)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmbNumWorkers1_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=1)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=1)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAAddress_whenParseFastTextNumWorkers2_thenParseAddress(self):
@@ -122,14 +122,14 @@ def test_givenAListOfAddress_whenParseFastTextNumWorkers2_thenParseAllAddress(se
         config = {"model_type": "fasttext", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=2)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=2)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAListOfAddress_whenParseBPEmbNumWorkers2_thenParseAllAddress(self):
         config = {"model_type": "bpemb", "device": self.device, "verbose": False}
         self.setup_model_with_config(config)
 
-        parse_address = self.a_model([self.an_address_to_parse, self.an_address_to_parse], num_workers=2)
+        parse_address = self.a_model([self.an_address_to_parse, self.another_address_to_parse], num_workers=2)
         self.assert_properly_parse(parse_address, multiple_address=True)
 
     def test_givenAAttentionModel_whenParseFastTextNumWorkers2_thenProperlyParseAddress(
diff --git a/tests/parser/integration/test_integration_uri.py b/tests/parser/integration/test_integration_uri.py
new file mode 100644
index 00000000..5ab49aeb
--- /dev/null
+++ b/tests/parser/integration/test_integration_uri.py
@@ -0,0 +1,35 @@
+# Bug with PyTorch source code makes torch.tensor as not callable for pylint.
+# pylint: disable=not-callable
+
+# Pylint error for TemporaryDirectory ask for with statement
+# pylint: disable=consider-using-with
+
+import os
+from unittest import skipIf
+
+from tests.parser.integration.base_predict import (
+    AddressParserPredictBase,
+)
+
+
+@skipIf(
+    not os.path.exists(os.path.join(os.path.expanduser("~"), ".cache", "deepparse", "cc.fr.300.bin")),
+    "download of model too long for test in runner",
+)
+class AddressParserPredictURITest(AddressParserPredictBase):
+    @classmethod
+    def setUpClass(cls):
+        super(AddressParserPredictURITest, cls).setUpClass()
+        cls.device = "cpu"
+
+    def test_givenAAddress_whenParseFastTextURI_thenParseAddress(self):
+        config = {
+            "model_type": "fasttext",
+            "device": self.device,
+            "verbose": False,
+            "path_to_retrained_model": "s3://deepparse/fasttext.ckpt",
+        }
+        self.setup_model_with_config(config)
+
+        parse_address = self.a_model(self.an_address_to_parse)
+        self.assert_properly_parse(parse_address)
diff --git a/tests/test_tools.py b/tests/test_tools.py
index 4e512752..f7d47532 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -33,8 +33,8 @@ def setUp(self) -> None:
         self.temp_dir_obj = TemporaryDirectory()
         self.fake_cache_path = self.temp_dir_obj.name
         self.a_file_extension = "version"
-        self.latest_fasttext_version = "b4f098bb8909b1c8a8d24eea07df3435"
-        self.latest_bpemb_version = "ac0dc019748b6853dca412add7234203"
+        self.latest_fasttext_version = "f67a0517c70a314bdde0b8440f21139d"
+        self.latest_bpemb_version = "aa32fa918494b461202157c57734c374"
         self.a_seed = 42
         self.verbose = False
 
diff --git a/tests/test_weights_tools.py b/tests/test_weights_tools.py
new file mode 100644
index 00000000..7330b21b
--- /dev/null
+++ b/tests/test_weights_tools.py
@@ -0,0 +1,87 @@
+# pylint: disable=too-many-public-methods
+import os
+
+# Pylint error for TemporaryDirectory ask for with statement
+# pylint: disable=consider-using-with
+
+
+import unittest
+from unittest import TestCase, skipIf
+from unittest.mock import MagicMock, patch, call
+
+from cloudpathlib import S3Path, CloudPath
+
+from deepparse import handle_weights_upload
+from deepparse.parser import AddressParser, FormattedParsedAddress
+
+
+class WeightsToolsTests(TestCase):
+    @patch("deepparse.weights_tools.torch")
+    def test_givenAS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock):
+        s3_path = MagicMock(spec=S3Path)
+
+        weights_mock = MagicMock()
+        torch_mock.load().return_value = weights_mock
+
+        handle_weights_upload(path_to_model_to_upload=s3_path)
+
+        torch_mock.has_calls([call.load()])
+
+    @patch("deepparse.weights_tools.CloudPath")
+    @patch("deepparse.weights_tools.torch")
+    def test_givenAStringS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock):
+        s3_path = "s3://a_path"
+
+        weights_mock = MagicMock()
+        torch_mock.load().return_value = weights_mock
+
+        handle_weights_upload(path_to_model_to_upload=s3_path)
+
+        torch_mock.has_calls([call.load()])
+        cloud_path_mock.assert_called()
+
+    @patch("deepparse.weights_tools.CloudPath")
+    @patch("deepparse.weights_tools.torch")
+    def test_givenAStringPath_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock):
+        s3_path = "a_normal_path.ckpt"
+
+        weights_mock = MagicMock()
+        torch_mock.load().return_value = weights_mock
+
+        handle_weights_upload(path_to_model_to_upload=s3_path)
+
+        torch_mock.has_calls([call.load()])
+
+        cloud_path_mock.assert_not_called()
+
+    def test_givenAWrongfullyStringS3Path_whenHandleWeights_upload_thenRaiseError(self):
+        s3_path = "s3/model.ckpt"
+
+        with self.assertRaises(FileNotFoundError):
+            handle_weights_upload(path_to_model_to_upload=s3_path)
+
+        s3_path = "s3//model.ckpt"
+
+        with self.assertRaises(FileNotFoundError):
+            handle_weights_upload(path_to_model_to_upload=s3_path)
+
+    @skipIf(
+        not os.path.exists(os.path.join(os.path.expanduser("~"), ".cache", "deepparse", "cc.fr.300.bin")),
+        "download of model too long for test in runner",
+    )
+    def test_integration_handle_weights_with_uri(self):
+        uri = "s3://deepparse/fasttext.ckpt"
+
+        address_parser = AddressParser(model_type="fasttext", path_to_retrained_model=uri)
+        parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
+        self.assertIsInstance(parse_address, FormattedParsedAddress)
+
+        uri = CloudPath("s3://deepparse/fasttext.ckpt")
+
+        address_parser = AddressParser(model_type="fasttext", path_to_retrained_model=uri)
+        parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
+        self.assertIsInstance(parse_address, FormattedParsedAddress)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/version.txt b/version.txt
index 85b7c695..bae256fd 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.9.6
+0.9.7
\ No newline at end of file