diff --git a/qadence/backends/pytorch_wrapper.py b/qadence/backends/pytorch_wrapper.py
new file mode 100644
index 00000000..54b40975
--- /dev/null
+++ b/qadence/backends/pytorch_wrapper.py
@@ -0,0 +1,324 @@
+from __future__ import annotations
+
+from collections import Counter, OrderedDict
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Sequence
+
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+
+from qadence.backend import Backend as QuantumBackend
+from qadence.backend import Converted, ConvertedCircuit, ConvertedObservable
+from qadence.backends.utils import param_dict
+from qadence.blocks import AbstractBlock, PrimitiveBlock
+from qadence.blocks.utils import uuid_to_block, uuid_to_eigen
+from qadence.circuit import QuantumCircuit
+from qadence.extensions import get_gpsr_fns
+from qadence.measurements import Measurements
+from qadence.ml_tools import promote_to_tensor
+from qadence.types import DiffMode, Endianness
+
+
+class PSRExpectation(Function):
+    """Overloads the PyTorch AD system to perform parameter shift rule on quantum circuits."""
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        expectation_fn: Callable[[dict[str, Tensor]], Tensor],
+        param_psrs: Sequence[Callable],
+        param_keys: Sequence[str],
+        *param_values: Tensor,
+    ) -> Tensor:
+        for param in param_values:
+            param.detach()
+
+        ctx.expectation_fn = expectation_fn
+        ctx.param_psrs = param_psrs
+        ctx.param_keys = param_keys
+        ctx.save_for_backward(*param_values)
+
+        expectation_values = expectation_fn(param_values=param_dict(param_keys, param_values))  # type: ignore[call-arg] # noqa: E501
+        # Stack batches of expectations if so.
+        if isinstance(expectation_values, list):
+            return torch.stack(expectation_values)
+        else:
+            return expectation_values
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: Tensor) -> tuple:
+        params = param_dict(ctx.param_keys, ctx.saved_tensors)
+
+        def expectation_fn(params: dict[str, Tensor]) -> Tensor:
+            return PSRExpectation.apply(
+                ctx.expectation_fn,
+                ctx.param_psrs,
+                params.keys(),
+                *params.values(),
+            )
+
+        def vjp(psr: Callable, name: str) -> Tensor:
+            return grad_out * psr(expectation_fn, params, name)
+
+        grads = [
+            vjp(psr, name) if needs_grad else None
+            for psr, name, needs_grad in zip(
+                ctx.param_psrs, ctx.param_keys, ctx.needs_input_grad[3:]
+            )
+        ]
+        return (None, None, None, *grads)
+
+
+@dataclass
+class DifferentiableExpectation:
+    """A handler for differentiating expectation estimation using various engines."""
+
+    backend: QuantumBackend
+    circuit: ConvertedCircuit
+    observable: list[ConvertedObservable] | ConvertedObservable
+    param_values: dict[str, Tensor]
+    state: Tensor | None = None
+    protocol: Measurements | None = None
+    endianness: Endianness = Endianness.BIG
+
+    def ad(self) -> Tensor:
+        self.observable = (
+            self.observable if isinstance(self.observable, list) else [self.observable]
+        )
+        if self.protocol:
+            expectation_fn = self.protocol.get_measurement_fn()
+            expectations = expectation_fn(
+                circuit=self.circuit.original,
+                observables=[obs.original for obs in self.observable],
+                param_values=self.param_values,
+                options=self.protocol.options,
+                state=self.state,
+                endianness=self.endianness,
+            )
+        else:
+            expectations = self.backend.expectation(
+                circuit=self.circuit,
+                observable=self.observable,
+                param_values=self.param_values,
+                state=self.state,
+                endianness=self.endianness,
+            )
+        return promote_to_tensor(
+            expectations if isinstance(expectations, Tensor) else torch.tensor(expectations)
+        )
+
+    def psr(self, psr_fn: Callable, **psr_args: int | float | None) -> Tensor:
+        # wrapper which unpacks the parameters
+        # as pytorch grads can only calculated w.r.t tensors
+        # so we unpack the params, feed in the names separately
+        # as apply doesnt take keyword arguments
+        # We also fold in the observable into the backend which makes
+        # life easier in the custom autodiff.
+        self.observable = (
+            self.observable if isinstance(self.observable, list) else [self.observable]
+        )
+
+        if self.protocol is not None:
+            expectation_fn = partial(
+                self.protocol.get_measurement_fn(),
+                circuit=self.circuit.original,
+                observables=[obs.original for obs in self.observable],
+                options=self.protocol.options,
+                state=self.state,
+                endianness=self.endianness,
+            )
+        else:
+            expectation_fn = partial(
+                self.backend.expectation,
+                circuit=self.circuit,
+                observable=self.observable,
+                state=self.state,
+                endianness=self.endianness,
+            )
+        # PSR only applies to parametric circuits.
+        if isinstance(self.observable, ConvertedObservable):
+            self.observable = [self.observable]
+        param_to_psr = self.construct_rules(
+            self.circuit.abstract, [o.abstract for o in self.observable], psr_fn, **psr_args
+        )
+
+        # Select the subset of all parameters for which PSR apply
+        # which are from the circuit only.
+        self.param_values = {k: self.param_values[k] for k in param_to_psr.keys()}
+
+        return PSRExpectation.apply(expectation_fn, param_to_psr.values(), self.param_values.keys(), *self.param_values.values())  # type: ignore # noqa: E501
+
+    # Make PSR construction a static method to avoid unhashability issues.
+    @staticmethod
+    def construct_rules(
+        circuit: QuantumCircuit,
+        observable: list[AbstractBlock],
+        psr_fn: Callable,
+        **psr_args: int | float | None,
+    ) -> dict[str, Callable]:
+        """Create a mapping between parameters and PSR functions."""
+
+        uuid_to_eigs = uuid_to_eigen(circuit.block)
+        # We currently rely on implicit ordering to match the PSR to the parameter,
+        # because we want to cache PSRs.
+
+        param_to_psr = OrderedDict()
+        for param_id, eigenvalues in uuid_to_eigs.items():
+            if eigenvalues is None:
+                raise ValueError(
+                    f"Eigenvalues are not defined for param_id {param_id}\n"
+                    # f"of type {type(block)}.\n"
+                    "PSR cannot be defined in that case."
+                )
+
+            param_to_psr[param_id] = psr_fn(eigenvalues, **psr_args)
+        for obs in observable:
+            for param_id, _ in uuid_to_eigen(obs).items():
+                # We need the embedded fixed params of the observable in the param_values dict
+                # to be able to call expectation. Since torch backward requires
+                # a list of param_ids and values of equal length, we need to pass them to PSR too.
+                # Since they are constants their gradients are 0.
+                param_to_psr[param_id] = lambda x: torch.tensor([0.0], requires_grad=False)
+        return param_to_psr
+
+
+class DifferentiableBackend(nn.Module):
+    """A class to abstract the operations done by the autodiff engine
+
+    Arguments:
+        backend: An instance of the QuantumBackend type perform execution.
+        diff_mode: A differentiable mode supported by the differentiation engine.
+        **psr_args: Arguments that will be passed on to `DifferentiableExpectation`.
+    """
+
+    def __init__(
+        self,
+        backend: QuantumBackend,
+        diff_mode: DiffMode = DiffMode.AD,
+        **psr_args: int | float | None,
+    ) -> None:
+        super().__init__()
+
+        self.backend = backend
+        self.diff_mode = diff_mode
+        self.psr_args = psr_args
+        # TODO: Add differentiable overlap calculation
+        self._overlap: Callable = None  # type: ignore [assignment]
+
+    def run(
+        self,
+        circuit: ConvertedCircuit,
+        param_values: dict = {},
+        state: Tensor | None = None,
+        endianness: Endianness = Endianness.BIG,
+    ) -> Tensor:
+        """Run on the underlying backend."""
+        return self.backend.run(
+            circuit=circuit, param_values=param_values, state=state, endianness=endianness
+        )
+
+    def expectation(
+        self,
+        circuit: ConvertedCircuit,
+        observable: list[ConvertedObservable] | ConvertedObservable,
+        param_values: dict[str, Tensor] = {},
+        state: Tensor | None = None,
+        protocol: Measurements | None = None,
+        endianness: Endianness = Endianness.BIG,
+    ) -> Tensor:
+        """Compute the expectation value of a given observable.
+
+        Arguments:
+            circuit: A backend native quantum circuit to be executed.
+            observable: A backend native observable to compute the expectation value from.
+            param_values: A dict of values for symbolic substitution.
+            state: An initial state.
+            protocol: A shot-based measurement protocol.
+            endianness: Endianness of the state.
+
+        Returns:
+            A tensor of expectation values.
+        """
+        observable = observable if isinstance(observable, list) else [observable]
+        differentiable_expectation = DifferentiableExpectation(
+            backend=self.backend,
+            circuit=circuit,
+            observable=observable,
+            param_values=param_values,
+            state=state,
+            protocol=protocol,
+            endianness=endianness,
+        )
+
+        if self.diff_mode == DiffMode.AD:
+            expectation = differentiable_expectation.ad
+        else:
+            try:
+                fns = get_gpsr_fns()
+                psr_fn = fns[self.diff_mode]
+            except KeyError:
+                raise ValueError(f"{self.diff_mode} differentiation mode is not supported")
+            expectation = partial(differentiable_expectation.psr, psr_fn=psr_fn, **self.psr_args)
+        return expectation()
+
+    def sample(
+        self,
+        circuit: ConvertedCircuit,
+        param_values: dict[str, Tensor],
+        state: Tensor | None = None,
+        n_shots: int = 1,
+        endianness: Endianness = Endianness.BIG,
+    ) -> list[Counter]:
+        """Sample bitstring from the registered circuit.
+
+        Arguments:
+            circuit: A backend native quantum circuit to be executed.
+            param_values: The values of the parameters after embedding
+            n_shots: The number of shots. Defaults to 1.
+
+        Returns:
+            An iterable with all the sampled bitstrings
+        """
+        with torch.no_grad():
+            return self.backend.sample(
+                circuit=circuit,
+                param_values=param_values,
+                state=state,
+                n_shots=n_shots,
+                endianness=endianness,
+            )
+
+    def circuit(self, circuit: QuantumCircuit) -> ConvertedCircuit:
+        parametrized_blocks = list(uuid_to_block(circuit.block).values())
+        non_prim_blocks = filter(lambda b: not isinstance(b, PrimitiveBlock), parametrized_blocks)
+        if len(list(non_prim_blocks)) > 0:
+            raise ValueError(
+                "The circuit contains non-primitive blocks that are currently not supported by the "
+                "PSR differentiable mode."
+            )
+        return self.backend.circuit(circuit)
+
+    def observable(self, observable: AbstractBlock, n_qubits: int) -> ConvertedObservable:
+        if observable is not None and observable.is_parametric:
+            raise ValueError("PSR cannot be applied to a parametric observable.")
+        return self.backend.observable(observable, n_qubits)
+
+    def convert(
+        self,
+        circuit: QuantumCircuit,
+        observable: list[AbstractBlock] | AbstractBlock | None = None,
+    ) -> Converted:
+        if self.diff_mode != DiffMode.AD and observable is not None:
+            if isinstance(observable, list):
+                for obs in observable:
+                    if obs.is_parametric:
+                        raise ValueError("PSR cannot be applied to a parametric observable.")
+            else:
+                if observable.is_parametric:
+                    raise ValueError("PSR cannot be applied to a parametric observable.")
+        return self.backend.convert(circuit, observable)
+
+    def assign_parameters(self, circuit: ConvertedCircuit, param_values: dict[str, Tensor]) -> Any:
+        return self.backend.assign_parameters(circuit, param_values)
diff --git a/qadence/blocks/embedding.py b/qadence/blocks/embedding.py
new file mode 100644
index 00000000..077ccb20
--- /dev/null
+++ b/qadence/blocks/embedding.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+from typing import Callable, Iterable, List
+
+import numpy as np
+import sympy
+import sympytorch  # type: ignore [import]
+import torch
+from torch import Tensor
+
+from qadence.blocks import (
+    AbstractBlock,
+)
+from qadence.blocks.utils import (
+    expressions,
+    parameters,
+    uuid_to_expression,
+)
+from qadence.parameters import evaluate, stringify, torchify
+
+StrTensorDict = dict[str, Tensor]
+
+
+def unique(x: Iterable) -> List:
+    return list(set(x))
+
+
+def embedding(
+    block: AbstractBlock, to_gate_params: bool = False
+) -> tuple[StrTensorDict, Callable[[StrTensorDict, StrTensorDict], StrTensorDict],]:
+    """Construct embedding function which maps user-facing parameters to either *expression-level*
+    parameters or *gate-level* parameters. The construced embedding function has the signature:
+
+         embedding_fn(params: StrTensorDict, inputs: StrTensorDict) -> StrTensorDict:
+
+    which means that it maps the *variational* parameter dict `params` and the *feature* parameter
+    dict `inputs` to one new parameter dict `embedded_dict` which holds all parameters that are
+    needed to execute a circuit on a given backend. There are two different *modes* for this
+    mapping:
+
+    - *Expression-level* parameters: For AD-based optimization. For every unique expression we end
+      up with one entry in the embedded dict:
+      `len(embedded_dict) == len(unique_parameter_expressions)`.
+    - *Gate-level* parameters: For PSR-based optimization or real devices. One parameter for each
+      gate parameter, regardless if they are based on the same expression. `len(embedded_dict) ==
+      len(parametric_gates)`. This is needed because PSR requires to shift the angles of **every**
+      gate where the same parameter appears.
+
+    Arguments:
+        block: parametrized block into which we want to embed parameters.
+        to_gate_params: A boolean flag whether to generate gate-level parameters or
+            expression-level parameters.
+
+    Returns:
+        A tuple with variational parameter dict and the embedding function.
+    """
+
+    unique_expressions = unique(expressions(block))
+    unique_symbols = [p for p in unique(parameters(block)) if not isinstance(p, sympy.Array)]
+    unique_const_matrices = [e for e in unique_expressions if isinstance(e, sympy.Array)]
+    unique_expressions = [e for e in unique_expressions if not isinstance(e, sympy.Array)]
+
+    # NOTE
+    # there are 3 kinds of parameters in qadence
+    # - non-trainable which are considered as inputs for classical data
+    # - trainable which are the variational parameters to be optimized
+    # - fixed: which are non-trainable parameters with fixed value (e.g. pi/2)
+    #
+    # both non-trainable and trainable parameters can have the same element applied
+    # to different operations in the quantum circuit, e.g. assigning the same parameter
+    # to multiple gates.
+    non_numeric_symbols = [p for p in unique_symbols if not p.is_number]
+    trainable_symbols = [p for p in non_numeric_symbols if p.trainable]
+    constant_expressions = [expr for expr in unique_expressions if expr.is_number]
+    # we dont need to care about constant symbols if they are contained in an symbolic expression
+    # we only care about gate params which are ONLY a constant
+
+    embeddings: dict[sympy.Expr, sympytorch.SymPyModule] = {
+        expr: torchify(expr) for expr in unique_expressions if not expr.is_number
+    }
+
+    uuid_to_expr = uuid_to_expression(block)
+
+    def embedding_fn(params: StrTensorDict, inputs: StrTensorDict) -> StrTensorDict:
+        embedded_params: dict[sympy.Expr, Tensor] = {}
+        for expr, fn in embeddings.items():
+            angle: Tensor
+            values = {}
+            for symbol in expr.free_symbols:
+                if symbol.name in inputs:
+                    value = inputs[symbol.name]
+                elif symbol.name in params:
+                    value = params[symbol.name]
+                else:
+                    msg_trainable = "Trainable" if symbol.trainable else "Non-trainable"
+                    raise KeyError(
+                        f"{msg_trainable} parameter '{symbol.name}' not found in the "
+                        f"inputs list: {list(inputs.keys())} nor the "
+                        f"params list: {list(params.keys())}."
+                    )
+                values[symbol.name] = value
+            angle = fn(**values)
+            # do not reshape parameters which are multi-dimensional
+            # tensors, such as for example generator matrices
+            if not len(angle.squeeze().shape) > 1:
+                angle = angle.reshape(-1)
+            embedded_params[expr] = angle
+
+        for e in constant_expressions + unique_const_matrices:
+            embedded_params[e] = params[stringify(e)]
+
+        if to_gate_params:
+            gate_lvl_params: StrTensorDict = {}
+            for uuid, e in uuid_to_expr.items():
+                gate_lvl_params[uuid] = embedded_params[e]
+            return gate_lvl_params
+        else:
+            return {stringify(k): v for k, v in embedded_params.items()}
+
+    params: StrTensorDict
+    params = {p.name: torch.tensor([p.value], requires_grad=True) for p in trainable_symbols}
+    params.update(
+        {
+            stringify(expr): torch.tensor([evaluate(expr)], requires_grad=False)
+            for expr in constant_expressions
+        }
+    )
+    params.update(
+        {
+            stringify(expr): torch.tensor(
+                np.array(expr.tolist(), dtype=np.cdouble), requires_grad=False
+            )
+            for expr in unique_const_matrices
+        }
+    )
+    return params, embedding_fn