diff --git a/docs/differentiation.md b/docs/differentiation.md
index 8e29f83c..25d71031 100644
--- a/docs/differentiation.md
+++ b/docs/differentiation.md
@@ -10,7 +10,32 @@ It uses the `torch` native automatic differentiation engine which tracks operati
 The [adjoint differentiation mode](https://arxiv.org/abs/2009.02823) computes first-order gradients by only requiring at most three states in memory in `O(P)` time where `P` is the number of parameters in a circuit.
 
 ### Generalized Parameter-Shift rules (DiffMode.GPSR)
-To be added.
+The Generalized parameter shift rule (GPSR mode) is an extension of the well known [parameter shift rule (PSR)](https://arxiv.org/abs/1811.11184) algorithm [to arbitrary quantum operations](https://arxiv.org/abs/2108.01218). Indeed, PSR only works for quantum operations whose generator has a single gap in its eigenvalue spectrum, GPSR extending to multi-gap.
+
+For this, we define the differentiable function as quantum expectation value
+
+$$
+f(x) = \left\langle 0\right|\hat{U}^{\dagger}(x)\hat{C}\hat{U}(x)\left|0\right\rangle
+$$
+
+where $\hat{U}(x)={\rm exp}{\left( -i\frac{x}{2}\hat{G}\right)}$ is the quantum evolution operator with generator $\hat{G}$ representing the structure of the underlying quantum circuit and $\hat{C}$ is the cost operator. Then using the eigenvalue spectrum $\left\{ \lambda_n\right\}$ of the generator $\hat{G}$ we calculate the full set of corresponding unique non-zero spectral gaps $\left\{ \Delta_s\right\}$ (differences between eigenvalues). It can be shown that the final expression of derivative of $f(x)$ is then given by the following expression:
+
+$\begin{equation}
+\frac{{\rm d}f\left(x\right)}{{\rm d}x}=\overset{S}{\underset{s=1}{\sum}}\Delta_{s}R_{s},
+\end{equation}$
+
+where $S$ is the number of unique non-zero spectral gaps and $R_s$ are real quantities that are solutions of a system of linear equations
+
+$\begin{equation}
+\begin{cases}
+F_{1} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}\left(\frac{\delta_{1}\Delta_{s}}{2}\right)R_{s},\\
+F_{2} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}\left(\frac{\delta_{2}\Delta_{s}}{2}\right)R_{s},\\
+ & ...\\
+F_{S} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}\left(\frac{\delta_{M}\Delta_{s}}{2}\right)R_{s}.
+\end{cases}
+\end{equation}$
+
+Here $F_s=f(x+\delta_s)-f(x-\delta_s)$ denotes the difference between values of functions evaluated at shifted arguments $x\pm\delta_s$.
 
 ### Example
 ```python exec="on" source="material-block" html="1"
@@ -23,7 +48,7 @@ batch_size = 1
 
 ry = pyq.RY(0, param_name="x")
 cnot = pyq.CNOT(1, 2)
-ops = [ry, cnot]
+ops = [ry]
 n_qubits = 3
 circ = pyq.QuantumCircuit(n_qubits, ops)
 
@@ -32,8 +57,11 @@ state = pyq.zero_state(n_qubits)
 
 values_ad = {"x": torch.tensor([torch.pi / 2], requires_grad=True)}
 values_adjoint = {"x": torch.tensor([torch.pi / 2], requires_grad=True)}
+values_gpsr = {"x": torch.tensor([torch.pi / 2], requires_grad=True)}
+
 exp_ad = pyq.expectation(circ, state, values_ad, obs, DiffMode.AD)
 exp_adjoint = pyq.expectation(circ, state, values_adjoint, obs, DiffMode.ADJOINT)
+exp_gpsr = pyq.expectation(circ, state, values_gpsr, obs, DiffMode.GPSR)
 
 dfdx_ad = torch.autograd.grad(exp_ad, tuple(values_ad.values()), torch.ones_like(exp_ad))
 
@@ -41,7 +69,12 @@ dfdx_adjoint = torch.autograd.grad(
     exp_adjoint, tuple(values_adjoint.values()), torch.ones_like(exp_adjoint)
 )
 
-assert len(dfdx_ad) == len(dfdx_adjoint)
+dfdx_gpsr = torch.autograd.grad(
+    exp_gpsr, tuple(values_gpsr.values()), torch.ones_like(exp_gpsr)
+)
+
+assert len(dfdx_ad) == len(dfdx_adjoint) == len(dfdx_gpsr)
 for i in range(len(dfdx_ad)):
     assert torch.allclose(dfdx_ad[i], dfdx_adjoint[i])
+    assert torch.allclose(dfdx_ad[i], dfdx_gpsr[i])
 ```
diff --git a/pyqtorch/api.py b/pyqtorch/api.py
index 92f78a83..24af0ecb 100644
--- a/pyqtorch/api.py
+++ b/pyqtorch/api.py
@@ -8,6 +8,7 @@
 from pyqtorch.adjoint import AdjointExpectation
 from pyqtorch.analog import Observable
 from pyqtorch.circuit import QuantumCircuit
+from pyqtorch.gpsr import PSRExpectation
 from pyqtorch.utils import DiffMode, inner_prod
 
 logger = getLogger(__name__)
@@ -94,6 +95,8 @@ def expectation(
             circuit, observable, state, values.keys(), *values.values()
         )
     elif diff_mode == DiffMode.GPSR:
-        raise NotImplementedError("To be added.")
+        return PSRExpectation.apply(
+            circuit, observable, state, values.keys(), *values.values()
+        )
     else:
         logger.error(f"Requested diff_mode '{diff_mode}' not supported.")
diff --git a/pyqtorch/gpsr.py b/pyqtorch/gpsr.py
new file mode 100644
index 00000000..3e47c46f
--- /dev/null
+++ b/pyqtorch/gpsr.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+from logging import getLogger
+from typing import Any, Tuple
+
+import torch
+from torch import Tensor, no_grad
+from torch.autograd import Function
+
+import pyqtorch as pyq
+from pyqtorch.analog import HamiltonianEvolution, Observable, Scale
+from pyqtorch.circuit import QuantumCircuit
+from pyqtorch.parametric import Parametric
+from pyqtorch.utils import inner_prod, param_dict
+
+logger = getLogger(__name__)
+
+
+class PSRExpectation(Function):
+    r"""
+    Implementation of the generalized parameter shift rule.
+
+    Compared to the original parameter shift rule
+    which only works for quantum operations whose generator has a single gap
+    in its eigenvalue spectrum, GPSR works with arbitrary
+    generators of quantum operations.
+
+    For this, we define the differentiable function as quantum expectation value
+
+    $$
+    f(x) = \left\langle 0\right|\hat{U}^{\dagger}(x)\hat{C}\hat{U}(x)\left|0\right\rangle
+    $$
+
+    where $\hat{U}(x)={\rm exp}{\left( -i\frac{x}{2}\hat{G}\right)}$
+    is the quantum evolution operator with generator $\hat{G}$ representing the structure
+    of the underlying quantum circuit and $\hat{C}$ is the cost operator.
+    Then using the eigenvalue spectrum $\left\{ \lambda_n\right\}$ of the generator $\hat{G}$
+    we calculate the full set of corresponding unique non-zero spectral gaps
+    $\left\{ \Delta_s\right\}$ (differences between eigenvalues).
+    It can be shown that the final expression of derivative of $f(x)$
+    is then given by the following expression:
+
+    $\begin{equation}
+    \frac{{\rm d}f\left(x\right)}{{\rm d}x}=\overset{S}{\underset{s=1}{\sum}}\Delta_{s}R_{s},
+    \end{equation}$
+
+    where $S$ is the number of unique non-zero spectral gaps and $R_s$ are real quantities that
+    are solutions of a system of linear equations
+
+    $\begin{equation}
+    \begin{cases}
+    F_{1} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}
+    \left(\frac{\delta_{1}\Delta_{s}}{2}\right)R_{s},\\
+    F_{2} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}
+    \left(\frac{\delta_{2}\Delta_{s}}{2}\right)R_{s},\\
+    & ...\\
+    F_{S} & =4\overset{S}{\underset{s=1}{\sum}}{\rm sin}
+    \left(\frac{\delta_{M}\Delta_{s}}{2}\right)R_{s}.
+    \end{cases}
+    \end{equation}$
+
+    Here $F_s=f(x+\delta_s)-f(x-\delta_s)$ denotes the difference between values
+    of functions evaluated at shifted arguments $x\pm\delta_s$.
+
+    Arguments:
+        circuit: A QuantumCircuit instance
+        observable: A hamiltonian.
+        state: A state in the form of [2 * n_qubits + [batch_size]]
+        param_names: A list of parameter names.
+        *param_values: A unpacked tensor of values for each parameter.
+    """
+
+    @staticmethod
+    @no_grad()
+    def forward(
+        ctx: Any,
+        circuit: QuantumCircuit,
+        observable: Observable,
+        state: Tensor,
+        param_names: list[str],
+        *param_values: Tensor,
+    ) -> Tensor:
+        ctx.circuit = circuit
+        ctx.observable = observable
+        ctx.param_names = param_names
+        ctx.state = state
+        values = param_dict(param_names, param_values)
+        ctx.out_state = circuit.run(state, values)
+        ctx.projected_state = observable.run(ctx.out_state, values)
+        ctx.save_for_backward(*param_values)
+        return inner_prod(ctx.out_state, ctx.projected_state).real
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: Tensor) -> Tuple[None, ...]:
+        check_support_psr(ctx.circuit)
+        param_values = ctx.saved_tensors
+        values = param_dict(ctx.param_names, param_values)
+        grads_dict = {k: None for k in values.keys()}
+        shift = torch.tensor(torch.pi) / 2.0
+
+        for op in ctx.circuit.flatten():
+            if isinstance(op, Parametric) and isinstance(op.param_name, str):
+                spectrum = torch.linalg.eigvalsh(op.pauli).reshape(-1, 1)
+                spectral_gap = torch.unique(
+                    torch.abs(torch.tril(spectrum - spectrum.T))
+                )
+                spectral_gap = spectral_gap[spectral_gap.nonzero()]
+                assert (
+                    len(spectral_gap) == 1
+                ), "PSRExpectation only works on single_gap for now."
+
+                if values[op.param_name].requires_grad:
+                    with no_grad():
+                        copied_values = values.copy()
+                        copied_values[op.param_name] += shift
+                        f_plus = pyq.expectation(
+                            ctx.circuit, ctx.state, copied_values, ctx.observable
+                        )
+                        copied_values[op.param_name] -= 2.0 * shift
+                        f_min = pyq.expectation(
+                            ctx.circuit, ctx.state, copied_values, ctx.observable
+                        )
+
+                    grad = (
+                        spectral_gap
+                        * (f_plus - f_min)
+                        / (4 * torch.sin(spectral_gap * shift / 2))
+                    )
+                    grad *= grad_out
+                if grads_dict[op.param_name] is not None:
+                    grads_dict[op.param_name] += grad
+                else:
+                    grads_dict[op.param_name] = grad
+            else:
+                logger.error(f"PSRExpectation does not support operation: {type(op)}.")
+        return (None, None, None, None, *grads_dict.values())
+
+
+def check_support_psr(circuit: QuantumCircuit):
+    """Checking that circuit has only compatible operations for PSR.
+
+    Args:
+        circuit (QuantumCircuit): Circuit to check.
+
+    Raises:
+        ValueError: When circuit contains Scale or HamiltonianEvolution.
+    """
+    for op in circuit.operations:
+        if isinstance(op, Scale) or isinstance(op, HamiltonianEvolution):
+            raise ValueError(
+                f"PSR is not applicable as circuit contains an operation of type: {type(op)}."
+            )
diff --git a/pyqtorch/utils.py b/pyqtorch/utils.py
index 5bdda042..140e57fa 100644
--- a/pyqtorch/utils.py
+++ b/pyqtorch/utils.py
@@ -87,6 +87,7 @@ class DiffMode(StrEnum):
     """An implementation of "Efficient calculation of gradients
                                        in classical simulations of variational quantum algorithms",
                                        Jones & Gacon, 2020"""
+
     GPSR = "gpsr"
     """To be added."""
 
diff --git a/tests/test_circuit.py b/tests/test_circuit.py
index 689034cc..5faf415c 100644
--- a/tests/test_circuit.py
+++ b/tests/test_circuit.py
@@ -13,16 +13,21 @@
 from pyqtorch.noise import Noise
 from pyqtorch.parametric import Parametric
 from pyqtorch.primitive import Primitive
-from pyqtorch.utils import GRADCHECK_ATOL, DensityMatrix, product_state
+from pyqtorch.utils import (
+    GRADCHECK_ATOL,
+    DensityMatrix,
+    product_state,
+)
 
 
-def test_adjoint_diff() -> None:
+# TODO add GPSR when multigap is implemented for this test
+@pytest.mark.parametrize("n_qubits", [3, 4, 5])
+def test_adjoint_diff(n_qubits: int) -> None:
     rx = pyq.RX(0, param_name="theta_0")
     cry = pyq.CPHASE(0, 1, param_name="theta_1")
     rz = pyq.RZ(2, param_name="theta_2")
     cnot = pyq.CNOT(1, 2)
     ops = [rx, cry, rz, cnot]
-    n_qubits = 3
     circ = pyq.QuantumCircuit(n_qubits, ops)
     obs = pyq.QuantumCircuit(n_qubits, [pyq.Z(0)])
 
@@ -60,7 +65,7 @@ def test_adjoint_diff() -> None:
 
     assert len(grad_ad) == len(grad_adjoint)
     for i in range(len(grad_ad)):
-        assert torch.allclose(grad_ad[i], grad_adjoint[i])
+        assert torch.allclose(grad_ad[i], grad_adjoint[i], atol=GRADCHECK_ATOL)
 
 
 @pytest.mark.parametrize("dtype", [torch.complex64, torch.complex128])
@@ -155,7 +160,7 @@ def test_adjoint_duplicate_params() -> None:
     grad_adjoint = torch.autograd.grad(
         exp_adjoint, tuple(values.values()), torch.ones_like(exp_adjoint)
     )[0]
-    assert torch.allclose(grad_ad, grad_adjoint)
+    assert torch.allclose(grad_ad, grad_adjoint, atol=GRADCHECK_ATOL)
 
 
 @pytest.mark.parametrize("fn", [pyq.X, pyq.Z, pyq.Y])
@@ -310,3 +315,74 @@ def test_sample_run() -> None:
     assert torch.allclose(wf, product_state("1100"))
     assert torch.allclose(pyq.QuantumCircuit(4, [pyq.I(0)]).run("1100"), wf)
     assert "1100" in samples[0]
+
+
+# TODO delete this test when first one up is
+@pytest.mark.parametrize("n_qubits", [3, 4, 5])
+def test_all_diff(n_qubits: int) -> None:
+    rx = pyq.RX(0, param_name="theta_0")
+    rz = pyq.RZ(2, param_name="theta_1")
+    cnot = pyq.CNOT(1, 2)
+    ops = [rx, rz, cnot]
+    circ = pyq.QuantumCircuit(n_qubits, ops)
+    obs = pyq.QuantumCircuit(n_qubits, [pyq.Z(0)])
+
+    theta_0_value = torch.pi / 2
+    theta_1_value = torch.pi
+
+    state = pyq.zero_state(n_qubits)
+
+    theta_0 = torch.tensor([theta_0_value], requires_grad=True)
+
+    theta_1 = torch.tensor([theta_1_value], requires_grad=True)
+
+    values = {"theta_0": theta_0, "theta_1": theta_1}
+
+    exp_ad = expectation(circ, state, values, obs, DiffMode.AD)
+    exp_adjoint = expectation(circ, state, values, obs, DiffMode.ADJOINT)
+    exp_gpsr = expectation(circ, state, values, obs, DiffMode.GPSR)
+
+    grad_ad = torch.autograd.grad(
+        exp_ad, tuple(values.values()), torch.ones_like(exp_ad)
+    )
+
+    grad_adjoint = torch.autograd.grad(
+        exp_adjoint, tuple(values.values()), torch.ones_like(exp_adjoint)
+    )
+
+    grad_gpsr = torch.autograd.grad(
+        exp_gpsr, tuple(values.values()), torch.ones_like(exp_gpsr)
+    )
+
+    assert len(grad_ad) == len(grad_adjoint) == len(grad_gpsr)
+
+    for i in range(len(grad_ad)):
+        assert torch.allclose(
+            grad_ad[i], grad_adjoint[i], atol=GRADCHECK_ATOL
+        ) and torch.allclose(grad_ad[i], grad_gpsr[i], atol=GRADCHECK_ATOL)
+
+
+@pytest.mark.xfail(raises=ValueError)
+@pytest.mark.parametrize("gate_type", ["scale", "hamevo"])
+def test_compatibility_gpsr(gate_type: str) -> None:
+
+    if gate_type == "scale":
+        seq_gate = pyq.Sequence([pyq.X(0)])
+        scale = pyq.Scale(seq_gate, "theta_0")
+        ops = [scale]
+    else:
+        hamevo = pyq.HamiltonianEvolution(pyq.Sequence([pyq.X(0)]), "theta_0", (0,))
+
+        ops = [hamevo]
+
+    circ = pyq.QuantumCircuit(1, ops)
+    obs = pyq.QuantumCircuit(1, [pyq.Z(0)])
+    state = pyq.zero_state(1)
+
+    param_value = torch.pi / 2
+    values = {"theta_0": torch.tensor([param_value], requires_grad=True)}
+    exp_gpsr = expectation(circ, state, values, obs, DiffMode.GPSR)
+
+    grad_gpsr = torch.autograd.grad(
+        exp_gpsr, tuple(values.values()), torch.ones_like(exp_gpsr)
+    )