diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 00000000..4a68ca76
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,18 @@
+steps:
+  - label: "HierarchicalEOM_CUDAExt"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1:
+          test_args: "--quickfail"
+          coverage: false # 1000x slowdown
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    env:
+      GROUP: "HierarchicalEOM_CUDAExt"
+      JULIA_PKG_SERVER: "" # it often struggles with our large artifacts
+      # SECRET_CODECOV_TOKEN: "..."
+    timeout_in_minutes: 30
+    # Don't run Buildkite if the commit message includes the text [skip tests]
+    if: build.message !~ /\[skip tests\]/
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 7e854328..68dd0e57 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,9 +19,11 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SteadyStateDiffEq = "9672c7b4-1e72-59bd-8a11-6ac3964bc41f"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 QuantumOptics = "6e0679c1-51ea-5a7c-ac74-d61b76210b0c"
 
 [extensions]
+HierarchicalEOM_CUDAExt = "CUDA"
 HierarchicalEOM_QOExt = "QuantumOptics"
 
 [compat]
@@ -43,9 +45,10 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 QuantumOptics = "6e0679c1-51ea-5a7c-ac74-d61b76210b0c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["BenchmarkTools", "Documenter", "LaTeXStrings", "Literate", "Pardiso", "Plots", "QuantumOptics", "Test"]
+test = ["BenchmarkTools", "Documenter", "LaTeXStrings", "Literate", "Pardiso", "Pkg", "Plots", "QuantumOptics", "Test"]
diff --git a/docs/make.jl b/docs/make.jl
index 733983da..cf2d1570 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -77,7 +77,8 @@ const PAGES = Any[
         "Examples" => EX_output_files,
         "Benchmark Solvers" => BM_output_files,
         "Extensions" => Any[
-            "QuantumOptics.jl" => "extensions/QuantumOptics.md"
+            "QuantumOptics.jl" => "extensions/QuantumOptics.md",
+            "CUDA.jl" => "extensions/CUDA.md"
         ]
     ],
     "Library API" => "libraryAPI.md"
diff --git a/docs/src/extensions/CUDA.md b/docs/src/extensions/CUDA.md
new file mode 100644
index 00000000..d9e2cb8e
--- /dev/null
+++ b/docs/src/extensions/CUDA.md
@@ -0,0 +1,95 @@
+# [Extension for CUDA.jl](@id doc-ext-CUDA)
+
+This is an extension to support GPU ([`CUDA.jl`](https://github.com/JuliaGPU/CUDA.jl)) acceleration for solving the [time evolution](@ref doc-Time-Evolution) and [spectrum](@ref doc-Spectrum). This improves the execution time and memory usage especially when the HEOMLS matrix is super large.
+
+!!! compat "Compat"
+    The described feature requires `Julia 1.9+`.
+
+The functions [`evolution`](@ref doc-Time-Evolution) (only supports ODE method with time-independent system Hamiltonian) and [`spectrum`](@ref doc-Spectrum) will automatically choose to solve on CPU or GPU depend on the type of the sparse matrix in `M::AbstractHEOMLSMatrix` objects (i.e., the type of the field `M.data`). 
+
+```julia
+typeof(M.data) <:   SparseMatrixCSC # solve on CPU
+typeof(M.data) <: CuSparseMatrixCSC # solve on GPU
+```
+
+Therefore, we wrapped several functions in `CUDA` and `CUDA.CUSPARSE` in order to return a new HEOMLS-matrix-type object with `M.data` is in the type of `CuSparseMatrix`, and also change the element type into `ComplexF32` and `Int32` (since GPU performs better in this type). The functions are listed as follows:
+- `cu(M::AbstractHEOMLSMatrix)` : Translate `M.data` into the type `CuSparseMatrixCSC{ComplexF32, Int32}`
+- `CuSparseMatrixCSC(M::AbstractHEOMLSMatrix)` : Translate `M.data` into the type `CuSparseMatrixCSC{ComplexF32, Int32}`
+
+### Demonstration
+
+The extension will be automatically loaded if user imports the package `CUDA.jl` :
+
+```julia
+using CUDA
+using HierarchicalEOM
+using LinearSolve # to change the solver for better GPU performance
+```
+
+### Setup
+
+Here, we demonstrate this extension by using the example of [the single-impurity Anderson model](@ref exp-SIAM). 
+
+```julia
+ϵ  = -5
+U  = 10
+Γ  = 2
+μ  = 0
+W  = 10
+kT = 0.5
+N  = 5
+tier  = 3
+
+tlist = 0f0:1f-1:10f0  # same as 0:0.1:10 but in the type of `Float32`
+ωlist = -10f0:1f0:10f0 # same as -10:1:10 but in the type of `Float32`
+
+σm = [0 1; 0  0]
+σz = [1 0; 0 -1]
+II = [1 0; 0  1]
+d_up = kron(     σm, II)
+d_dn = kron(-1 * σz, σm)
+ρ0   = kron([1 0; 0 0], [1 0; 0 0])
+Hsys = ϵ * (d_up' * d_up + d_dn' * d_dn) + U * (d_up' * d_up * d_dn' * d_dn)
+
+bath_up = Fermion_Lorentz_Pade(d_up, Γ, μ, W, kT, N)
+bath_dn = Fermion_Lorentz_Pade(d_dn, Γ, μ, W, kT, N)
+bath_list = [bath_up, bath_dn]
+
+# even HEOMLS matrix
+M_even_cpu = M_Fermion(Hsys, tier, bath_list; verbose=false)
+M_even_gpu = cu(M_even_cpu)
+
+# odd HEOMLS matrix
+M_odd_cpu  = M_Fermion(Hsys, tier, bath_list, ODD; verbose=false)
+M_odd_gpu  = cu(M_odd_cpu)
+
+# solve steady state with CPU
+ados_ss = SteadyState(M_even_cpu);
+```
+
+!!! note "Note"
+    This extension does not support for solving [`SteadyState`](@ref doc-Stationary-State) on GPU since it is not efficient and might get wrong solutions. If you really want to obtain the stationary state with GPU, you can repeatedly solve the [`evolution`](@ref doc-Time-Evolution) until you find it.
+
+### Solving time evolution with CPU
+
+```julia
+ados_list_cpu = evolution(M_even_cpu, ρ0, tlist; verbose=false)
+```
+
+### Solving time evolution with GPU
+
+```julia
+ados_list_gpu = evolution(M_even_gpu, ρ0, tlist; verbose=false)
+```
+
+### Solving Spectrum with CPU
+
+```julia
+dos_cpu = spectrum(M_odd_cpu, ados_ss, d_up, ωlist; verbose=false)
+```
+
+### Solving Spectrum with GPU
+
+```julia
+dos_gpu = spectrum(M_odd_gpu, ados_ss, d_up, ωlist; solver=KrylovJL_BICGSTAB(rtol=1f-10, atol=1f-12), verbose=false)
+```
\ No newline at end of file
diff --git a/docs/src/spectrum.md b/docs/src/spectrum.md
index d47edb64..ccee275a 100644
--- a/docs/src/spectrum.md
+++ b/docs/src/spectrum.md
@@ -13,6 +13,9 @@ The function [`spectrum`](@ref) will automatically detect the [parity](@ref doc-
 
 `HierarchicalEOM.jl` wraps some of the functions in [LinearSolve.jl](http://linearsolve.sciml.ai/stable/), which is a very rich numerical library for solving the linear problems and provides many solvers. It offers quite a few options for the user to tailor the solver to their specific needs. The default solver (and its corresponding settings) are chosen to suit commonly encountered problems and should work fine for most of the cases. If you require more specialized methods, such as the choice of algorithm, please refer to [benchmark for LinearSolve solvers](@ref benchmark-LS-solvers) and also the documentation of [LinearSolve.jl](http://linearsolve.sciml.ai/stable/).
 
+!!! compat "Extension for CUDA.jl"
+    `HierarchicalEOM.jl` provides an extension to support GPU ([`CUDA.jl`](https://github.com/JuliaGPU/CUDA.jl)) acceleration for solving the spectrum, but this feature requires `Julia 1.9+` and `HierarchicalEOM 1.1+`. See [here](@ref doc-ext-CUDA) for more details.
+
 ## [Power Spectral Density](@id doc-PSD)
 Start from the spectrum for bosonic systems (power spectral density) in the time-domain. We write the system two-time correlation function in terms of the propagator ``\hat{\mathcal{G}}(t)=\exp(\hat{\mathcal{M}} t)`` for ``t>0``. The power spectral density ``S(\omega)`` can be obtained as
 ```math
diff --git a/docs/src/time_evolution.md b/docs/src/time_evolution.md
index 45f19ad0..8c6f9a9a 100644
--- a/docs/src/time_evolution.md
+++ b/docs/src/time_evolution.md
@@ -53,6 +53,10 @@ end
 ## Ordinary Differential Equation Method
 The first method is implemented by solving the ordinary differential equation (ODE) as shown above. `HierarchicalEOM.jl` wraps some of the functions in [`DifferentialEquations.jl`](https://diffeq.sciml.ai/stable/), which is a very rich numerical library for solving the differential equations and provides many ODE solvers. It offers quite a few options for the user to tailor the solver to their specific needs. The default solver (and its corresponding settings) are chosen to suit commonly encountered problems and should work fine for most of the cases. If you require more specialized methods, such as the choice of algorithm, please refer to [benchmarks for DifferentialEquations solvers](@ref benchmark-ODE-solvers) and also the documentation of [`DifferentialEquations.jl`](https://diffeq.sciml.ai/stable/).
 
+!!! compat "Extension for CUDA.jl"
+    `HierarchicalEOM.jl` provides an extension to support GPU ([`CUDA.jl`](https://github.com/JuliaGPU/CUDA.jl)) acceleration for solving the time evolution (only for ODE method with time-independent system Hamiltonian), but this feature requires `Julia 1.9+` and `HierarchicalEOM 1.1+`. See [here](@ref doc-ext-CUDA) for more details.
+
+
 ### Given the initial state as Density Operator (`AbstractMatrix` type)
 
 See the docstring of this method:  
diff --git a/ext/HierarchicalEOM_CUDAExt.jl b/ext/HierarchicalEOM_CUDAExt.jl
new file mode 100644
index 00000000..f428a247
--- /dev/null
+++ b/ext/HierarchicalEOM_CUDAExt.jl
@@ -0,0 +1,73 @@
+module HierarchicalEOM_CUDAExt
+
+using HierarchicalEOM
+import HierarchicalEOM.HeomAPI:  _HandleVectorType, _HandleSteadyStateMatrix
+import HierarchicalEOM.Spectrum: _HandleIdentityType
+import CUDA
+import CUDA: cu, CuArray
+import CUDA.CUSPARSE: CuSparseMatrixCSC
+import SparseArrays: sparse, SparseVector, SparseMatrixCSC
+
+@doc raw"""
+    cu(M::AbstractHEOMLSMatrix)
+Return a new HEOMLS-matrix-type object with `M.data` is in the type of `CuSparseMatrixCSC{ComplexF32, Int32}` for gpu calculations.
+"""
+cu(M::AbstractHEOMLSMatrix) = CuSparseMatrixCSC(M)
+
+@doc raw"""
+    CuSparseMatrixCSC(M::AbstractHEOMLSMatrix)
+Return a new HEOMLS-matrix-type object with `M.data` is in the type of `CuSparseMatrixCSC{ComplexF32, Int32}` for gpu calculations.
+"""
+function CuSparseMatrixCSC(M::T) where T <: AbstractHEOMLSMatrix
+    A = M.data
+    if typeof(A) <: CuSparseMatrixCSC
+        return M
+    else
+        colptr = CuArray{Int32}(A.colptr)
+        rowval = CuArray{Int32}(A.rowval)
+        nzval  = CuArray{ComplexF32}(A.nzval)
+        A_gpu  = CuSparseMatrixCSC{ComplexF32, Int32}(colptr, rowval, nzval, size(A))
+        if T <: M_S
+            return M_S(A_gpu, M.tier, M.dim, M.N, M.sup_dim, M.parity)
+        elseif T <: M_Boson
+            return M_Boson(A_gpu, M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+        elseif T <: M_Fermion
+            return M_Fermion(A_gpu, M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+        else
+            return M_Boson_Fermion(A_gpu, M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
+        end
+    end
+end
+
+# for changing a `CuArray` back to `ADOs`
+function _HandleVectorType(V::T, cp::Bool=false) where T <: CuArray
+    return Vector{ComplexF64}(V)
+end
+
+# for changing the type of `ADOs` to match the type of HEOMLS matrix 
+function _HandleVectorType(MatrixType::Type{TM}, V::SparseVector) where TM <: CuSparseMatrixCSC
+    TE = eltype(MatrixType)
+    return CuArray{TE}(V)
+end
+
+##### We first remove this part because there are errors when solveing steady states using GPU
+# function _HandleSteadyStateMatrix(MatrixType::Type{TM}, M::AbstractHEOMLSMatrix, S::Int) where TM <: CuSparseMatrixCSC
+#     colptr = Vector{Int32}(M.data.colPtr)
+#     rowval = Vector{Int32}(M.data.rowVal)
+#     nzval  = Vector{ComplexF32}(M.data.nzVal)
+#     A = SparseMatrixCSC{ComplexF32, Int32}(S, S, colptr, rowval, nzval)
+#     A[1,1:S] .= 0f0
+#     
+#     # sparse(row_idx, col_idx, values, row_dims, col_dims)
+#     A += sparse(ones(Int32, M.dim), [Int32((n - 1) * (M.dim + 1) + 1) for n in 1:(M.dim)], ones(ComplexF32, M.dim), S, S)
+#     return CuSparseMatrixCSC(A)
+# end
+
+function _HandleIdentityType(MatrixType::Type{TM}, S::Int) where TM <: CuSparseMatrixCSC
+    colptr = CuArray{Int32}(Int32(1):Int32(S+1))
+    rowval = CuArray{Int32}(Int32(1):Int32(S))
+    nzval  = CUDA.ones(ComplexF32, S)
+    return CuSparseMatrixCSC{ComplexF32, Int32}(colptr, rowval, nzval, (S, S))
+end
+
+end
\ No newline at end of file
diff --git a/src/ADOs.jl b/src/ADOs.jl
index 256d995d..64a8cbaa 100644
--- a/src/ADOs.jl
+++ b/src/ADOs.jl
@@ -106,7 +106,7 @@ iterate(A::ADOs, ::Nothing) = nothing
 
 function show(io::IO, A::ADOs)
     print(io, 
-        "Auxiliary Density Operators with $(A.parity), (system) dim = $(A.dim), number N = $(A.N)\n"
+        "$(A.N) Auxiliary Density Operators with $(A.parity) and (system) dim = $(A.dim)\n"
     )
 end
 function show(io::IO, m::MIME"text/plain", A::ADOs) show(io, A) end
@@ -206,4 +206,19 @@ function Expect(op, ados_list::Vector{ADOs}; take_real=true)
     else
         return exp_val
     end
+end
+
+# for changing a `Vector` back to `ADOs`
+function _HandleVectorType(V::T, cp::Bool=true) where T <: Vector
+    if cp
+        return Vector{ComplexF64}(V)
+    else
+        return V
+    end
+end
+
+# for changing the type of `ADOs` to match the type of HEOMLS matrix 
+function _HandleVectorType(MatrixType::Type{TM}, V::SparseVector) where TM <: SparseMatrixCSC
+    TE = eltype(MatrixType)
+    return Vector{TE}(V)
 end
\ No newline at end of file
diff --git a/src/HierarchicalEOM.jl b/src/HierarchicalEOM.jl
index a7c04dbb..5d59ee5e 100644
--- a/src/HierarchicalEOM.jl
+++ b/src/HierarchicalEOM.jl
@@ -80,9 +80,9 @@ module HierarchicalEOM
 
     # sub-module Spectrum for HierarchicalEOM
     module Spectrum
-        import ..HeomAPI: AbstractHEOMLSMatrix, OddParity, ADOs, spre
+        import ..HeomAPI: AbstractHEOMLSMatrix, OddParity, ADOs, spre, _HandleVectorType
         import LinearAlgebra: I, kron
-        import SparseArrays: sparse, sparsevec
+        import SparseArrays: sparse, sparsevec, SparseMatrixCSC
         import LinearSolve: LinearProblem, init, solve!, UMFPACKFactorization
         import ProgressMeter: Progress, next!        
         import ..HeomBase: PROGBAR_OPTIONS, HandleMatrixType
diff --git a/src/Spectrum.jl b/src/Spectrum.jl
index 4ad92f37..9cf83c11 100644
--- a/src/Spectrum.jl
+++ b/src/Spectrum.jl
@@ -107,16 +107,16 @@ end
     end
 
     Size = size(M, 1)
-    I_total = sparse(I, Size, Size)
-    I_heom  = sparse(I, M.N, M.N)
+    I_heom  = sparse(one(ComplexF64) * I, M.N, M.N)
+    I_total = _HandleIdentityType(typeof(M.data), Size)
 
     # equal to : transpose(sparse(vec(system_identity_matrix)))
-    I_dual_vec = transpose(sparsevec([1 + n * (M.dim + 1) for n in 0:(M.dim - 1)], ones(M.dim), M.sup_dim))
+    I_dual_vec = transpose(sparsevec([1 + n * (M.dim + 1) for n in 0:(M.dim - 1)], ones(ComplexF64, M.dim), M.sup_dim))
 
     # operator for calculating two-time correlation functions in frequency domain
     a_normal = kron(I_heom, spre(op))
     a_dagger = kron(I_heom, spre(op'))
-    local X::Vector{ComplexF64} = a_normal * ados_vec
+    X = _HandleVectorType(typeof(M.data), a_normal * ados_vec)
 
     Length = length(ω_list)
     Sω = Vector{Float64}(undef, Length)
@@ -137,7 +137,7 @@ end
         end
 
         # trace over the Hilbert space of system (expectation value)
-        Sω[i] = -1 * real(I_dual_vec * (a_dagger * sol.u)[1:(M.sup_dim)])
+        Sω[i] = -1 * real(I_dual_vec * (a_dagger * _HandleVectorType(sol.u, false))[1:(M.sup_dim)])
 
         if SAVE
             open(FILENAME, "a") do file
@@ -176,17 +176,17 @@ end
     end
 
     Size = size(M, 1)
-    I_total = sparse(I, Size, Size)
-    I_heom  = sparse(I, M.N, M.N)
+    I_heom  = sparse(one(ComplexF64) * I, M.N, M.N)
+    I_total = _HandleIdentityType(typeof(M.data), Size)
 
     # equal to : transpose(sparse(vec(system_identity_matrix)))
-    I_dual_vec = transpose(sparsevec([1 + n * (M.dim + 1) for n in 0:(M.dim - 1)], ones(M.dim), M.sup_dim))
+    I_dual_vec = transpose(sparsevec([1 + n * (M.dim + 1) for n in 0:(M.dim - 1)], ones(ComplexF64, M.dim), M.sup_dim))
 
     # operators for calculating two-time correlation functions in frequency domain
     d_normal = kron(I_heom, spre(op))
     d_dagger = kron(I_heom, spre(op'))
-    local X_m::Vector{ComplexF64} = d_normal * ados_vec
-    local X_p::Vector{ComplexF64} = d_dagger * ados_vec
+    X_m = _HandleVectorType(typeof(M.data), d_normal * ados_vec)
+    X_p = _HandleVectorType(typeof(M.data), d_dagger * ados_vec)
 
     Length = length(ω_list)
     Aω = Vector{Float64}(undef, Length)
@@ -211,8 +211,8 @@ end
             cache_p.A = M.data + Iω
             sol_p = solve!(cache_p)
         end
-        Cω_m = d_dagger * sol_m.u
-        Cω_p = d_normal * sol_p.u
+        Cω_m = d_dagger * _HandleVectorType(sol_m.u, false)
+        Cω_p = d_normal * _HandleVectorType(sol_p.u, false)
         
         # trace over the Hilbert space of system (expectation value)
         Aω[i] = -1 * (
@@ -235,4 +235,9 @@ end
     end
 
     return Aω
+end
+
+function _HandleIdentityType(MatrixType::Type{TM}, S::Int) where TM <: SparseMatrixCSC
+    ElType = eltype(MatrixType)
+    return sparse(one(ElType) * I, S, S)
 end
\ No newline at end of file
diff --git a/src/SteadyState.jl b/src/SteadyState.jl
index e0cea1a7..29d2821c 100644
--- a/src/SteadyState.jl
+++ b/src/SteadyState.jl
@@ -19,13 +19,8 @@ For more details about solvers and extra options, please refer to [`LinearSolve.
         error("The parity of M should be \"EVEN\".")
     end    
 
-    A = copy(M.data)
-    S = size(A, 1)
-    A[1,1:S] .= 0
-    
-    # sparse(row_idx, col_idx, values, row_dims, col_dims)
-    A += sparse(fill(1, M.dim), [(n - 1) * (M.dim + 1) + 1 for n in 1:(M.dim)], fill(1, M.dim), S, S)
-    
+    S = size(M, 1)
+    A = _HandleSteadyStateMatrix(typeof(M.data), M, S)
     b = sparsevec([1], [1. + 0.0im], S)
     
     # solving x where A * x = b
@@ -33,14 +28,14 @@ For more details about solvers and extra options, please refer to [`LinearSolve.
         print("Solving steady state for auxiliary density operators...")
         flush(stdout)
     end
-    cache = init(LinearProblem(A, Vector(b)), solver, SOLVEROptions...)
+    cache = init(LinearProblem(A, _HandleVectorType(typeof(M.data), b)), solver, SOLVEROptions...)
     sol = solve!(cache)
     if verbose
         println("[DONE]")
         flush(stdout)
     end
     
-    return ADOs(sol.u, M.dim, M.N, M.parity)
+    return ADOs(_HandleVectorType(sol.u, false), M.dim, M.N, M.parity)
 end
 
 @doc raw"""
@@ -145,7 +140,9 @@ For more details about solvers and extra options, please refer to [`Differential
 
     # problem: dρ(t)/dt = L * ρ(t)
     L = MatrixOperator(M.data)
-    prob = ODEProblem(L, Vector(ados.data), (0, Inf))
+    ElType = eltype(M)
+    tspan  = Tuple{ElType, ElType}((0, Inf))
+    prob   = ODEProblem(L, _HandleVectorType(typeof(M.data), ados.data), tspan)
 
     # solving steady state of the ODE problem
     if verbose
@@ -165,5 +162,15 @@ For more details about solvers and extra options, please refer to [`Differential
         flush(stdout)
     end
 
-    return ADOs(sol.u, M.dim, M.N, M.parity)
+    return ADOs(_HandleVectorType(sol.u, false), M.dim, M.N, M.parity)
+end
+
+function _HandleSteadyStateMatrix(MatrixType::Type{TM}, M::AbstractHEOMLSMatrix, S::Int) where TM <: SparseMatrixCSC
+    ElType = eltype(M)
+    A = copy(M.data)
+    A[1,1:S] .= 0
+    
+    # sparse(row_idx, col_idx, values, row_dims, col_dims)
+    A += sparse(ones(ElType, M.dim), [(n - 1) * (M.dim + 1) + 1 for n in 1:(M.dim)], ones(ElType, M.dim), S, S)
+    return A
 end
\ No newline at end of file
diff --git a/src/evolution.jl b/src/evolution.jl
index 19a31a2e..335800de 100644
--- a/src/evolution.jl
+++ b/src/evolution.jl
@@ -268,7 +268,7 @@ For more details about solvers and extra options, please refer to [`Differential
 
     # problem: dρ/dt = L * ρ(0)
     L = MatrixOperator(M.data)
-    prob = ODEProblem(L, Vector(ados.data), (tlist[1], tlist[end]))
+    prob  = ODEProblem(L, _HandleVectorType(typeof(M.data), ados.data), (tlist[1], tlist[end]))
 
     # setup integrator
     integrator = init(
@@ -294,7 +294,7 @@ For more details about solvers and extra options, please refer to [`Differential
         step!(integrator, dt, true)
         
         # save the ADOs
-        ados = ADOs(copy(integrator.u), M.dim, M.N, M.parity)
+        ados = ADOs(_HandleVectorType(integrator.u), M.dim, M.N, M.parity)
         push!(ADOs_list, ados)
         
         if SAVE
@@ -446,7 +446,7 @@ For more details about solvers and extra options, please refer to [`Differential
     
     # problem: dρ/dt = L(t) * ρ(0)
     ## M.dim will check whether the returned time-dependent Hamiltonian has the correct dimension
-    prob = ODEProblem(L, Vector(ados.data), (tlist[1], tlist[end]), (M, H, param))
+    prob = ODEProblem(L, _HandleVectorType(typeof(M.data), ados.data), (tlist[1], tlist[end]), (M, H, param))
 
     # setup integrator
     integrator = init(
@@ -472,7 +472,7 @@ For more details about solvers and extra options, please refer to [`Differential
         step!(integrator, dt, true)
         
         # save the ADOs
-        ados = ADOs(copy(integrator.u), M.dim, M.N, M.parity)
+        ados = ADOs(_HandleVectorType(integrator.u), M.dim, M.N, M.parity)
         push!(ADOs_list, ados)
         
         if SAVE
diff --git a/src/heom_matrices/M_Boson.jl b/src/heom_matrices/M_Boson.jl
index 5a8bb5ca..bbef6141 100644
--- a/src/heom_matrices/M_Boson.jl
+++ b/src/heom_matrices/M_Boson.jl
@@ -56,7 +56,7 @@ Note that the parity only need to be set as `ODD` when the system contains fermi
     _Hsys = HandleMatrixType(Hsys, 0, "Hsys (system Hamiltonian)")
     Nsys    = size(_Hsys, 1)
     sup_dim = Nsys ^ 2
-    I_sup   = sparse(I, sup_dim, sup_dim)
+    I_sup   = sparse(one(ComplexF64) * I, sup_dim, sup_dim)
 
     # the Liouvillian operator for free Hamiltonian term
     Lsys = -1im * (spre(_Hsys) - spost(_Hsys))
diff --git a/src/heom_matrices/M_Boson_Fermion.jl b/src/heom_matrices/M_Boson_Fermion.jl
index 12650d57..5232d1d9 100644
--- a/src/heom_matrices/M_Boson_Fermion.jl
+++ b/src/heom_matrices/M_Boson_Fermion.jl
@@ -73,7 +73,7 @@ Note that the parity only need to be set as `ODD` when the system contains fermi
     _Hsys = HandleMatrixType(Hsys, 0, "Hsys (system Hamiltonian)")
     Nsys    = size(_Hsys, 1)
     sup_dim = Nsys ^ 2
-    I_sup   = sparse(I, sup_dim, sup_dim)
+    I_sup   = sparse(one(ComplexF64) * I, sup_dim, sup_dim)
 
     # the Liouvillian operator for free Hamiltonian term
     Lsys = -1im * (spre(_Hsys) - spost(_Hsys))
diff --git a/src/heom_matrices/M_Fermion.jl b/src/heom_matrices/M_Fermion.jl
index 553443e0..a70246dc 100644
--- a/src/heom_matrices/M_Fermion.jl
+++ b/src/heom_matrices/M_Fermion.jl
@@ -54,7 +54,7 @@ Generate the fermion-type HEOM Liouvillian superoperator matrix
     _Hsys = HandleMatrixType(Hsys, 0, "Hsys (system Hamiltonian)")
     Nsys    = size(_Hsys, 1)
     sup_dim = Nsys ^ 2
-    I_sup   = sparse(I, sup_dim, sup_dim)
+    I_sup   = sparse(one(ComplexF64) * I, sup_dim, sup_dim)
 
     # the Liouvillian operator for free Hamiltonian term
     Lsys = -1im * (spre(_Hsys) - spost(_Hsys))
diff --git a/src/heom_matrices/heom_matrix_base.jl b/src/heom_matrices/heom_matrix_base.jl
index 178a6821..bec30bdd 100644
--- a/src/heom_matrices/heom_matrix_base.jl
+++ b/src/heom_matrices/heom_matrix_base.jl
@@ -68,7 +68,8 @@ function show(io::IO, M::AbstractHEOMLSMatrix)
     end
 
     print(io, 
-        type, " type HEOMLS matrix with system dim = $(M.dim) and parity = $(M.parity)\n",
+        type, " type HEOMLS matrix acting on $(M.parity) ADOs\n",
+        "system dim = $(M.dim)\n",
         "number of ADOs N = $(M.N)\n",
         "data =\n"
     )
@@ -131,11 +132,11 @@ function addBosonDissipator(M::T, jumpOP::Vector=[]) where T <: AbstractHEOMLSMa
         if T <: M_S
             return M_S(M.data + L, M.tier, M.dim, M.N, M.sup_dim, M.parity)
         elseif T <: M_Boson
-            return M_Boson(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Boson(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         elseif T <: M_Fermion
-            return M_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         else
-            return M_Boson_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
+            return M_Boson_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
         end
     end
 end
@@ -175,11 +176,11 @@ function addFermionDissipator(M::T, jumpOP::Vector=[]) where T <: AbstractHEOMLS
         if T <: M_S
             return M_S(M.data + L, M.tier, M.dim, M.N, M.sup_dim, M.parity)
         elseif T <: M_Boson
-            return M_Boson(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Boson(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         elseif T <: M_Fermion
-            return M_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         else
-            return M_Boson_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
+            return M_Boson_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
         end
     end
 end
@@ -230,11 +231,11 @@ function addTerminator(M::Mtype, Bath::Union{BosonBath, FermionBath}) where Mtyp
         )
 
         if Mtype <: M_Boson
-            return M_Boson(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Boson(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         elseif Mtype <: M_Fermion
-            return M_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
+            return M_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.tier, M.dim, M.N, M.sup_dim, M.parity, M.bath, M.hierarchy)
         else
-            return M_Boson_Fermion(M.data + kron(sparse(I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
+            return M_Boson_Fermion(M.data + kron(sparse(one(ComplexF64) * I, M.N, M.N), L), M.Btier, M.Ftier, M.dim, M.N, M.sup_dim, M.parity, M.Bbath, M.Fbath, M.hierarchy)
         end
     end
 end
diff --git a/test/CUDA/CUDAExt.jl b/test/CUDA/CUDAExt.jl
new file mode 100644
index 00000000..41acd0f0
--- /dev/null
+++ b/test/CUDA/CUDAExt.jl
@@ -0,0 +1,99 @@
+using HierarchicalEOM
+using CUDA
+using LinearSolve
+
+# re-define the bath (make the matrix smaller)
+λ  = 0.01
+W  = 0.5
+kT = 0.5
+μ  = 0
+N  = 3
+tier = 3
+
+# System Hamiltonian
+Hsys = [
+    0 0;
+    0 0
+]
+
+# system-bath coupling operator
+Qb = [
+    0 1; 
+    1 0
+]
+Qf = [
+    0 1; 
+    0 0
+]
+
+# initial state
+ρ0 = [
+    0 0; 
+    0 1
+]
+
+Bbath = Boson_DrudeLorentz_Pade(Qb, λ, W, kT, N)
+Fbath = Fermion_Lorentz_Pade(Qf, λ, μ, W, kT, N)
+
+# Solving time Evolution
+## Schrodinger HEOMLS
+L_cpu = M_S(Hsys; verbose=false)
+L_gpu = cu(L_cpu)
+ados_cpu = evolution(L_cpu, ρ0, [0f0, 10f0]; verbose=false)
+ados_gpu = evolution(L_gpu, ρ0, [0f0, 10f0]; verbose=false)
+@test _is_Matrix_approx(getRho(ados_cpu[end]), getRho(ados_cpu[end]))
+
+## Boson HEOMLS
+L_cpu = M_Boson(Hsys, tier, Bbath; verbose=false)
+L_gpu = cu(L_cpu)
+ados_cpu = evolution(L_cpu, ρ0, [0f0, 10f0]; verbose=false)
+ados_gpu = evolution(L_gpu, ρ0, [0f0, 10f0]; verbose=false)
+@test _is_Matrix_approx(getRho(ados_cpu[end]), getRho(ados_cpu[end]))
+
+## Boson Fermion HEOMLS
+L_cpu = M_Fermion(Hsys, tier, Fbath; verbose=false)
+L_gpu = cu(L_cpu)
+ados_cpu = evolution(L_cpu, ρ0, [0f0, 10f0]; verbose=false)
+ados_gpu = evolution(L_gpu, ρ0, [0f0, 10f0]; verbose=false)
+@test _is_Matrix_approx(getRho(ados_cpu[end]), getRho(ados_cpu[end]))
+
+## Boson Fermion HEOMLS
+L_cpu = M_Boson_Fermion(Hsys, tier, tier, Bbath, Fbath; verbose=false)
+L_gpu = cu(L_cpu)
+tlist = 0f0:1f0:10f0
+ados_cpu = evolution(L_cpu, ρ0, tlist; verbose=false)
+ados_gpu = evolution(L_gpu, ρ0, tlist; verbose=false)
+for i in 1:length(tlist)
+    @test _is_Matrix_approx(getRho(ados_cpu[i]), getRho(ados_cpu[i]))
+end
+
+# SIAM
+ϵ = -5
+U = 10
+σm = [0 1; 0  0] ## σ-
+σz = [1 0; 0 -1] ## σz
+II = [1 0; 0  1] ## identity matrix
+d_up = kron(     σm, II)
+d_dn = kron(-1 * σz, σm)
+Hsys = ϵ * (d_up' * d_up + d_dn' * d_dn) + U * (d_up' * d_up * d_dn' * d_dn)
+Γ  = 2
+μ  = 0
+W  = 10
+kT = 0.5
+N  = 5
+tier = 3
+bath_up = Fermion_Lorentz_Pade(d_up, Γ, μ, W, kT, N)
+bath_dn = Fermion_Lorentz_Pade(d_dn, Γ, μ, W, kT, N)
+bath_list = [bath_up, bath_dn]
+
+## solve density of states
+ωlist = -10:1:10
+L_cpu     = M_Fermion(Hsys, tier, bath_list; verbose=false)
+L_odd_cpu = M_Fermion(Hsys, tier, bath_list, ODD; verbose=false)
+L_odd_gpu = cu(L_odd_cpu)
+ados_cpu  = SteadyState(L_cpu; verbose=false)
+dos_cpu   = spectrum(L_odd_cpu, ados_cpu, d_up, ωlist; verbose=false)
+dos_gpu   = spectrum(L_odd_gpu, ados_cpu, d_up, ωlist; solver=KrylovJL_BICGSTAB(rtol=1f-10, atol=1f-12), verbose=false)
+for (i, ω) in enumerate(ωlist)
+    @test dos_cpu[i] ≈ dos_gpu[i]  atol = 1e-6
+end
\ No newline at end of file
diff --git a/test/CUDA/Project.toml b/test/CUDA/Project.toml
new file mode 100644
index 00000000..6674b9ae
--- /dev/null
+++ b/test/CUDA/Project.toml
@@ -0,0 +1,4 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+HierarchicalEOM = "a62dbcb7-80f5-4d31-9a88-8b19fd92b128"
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 22b7cc9b..063e001b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
+using Test, Pkg
 using QuantumOptics
 using HierarchicalEOM
-using Test
 using SparseArrays
 using LinearAlgebra
 
@@ -28,6 +28,15 @@ if GROUP == "All" || GROUP == "Core"
     include("phys_analysis.jl")
 end
 
+if GROUP == "HierarchicalEOM_CUDAExt"
+    Pkg.activate("CUDA")
+    Pkg.develop(PackageSpec(path = dirname(@__DIR__)))
+    Pkg.instantiate()
+    @testset "CUDA Extension" begin
+        include("CUDA/CUDAExt.jl")
+    end
+end
+
 if (GROUP == "All" || GROUP == "HierarchicalEOM_QOExt") && HAS_EXTENSIONS
     @testset "QuantumOptics Extension" begin
         include("QOExt.jl")