From 7cf88f1a5243541d9d86fae42c34b518a0cb02ed Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Wed, 20 Dec 2023 14:05:41 +0100
Subject: [PATCH] smallish refactor of ufunc handling

This should fix lots of corner cases of ufuncs
handling stuff unexpectedly.

- added _rows_and_cols which returns a DOK format
  of the rows and columns
- added documentation about creating a sparse matrix
  from data, indices and pointers.
- much faster creation of diagonal matrices
  And handles correctly offsets, out-of-bounds
  will be removed from the SparseCSR
- added SparseCSR.toarray
- added SparseCSR.__array__ to allow numpy handling
- fixed SparseCSR dtype handling in ufuncs.
  The ufuncs result-type handling will be used.
  In some corner cases could the casting be wrong, now
  the dtype is explicitly created from the ufunc.resolve_dtypes
- added more SparseCSR tests which should cover cases
  of non-sorted, sorted, and non-finalized matrices.
- major clean-up of SparseCSR tests, removed setup and instantiated
  specific matrix fixtures.
- lots of minor bug-fixes related to in-place operations

Fixes #661

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 CHANGELOG.md                            |   8 +
 src/sisl/_help.py                       |  26 +-
 src/sisl/_sparse.pyx                    |   3 +-
 src/sisl/geometry.py                    |   2 +-
 src/sisl/io/siesta/_help.py             |  23 +-
 src/sisl/io/siesta/binaries.py          |  27 +-
 src/sisl/io/siesta/tests/test_siesta.py |   6 +-
 src/sisl/sparse.py                      | 241 +++++++-----
 src/sisl/tests/test_help.py             |   2 +
 src/sisl/tests/test_sparse.py           | 497 ++++++++++++++----------
 10 files changed, 506 insertions(+), 329 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0beeed6752..59307e9ac3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ we hit release version 1.0.0.
 ## [0.14.4] - YYYY-MM-DD
 
 ### Added
+- `SparseCSR.toarray` to comply with array handling (equivalent to `todense`)
 - enabled `Grid.to|new` with the most basic stuff
   str|Path|Grid|pyamg
 - `Shape.translate`, to easily translate entire shape constructs, #655
@@ -18,6 +19,11 @@ we hit release version 1.0.0.
 - added `offset` argument in `Geometry.add_vacuum` to enable shifting atomic coordinates
 
 ### Fixed
+- `SparseCSR` ufunc handling, in some corner cases could the dtype casting do things
+  wrongly.
+- fixed corner cases where the `SparseCSR.diags(offsets=)` would add elements
+  in non-existing elements
+- some cases of writing orthogonal matrices to TSHS/nc file formats #661
 - `BDOS` from TBtrans calculations now returns the full DOS of all (Bloch-expanded)
   atoms
 - `Lattice` objects now issues a warning when created with 0-length vectors
@@ -30,6 +36,8 @@ we hit release version 1.0.0.
 
 ### Changed
 - `vacuum` is now an optional parameter for all ribbon structures
+- enabled `array_fill_repeat` with custom axis, to tile along specific
+  dimensions
 
 
 ## [0.14.3] - 2023-11-07
diff --git a/src/sisl/_help.py b/src/sisl/_help.py
index 615ad40043..61361fcd8a 100644
--- a/src/sisl/_help.py
+++ b/src/sisl/_help.py
@@ -36,32 +36,32 @@
     from xml.etree.ElementTree import parse as xml_parse
 
 
-def array_fill_repeat(array, size, cls=None):
+def array_fill_repeat(array, size, axis=-1, cls=None):
     """
     This repeats an array along the zeroth axis until it
     has size `size`. Note that initial size of `array` has
     to be an integer part of `size`.
     """
+    array = np.asarray(array, dtype=cls)
     try:
-        reps = size // len(array)
+        reps = size // array.shape[axis]
     except Exception:
-        array = [array]
-        reps = size // len(array)
-    if size % len(array) != 0:
+        # likely a scalar
+        array = np.expand_dims(array, axis=0)
+        reps = size // array.shape[axis]
+    if size % array.shape[axis] != 0:
         # We do not have it correctly formatted (either an integer
         # repeatable part, full, or a single)
         raise ValueError(
             "Repetition of or array is not divisible with actual length. "
             "Hence we cannot create a repeated size."
         )
-    if cls is None:
-        if reps > 1:
-            return np.tile(array, reps)
-        return array
-    else:
-        if reps > 1:
-            return np.tile(np.array(array, dtype=cls), reps)
-        return np.array(array, dtype=cls)
+
+    if reps > 1:
+        tile_reps = list(1 for _ in array.shape)
+        tile_reps[axis] = reps
+        return np.tile(array, tile_reps)
+    return array
 
 
 @set_module("sisl")
diff --git a/src/sisl/_sparse.pyx b/src/sisl/_sparse.pyx
index 70f73d1ee9..efc962430d 100644
--- a/src/sisl/_sparse.pyx
+++ b/src/sisl/_sparse.pyx
@@ -251,12 +251,13 @@ def _sparse_dense(shape,
 
     cdef Py_ssize_t nr = ncol.shape[0]
     cdef V = np.zeros(shape, dtype=dtype)
+    cdef VV = V[:, ::1]
     cdef Py_ssize_t r, ind, ix, s2
 
     s2 = shape[2]
     for r in range(nr):
         for ind in range(ptr[r], ptr[r] + ncol[r]):
             for ix in range(s2):
-                V[r, col[ind], ix] += D[ind, ix]
+                VV[r, col[ind], ix] += D[ind, ix]
 
     return V
diff --git a/src/sisl/geometry.py b/src/sisl/geometry.py
index 927ecd8375..829c8e0416 100644
--- a/src/sisl/geometry.py
+++ b/src/sisl/geometry.py
@@ -373,7 +373,7 @@ def _sanitize_atoms(self, atoms) -> ndarray:
         atoms = _a.asarray(atoms)
         if atoms.size == 0:
             return _a.asarrayl([])
-        elif atoms.dtype == bool_:
+        if atoms.dtype == bool_:
             return atoms.nonzero()[0]
         return atoms
 
diff --git a/src/sisl/io/siesta/_help.py b/src/sisl/io/siesta/_help.py
index 7a5862ef93..a1b60edfff 100644
--- a/src/sisl/io/siesta/_help.py
+++ b/src/sisl/io/siesta/_help.py
@@ -4,6 +4,8 @@
 import numpy as np
 
 import sisl._array as _a
+from sisl.sparse import _rows_and_cols
+from sisl.messages import warn
 from sisl import SislError
 
 try:
@@ -25,21 +27,12 @@ def _ensure_diagonal(csr):
     since missing items will be set to 0 which should be 1 in
     non-orthogonal basis sets.
     """
-    # Create index arrays
-    row = (csr.ncol > 0).nonzero()[0]
-    row = np.repeat(row, csr.ncol[row])
-    # Ensure we only take those elements that are in play (i.e. this
-    # will work regardless of the csr being finalized or not)
-    col = csr.col[_a.array_arange(csr.ptr[:-1], n=csr.ncol, dtype=np.int32)]
-
-    # figure out where they are the same (diagonals)
-    present_diags = row[row == col]
-
-    # Now figure out which elements are missing
-    missing_diags = np.delete(np.arange(csr.shape[0]), present_diags)
-
-    for row in missing_diags:
-        csr[row, row] = 0.0
+    old_nnz = csr.nnz
+    csr += csr.diags(0, dim=1)
+    if csr.nnz != old_nnz:
+        warn(
+            "ensuring the sparse matrix having diagonal elements changed the sparsity pattern."
+        )
 
 
 def _csr_from_siesta(geom, csr):
diff --git a/src/sisl/io/siesta/binaries.py b/src/sisl/io/siesta/binaries.py
index 9c68cbfa74..c60fb48080 100644
--- a/src/sisl/io/siesta/binaries.py
+++ b/src/sisl/io/siesta/binaries.py
@@ -403,9 +403,11 @@ def write_hamiltonian(self, H, **kwargs):
                 "a zero element sparse matrix!"
             )
 
+        sort = kwargs.get("sort", True)
+
         # Convert to siesta CSR
-        _csr_to_siesta(H.geometry, csr)
-        csr.finalize(sort=kwargs.get("sort", True))
+        _csr_to_siesta(H.geometry, csr, diag=True)
+        csr.finalize(sort=sort)
         _mat_spin_convert(csr, H.spin)
 
         # Extract the data to pass to the fortran routine
@@ -414,17 +416,22 @@ def write_hamiltonian(self, H, **kwargs):
 
         # Get H and S
         if H.orthogonal:
-            h = csr._D
-            s = csr.diags(1.0, dim=1)
-            # Ensure all data is correctly formatted (i.e. have the same sparsity pattern)
-            s.align(csr)
-            s.finalize(sort=kwargs.get("sort", True))
-            if s.nnz != len(h):
+            s = csr.copy(dims=0)
+            s.empty(keep_nnz=True)
+            s += s.diags(1)
+            missing_diags = s.nnz - csr.nnz
+            if missing_diags:
+                # This should never happen as _csr_to_siesta should ensure
+                # that the diagonal entries always exists.
+                # Hence it gets changed before writing.
+                # Not compeletely optimal, but for now this is OK
                 raise SislError(
+                    f"{self.__class__.__name__}.write_hamiltonian "
                     "The diagonal elements of your orthogonal Hamiltonian "
-                    "have not been defined, this is a requirement."
+                    f"have not been defined. Got {len(csr) - missing_diags} elements, expected {len(csr)}."
                 )
-            s = s._D[:, 0]
+            h = csr._D
+            s = s._D
         else:
             h = csr._D[:, : H.S_idx]
             s = csr._D[:, H.S_idx]
diff --git a/src/sisl/io/siesta/tests/test_siesta.py b/src/sisl/io/siesta/tests/test_siesta.py
index 0ce0b448d5..0b6df9c0bb 100644
--- a/src/sisl/io/siesta/tests/test_siesta.py
+++ b/src/sisl/io/siesta/tests/test_siesta.py
@@ -96,11 +96,7 @@ def test_nc_multiple_checks(sisl_tmp, sisl_system, sort):
             shuffle(edges)
             DM[io, edges] = 2.0
 
-        if not sort:
-            with pytest.raises(ValueError):
-                DM.write(sile, sort=sort)
-        else:
-            DM.write(sile, sort=sort)
+        DM.write(sile, sort=sort)
 
 
 def test_nc_overlap(sisl_tmp, sisl_system):
diff --git a/src/sisl/sparse.py b/src/sisl/sparse.py
index 36099990cb..5e5d81921b 100644
--- a/src/sisl/sparse.py
+++ b/src/sisl/sparse.py
@@ -43,7 +43,7 @@
 from scipy.sparse import csr_matrix, issparse
 
 from . import _array as _a
-from ._array import array_arange, arrayi, asarrayi, fulli
+from ._array import array_arange
 from ._help import array_fill_repeat, isiterable
 from ._indices import indices, indices_only
 from ._internal import set_module
@@ -57,6 +57,27 @@
 __all__ = ["SparseCSR", "ispmatrix", "ispmatrixd"]
 
 
+def _rows_and_cols(csr, rows=None):
+    """Retrieve the sparse patterns rows and columns from the sparse matrix
+
+    Possibly only retrieving it for a subset of rows.
+    """
+    ptr = csr.ptr
+    ncol = csr.ncol
+    col = csr.col
+    if rows is None:
+        cols = col[array_arange(ptr[:-1], n=ncol, dtype=int32)]
+        idx = (ncol > 0).nonzero()[0]
+        rows = repeat(idx.astype(int32, copy=False), ncol[idx])
+    else:
+        rows = csr._sanitize(rows).ravel()
+        ncol = ncol[rows]
+        cols = col[array_arange(ptr[rows], n=ncol, dtype=int32)]
+        idx = (ncol > 0).nonzero()[0]
+        rows = repeat(rows[idx].astype(int32, copy=False), ncol[idx])
+    return rows, cols
+
+
 def _ncol_to_indptr(ncol):
     """Convert the ncol array into a pointer array"""
     ptr = _a.emptyi(ncol.size + 1)
@@ -99,6 +120,9 @@ class SparseCSR(NDArrayOperatorsMixin):
     - ``SparseCSR((M,N,K)[, dtype])``
       creating a sparse matrix with ``M`` rows, ``N`` columns
       and ``K`` elements per sparse element.
+    - ``SparseCSR((data, ptr, indices), [shape, dtype])``
+      creating a sparse matrix with specific data as would
+      be used when creating `scipy.sparse.csr_matrix`.
 
     Additionally these parameters control the
     creation of the sparse matrix.
@@ -131,6 +155,7 @@ def __init__(self, arg1, dim=1, dtype=None, nnzpr=20, nnz=None, **kwargs):
         # a non-zero element, the # of elements
         # for the insert row is increased at least by this number
         self._ns = 10
+        self._finalized = False
 
         if issparse(arg1):
             # This is a sparse matrix
@@ -193,6 +218,8 @@ def __init__(self, arg1, dim=1, dtype=None, nnzpr=20, nnz=None, **kwargs):
                     self._D[:, :] = arg1[0]
                 else:
                     self._D[:, 0] = arg1[0]
+                if np.all(self.ncol <= 1):
+                    self._finalized = True
 
     def __init_shape(self, arg1, dim=1, dtype=None, nnzpr=20, nnz=None, **kwargs):
         # The shape of the data...
@@ -241,7 +268,7 @@ def __init_shape(self, arg1, dim=1, dtype=None, nnzpr=20, nnz=None, **kwargs):
         # in the sparsity pattern
         self.ncol = _a.zerosi([M])
         # Create pointer array
-        self.ptr = _a.cumsumi(fulli(M + 1, nnzpr)) - nnzpr
+        self.ptr = _a.cumsumi(_a.fulli(M + 1, nnzpr)) - nnzpr
         # Create column array
         self.col = _a.fulli(nnz, -1)
         # Store current number of non-zero elements
@@ -252,13 +279,12 @@ def __init_shape(self, arg1, dim=1, dtype=None, nnzpr=20, nnz=None, **kwargs):
         # thus automatically zeroing the other dimensions.
         self._D = zeros([nnz, K], dtype)
 
-        # Denote that this sparsity pattern hasn't been finalized
-        self._finalized = False
-
     @classmethod
     def sparsity_union(cls, *spmats, dtype=None, dim=None, value=0):
         """Create a SparseCSR with constant fill value in all places that `spmats` have nonzeros
 
+        By default the returned matrix will be sorted.
+
         Parameters
         ----------
         spmats : SparseCSR or csr_matrix
@@ -316,20 +342,14 @@ def diagonal(self):
         # get the diagonal components
         diag = np.zeros([self.shape[0], self.shape[2]], dtype=self.dtype)
 
-        ptr = self.ptr
-        ncol = self.ncol
-        col = self.col
-        D = self._D
+        rows, cols = _rows_and_cols(self)
 
         # Now retrieve rows and cols
-        idx = (ncol > 0).nonzero()[0]
-        row = repeat(idx.astype(int32, copy=False), ncol[idx])
-        idx = array_arange(ptr[:-1], n=ncol, dtype=int32)
-        col = col[idx]
+        idx = array_arange(self.ptr[:-1], n=self.ncol, dtype=int32)
         # figure out the indices where we have a diagonal index
-        diag_idx = np.equal(row, col)
+        diag_idx = np.equal(rows, cols)
         idx = idx[diag_idx]
-        diag[row[diag_idx]] = D[idx]
+        diag[rows[diag_idx]] = self._D[idx]
         if self.shape[2] == 1:
             return diag.ravel()
         return diag
@@ -352,23 +372,33 @@ def diags(self, diagonals, offsets=0, dim=None, dtype=None):
         """
         if dim is None:
             dim = self.shape[2]
+        diagonals = np.asarray(diagonals)
         if dtype is None:
-            dtype = self.dtype
+            dtype = np.result_type(self.dtype, diagonals.dtype)
 
         # Now create the sparse matrix
         shape = list(self.shape)
         shape[2] = dim
         shape = tuple(shape)
 
-        # Delete the last entry, regardless of the size, the diagonal
-        D = self.__class__(shape, dtype=dtype)
+        offsets = array_fill_repeat(offsets, shape[0], cls=dtype)
 
-        diagonals = array_fill_repeat(diagonals, D.shape[0], cls=dtype)
-        offsets = array_fill_repeat(offsets, D.shape[0], cls=dtype)
+        # Create the index-pointer, data and values
+        data = array_fill_repeat(diagonals, shape[0], axis=0, cls=dtype)
+        indices = _a.arangei(shape[0]) + offsets
 
-        # Create diagonal elements
-        for i in range(D.shape[0]):
-            D[i, i + offsets[i]] = diagonals[i]
+        # create the pointer.
+        idx_ok = np.logical_and(0 <= indices, indices < shape[1])
+        data = data[idx_ok]
+        ptr1 = _a.onesi(shape[0])
+        ptr1[~idx_ok] = 0
+        indices = indices[idx_ok]
+        ptr = _a.emptyi(shape[0] + 1)
+        ptr[0] = 0
+        ptr[1:] = np.cumsum(ptr1)
+
+        # Delete the last entry, regardless of the size, the diagonal
+        D = self.__class__((data, indices, ptr), shape=shape, dtype=dtype)
 
         return D
 
@@ -523,7 +553,7 @@ def _sanitize(self, idx, axis=0) -> ndarray:
         idx = _a.asarrayi(idx)
         if idx.size == 0:
             return _a.asarrayi([])
-        elif idx.dtype == bool_:
+        if idx.dtype == bool_:
             return idx.nonzero()[0].astype(np.int32)
         return idx
 
@@ -782,6 +812,10 @@ def scale_columns(self, cols, scale, rows=None):
         # Scale values where columns coincide with scaling factor
         self._D[idx[scale_idx]] *= scale
 
+    def toarray(self):
+        """Return a dense `numpy.ndarray` which has 3 dimensions (self.shape)"""
+        return sparse_dense(self)
+
     def todense(self):
         """Return a dense `numpy.ndarray` which has 3 dimensions (self.shape)"""
         return sparse_dense(self)
@@ -935,11 +969,16 @@ def _extend(self, i, j, ret_indices=True):
             for indices out of bounds
         """
         i = self._sanitize(i)
-        if asarray(i).size == 0:
-            return arrayi([])
+        if i.size == 0:
+            return _a.arrayi([])
+        if i.size > 1:
+            raise ValueError(
+                "extending the sparse matrix is only allowed for single rows at a time"
+            )
         if i < 0 or i >= self.shape[0]:
             raise IndexError(f"row index is out-of-bounds {i} : {self.shape[0]}")
-        i1 = int(i) + 1
+        i1 = i + 1
+
         # We skip this check and let sisl die if wrong input is given...
         # if not isinstance(i, Integral):
         #    raise ValueError("Retrieving/Setting elements in a sparse matrix"
@@ -948,7 +987,7 @@ def _extend(self, i, j, ret_indices=True):
         # Ensure flattened array...
         j = self._sanitize(j, axis=1).ravel()
         if len(j) == 0:
-            return arrayi([])
+            return _a.arrayi([])
         if np_any(j < 0) or np_any(j >= self.shape[1]):
             raise IndexError(f"column index is out-of-bounds {j} : {self.shape[1]}")
 
@@ -985,7 +1024,6 @@ def _extend(self, i, j, ret_indices=True):
         new_nnz = new_n - int(ptr[i1]) + ncol_ptr_i
 
         if new_nnz > 0:
-            # print(f"new_nnz {i} : {new_nnz}")
             # Ensure that it is not-set as finalized
             # There is no need to set it all the time.
             # Simply because the first call to finalize
@@ -1058,7 +1096,7 @@ def _extend_empty(self, i, n):
             raise IndexError("row index is out-of-bounds")
 
         # fast reference
-        i1 = int(i) + 1
+        i1 = i + 1
 
         # Ensure that it is not-set as finalized
         # There is no need to set it all the time.
@@ -1159,8 +1197,10 @@ def __delitem__(self, key):
 
         # Get original values
         sl = slice(ptr[i], ptr[i] + ncol[i], None)
-        oC = self.col[sl]
-        oD = self._D[sl, :]
+        oC = self.col[sl].copy()
+        self.col[sl] = -1
+        oD = self._D[sl, :].copy()
+        self._D[sl, :] = 0
 
         # Now create the compressed data...
         index -= ptr[i]
@@ -1448,10 +1488,9 @@ def copy(self, dims=None, dtype=None):
         new = self.__class__(shape, dtype=dtype, nnz=1)
 
         # The default sizes are not passed
-        # Hence we *must* copy the arrays
-        # directly
-        copyto(new.ptr, self.ptr, casting="same_kind")
-        copyto(new.ncol, self.ncol, casting="same_kind")
+        # Hence we *must* copy the arrays directly
+        new.ptr[:] = self.ptr[:]
+        new.ncol[:] = self.ncol[:]
         new.col = self.col.copy()
         new._nnz = self.nnz
 
@@ -1730,7 +1769,6 @@ def transpose(self, sort=True):
             D = self._D.copy()
         else:
             idx = array_arange(self.ptr[:-1], n=ncol, dtype=int32)
-            # ptr = _ncol_to_indptr(ncol)
             col = self.col[idx]
             D = self._D[idx, :].copy()
             del idx
@@ -1787,30 +1825,61 @@ def __repr__(self):
     # numpy dispatch methods
     __array_priority__ = 14
 
+    def __array__(self, dtype=None):
+        out = self.toarray()
+        if dtype is None:
+            return out
+        return out.astype(dtype, copy=False)
+
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        # print(f"{self.__class__.__name__}.__array_ufunc__ :", ufunc, method)
         out = kwargs.pop("out", None)
 
         if getattr(ufunc, "signature", None) is not None:
             # The signature is not a scalar operation
             return NotImplemented
 
+        dtypes = []
         if out is not None:
             (out,) = out
             kwargs["dtype"] = out.dtype
+        else:
+            for arg in inputs:
+                # this exercise is required to ensure that we don't
+                # prematurely promote dtypes.
+                # For instance,
+                #    SparseCSR(dtype=np.complex64) * 1j
+                # should remain np.complex64.
+                # Yet, if one does np.asarray(1j) you'll get
+                # np.complex128.
+                # This means that we should let the ufunc do any
+                # promotions necessary.
+                if isscalar(arg):
+                    pass
+                elif isinstance(arg, (tuple, list, ndarray)):
+                    arg = arg[0]
+                try:
+                    dtypes.append(arg.dtype)
+                except AttributeError:
+                    dtypes.append(type(arg))
 
         if method == "__call__":
+            if dtypes:
+                dtypes.append(None)
+                kwargs["dtype"] = ufunc.resolve_dtypes(tuple(dtypes))[-1]
             result = _ufunc_call(ufunc, *inputs, **kwargs)
         elif method == "reduce":
+            if dtypes:
+                dtypes = [None] + dtypes + [None]
+                kwargs["dtype"] = ufunc.resolve_dtypes(tuple(dtypes), reduction=True)[
+                    -1
+                ]
             result = _ufunc_reduce(ufunc, *inputs, **kwargs)
         elif method == "outer":
-            # print("running outer")
             # Currently I don't know what to do here
             # We don't have multidimensional sparse matrices,
             # but perhaps that could be needed later?
             return NotImplemented
         else:
-            # print("running method = ", method)
             return NotImplemented
 
         if out is None:
@@ -1827,8 +1896,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             out.ncol[:] = result.ncol[:]
             out.ptr[:] = result.ptr[:]
             # this will copy
-            out.col = result.col
-            out._D = result._D.astype(kwargs.get("dtype", out.dtype))
+            out.col = result.col.copy()
+            out._D = result._D.astype(out.dtype)
+            out._nnz = result.nnz
+            del result
         else:
             out = NotImplemented
         return out
@@ -1860,8 +1931,16 @@ def __setstate__(self, state):
             self.ptr = state["ptr"]
 
 
-def _get_reduced_shape(shape):
-    return tuple(s for s in shape[::-1] if s > 1)[::-1]
+def _get_reduced_shape(arr):
+    # return the reduced shape by removing any dimensions with length 1
+    if isscalar(arr):
+        return tuple()
+    if isinstance(arr, (tuple, list)):
+        n = len(arr)
+        if n > 1:
+            return (n,) + _get_reduced_shape(arr[0])
+        return tuple()
+    return np.squeeze(arr).shape
 
 
 def _ufunc(ufunc, a, b, **kwargs):
@@ -1869,46 +1948,49 @@ def _ufunc(ufunc, a, b, **kwargs):
         if issparse(b) or isinstance(b, (SparseCSR, tuple)):
             return _ufunc_sp_sp(ufunc, a, b, **kwargs)
         return _ufunc_sp_ndarray(ufunc, a, b, **kwargs)
-    elif isinstance(b, SparseCSR):
+    if isinstance(b, SparseCSR):
         return _ufunc_ndarray_sp(ufunc, a, b, **kwargs)
     return ufunc(a, b, **kwargs)
 
 
 def _ufunc_sp_ndarray(ufunc, a, b, **kwargs):
-    if len(_get_reduced_shape(b.shape)) > 1:
+    if len(_get_reduced_shape(b)) > 1:
         # there are shapes for individiual
         # we will now calculate a full matrix
-        return ufunc(a.todense(), b, **kwargs)
+        return ufunc(a.toarray(), b, **kwargs)
 
     # create a copy
-    out = a.copy(dtype=kwargs.get("dtype", a.dtype))
+    out = a.copy(dtype=kwargs["dtype"])
     if out.ptr[-1] == out.nnz:
-        ufunc(a._D, b, **kwargs, out=out._D)
+        out._D = ufunc(a._D, b, **kwargs)
     else:
         # limit the values
         # since slicing non-uniform ranges does not return
         # a view, we can't use
         #   ufunc(..., out=out._D[idx, :])
         idx = array_arange(a.ptr[:-1], n=a.ncol)
-        out._D[idx, :] = ufunc(a._D[idx, :], b, **kwargs)
+        out._D[idx] = ufunc(a._D[idx, :], b, **kwargs)
         del idx
     return out
 
 
 def _ufunc_ndarray_sp(ufunc, a, b, **kwargs):
-    if len(_get_reduced_shape(a.shape)) > 1:
+    if len(_get_reduced_shape(a)) > 1:
         # there are shapes for individiual
         # we will now calculate a full matrix
-        return ufunc(a, b.todense(), **kwargs)
+        return ufunc(a, b.toarray(), **kwargs)
 
     # create a copy
-    out = b.copy(dtype=kwargs.get("dtype", b.dtype))
+    out = b.copy(dtype=kwargs["dtype"])
     if out.ptr[-1] == out.nnz:
-        ufunc(a, b._D, **kwargs, out=out._D)
+        out._D = ufunc(a, b._D, **kwargs)
     else:
         # limit the values
+        # since slicing non-uniform ranges does not return
+        # a view, we can't use
+        #   ufunc(..., out=out._D[idx, :])
         idx = array_arange(b.ptr[:-1], n=b.ncol)
-        out._D[idx, :] = ufunc(a, b._D[idx, :], **kwargs)
+        out._D[idx] = ufunc(a, b._D[idx, :], **kwargs)
         del idx
     return out
 
@@ -1927,7 +2009,13 @@ def rowslice(r):
                 return slice(mat.ptr[r], mat.ptr[r] + mat.ncol[r])
 
             accessors = mat.dim, mat.col, mat._D, rowslice
-            issorted = mat.finalized
+            # check whether they are actually sorted
+            if mat.finalized:
+                rows, cols = _rows_and_cols(mat)
+                rows, cols = np.diff(rows), np.diff(cols)
+                issorted = np.all(cols[rows == 0] > 0)
+            else:
+                issorted = False
         else:
             # makes this work for all matrices
             # and csr_matrix.tocsr is a no-op
@@ -1956,7 +2044,7 @@ def rowslice(r):
         raise ValueError(f"could not broadcast sparse matrices {a.shape} and {b.shape}")
 
     # create union of the sparsity pattern
-    out = SparseCSR.sparsity_union(a, b, dim=max(adim, bdim))
+    out = SparseCSR.sparsity_union(a, b, dim=max(adim, bdim), dtype=kwargs["dtype"])
 
     for r in range(out.shape[0]):
         offset = out.ptr[r]
@@ -1964,11 +2052,11 @@ def rowslice(r):
 
         asl = arow(r)
         aidx = afindidx(ocol, acol[asl], offset)
-        asl = arange(asl.start, asl.stop)
+        asl = _a.arangei(asl.start, asl.stop)
 
         bsl = brow(r)
         bidx = bfindidx(ocol, bcol[bsl], offset)
-        bsl = arange(bsl.start, bsl.stop)
+        bsl = _a.arangei(bsl.start, bsl.stop)
 
         # Common indices
         iover, aover, bover, iaonly, ibonly = intersect_and_diff_sets(aidx, bidx)
@@ -2000,39 +2088,22 @@ def _ufunc_call(ufunc, *in_args, **kwargs):
         if isinstance(arg, SparseCSR):
             args.append(arg)
         elif isscalar(arg) or isinstance(arg, (tuple, list, ndarray)):
-            args.append(asarray(arg))
+            args.append(arg)
         elif issparse(arg):
             args.append(arg)
         else:
             return
 
-    if "dtype" not in kwargs:
-        kwargs["dtype"] = np.result_type(*args)
-
-    # Input arguments are corrected
-    # Get resulting shape
-    # Currently we don't check shapes and output,
-    # however this function fails in case the shapes
-    # are not b-castable.
-    def spshape(arg):
-        if issparse(arg):
-            # spmatrices can only ever have 2 dimensions
-            # but SparseCSR always have 3, so we pad with ones.
-            return arg.shape + (1,)
-        return arg.shape
-
-    # shape = _get_bcast_shape(*tuple(spshape(arg) for arg in args))
-
     if len(args) == 1:
         a = args[0]
         # create a copy
-        out = a.copy(dtype=kwargs.get("dtype", a.dtype))
+        out = a.copy(dtype=kwargs["dtype"])
         if out.ptr[-1] == out.nnz:
-            ufunc(a._D[:, :], **kwargs, out=out._D)
+            out._D = ufunc(a._D, **kwargs)
         else:
             # limit the values
             idx = array_arange(a.ptr[:-1], n=a.ncol)
-            out._D[idx, :] = ufunc(a._D[idx, :], **kwargs)
+            out._D[idx] = ufunc(a._D[idx, :], **kwargs)
             del idx
         return out
 
@@ -2043,9 +2114,6 @@ def _(a, b):
 
 
 def _ufunc_reduce(ufunc, array, axis=0, *args, **kwargs):
-    if "dtype" not in kwargs:
-        kwargs["dtype"] = np.result_type(array, *args)
-
     # currently the initial argument does not work properly if the
     # size isn't correct
     if np.asarray(kwargs.get("initial", 0.0)).ndim > 1:
@@ -2082,7 +2150,7 @@ def wrap_axis(axis):
     elif axis == 1:
         pass
     elif axis == 2:
-        out = array.copy(dims=range(array.shape[2] - 1), dtype=kwargs.get("dtype"))
+        out = array.copy(dims=range(array.shape[2] - 1), dtype=kwargs["dtype"])
         out._D[:, 0] = ufunc.reduce(array._D, axis=1, *args, **kwargs)
         return out
     else:
@@ -2090,9 +2158,8 @@ def wrap_axis(axis):
             f"Unknown axis argument in ufunc.reduce call on {array.__class__.__name__}"
         )
 
-    ret = empty(
-        [array.shape[0], array.shape[2]], dtype=kwargs.get("dtype", array.dtype)
-    )
+    ret = empty([array.shape[0], array.shape[2]], dtype=kwargs["dtype"])
+
     # Now do ufunc calculations, note that initial gets passed directly
     ptr = array.ptr
     ncol = array.ncol
diff --git a/src/sisl/tests/test_help.py b/src/sisl/tests/test_help.py
index e5e0f6ce7f..f5f39b10f2 100644
--- a/src/sisl/tests/test_help.py
+++ b/src/sisl/tests/test_help.py
@@ -21,6 +21,8 @@ def test_array_fill_repeat1():
     assert array_fill_repeat([1], 20).shape[0] == 20
     assert array_fill_repeat([1, 2], 20).shape[0] == 20
     assert array_fill_repeat(1, 20).shape[0] == 20
+    assert array_fill_repeat([[1]], 20, axis=0).shape == (20, 1)
+    assert array_fill_repeat([[1, 2]], 20, axis=0).shape == (20, 2)
 
 
 def test_array_fill_repeat2():
diff --git a/src/sisl/tests/test_sparse.py b/src/sisl/tests/test_sparse.py
index 40a933f1df..464d4488bd 100644
--- a/src/sisl/tests/test_sparse.py
+++ b/src/sisl/tests/test_sparse.py
@@ -2,6 +2,7 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 import math as m
+import operator
 import sys
 
 import numpy as np
@@ -19,14 +20,18 @@
 
 
 @pytest.fixture
-def setup():
-    class t:
-        def __init__(self):
-            self.s1 = SparseCSR((10, 100), dtype=np.int32)
-            self.s1d = SparseCSR((10, 100))
-            self.s2 = SparseCSR((10, 100, 2))
+def s1():
+    return SparseCSR((10, 100), dtype=np.int32)
 
-    return t()
+
+@pytest.fixture
+def s1d():
+    return SparseCSR((10, 100))
+
+
+@pytest.fixture
+def s2():
+    return SparseCSR((10, 100, 2))
 
 
 def test_indices():
@@ -62,6 +67,13 @@ def test_fail_init2():
         SparseCSR((data, indices, indptr, indptr), shape=(100, 20, 20))
 
 
+def test_init_csr_inputs():
+    data = np.empty([2, 2], np.float64)
+    indices = np.arange(2)
+    indptr = np.arange(3)
+    SparseCSR((data, indices, indptr))
+
+
 def test_fail_align1():
     s1 = SparseCSR((10, 100), dtype=np.int32)
     s2 = SparseCSR((20, 100), dtype=np.int32)
@@ -80,12 +92,11 @@ def test_set_get1():
     assert np.all(s1[1, [1, 6], 1] == [2, 0])
 
 
-def test_init1(setup):
-    str(setup.s1)
-    assert setup.s1.dtype == np.int32
-    assert setup.s2.dtype == np.float64
-    assert np.allclose(setup.s1.data, setup.s1.data)
-    assert np.allclose(setup.s2.data, setup.s2.data)
+def test_init1(s1, s2, s1d):
+    str(s1)
+    assert s1.dtype == np.int32
+    assert s1d.dtype == np.float64
+    assert s2.dtype == np.float64
 
 
 def test_init2():
@@ -180,43 +191,56 @@ def test_extend1():
     assert np.allclose(csr[0, [0, 1, 2]], [0, 1, 2])
 
 
-def test_diag1():
+def test_diags_1():
     csr = SparseCSR((10, 10), nnzpr=1, dtype=np.int32)
     csr1 = csr.diags(10)
     assert csr1.shape[0] == 10
     assert csr1.shape[1] == 10
     assert csr1.shape[2] == 1
     assert csr1.nnz == 10
+    assert np.allclose(csr1.toarray()[..., 0], np.diag(np.full(10, 10)))
     csr2 = csr.diags(10)
     csr3 = csr.diags(np.zeros(10, np.int32))
     assert csr2.spsame(csr3)
 
 
-def test_diag2():
+def test_diags_2():
     csr = SparseCSR((10, 10), dim=3, nnzpr=1, dtype=np.int32)
     csr1 = csr.diags(10, dim=1)
-    csr2 = csr.diags(10, dim=2)
+    csr2 = csr.diags(10, dim=2, dtype=np.int16)
     assert csr1.shape[2] == 1
     assert csr2.shape[2] == 2
     assert csr1.nnz == 10
     assert csr2.nnz == 10
     assert csr1.spsame(csr2)
-
-
-def test_diag3():
-    csr = SparseCSR((10, 10), dim=3, nnzpr=1, dtype=np.int32)
-    csr1 = csr.diags(10, dim=1, dtype=np.int16)
-    csr2 = csr.diags(10, dim=2, dtype=np.float64)
-    assert csr1.shape[2] == 1
-    assert csr2.shape[2] == 2
-    assert csr1.nnz == 10
-    assert csr2.nnz == 10
-    assert csr1.dtype == np.int16
-    assert csr2.dtype == np.float64
-
-
-def test_create1(setup):
-    s1d = setup.s1d
+    # TODO fix diags such that it won't affect
+    assert csr1.dtype == np.int64
+    assert csr2.dtype == np.int16
+
+
+def test_diags_offsets():
+    csr = SparseCSR((10, 10), dtype=np.int32)
+    m = csr.diags(1)
+    assert m.nnz == 10
+    assert np.sum(m) == 10
+    m = csr.diags(1, offsets=1)
+    assert m.nnz == 9
+    assert np.sum(m) == 9
+    m = csr.diags(1, offsets=2)
+    assert m.nnz == 8
+    assert np.sum(m) == 8
+    m = csr.diags(1, offsets=-2)
+    assert m.nnz == 8
+    assert np.sum(m) == 8
+
+
+def test_diags_multiple_diagonals():
+    csr = SparseCSR((10, 10), dim=2, dtype=np.int32)
+    m = csr.diags([[1, 2]])
+    assert np.allclose(m[0, 0], [1, 2])
+
+
+def test_create1(s1d):
     s1d[0, [1, 2, 3]] = 1
     assert s1d.nnz == 3
     s1d[2, [1, 2, 3]] = 1
@@ -231,8 +255,7 @@ def test_create1(setup):
     s1d.empty()
 
 
-def test_create2(setup):
-    s1 = setup.s1
+def test_create2(s1):
     assert len(s1) == s1.shape[0]
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
@@ -241,11 +264,9 @@ def test_create2(setup):
         for jj in j:
             assert s1[0, jj] == i
             assert s1[1, jj] == 0
-    s1.empty()
 
 
-def test_create3(setup):
-    s1 = setup.s1
+def test_create3(s1):
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
         s1[0, j] = i
@@ -255,90 +276,91 @@ def test_create3(setup):
         for jj in j:
             assert s1[0, jj] == i
             assert s1[1, jj] == 0
-    s1.empty()
 
 
-def test_create_1d_bcasting_data_1d(setup):
-    s1 = setup.s1.copy()
+def test_create_1d_bcasting_data_1d(s1):
+    s2 = s1.copy()
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
-        s1[0, j] = i
-        s1[1, j] = i
-        s1[2, j] = i
-        s1[3, j] = i
+        s2[0, j] = i
+        s2[1, j] = i
+        s2[2, j] = i
+        s2[3, j] = i
 
-    s2 = setup.s1.copy()
+    s3 = s1.copy()
     for i in range(10):
         j = np.arange(i * 4, i * 4 + 3).reshape(1, -1)
-        s2[np.arange(4).reshape(-1, 1), j] = i
+        s3[np.arange(4).reshape(-1, 1), j] = i
 
-    assert s1.spsame(s2)
-    assert np.sum(s1 - s2) == 0
+    assert s2.spsame(s3)
+    assert np.sum(s2 - s3) == 0
 
 
 @pytest.mark.xfail(
     sys.platform.startswith("win"), reason="Unknown windows error in b-casting"
 )
-def test_create_1d_bcasting_data_2d(setup):
-    s1 = setup.s1.copy()
+def test_create_1d_bcasting_data_2d(s1):
+    s2 = s1.copy()
     data = np.random.randint(1, 100, (4, 3))
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
-        s1[0, j] = data[0, :]
-        s1[1, j] = data[1, :]
-        s1[2, j] = data[2, :]
-        s1[3, j] = data[3, :]
+        s2[0, j] = data[0, :]
+        s2[1, j] = data[1, :]
+        s2[2, j] = data[2, :]
+        s2[3, j] = data[3, :]
 
-    s2 = setup.s1.copy()
+    s3 = s1.copy()
     for i in range(10):
         j = np.arange(i * 4, i * 4 + 3).reshape(1, -1)
-        s2[np.arange(4).reshape(-1, 1), j] = data
+        s3[np.arange(4).reshape(-1, 1), j] = data
 
-    assert s1.spsame(s2)
-    assert np.sum(s1 - s2) == 0
+    assert s2.spsame(s3)
+    assert np.sum(s2 - s3) == 0
 
 
-def test_create_1d_diag(setup):
-    s1 = setup.s1.copy()
+def test_create_1d_diag(s1):
+    s2 = s1.copy()
     data = np.random.randint(1, 100, len(s1))
     d = np.arange(len(s1))
     for i in d:
-        s1[i, i] = data[i]
+        s2[i, i] = data[i]
 
-    s2 = setup.s1.copy()
-    s2[d, d] = data
+    s3 = s1.copy()
+    s3[d, d] = data
 
-    assert s1.spsame(s2)
-    assert np.sum(s1 - s2) == 0
+    assert s2.spsame(s3)
+    assert np.sum(s2 - s3) == 0
 
 
-def test_create_2d_diag_0d(setup):
-    s1 = setup.s2.copy()
+def test_create_2d_diag_0d(s2):
+    S1 = s2.copy()
     data = 1
-    d = np.arange(len(s1))
+    d = np.arange(len(s2))
     for i in d:
-        s1[i, i] = data
-    s2 = setup.s2.copy()
-    s2[d, d] = data
-    s3 = setup.s2.copy()
-    s3[d, d, 0] = data
-    s3[d, d, 1] = data
+        S1[i, i] = data
+    S2 = s2.copy()
+    S2[d, d] = data
+
+    S3 = s2.copy()
+    S3[d, d, 0] = data
+    S3[d, d, 1] = data
+
+    assert S1.spsame(S2)
+    assert np.sum(S1 - S2) == 0
+    assert S1.spsame(S3)
+    assert np.sum(S1 - S3) == 0
 
-    assert s1.spsame(s2)
-    assert np.sum(s1 - s2) == 0
-    assert s1.spsame(s3)
-    assert np.sum(s1 - s3) == 0
 
+def test_create_2d_diag_1d(s2):
+    s1 = s2.copy()
+    s3 = s2.copy()
 
-def test_create_2d_diag_1d(setup):
-    s1 = setup.s2.copy()
     data = np.random.randint(1, 100, len(s1))
     d = np.arange(len(s1))
     for i in d:
         s1[i, i] = data[i]
-    s2 = setup.s2.copy()
+
     s2[d, d] = data
-    s3 = setup.s2.copy()
     s3[d, d, 0] = data
     s3[d, d, 1] = data
 
@@ -348,15 +370,16 @@ def test_create_2d_diag_1d(setup):
     assert np.sum(s1 - s3) == 0
 
 
-def test_create_2d_diag_2d(setup):
-    s1 = setup.s2.copy()
+def test_create_2d_diag_2d(s2):
+    s1 = s2.copy()
+    s3 = s2.copy()
+
     data = np.random.randint(1, 100, len(s1) * 2).reshape(-1, 2)
     d = np.arange(len(s1))
     for i in d:
         s1[i, i] = data[i]
-    s2 = setup.s2.copy()
+
     s2[d, d] = data
-    s3 = setup.s2.copy()
     s3[d, d, 0] = data[:, 0]
     s3[d, d, 1] = data[:, 1]
 
@@ -366,8 +389,10 @@ def test_create_2d_diag_2d(setup):
     assert np.sum(s1 - s3) == 0
 
 
-def test_create_2d_data_2d(setup):
-    s1 = setup.s2.copy()
+def test_create_2d_data_2d(s2):
+    s1 = s2.copy()
+    s3 = s2.copy()
+
     # matrix assignment
     I = np.arange(len(s1) // 2)
     data = np.random.randint(1, 100, I.size**2).reshape(I.size, I.size)
@@ -376,11 +401,10 @@ def test_create_2d_data_2d(setup):
         s1[i, I, 1] = data[i]
 
     I.shape = (-1, 1)
-    s2 = setup.s2.copy()
+
     s2[I, I.T, 0] = data
     s2[I, I.T, 1] = data
 
-    s3 = setup.s2.copy()
     s3[I, I.T] = data[:, :, None]
 
     assert s1.spsame(s2)
@@ -389,8 +413,9 @@ def test_create_2d_data_2d(setup):
     assert np.sum(s1 - s3) == 0
 
 
-def test_create_2d_data_3d(setup):
-    s1 = setup.s2.copy()
+def test_create_2d_data_3d(s2):
+    s1 = s2.copy()
+
     # matrix assignment
     I = np.arange(len(s1) // 2)
     data = np.random.randint(1, 100, I.size**2 * 2).reshape(I.size, I.size, 2)
@@ -398,56 +423,49 @@ def test_create_2d_data_3d(setup):
         s1[i, I] = data[i]
 
     I.shape = (-1, 1)
-    s2 = setup.s2.copy()
     s2[I, I.T] = data
 
     assert s1.spsame(s2)
     assert np.sum(s1 - s2) == 0
 
 
-def test_copy_dims(setup):
-    s1 = setup.s2.copy(dims=0)
-    assert np.allclose(setup.s2._D[:, 0], s1._D[:, 0])
-    s1 = setup.s2.copy(dims=1)
-    assert np.allclose(setup.s2._D[:, 1], s1._D[:, 0])
+def test_copy_dims(s2):
+    s2[2, 2] = [2, 3]
+    s1 = s2.copy(dims=0)
+    assert np.allclose(s2._D[:, 0], s1._D[:, 0])
+    s1 = s2.copy(dims=1)
+    assert np.allclose(s2._D[:, 1], s1._D[:, 0])
 
-    s1 = setup.s2.copy(dims=[1, 0])
-    assert np.allclose(setup.s2._D[:, 1], s1._D[:, 0])
-    assert np.allclose(setup.s2._D[:, 0], s1._D[:, 1])
+    s1 = s2.copy(dims=[1, 0])
+    assert np.allclose(s2._D[:, 1], s1._D[:, 0])
+    assert np.allclose(s2._D[:, 0], s1._D[:, 1])
 
 
-def test_fail_data_3d_to_1d(setup):
-    s2 = setup.s2
+def test_fail_data_3d_to_1d(s2):
     # matrix assignment
     I = np.arange(len(s2) // 2).reshape(-1, 1)
     data = np.random.randint(1, 100, I.size * 2).reshape(I.size, 1, 2)
     with pytest.raises(ValueError):
         s2[I, I.T] = data
-    s2.empty()
 
 
-def test_fail_data_2d_to_2d(setup):
-    s2 = setup.s2
+def test_fail_data_2d_to_2d(s2):
     # matrix assignment
     I = np.arange(len(s2) // 2).reshape(-1, 1)
     data = np.random.randint(1, 100, I.size**2).reshape(I.size, I.size)
     with pytest.raises(ValueError):
         s2[I, I.T] = data
-    s2.empty()
 
 
-def test_fail_data_2d_to_3d(setup):
-    s2 = setup.s2
+def test_fail_data_2d_to_3d(s2):
     # matrix assignment
     I = np.arange(len(s2) // 2).reshape(-1, 1)
     data = np.random.randint(1, 100, I.size**2).reshape(I.size, I.size)
     with pytest.raises(ValueError):
         s2[I, I.T, [0, 1]] = data
-    s2.empty()
 
 
-def test_finalize1(setup):
-    s1 = setup.s1
+def test_finalize1(s1):
     s1[0, [1, 2, 3]] = 1
     s1[2, [1, 2, 3]] = 1.0
     s1[1, [3, 2, 1]] = 1.0
@@ -467,8 +485,7 @@ def test_finalize1(setup):
     assert not s1.finalized
 
 
-def test_finalize2(setup):
-    s1 = setup.s1
+def test_finalize2(s1):
     s1[0, [1, 2, 3]] = 1
     s1[2, [1, 2, 3]] = 1.0
     s1[1, [3, 2, 1]] = 1.0
@@ -483,11 +500,9 @@ def test_finalize2(setup):
     assert np.allclose(s1.col[p[1] : p[1] + n[1]], [3, 2, 1])
     assert not s1.finalized
     assert len(s1.col) == 9
-    s1.empty()
 
 
-def test_iterator1(setup):
-    s1 = setup.s1
+def test_iterator1(s1):
     s1[0, [1, 2, 3]] = 1
     s1[2, [1, 2, 4]] = 1.0
     e = [[1, 2, 3], [], [1, 2, 4]]
@@ -513,11 +528,8 @@ def test_iterator1(setup):
         assert j in e[i]
         assert d == 1.0
 
-    s1.empty()
 
-
-def test_iterator2(setup):
-    s1 = setup.s1
+def test_iterator2(s1):
     e = [[1, 2, 3], [], [1, 2, 4]]
     s1[0, [1, 2, 3]] = 1
     s1[2, [1, 2, 4]] = 1.0
@@ -535,11 +547,8 @@ def test_iterator2(setup):
             assert c in e[r]
             assert d == 1.0
 
-    s1.empty()
-
 
-def test_iterator3(setup):
-    s1 = setup.s1
+def test_iterator3(s1):
     e = [[1, 2, 3], [], [1, 2, 4]]
     s1[0, [1, 2, 3]] = 1
     s1[2, [1, 2, 4]] = 1.0
@@ -560,11 +569,9 @@ def test_iterator3(setup):
             assert c in [0, 1]
             n += 1
         assert n == nvals
-    s1.empty()
 
 
-def test_delitem1(setup):
-    s1 = setup.s1
+def test_delitem1(s1):
     s1[0, [1, 2, 3]] = 1
     assert s1.nnz == 3
     del s1[0, 1]
@@ -584,42 +591,34 @@ def test_delitem1(setup):
         assert s1[i, 0] == 0
     del s1[range(2), range(3), 0]
     assert s1.nnz == 0
-    s1.empty()
 
 
-def test_contains1(setup):
-    s1 = setup.s1
+def test_contains1(s1):
     s1[0, [1, 2, 3]] = 1
     assert s1.nnz == 3
     assert [0, 1] in s1
     assert [0, [1, 3]] in s1
-    s1.empty()
 
 
-def test_sub1(setup):
-    s1 = setup.s1
+def test_sub1(s1):
     s1[0, [1, 2, 3]] = 1
     assert s1.nnz == 3
     s1 = s1.sub([0, 1])
     assert s1.nnz == 1
     assert s1.shape[0] == 2
     assert s1.shape[1] == 2
-    s1.empty()
 
 
-def test_remove1(setup):
-    s1 = setup.s1
+def test_remove1(s1):
     s1[0, [1, 2, 3]] = 1
     assert s1.nnz == 3
     s2 = s1.remove([1])
     assert s2.nnz == 2
     assert s2.shape[0] == s1.shape[0] - 1
     assert s2.shape[1] == s1.shape[1] - 1
-    s1.empty()
 
 
-def test_eliminate_zeros1(setup):
-    s1 = setup.s1
+def test_eliminate_zeros1(s1):
     s1[0, [1, 2, 3]] = 1
     s1[1, [1, 2, 3]] = 0
     assert s1.nnz == 6
@@ -628,11 +627,9 @@ def test_eliminate_zeros1(setup):
     assert s1[1, 1] == 0
     assert s1[1, 2] == 0
     assert s1[1, 3] == 0
-    s1.empty()
 
 
-def test_eliminate_zeros_tolerance(setup):
-    s1 = setup.s1
+def test_eliminate_zeros_tolerance(s1):
     s1[0, [1, 2, 3]] = 1
     s1[1, [1, 2, 3]] = 2
     assert s1.nnz == 6
@@ -643,10 +640,9 @@ def test_eliminate_zeros_tolerance(setup):
     assert s1[0, 1] == 0
     assert s1[0, 2] == 0
     assert s1[0, 3] == 0
-    s1.empty()
 
 
-def test_eliminate_zeros_tolerance_ndim(setup):
+def test_eliminate_zeros_tolerance_ndim():
     s = SparseCSR((3, 3, 3))
     s[1, [0, 1, 2]] = 0.1
     s[1, [0, 1, 2], 1] = 0.05
@@ -662,9 +658,7 @@ def test_eliminate_zeros_tolerance_ndim(setup):
     assert s.nnz == 0
 
 
-def test_spsame(setup):
-    s1 = setup.s1
-    s2 = setup.s2
+def test_spsame(s1, s2):
     s1[0, [1, 2, 3]] = 1
     s2[0, [1, 2, 3]] = (1, 1)
     assert s1.spsame(s2)
@@ -672,20 +666,15 @@ def test_spsame(setup):
     assert not s1.spsame(s2)
     s1.align(s2)
     assert s1.spsame(s2)
-    s1.empty()
-    s2.empty()
 
 
 @pytest.mark.xfail(reason="same index assignment in single statement")
-def test_set_same_index(setup):
-    s1 = setup.s1
+def test_set_same_index(s1):
     s1[0, [1, 1]] = 1
     assert s1.nnz == 1
-    s1.empty()
 
 
-def test_delete_col1(setup):
-    s1 = setup.s1
+def test_delete_col1(s1):
     nc = s1.shape[1]
     s1[0, [1, 3]] = 1
     s1[1, [1, 2, 3]] = 1
@@ -702,11 +691,9 @@ def test_delete_col1(setup):
     s1.delete_columns(100000, True)
     assert s1.nnz == 2
     assert s1.shape[1] == nc - 1
-    s1.empty()
 
 
-def test_delete_col2(setup):
-    s1 = setup.s1
+def test_delete_col2(s1):
     nc = s1.shape[1]
     s1[1, [1, 2, 3]] = 1
     assert s1.nnz == 3
@@ -716,12 +703,10 @@ def test_delete_col2(setup):
     s1.delete_columns(2, True)
     assert s1.nnz == 1
     assert s1.shape[1] == nc - 1
-    s1.empty()
 
 
-def test_delete_col3(setup):
-    s1 = setup.s1.copy()
-    s2 = setup.s1.copy()
+def test_delete_col3(s1):
+    s2 = s1.copy()
     nc = s1.shape[1]
     for i in range(10):
         s1[i, [3, 2, 1]] = 1
@@ -753,8 +738,7 @@ def test_delete_col4():
     assert s1.spsame(s2)
 
 
-def test_delete_col5(setup):
-    s1 = setup.s1
+def test_delete_col5(s1):
     nc = s1.shape[1]
     s1[1, [1, 2, 3]] = 1
     assert s1.nnz == 3
@@ -770,11 +754,9 @@ def test_delete_col5(setup):
     s1.delete_columns(100000, True)
     assert s1.nnz == 1
     assert s1.shape[1] == nc - 1
-    s1.empty()
 
 
-def test_delete_col6(setup):
-    s1 = setup.s1
+def test_delete_col6(s1):
     nc = s1.shape[1]
     for i in range(3):
         s1[i, [1, 2, 3]] = 1
@@ -782,11 +764,9 @@ def test_delete_col6(setup):
     s1.delete_columns(2)
     assert s1.nnz == 6
     assert s1.shape[1] == nc - 1
-    s1.empty()
 
 
-def test_translate_col1(setup):
-    s1 = setup.s1
+def test_translate_col1(s1):
     s1[1, 1] = 1
     s1[1, 2] = 2
     s1[1, 3] = 3
@@ -795,11 +775,9 @@ def test_translate_col1(setup):
     assert s1.nnz == 3
     assert s1[1, 1] == 3
     assert s1[1, 3] == 1
-    s1.empty()
 
 
-def test_translate_col2(setup):
-    s1 = setup.s1
+def test_translate_col2(s1):
     s1[1, 1] = 1
     s1[1, 2] = 2
     s1[1, 3] = 3
@@ -808,11 +786,9 @@ def test_translate_col2(setup):
     assert s1.nnz == 2
     assert s1[1, 3] == 1
     assert s1[1, 1] == 0
-    s1.empty()
 
 
-def test_translate_col3(setup):
-    s1 = setup.s1
+def test_translate_col3(s1):
     for i in range(3):
         s1[i, 1] = 1
         s1[i, 2] = 2
@@ -828,11 +804,9 @@ def test_translate_col3(setup):
     for i in range(3):
         assert s1[i, 1] == 1
         assert s1[i, 3] == 3
-    s1.empty()
 
 
-def test_translate_col4(setup):
-    s1 = setup.s1
+def test_translate_col4(s1):
     nc = s1.shape[1]
     for i in range(3):
         s1[i, 1] = 1
@@ -851,11 +825,9 @@ def test_translate_col4(setup):
     for i in range(3):
         assert s1[i, 1] == 0
         assert s1[i, 3] == 3
-    s1.empty()
 
 
-def test_edges1(setup):
-    s1 = setup.s1
+def test_edges1(s1):
     s1[1, 1] = 1
     s1[1, 2] = 2
     s1[1, 3] = 3
@@ -863,11 +835,9 @@ def test_edges1(setup):
     assert np.all(s1.edges(1, exclude=[1]) == [2, 3])
     assert np.all(s1.edges(1, exclude=2) == [1, 3])
     assert len(s1.edges(2)) == 0
-    s1.empty()
 
 
-def test_nonzero1(setup):
-    s1 = setup.s1
+def test_nonzero1(s1):
     s1[2, 1] = 1
     s1[1, 1] = 1
     s1[1, 2] = 2
@@ -887,11 +857,9 @@ def test_nonzero1(setup):
     r, c = s1.nonzero(rows=[0, 1])
     assert np.all(r == [1, 1, 1])
     assert np.all(c == [1, 2, 3])
-    s1.empty()
 
 
-def test_op1(setup):
-    s1 = setup.s1
+def test_op1(s1):
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
         s1[0, j] = i
@@ -925,11 +893,9 @@ def test_op1(setup):
         for jj in j:
             assert s1[0, jj] == i**2
             assert s1[1, jj] == 0
-    s1.empty()
 
 
-def test_op2(setup):
-    s1 = setup.s1
+def test_op2(s1):
     for i in range(10):
         j = range(i * 4, i * 4 + 3)
         s1[0, j] = i
@@ -996,12 +962,10 @@ def test_op2(setup):
             assert s[0, jj], 2 ** s1[0 == jj]
             assert s1[0, jj] == i
             assert s[1, jj] == 0
-    s1.empty()
 
 
-def test_op_csr(setup):
+def test_op_csr(s1):
     csr = sc.sparse.csr_matrix((10, 100), dtype=np.int32)
-    s1 = setup.s1
     for i in range(10):
         j = range(i + 2)
         s1[0, j] = i
@@ -1063,7 +1027,6 @@ def test_op_csr(setup):
             assert s1[0, jj] == i
             assert s[1, jj] == 0
         assert s[0, 0] == i**2
-    s1.empty()
 
 
 def test_op3():
@@ -1076,6 +1039,7 @@ def test_op3():
     for op in ["add", "sub", "mul", "pow"]:
         func = getattr(S, f"__{op}__")
         s = func(1)
+        # 1 will promote to whatever numpy will cast it to
         assert s.dtype == np.int32
         s = func(1.0)
         assert s.dtype == np.float64
@@ -1142,6 +1106,141 @@ def test_op4():
     assert s.dtype == np.complex128
 
 
+def binary():
+    op = operator
+    ops = [
+        op.mod,
+        op.mul,
+        op.add,
+        op.sub,
+        op.floordiv,
+        op.truediv,
+    ]
+    return ops
+
+
+def binary_int():
+    op = operator
+    return [op.pow]
+
+
+def unary():
+    op = operator
+    ops = [
+        op.abs,
+        op.neg,
+        op.pos,
+    ]
+    return ops
+
+
+@pytest.fixture(scope="module")
+def matrix_sisl_csr():
+    matrix = []
+
+    def add3(m):
+        matrix.append(m)
+        m = m.copy()
+        m.finalize(sort=False)
+        matrix.append(m)
+        m = m.copy()
+        m.finalize(sort=True)
+        matrix.append(m)
+
+    # diagonal
+    m = SparseCSR((10, 80), dtype=np.int32)
+    for i in range(10):
+        m[i, i] = 1
+    add3(m)
+
+    # not completely full (some empty ncol)
+
+    m = SparseCSR((10, 80), dtype=np.int32)
+    m[0, [1, 0]] = [12, 2]
+    m[2, 2] = 1
+    m[4, [4, 0]] = [2, 3]
+    m[5, [5, 0]] = [3, 5]
+    add3(m)
+
+    # all more than 1 coupling, and not sorted
+    m = SparseCSR((10, 80), dtype=np.int32)
+    m[0, [1, 0]] = [11, 0]
+    m[1, [10, 50, 20]] = [14, 20, 43]
+    m[2, [4, 7, 3, 1]] = [2, 5, 4, 10]
+    m[3, [4, 7, 3, 1]] = [2, 5, 4, 10]
+    m[4, [1, 5, 3, 21]] = [2, 5, 4, 10]
+    m[5, [53, 52, 3, 21]] = [31, 6, 7, 12]
+    m[6, [43, 32, 1, 6]] = [3, 6, 7, 16]
+    m[7, [65, 44, 3, 2]] = [3, 6, 73, 31]
+    m[8, [66, 45, 6, 8]] = [4, 3, 12, 357]
+    m[9, [55, 44, 33, 22]] = [4, 3, 11, 27]
+    add3(m)
+    return matrix
+
+
+@pytest.fixture(scope="module")
+def matrix_csr_matrix():
+    matrix = []
+
+    csr_matrix = sc.sparse.csr_matrix
+
+    def add3(m):
+        matrix.append(m)
+        m = m.copy()
+        m.sort_indices()
+        matrix.append(m)
+
+    # diagonal
+    m = csr_matrix((10, 80), dtype=np.int32)
+    for i in range(10):
+        m[i, i] = 1
+    add3(m)
+
+    # not completely full (some empty ncol)
+
+    m = csr_matrix((10, 80), dtype=np.int32)
+    m[0, [1, 0]] = [11, 3]
+    m[2, 2] = 10
+    m[4, [4, 0]] = [22, 40]
+    m[5, [5, 0]] = [33, 51]
+    add3(m)
+
+    # all more than 1 coupling, and not sorted
+    m = csr_matrix((10, 80), dtype=np.int32)
+    m[0, [1, 0]] = [12, 4]
+    m[1, [10, 50, 20]] = [11, 20, 401]
+    m[2, [4, 7, 3, 1]] = [2, 5, 4, 10]
+    m[3, [4, 7, 3, 1]] = [2, 5, 4, 10]
+    m[4, [1, 5, 3, 21]] = [2, 5, 4, 10]
+    m[5, [53, 52, 3, 21]] = [3, 6, 7, 14]
+    m[6, [43, 32, 1, 6]] = [3, 6, 7, 11]
+    m[7, [65, 44, 3, 2]] = [3, 6, 7, 12]
+    m[8, [66, 45, 6, 8]] = [4, 3, 15, 7]
+    m[9, [55, 44, 33, 22]] = [4, 3, 14, 7]
+    add3(m)
+    return matrix
+
+
+@pytest.mark.parametrize("op", binary())
+def test_op_binary(matrix_sisl_csr, matrix_csr_matrix, op):
+    for m in matrix_sisl_csr:
+        mD = m.toarray()[..., 0]
+        if op not in (operator.add, operator.sub):
+            v = op(m, 3)
+            assert np.allclose(op(mD, 3), v.toarray()[..., 0])
+
+        if op not in (operator.truediv, operator.floordiv):
+            for m2 in matrix_sisl_csr:
+                m2D = m2.toarray()[..., 0]
+                v = op(m, m2)
+                assert np.allclose(op(mD, m2D), v.toarray()[..., 0])
+
+            for m2 in matrix_csr_matrix:
+                m2D = m2.toarray()
+                v = op(m, m2)
+                assert np.allclose(op(mD, m2D), v.toarray()[..., 0])
+
+
 def test_op5():
     S1 = SparseCSR((10, 100), dtype=np.int32)
     S2 = SparseCSR((10, 100), dtype=np.int32)
@@ -1232,9 +1331,13 @@ def test_op_numpy_scalar():
     assert isinstance(s, SparseCSR)
     assert s.dtype == np.complex64
 
-    s = np.exp(1j * S)
+    s = 1j * S
     assert isinstance(s, SparseCSR)
-    assert s.dtype == np.complex64
+    assert s.dtype == (1j * np.array([1j], dtype=np.complex64)).dtype
+
+    s = np.exp(s)
+    assert isinstance(s, SparseCSR)
+    assert s.dtype == np.exp(1j * np.array([1j], dtype=np.complex64)).dtype
 
 
 def test_op_sparse_dim():