From d084aebfe043d9a8ef20923ab806017ca36937d5 Mon Sep 17 00:00:00 2001 From: Jake Lishman Date: Fri, 26 Apr 2024 16:30:07 +0100 Subject: [PATCH] Add Rust-based `SparsePauliOp.to_matrix` and Miri tests (#11388) * Add Rust-based `SparsePauliOp.to_matrix` This rewrites the numerical version of `SparsePauliOp.to_matrix` to be written in parallelised Rust, building up the matrices row-by-row rather than converting each contained operator to a matrix individually and summing them. The new algorithms are complete row-based, which is embarrassingly parallel for dense matrices, and parallelisable with additional copies and cumulative sums in the CSR case. The two new algorithms are an asymptotic complexity improvement for both dense and sparse matrices over the "sum the individual matrices" version. In the dense case, the cost goes from O(4 ** num_qubits * num_ops) to O(4 ** num_qubits + (2 ** num_qubits) * reduced_num_ops) where the first term is from the zeroing of the matrix, and the second is from updating the elements in-place. `reduced_num_ops` is the number of explicitly stored operations after Pauli-operator uniqueness compaction, so is upper-bounded as `4 ** num_qubits`. (The Pauli compaction goes as `O(num_ops)`, so is irrelevant to the complexity discussion.) The CSR form of the algorithm goes as O(2 ** num_qubits * reduced_num_ops * lg(reduced_num_ops)) which (I think! - I didn't fully calculate it) is asymptotically the same as before, but in practice the constant factors and intermediate memory use are *dramatically* reduced, and the new algorithm is threadable with an additional `O(2 ** num_qubits * reduced_num_ops)` intermediate memory overhead (the serial form has only `O(reduced_num_ops)` memory overhead). The `object`-coefficients form is left as-is to avoid exploding the complexity in Rust space; these objects are already slow and unsuited for high-performance code, so the effects should be minimal. * Add non-blocking Miri to CI As we begin to include more `unsafe` code in the Rust-accelerated components, it is becoming more important for us to test these in an undefined-behaviour sanitiser. This is done in a separate CI job because: - we do not yet know how stable Miri will be, so we don't want to block on it. - some dependencies need their version-resolution patching to Miri-compatible versions, but we want to run our regular test suites with the same versions of packages we will be building against. * Parallelise cumulative nnz sum This parallelises the previously serial-only cumulative sum of the `indptr` array of number of non-zero entries at the end. In practice, I didn't actually see any change in performance from this, but philosophically it feels like the right thing to do. * Update Miri pin to later version of crossbeam-epohc * Improve error handling and messages * Simplify unnecessary match * Add link to environment variable configuration * Add link to Rayon plumbing README * Add explicit test of serial and parallel modes --- .github/workflows/miri.yml | 46 ++ CONTRIBUTING.md | 52 ++ crates/accelerate/src/lib.rs | 4 + crates/accelerate/src/rayon_ext.rs | 171 +++++ crates/accelerate/src/sparse_pauli_op.rs | 617 +++++++++++++++++- crates/accelerate/src/test.rs | 24 + .../operators/symplectic/sparse_pauli_op.py | 43 +- .../notes/spo-to-matrix-26445a791e24f62a.yaml | 8 + .../symplectic/test_sparse_pauli_op.py | 32 + 9 files changed, 983 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/miri.yml create mode 100644 crates/accelerate/src/rayon_ext.rs create mode 100644 crates/accelerate/src/test.rs create mode 100644 releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml new file mode 100644 index 000000000000..bdceb20c3008 --- /dev/null +++ b/.github/workflows/miri.yml @@ -0,0 +1,46 @@ +name: Miri +on: + push: + pull_request: +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.workflow }} + # Only cancel in PR mode. In push mode, don't cancel so we don't see spurious test "failures", + # and we get coverage reports on Coveralls for every push. + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + miri: + if: github.repository_owner == 'Qiskit' + name: Miri + runs-on: ubuntu-latest + env: + RUSTUP_TOOLCHAIN: nightly + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@nightly + with: + components: miri + + - name: Prepare Miri + run: | + set -e + # Some of our dependencies aren't Miri-safe with their current release versions. These + # need overriding with known-good versions to run against until the Miri-safe versions are + # released and updated in our Cargo.lock. + cat >>Cargo.toml < bool { let parallel_context = env::var("QISKIT_IN_PARALLEL") diff --git a/crates/accelerate/src/rayon_ext.rs b/crates/accelerate/src/rayon_ext.rs new file mode 100644 index 000000000000..af914a86d414 --- /dev/null +++ b/crates/accelerate/src/rayon_ext.rs @@ -0,0 +1,171 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +//! Extension structs for use with Rayon parallelism. + +// See https://github.com/rayon-rs/rayon/blob/v1.10.0/src/iter/plumbing/README.md (or a newer +// version) for more of an explanation of how Rayon's plumbing works. + +use rayon::iter::plumbing::*; +use rayon::prelude::*; + +pub trait ParallelSliceMutExt: ParallelSliceMut { + /// Create a parallel iterator over mutable chunks of uneven lengths for this iterator. + /// + /// # Panics + /// + /// Panics if the sums of the given lengths do not add up to the length of the slice. + #[track_caller] + fn par_uneven_chunks_mut<'len, 'data>( + &'data mut self, + chunk_lengths: &'len [usize], + ) -> ParUnevenChunksMut<'len, 'data, T> { + let mut_slice = self.as_parallel_slice_mut(); + let chunk_sum = chunk_lengths.iter().sum::(); + let slice_len = mut_slice.len(); + if chunk_sum != slice_len { + panic!("given slices of total size {chunk_sum} for a chunk of length {slice_len}"); + } + ParUnevenChunksMut { + chunk_lengths, + data: mut_slice, + } + } +} + +impl ParallelSliceMutExt for S where S: ParallelSliceMut {} + +/// Very similar to Rayon's [rayon::slice::ChunksMut], except that the lengths of the individual +/// chunks are arbitrary, provided they sum to the total length of the slice. +#[derive(Debug)] +pub struct ParUnevenChunksMut<'len, 'data, T> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> ParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn drive_unindexed>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } +} + +impl<'len, 'data, T: Send + 'data> IndexedParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + #[track_caller] + fn drive>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } + + fn len(&self) -> usize { + self.chunk_lengths.len() + } + + #[track_caller] + fn with_producer>(self, callback: CB) -> CB::Output { + callback.callback(UnevenChunksMutProducer { + chunk_lengths: self.chunk_lengths, + data: self.data, + }) + } +} + +struct UnevenChunksMutProducer<'len, 'data, T: Send> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> Producer for UnevenChunksMutProducer<'len, 'data, T> { + type Item = &'data mut [T]; + type IntoIter = UnevenChunksMutIter<'len, 'data, T>; + + fn into_iter(self) -> Self::IntoIter { + Self::IntoIter::new(self.chunk_lengths, self.data) + } + + #[track_caller] + fn split_at(self, index: usize) -> (Self, Self) { + // Technically quadratic for a full-depth split, but let's worry about that later if needed. + let data_mid = self.chunk_lengths[..index].iter().sum(); + let (chunks_left, chunks_right) = self.chunk_lengths.split_at(index); + let (data_left, data_right) = self.data.split_at_mut(data_mid); + ( + Self { + chunk_lengths: chunks_left, + data: data_left, + }, + Self { + chunk_lengths: chunks_right, + data: data_right, + }, + ) + } +} + +#[must_use = "iterators do nothing unless consumed"] +struct UnevenChunksMutIter<'len, 'data, T> { + chunk_lengths: &'len [usize], + // The extra `Option` wrapper here is to satisfy the borrow checker while we're splitting the + // `data` reference. We need to consume `self`'s reference during the split before replacing + // it, which means we need to temporarily set the `data` ref to some unowned value. + // `Option<&mut [T]>` means we can replace it temporarily with the null reference, ensuring the + // mutable aliasing rules are always upheld. + data: Option<&'data mut [T]>, +} + +impl<'len, 'data, T> UnevenChunksMutIter<'len, 'data, T> { + fn new(chunk_lengths: &'len [usize], data: &'data mut [T]) -> Self { + Self { + chunk_lengths, + data: Some(data), + } + } +} + +impl<'len, 'data, T> Iterator for UnevenChunksMutIter<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn next(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let (out_data, rem_data) = self + .data + .take() + .unwrap() + .split_at_mut(self.chunk_lengths[0]); + self.chunk_lengths = &self.chunk_lengths[1..]; + self.data = Some(rem_data); + Some(out_data) + } + + fn size_hint(&self) -> (usize, Option) { + (self.chunk_lengths.len(), Some(self.chunk_lengths.len())) + } +} +impl<'len, 'data, T> ExactSizeIterator for UnevenChunksMutIter<'len, 'data, T> {} +impl<'len, 'data, T> DoubleEndedIterator for UnevenChunksMutIter<'len, 'data, T> { + #[track_caller] + fn next_back(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let pos = self.chunk_lengths.len() - 1; + let data_pos = self.data.as_ref().map(|x| x.len()).unwrap() - self.chunk_lengths[pos]; + let (rem_data, out_data) = self.data.take().unwrap().split_at_mut(data_pos); + self.chunk_lengths = &self.chunk_lengths[..pos]; + self.data = Some(rem_data); + Some(out_data) + } +} diff --git a/crates/accelerate/src/sparse_pauli_op.rs b/crates/accelerate/src/sparse_pauli_op.rs index c8e0f0fb9316..5d6a82df7940 100644 --- a/crates/accelerate/src/sparse_pauli_op.rs +++ b/crates/accelerate/src/sparse_pauli_op.rs @@ -10,16 +10,22 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; +use pyo3::types::PyTuple; use pyo3::wrap_pyfunction; use pyo3::Python; +use numpy::prelude::*; +use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods}; + use hashbrown::HashMap; use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis}; use num_complex::Complex64; use num_traits::Zero; -use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray2, PyUntypedArrayMethods}; +use rayon::prelude::*; + +use crate::rayon_ext::*; /// Find the unique elements of an array. /// @@ -71,9 +77,49 @@ enum Pauli { Z, } +/// Pack a 2D array of Booleans into a given width. Returns an error if the input array is +/// too large to be packed into u64. +fn pack_bits(bool_arr: ArrayView2) -> Result, ()> { + let num_qubits = bool_arr.shape()[1]; + if num_qubits > (u64::BITS as usize) { + return Err(()); + } + let slack = num_qubits % 8; + let pack_row = |row: ArrayView1| -> u64 { + let mut val: u64 = 0; + let mut shift = 0; + for chunk in row.exact_chunks(8) { + val |= ((chunk[0] as u8 + | ((chunk[1] as u8) << 1) + | ((chunk[2] as u8) << 2) + | ((chunk[3] as u8) << 3) + | ((chunk[4] as u8) << 4) + | ((chunk[5] as u8) << 5) + | ((chunk[6] as u8) << 6) + | ((chunk[7] as u8) << 7)) as u64) + << shift; + shift += 8; + } + if slack > 0 { + for (i, b) in row + .slice(s![num_qubits - slack..num_qubits]) + .iter() + .enumerate() + { + val |= (*b as u64) << (shift + i); + } + } + val + }; + Ok(bool_arr + .axis_iter(Axis(0)) + .map(pack_row) + .collect::>()) +} + /// A complete ZX-convention representation of a Pauli decomposition. This is all the components /// necessary to construct a Qiskit-space :class:`.SparsePauliOp`, where :attr:`phases` is in the -/// ZX convention. +/// ZX convention. This class is just meant for interoperation between Rust and Python. #[pyclass(module = "qiskit._accelerate.sparse_pauli_op")] pub struct ZXPaulis { #[pyo3(get)] @@ -86,6 +132,196 @@ pub struct ZXPaulis { pub coeffs: Py>, } +#[pymethods] +impl ZXPaulis { + #[new] + fn __new__( + x: &Bound>, + z: &Bound>, + phases: &Bound>, + coeffs: &Bound>, + ) -> PyResult { + let &[num_ops, num_qubits] = x.shape() else { unreachable!("PyArray2 must be 2D") }; + if z.shape() != [num_ops, num_qubits] { + return Err(PyValueError::new_err(format!( + "'x' and 'z' have different shapes: {:?} and {:?}", + [num_ops, num_qubits], + z.shape() + ))); + } + if phases.len() != num_ops || coeffs.len() != num_ops { + return Err(PyValueError::new_err(format!( + "mismatched dimensions: 'x' and 'z' have {} operator(s), 'phase' has {} and 'coeffs' has {}", + num_ops, + phases.len(), + coeffs.len(), + ))); + } + + Ok(Self { + x: x.to_owned().unbind(), + z: z.to_owned().unbind(), + phases: phases.to_owned().unbind(), + coeffs: coeffs.to_owned().unbind(), + }) + } +} + +impl ZXPaulis { + /// Attempt to acquire a Rust-enforced Rust-only immutable borrow onto the underlying + /// Python-space data. This returns `None` if any of the underlying arrays already has a + /// mutable borrow taken out onto it. + pub fn try_readonly<'a, 'py>(&'a self, py: Python<'py>) -> Option> + where + 'a: 'py, + { + Some(ZXPaulisReadonly { + x: self.x.bind(py).try_readonly().ok()?, + z: self.z.bind(py).try_readonly().ok()?, + phases: self.phases.bind(py).try_readonly().ok()?, + coeffs: self.coeffs.bind(py).try_readonly().ok()?, + }) + } +} + +/// Intermediate structure that represents readonly views onto the Python-space sparse Pauli data. +/// This is used in the chained methods so that the syntactical temporary lifetime extension can +/// occur; we can't have the readonly array temporaries only live within a method that returns +/// [ZXPaulisView], because otherwise the lifetimes of the [PyReadonlyArray] elements will be too +/// short. +pub struct ZXPaulisReadonly<'a> { + x: PyReadonlyArray2<'a, bool>, + z: PyReadonlyArray2<'a, bool>, + phases: PyReadonlyArray1<'a, u8>, + coeffs: PyReadonlyArray1<'a, Complex64>, +} + +impl ZXPaulisReadonly<'_> { + /// Get a [ndarray] view of the data of these [rust-numpy] objects. + fn as_array(&self) -> ZXPaulisView { + ZXPaulisView { + x: self.x.as_array(), + z: self.z.as_array(), + phases: self.phases.as_array(), + coeffs: self.coeffs.as_array(), + } + } +} + +/// Intermediate structure that represents [ndarray] views onto the Python-space sparse Pauli data +/// in the ZX convention. This can be used directly by Rust methods if desired, or bit-packed into +/// a matrix-representation format [MatrixCompressedPaulis] using the [compress] method. +pub struct ZXPaulisView<'py> { + x: ArrayView2<'py, bool>, + z: ArrayView2<'py, bool>, + phases: ArrayView1<'py, u8>, + coeffs: ArrayView1<'py, Complex64>, +} + +impl<'py> ZXPaulisView<'py> { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.x.shape()[1] + } + + /// Convert the ZX representation into a bitpacked internal representation. See the + /// documentation of [MatrixCompressedPaulis] for details of the changes to the Pauli + /// convention and representation. + pub fn matrix_compress(&self) -> PyResult { + let num_qubits = self.num_qubits(); + // This is obviously way too big for a dense operator, and SciPy limits us to using `i64` + // for the index and indptr types, so (except for some synthetic cases), it's not possible + // for us to work with a larger matrix than this. + if num_qubits > 63 { + return Err(PyValueError::new_err(format!( + "{num_qubits} is too many qubits to convert to a matrix" + ))); + } + if num_qubits == 0 { + return Ok(MatrixCompressedPaulis { + num_qubits: 0, + x_like: Vec::new(), + z_like: Vec::new(), + coeffs: self.coeffs.to_vec(), + }); + } + let x_like = pack_bits(self.x).expect("x should already be validated as <64 qubits"); + let z_like = pack_bits(self.z).expect("z should already be validated as <64 qubits"); + let coeffs = x_like + .iter() + .zip(z_like.iter()) + .zip(self.phases.iter().zip(self.coeffs.iter())) + .map(|((xs, zs), (&phase, &coeff))| { + let ys = (xs & zs).count_ones(); + match (phase as u32 + ys) % 4 { + 0 => coeff, + 1 => Complex64::new(coeff.im, -coeff.re), + 2 => Complex64::new(-coeff.re, -coeff.im), + 3 => Complex64::new(-coeff.im, coeff.re), + _ => unreachable!(), + } + }) + .collect::>(); + Ok(MatrixCompressedPaulis { + num_qubits: num_qubits as u8, + x_like, + z_like, + coeffs, + }) + } +} + +/// Temporary bit-compressed storage of the Pauli string. The [coeffs] are reinterpreted to +/// include the old `phase` component in them directly, plus the factors of `-i` stemming from `Y` +/// components. The result is that the [coeffs] now more directly represent entries in a matrix, +/// while [x_like] and [z_like] are no longer direct measures of `X` and `Z` elements (as in the ZX +/// convention), but are instead only a marker of the column and parity respectively. +/// +/// In other words, `row_num ^ x_like` gives the column number of an element, while +/// `(row_num & z_like).count_ones()` counts multiplicative factors of `-1` to be applied to +/// `coeff` when placing it at `(row_num, col_num)` in an output matrix. +pub struct MatrixCompressedPaulis { + num_qubits: u8, + x_like: Vec, + z_like: Vec, + coeffs: Vec, +} + +impl MatrixCompressedPaulis { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.num_qubits as usize + } + + /// The number of explicitly stored operators in the sum. + pub fn num_ops(&self) -> usize { + self.coeffs.len() + } + + /// Sum coefficients that correspond to the same Pauli operator; this reduces the number of + /// explicitly stored operations, if there are duplicates. After the summation, any terms that + /// have become zero are dropped. + pub fn combine(&mut self) { + let mut hash_table = HashMap::<(u64, u64), Complex64>::with_capacity(self.coeffs.len()); + for (key, coeff) in self + .x_like + .drain(..) + .zip(self.z_like.drain(..)) + .zip(self.coeffs.drain(..)) + { + *hash_table.entry(key).or_insert(Complex64::new(0.0, 0.0)) += coeff; + } + for ((x, z), coeff) in hash_table { + if coeff == Complex64::new(0.0, 0.0) { + continue; + } + self.x_like.push(x); + self.z_like.push(z); + self.coeffs.push(coeff); + } + } +} + /// Decompose a dense complex operator into the symplectic Pauli representation in the /// ZX-convention. /// @@ -257,10 +493,385 @@ fn decompose_dense_inner( ); } +/// Convert the given [ZXPaulis] object to a dense 2D Numpy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_dense<'py>( + py: Python<'py>, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult>> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + let side = 1usize << paulis.num_qubits(); + let parallel = !force_serial && crate::getenv_use_multiple_threads(); + let out = to_matrix_dense_inner(&paulis, parallel); + PyArray1::from_vec_bound(py, out).reshape([side, side]) +} + +/// Inner worker of the Python-exposed [to_matrix_dense]. This is separate primarily to allow +/// Rust-space unit testing even if Python isn't available for execution. This returns a C-ordered +/// [Vec] of the 2D matrix. +fn to_matrix_dense_inner(paulis: &MatrixCompressedPaulis, parallel: bool) -> Vec { + let side = 1usize << paulis.num_qubits(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(side * side); + // SAFETY: we iterate through the vec in chunks of `side`, and start each row by filling it + // with zeros before ever reading from it. It's fine to overwrite the uninitialised memory + // because `Complex64: !Drop`. + unsafe { out.set_len(side * side) }; + out + }; + let write_row = |(i_row, row): (usize, &mut [Complex64])| { + // Doing the initialisation here means that when we're in parallel contexts, we do the + // zeroing across the whole threadpool. This also seems to give a speed-up in serial + // contexts, but I don't understand that. ---Jake + row.fill(Complex64::new(0.0, 0.0)); + for ((&x_like, &z_like), &coeff) in paulis + .x_like + .iter() + .zip(paulis.z_like.iter()) + .zip(paulis.coeffs.iter()) + { + // Technically this discards part of the storable data, but in practice, a dense + // operator with more than 32 qubits needs in the region of 1 ZiB memory. We still use + // `u64` to help sparse-matrix construction, though. + let coeff = if (i_row as u32 & z_like as u32).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + row[i_row ^ (x_like as usize)] += coeff; + } + }; + if parallel { + out.par_chunks_mut(side).enumerate().for_each(write_row); + } else { + out.chunks_mut(side).enumerate().for_each(write_row); + } + out +} + +type CSRData = (Vec, Vec, Vec); +type ToCSRData = fn(&MatrixCompressedPaulis) -> CSRData; + +/// Convert the given [ZXPaulis] object to the three-array CSR form. The output type of the +/// `indices` and `indptr` matrices will be `i32` if that type is guaranteed to be able to hold the +/// number of non-zeros, otherwise it will be `i64`. Signed types are used to match Scipy. `i32` +/// is preferentially returned, because Scipy will downcast to this on `csr_matrix` construction if +/// all array elements would fit. For large operators with significant cancellation, it is +/// possible that `i64` will be returned when `i32` would suffice, but this will not cause +/// unsoundness, just a copy overhead when constructing the Scipy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_sparse( + py: Python, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + + // This deliberately erases the Rust types in the output so we can return either 32- or 64-bit + // indices as appropriate without breaking Rust's typing. + fn to_py_tuple(py: Python, csr_data: CSRData) -> Py + where + T: numpy::Element, + { + let (values, indices, indptr) = csr_data; + ( + PyArray1::from_vec_bound(py, values), + PyArray1::from_vec_bound(py, indices), + PyArray1::from_vec_bound(py, indptr), + ) + .into_py(py) + } + + // Pessimistic estimation of whether we can fit in `i32`. If there's any risk of overflowing + // `i32`, we use `i64`, but Scipy will always try to downcast to `i32`, so we try to match it. + let max_entries_per_row = (paulis.num_ops() as u64).min(1u64 << (paulis.num_qubits() - 1)); + let use_32_bit = + max_entries_per_row.saturating_mul(1u64 << paulis.num_qubits()) <= (i32::MAX as u64); + if use_32_bit { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_32 + } else { + to_matrix_sparse_serial_32 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } else { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_64 + } else { + to_matrix_sparse_serial_64 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } +} + +/// Copy several slices into a single flat vec, in parallel. Allocates a temporary `Vec` of +/// the same length as the input slice to track the chunking. +fn copy_flat_parallel(slices: &[U]) -> Vec +where + T: Copy + Send + Sync, + U: AsRef<[T]> + Sync, +{ + let lens = slices + .iter() + .map(|slice| slice.as_ref().len()) + .collect::>(); + let size = lens.iter().sum(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(size); + // SAFETY: we've just calculated that the lengths of the given vecs add up to the right + // thing, and we're about to copy in the data from each of them into this uninitialised + // array. It's guaranteed safe to write `T` to the uninitialised space, because `Copy` + // implies `!Drop`. + unsafe { out.set_len(size) }; + out + }; + out.par_uneven_chunks_mut(&lens) + .zip(slices.par_iter().map(|x| x.as_ref())) + .for_each(|(out_slice, in_slice)| out_slice.copy_from_slice(in_slice)); + out +} + +macro_rules! impl_to_matrix_sparse { + ($serial_fn:ident, $parallel_fn:ident, $int_ty:ty, $uint_ty:ty $(,)?) => { + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// completely serial strategy. + fn $serial_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut order = (0..num_ops).collect::>(); + let mut values = Vec::::with_capacity(side * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(side * (num_ops + 1) / 2); + let mut indptr: Vec<$int_ty> = vec![0; side + 1]; + let mut nnz = 0; + for i_row in 0..side { + order.sort_unstable_by(|&a, &b| { + ((i_row as $uint_ty) ^ (paulis.x_like[a] as $uint_ty)) + .cmp(&((i_row as $uint_ty) ^ (paulis.x_like[b] as $uint_ty))) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if ((i_row as $uint_ty) & (z_like as $uint_ty)).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + indptr[i_row + 1] = nnz; + } + (values, indices, indptr) + } + + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// parallel strategy. This involves more data copying than the serial form, so there is a + /// nontrivial amount of parallel overhead. + fn $parallel_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut indptr = Vec::<$int_ty>::with_capacity(side + 1); + indptr.push(0); + // SAFETY: we allocate the space for the `indptr` array here, then each thread writes + // in the number of nonzero entries for each row it was responsible for. We know ahead + // of time exactly how many entries we need (one per row, plus an explicit 0 to start). + // It's also important that `$int_ty` does not implement `Drop`, since otherwise it + // will be called on uninitialised memory (all primitive int types satisfy this). + unsafe { + indptr.set_len(side + 1); + } + + // The parallel overhead from splitting a subtask is fairly high (allocating and + // potentially growing a couple of vecs), so we're trading off some of Rayon's ability + // to keep threads busy by subdivision with minimising overhead; we're setting the + // chunk size such that the iterator will have as many elements as there are threads. + let num_threads = rayon::current_num_threads(); + let chunk_size = (side + num_threads - 1) / num_threads; + let mut values_chunks = Vec::with_capacity(num_threads); + let mut indices_chunks = Vec::with_capacity(num_threads); + // SAFETY: the slice here is uninitialised data; it must not be read. + indptr[1..] + .par_chunks_mut(chunk_size) + .enumerate() + .map(|(i, indptr_chunk)| { + let start = chunk_size * i; + let end = (chunk_size * (i + 1)).min(side); + let mut order = (0..num_ops).collect::>(); + // Since we compressed the Paulis by summing equal elements, we're + // lower-bounded on the number of elements per row by this value, up to + // cancellations. This should be a reasonable trade-off between sometimes + // expandin the vector and overallocation. + let mut values = + Vec::::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut nnz = 0; + for i_row in start..end { + order.sort_unstable_by(|&a, &b| { + (i_row as $uint_ty ^ paulis.x_like[a] as $uint_ty) + .cmp(&(i_row as $uint_ty ^ paulis.x_like[b] as $uint_ty)) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if (i_row as $uint_ty & z_like as $uint_ty).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + // When we write it, this is a cumulative `nnz` _within the chunk_. We + // turn that into a proper cumulative sum in serial afterwards. + indptr_chunk[i_row - start] = nnz; + } + (values, indices) + }) + .unzip_into_vecs(&mut values_chunks, &mut indices_chunks); + // Turn the chunkwise nnz counts into absolute nnz counts. + let mut start_nnz = 0usize; + let chunk_nnz = values_chunks + .iter() + .map(|chunk| { + let prev = start_nnz; + start_nnz += chunk.len(); + prev as $int_ty + }) + .collect::>(); + indptr[1..] + .par_chunks_mut(chunk_size) + .zip(chunk_nnz) + .for_each(|(indptr_chunk, start_nnz)| { + indptr_chunk.iter_mut().for_each(|nnz| *nnz += start_nnz); + }); + // Concatenate the chunkwise values and indices togther. + let values = copy_flat_parallel(&values_chunks); + let indices = copy_flat_parallel(&indices_chunks); + (values, indices, indptr) + } + }; +} + +impl_to_matrix_sparse!( + to_matrix_sparse_serial_32, + to_matrix_sparse_parallel_32, + i32, + u32 +); +impl_to_matrix_sparse!( + to_matrix_sparse_serial_64, + to_matrix_sparse_parallel_64, + i64, + u64 +); + #[pymodule] pub fn sparse_pauli_op(m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(unordered_unique))?; m.add_wrapped(wrap_pyfunction!(decompose_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_sparse))?; m.add_class::()?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::*; + + // The purpose of these tests is more about exercising the `unsafe` code; we test for full + // correctness from Python space. + + fn example_paulis() -> MatrixCompressedPaulis { + MatrixCompressedPaulis { + num_qubits: 4, + x_like: vec![0b0000, 0b0001, 0b0010, 0b1100, 0b1010, 0b0000], + z_like: vec![0b1000, 0b0110, 0b1001, 0b0100, 0b1010, 0b1000], + // Deliberately using multiples of small powers of two so the floating-point addition + // of them is associative. + coeffs: vec![ + Complex64::new(0.25, 0.5), + Complex64::new(0.125, 0.25), + Complex64::new(0.375, 0.125), + Complex64::new(-0.375, 0.0625), + Complex64::new(-0.5, -0.25), + ], + } + } + + #[test] + fn dense_threaded_and_serial_equal() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_dense_inner(&paulis, true)).unwrap(); + let serial = to_matrix_dense_inner(&paulis, false); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_32() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_32(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_32(&paulis); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_64() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_64(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_64(&paulis); + assert_eq!(parallel, serial); + } +} diff --git a/crates/accelerate/src/test.rs b/crates/accelerate/src/test.rs new file mode 100644 index 000000000000..dac51499202b --- /dev/null +++ b/crates/accelerate/src/test.rs @@ -0,0 +1,24 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +/// Helper for tests that involve calling Rayon code from within Miri. This runs the given +/// function in a scoped threadpool, which is then immediately dropped. This means that Miri will +/// not complain about the global (static) threads that are not joined when the process exits, +/// which is deliberate. +pub fn in_scoped_thread_pool(worker: F) -> Result +where + T: Send, + F: FnOnce() -> T + Send, +{ + ::rayon::ThreadPoolBuilder::new() + .build_scoped(::rayon::ThreadBuilder::run, |pool| pool.install(worker)) +} diff --git a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py index dc445509b0e3..dffe5b2396b2 100644 --- a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py +++ b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py @@ -23,7 +23,13 @@ import numpy as np import rustworkx as rx -from qiskit._accelerate.sparse_pauli_op import unordered_unique, decompose_dense +from qiskit._accelerate.sparse_pauli_op import ( + ZXPaulis, + decompose_dense, + to_matrix_dense, + to_matrix_sparse, + unordered_unique, +) from qiskit.circuit.parameter import Parameter from qiskit.circuit.parameterexpression import ParameterExpression from qiskit.circuit.parametertable import ParameterView @@ -925,24 +931,39 @@ def to_list(self, array: bool = False): return labels return labels.tolist() - def to_matrix(self, sparse: bool = False) -> np.ndarray: + def to_matrix(self, sparse: bool = False, force_serial: bool = False) -> np.ndarray: """Convert to a dense or sparse matrix. Args: - sparse (bool): if True return a sparse CSR matrix, otherwise - return dense Numpy array (Default: False). + sparse: if ``True`` return a sparse CSR matrix, otherwise return dense Numpy + array (the default). + force_serial: if ``True``, use an unthreaded implementation, regardless of the state of + the `Qiskit threading-control environment variables + `__. + By default, this will use threaded parallelism over the available CPUs. Returns: array: A dense matrix if `sparse=False`. csr_matrix: A sparse matrix in CSR format if `sparse=True`. """ - mat = None - for i in self.matrix_iter(sparse=sparse): - if mat is None: - mat = i - else: - mat += i - return mat + if self.coeffs.dtype == object: + # Fallback to slow Python-space method. + return sum(self.matrix_iter(sparse=sparse)) + + pauli_list = self.paulis + zx = ZXPaulis( + pauli_list.x.astype(np.bool_), + pauli_list.z.astype(np.bool_), + pauli_list.phase.astype(np.uint8), + self.coeffs.astype(np.complex128), + ) + if sparse: + from scipy.sparse import csr_matrix + + data, indices, indptr = to_matrix_sparse(zx, force_serial=force_serial) + side = 1 << self.num_qubits + return csr_matrix((data, indices, indptr), shape=(side, side)) + return to_matrix_dense(zx, force_serial=force_serial) def to_operator(self) -> Operator: """Convert to a matrix Operator object""" diff --git a/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml new file mode 100644 index 000000000000..135e83ef99b1 --- /dev/null +++ b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + The performance of :meth:`.SparsePauliOp.to_matrix` has been greatly improved for both dense and + sparse forms. By default, both will now take advantage of threaded parallelism available on + your system, subject to the ``RAYON_NUM_THREADS`` environment variable. You can temporarily + force serial execution using the new ``force_serial`` Boolean argument to + :meth:`~.SparsePauliOp.to_matrix`. diff --git a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py index fdbfb4d4201d..330fd53bc35d 100644 --- a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py +++ b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py @@ -15,9 +15,11 @@ import itertools as it import unittest import numpy as np +import scipy.sparse import rustworkx as rx from ddt import ddt + from qiskit import QiskitError from qiskit.circuit import ParameterExpression, Parameter, ParameterVector from qiskit.circuit.parametertable import ParameterView @@ -259,6 +261,36 @@ def test_to_matrix_large(self): np.testing.assert_array_equal(spp_op.to_matrix(), target) np.testing.assert_array_equal(spp_op.to_matrix(sparse=True).toarray(), target) + def test_to_matrix_zero(self): + """Test `to_matrix` with a zero operator.""" + num_qubits = 4 + zero_numpy = np.zeros((2**num_qubits, 2**num_qubits), dtype=np.complex128) + zero = SparsePauliOp.from_list([], num_qubits=num_qubits) + + zero_dense = zero.to_matrix(sparse=False) + np.testing.assert_array_equal(zero_dense, zero_numpy) + + zero_sparse = zero.to_matrix(sparse=True) + self.assertIsInstance(zero_sparse, scipy.sparse.csr_matrix) + np.testing.assert_array_equal(zero_sparse.A, zero_numpy) + + def test_to_matrix_parallel_vs_serial(self): + """Parallel execution should produce the same results as serial execution up to + floating-point associativity effects.""" + # Using powers-of-two coefficients to make floating-point arithmetic associative so we can + # do bit-for-bit assertions. Choose labels that have at least few overlapping locations. + labels = ["XZIXYX", "YIIYXY", "ZZZIIZ", "IIIIII"] + coeffs = [0.25, 0.125j, 0.5 - 0.25j, -0.125 + 0.5j] + op = SparsePauliOp(labels, coeffs) + np.testing.assert_array_equal( + op.to_matrix(sparse=True, force_serial=False).toarray(), + op.to_matrix(sparse=True, force_serial=True).toarray(), + ) + np.testing.assert_array_equal( + op.to_matrix(sparse=False, force_serial=False), + op.to_matrix(sparse=False, force_serial=True), + ) + def test_to_matrix_parameters(self): """Test to_matrix method for parameterized SparsePauliOp.""" labels = ["XI", "YZ", "YY", "ZZ"]