diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml new file mode 100644 index 000000000000..bdceb20c3008 --- /dev/null +++ b/.github/workflows/miri.yml @@ -0,0 +1,46 @@ +name: Miri +on: + push: + pull_request: +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.workflow }} + # Only cancel in PR mode. In push mode, don't cancel so we don't see spurious test "failures", + # and we get coverage reports on Coveralls for every push. + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + miri: + if: github.repository_owner == 'Qiskit' + name: Miri + runs-on: ubuntu-latest + env: + RUSTUP_TOOLCHAIN: nightly + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@nightly + with: + components: miri + + - name: Prepare Miri + run: | + set -e + # Some of our dependencies aren't Miri-safe with their current release versions. These + # need overriding with known-good versions to run against until the Miri-safe versions are + # released and updated in our Cargo.lock. + cat >>Cargo.toml < bool { let parallel_context = env::var("QISKIT_IN_PARALLEL") diff --git a/crates/accelerate/src/rayon_ext.rs b/crates/accelerate/src/rayon_ext.rs new file mode 100644 index 000000000000..af914a86d414 --- /dev/null +++ b/crates/accelerate/src/rayon_ext.rs @@ -0,0 +1,171 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +//! Extension structs for use with Rayon parallelism. + +// See https://github.com/rayon-rs/rayon/blob/v1.10.0/src/iter/plumbing/README.md (or a newer +// version) for more of an explanation of how Rayon's plumbing works. + +use rayon::iter::plumbing::*; +use rayon::prelude::*; + +pub trait ParallelSliceMutExt: ParallelSliceMut { + /// Create a parallel iterator over mutable chunks of uneven lengths for this iterator. + /// + /// # Panics + /// + /// Panics if the sums of the given lengths do not add up to the length of the slice. + #[track_caller] + fn par_uneven_chunks_mut<'len, 'data>( + &'data mut self, + chunk_lengths: &'len [usize], + ) -> ParUnevenChunksMut<'len, 'data, T> { + let mut_slice = self.as_parallel_slice_mut(); + let chunk_sum = chunk_lengths.iter().sum::(); + let slice_len = mut_slice.len(); + if chunk_sum != slice_len { + panic!("given slices of total size {chunk_sum} for a chunk of length {slice_len}"); + } + ParUnevenChunksMut { + chunk_lengths, + data: mut_slice, + } + } +} + +impl ParallelSliceMutExt for S where S: ParallelSliceMut {} + +/// Very similar to Rayon's [rayon::slice::ChunksMut], except that the lengths of the individual +/// chunks are arbitrary, provided they sum to the total length of the slice. +#[derive(Debug)] +pub struct ParUnevenChunksMut<'len, 'data, T> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> ParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn drive_unindexed>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } +} + +impl<'len, 'data, T: Send + 'data> IndexedParallelIterator for ParUnevenChunksMut<'len, 'data, T> { + #[track_caller] + fn drive>(self, consumer: C) -> C::Result { + bridge(self, consumer) + } + + fn len(&self) -> usize { + self.chunk_lengths.len() + } + + #[track_caller] + fn with_producer>(self, callback: CB) -> CB::Output { + callback.callback(UnevenChunksMutProducer { + chunk_lengths: self.chunk_lengths, + data: self.data, + }) + } +} + +struct UnevenChunksMutProducer<'len, 'data, T: Send> { + chunk_lengths: &'len [usize], + data: &'data mut [T], +} + +impl<'len, 'data, T: Send + 'data> Producer for UnevenChunksMutProducer<'len, 'data, T> { + type Item = &'data mut [T]; + type IntoIter = UnevenChunksMutIter<'len, 'data, T>; + + fn into_iter(self) -> Self::IntoIter { + Self::IntoIter::new(self.chunk_lengths, self.data) + } + + #[track_caller] + fn split_at(self, index: usize) -> (Self, Self) { + // Technically quadratic for a full-depth split, but let's worry about that later if needed. + let data_mid = self.chunk_lengths[..index].iter().sum(); + let (chunks_left, chunks_right) = self.chunk_lengths.split_at(index); + let (data_left, data_right) = self.data.split_at_mut(data_mid); + ( + Self { + chunk_lengths: chunks_left, + data: data_left, + }, + Self { + chunk_lengths: chunks_right, + data: data_right, + }, + ) + } +} + +#[must_use = "iterators do nothing unless consumed"] +struct UnevenChunksMutIter<'len, 'data, T> { + chunk_lengths: &'len [usize], + // The extra `Option` wrapper here is to satisfy the borrow checker while we're splitting the + // `data` reference. We need to consume `self`'s reference during the split before replacing + // it, which means we need to temporarily set the `data` ref to some unowned value. + // `Option<&mut [T]>` means we can replace it temporarily with the null reference, ensuring the + // mutable aliasing rules are always upheld. + data: Option<&'data mut [T]>, +} + +impl<'len, 'data, T> UnevenChunksMutIter<'len, 'data, T> { + fn new(chunk_lengths: &'len [usize], data: &'data mut [T]) -> Self { + Self { + chunk_lengths, + data: Some(data), + } + } +} + +impl<'len, 'data, T> Iterator for UnevenChunksMutIter<'len, 'data, T> { + type Item = &'data mut [T]; + + #[track_caller] + fn next(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let (out_data, rem_data) = self + .data + .take() + .unwrap() + .split_at_mut(self.chunk_lengths[0]); + self.chunk_lengths = &self.chunk_lengths[1..]; + self.data = Some(rem_data); + Some(out_data) + } + + fn size_hint(&self) -> (usize, Option) { + (self.chunk_lengths.len(), Some(self.chunk_lengths.len())) + } +} +impl<'len, 'data, T> ExactSizeIterator for UnevenChunksMutIter<'len, 'data, T> {} +impl<'len, 'data, T> DoubleEndedIterator for UnevenChunksMutIter<'len, 'data, T> { + #[track_caller] + fn next_back(&mut self) -> Option { + if self.chunk_lengths.is_empty() { + return None; + } + let pos = self.chunk_lengths.len() - 1; + let data_pos = self.data.as_ref().map(|x| x.len()).unwrap() - self.chunk_lengths[pos]; + let (rem_data, out_data) = self.data.take().unwrap().split_at_mut(data_pos); + self.chunk_lengths = &self.chunk_lengths[..pos]; + self.data = Some(rem_data); + Some(out_data) + } +} diff --git a/crates/accelerate/src/sparse_pauli_op.rs b/crates/accelerate/src/sparse_pauli_op.rs index c8e0f0fb9316..5d6a82df7940 100644 --- a/crates/accelerate/src/sparse_pauli_op.rs +++ b/crates/accelerate/src/sparse_pauli_op.rs @@ -10,16 +10,22 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; +use pyo3::types::PyTuple; use pyo3::wrap_pyfunction; use pyo3::Python; +use numpy::prelude::*; +use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, PyUntypedArrayMethods}; + use hashbrown::HashMap; use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis}; use num_complex::Complex64; use num_traits::Zero; -use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray2, PyUntypedArrayMethods}; +use rayon::prelude::*; + +use crate::rayon_ext::*; /// Find the unique elements of an array. /// @@ -71,9 +77,49 @@ enum Pauli { Z, } +/// Pack a 2D array of Booleans into a given width. Returns an error if the input array is +/// too large to be packed into u64. +fn pack_bits(bool_arr: ArrayView2) -> Result, ()> { + let num_qubits = bool_arr.shape()[1]; + if num_qubits > (u64::BITS as usize) { + return Err(()); + } + let slack = num_qubits % 8; + let pack_row = |row: ArrayView1| -> u64 { + let mut val: u64 = 0; + let mut shift = 0; + for chunk in row.exact_chunks(8) { + val |= ((chunk[0] as u8 + | ((chunk[1] as u8) << 1) + | ((chunk[2] as u8) << 2) + | ((chunk[3] as u8) << 3) + | ((chunk[4] as u8) << 4) + | ((chunk[5] as u8) << 5) + | ((chunk[6] as u8) << 6) + | ((chunk[7] as u8) << 7)) as u64) + << shift; + shift += 8; + } + if slack > 0 { + for (i, b) in row + .slice(s![num_qubits - slack..num_qubits]) + .iter() + .enumerate() + { + val |= (*b as u64) << (shift + i); + } + } + val + }; + Ok(bool_arr + .axis_iter(Axis(0)) + .map(pack_row) + .collect::>()) +} + /// A complete ZX-convention representation of a Pauli decomposition. This is all the components /// necessary to construct a Qiskit-space :class:`.SparsePauliOp`, where :attr:`phases` is in the -/// ZX convention. +/// ZX convention. This class is just meant for interoperation between Rust and Python. #[pyclass(module = "qiskit._accelerate.sparse_pauli_op")] pub struct ZXPaulis { #[pyo3(get)] @@ -86,6 +132,196 @@ pub struct ZXPaulis { pub coeffs: Py>, } +#[pymethods] +impl ZXPaulis { + #[new] + fn __new__( + x: &Bound>, + z: &Bound>, + phases: &Bound>, + coeffs: &Bound>, + ) -> PyResult { + let &[num_ops, num_qubits] = x.shape() else { unreachable!("PyArray2 must be 2D") }; + if z.shape() != [num_ops, num_qubits] { + return Err(PyValueError::new_err(format!( + "'x' and 'z' have different shapes: {:?} and {:?}", + [num_ops, num_qubits], + z.shape() + ))); + } + if phases.len() != num_ops || coeffs.len() != num_ops { + return Err(PyValueError::new_err(format!( + "mismatched dimensions: 'x' and 'z' have {} operator(s), 'phase' has {} and 'coeffs' has {}", + num_ops, + phases.len(), + coeffs.len(), + ))); + } + + Ok(Self { + x: x.to_owned().unbind(), + z: z.to_owned().unbind(), + phases: phases.to_owned().unbind(), + coeffs: coeffs.to_owned().unbind(), + }) + } +} + +impl ZXPaulis { + /// Attempt to acquire a Rust-enforced Rust-only immutable borrow onto the underlying + /// Python-space data. This returns `None` if any of the underlying arrays already has a + /// mutable borrow taken out onto it. + pub fn try_readonly<'a, 'py>(&'a self, py: Python<'py>) -> Option> + where + 'a: 'py, + { + Some(ZXPaulisReadonly { + x: self.x.bind(py).try_readonly().ok()?, + z: self.z.bind(py).try_readonly().ok()?, + phases: self.phases.bind(py).try_readonly().ok()?, + coeffs: self.coeffs.bind(py).try_readonly().ok()?, + }) + } +} + +/// Intermediate structure that represents readonly views onto the Python-space sparse Pauli data. +/// This is used in the chained methods so that the syntactical temporary lifetime extension can +/// occur; we can't have the readonly array temporaries only live within a method that returns +/// [ZXPaulisView], because otherwise the lifetimes of the [PyReadonlyArray] elements will be too +/// short. +pub struct ZXPaulisReadonly<'a> { + x: PyReadonlyArray2<'a, bool>, + z: PyReadonlyArray2<'a, bool>, + phases: PyReadonlyArray1<'a, u8>, + coeffs: PyReadonlyArray1<'a, Complex64>, +} + +impl ZXPaulisReadonly<'_> { + /// Get a [ndarray] view of the data of these [rust-numpy] objects. + fn as_array(&self) -> ZXPaulisView { + ZXPaulisView { + x: self.x.as_array(), + z: self.z.as_array(), + phases: self.phases.as_array(), + coeffs: self.coeffs.as_array(), + } + } +} + +/// Intermediate structure that represents [ndarray] views onto the Python-space sparse Pauli data +/// in the ZX convention. This can be used directly by Rust methods if desired, or bit-packed into +/// a matrix-representation format [MatrixCompressedPaulis] using the [compress] method. +pub struct ZXPaulisView<'py> { + x: ArrayView2<'py, bool>, + z: ArrayView2<'py, bool>, + phases: ArrayView1<'py, u8>, + coeffs: ArrayView1<'py, Complex64>, +} + +impl<'py> ZXPaulisView<'py> { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.x.shape()[1] + } + + /// Convert the ZX representation into a bitpacked internal representation. See the + /// documentation of [MatrixCompressedPaulis] for details of the changes to the Pauli + /// convention and representation. + pub fn matrix_compress(&self) -> PyResult { + let num_qubits = self.num_qubits(); + // This is obviously way too big for a dense operator, and SciPy limits us to using `i64` + // for the index and indptr types, so (except for some synthetic cases), it's not possible + // for us to work with a larger matrix than this. + if num_qubits > 63 { + return Err(PyValueError::new_err(format!( + "{num_qubits} is too many qubits to convert to a matrix" + ))); + } + if num_qubits == 0 { + return Ok(MatrixCompressedPaulis { + num_qubits: 0, + x_like: Vec::new(), + z_like: Vec::new(), + coeffs: self.coeffs.to_vec(), + }); + } + let x_like = pack_bits(self.x).expect("x should already be validated as <64 qubits"); + let z_like = pack_bits(self.z).expect("z should already be validated as <64 qubits"); + let coeffs = x_like + .iter() + .zip(z_like.iter()) + .zip(self.phases.iter().zip(self.coeffs.iter())) + .map(|((xs, zs), (&phase, &coeff))| { + let ys = (xs & zs).count_ones(); + match (phase as u32 + ys) % 4 { + 0 => coeff, + 1 => Complex64::new(coeff.im, -coeff.re), + 2 => Complex64::new(-coeff.re, -coeff.im), + 3 => Complex64::new(-coeff.im, coeff.re), + _ => unreachable!(), + } + }) + .collect::>(); + Ok(MatrixCompressedPaulis { + num_qubits: num_qubits as u8, + x_like, + z_like, + coeffs, + }) + } +} + +/// Temporary bit-compressed storage of the Pauli string. The [coeffs] are reinterpreted to +/// include the old `phase` component in them directly, plus the factors of `-i` stemming from `Y` +/// components. The result is that the [coeffs] now more directly represent entries in a matrix, +/// while [x_like] and [z_like] are no longer direct measures of `X` and `Z` elements (as in the ZX +/// convention), but are instead only a marker of the column and parity respectively. +/// +/// In other words, `row_num ^ x_like` gives the column number of an element, while +/// `(row_num & z_like).count_ones()` counts multiplicative factors of `-1` to be applied to +/// `coeff` when placing it at `(row_num, col_num)` in an output matrix. +pub struct MatrixCompressedPaulis { + num_qubits: u8, + x_like: Vec, + z_like: Vec, + coeffs: Vec, +} + +impl MatrixCompressedPaulis { + /// The number of qubits this operator acts on. + pub fn num_qubits(&self) -> usize { + self.num_qubits as usize + } + + /// The number of explicitly stored operators in the sum. + pub fn num_ops(&self) -> usize { + self.coeffs.len() + } + + /// Sum coefficients that correspond to the same Pauli operator; this reduces the number of + /// explicitly stored operations, if there are duplicates. After the summation, any terms that + /// have become zero are dropped. + pub fn combine(&mut self) { + let mut hash_table = HashMap::<(u64, u64), Complex64>::with_capacity(self.coeffs.len()); + for (key, coeff) in self + .x_like + .drain(..) + .zip(self.z_like.drain(..)) + .zip(self.coeffs.drain(..)) + { + *hash_table.entry(key).or_insert(Complex64::new(0.0, 0.0)) += coeff; + } + for ((x, z), coeff) in hash_table { + if coeff == Complex64::new(0.0, 0.0) { + continue; + } + self.x_like.push(x); + self.z_like.push(z); + self.coeffs.push(coeff); + } + } +} + /// Decompose a dense complex operator into the symplectic Pauli representation in the /// ZX-convention. /// @@ -257,10 +493,385 @@ fn decompose_dense_inner( ); } +/// Convert the given [ZXPaulis] object to a dense 2D Numpy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_dense<'py>( + py: Python<'py>, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult>> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + let side = 1usize << paulis.num_qubits(); + let parallel = !force_serial && crate::getenv_use_multiple_threads(); + let out = to_matrix_dense_inner(&paulis, parallel); + PyArray1::from_vec_bound(py, out).reshape([side, side]) +} + +/// Inner worker of the Python-exposed [to_matrix_dense]. This is separate primarily to allow +/// Rust-space unit testing even if Python isn't available for execution. This returns a C-ordered +/// [Vec] of the 2D matrix. +fn to_matrix_dense_inner(paulis: &MatrixCompressedPaulis, parallel: bool) -> Vec { + let side = 1usize << paulis.num_qubits(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(side * side); + // SAFETY: we iterate through the vec in chunks of `side`, and start each row by filling it + // with zeros before ever reading from it. It's fine to overwrite the uninitialised memory + // because `Complex64: !Drop`. + unsafe { out.set_len(side * side) }; + out + }; + let write_row = |(i_row, row): (usize, &mut [Complex64])| { + // Doing the initialisation here means that when we're in parallel contexts, we do the + // zeroing across the whole threadpool. This also seems to give a speed-up in serial + // contexts, but I don't understand that. ---Jake + row.fill(Complex64::new(0.0, 0.0)); + for ((&x_like, &z_like), &coeff) in paulis + .x_like + .iter() + .zip(paulis.z_like.iter()) + .zip(paulis.coeffs.iter()) + { + // Technically this discards part of the storable data, but in practice, a dense + // operator with more than 32 qubits needs in the region of 1 ZiB memory. We still use + // `u64` to help sparse-matrix construction, though. + let coeff = if (i_row as u32 & z_like as u32).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + row[i_row ^ (x_like as usize)] += coeff; + } + }; + if parallel { + out.par_chunks_mut(side).enumerate().for_each(write_row); + } else { + out.chunks_mut(side).enumerate().for_each(write_row); + } + out +} + +type CSRData = (Vec, Vec, Vec); +type ToCSRData = fn(&MatrixCompressedPaulis) -> CSRData; + +/// Convert the given [ZXPaulis] object to the three-array CSR form. The output type of the +/// `indices` and `indptr` matrices will be `i32` if that type is guaranteed to be able to hold the +/// number of non-zeros, otherwise it will be `i64`. Signed types are used to match Scipy. `i32` +/// is preferentially returned, because Scipy will downcast to this on `csr_matrix` construction if +/// all array elements would fit. For large operators with significant cancellation, it is +/// possible that `i64` will be returned when `i32` would suffice, but this will not cause +/// unsoundness, just a copy overhead when constructing the Scipy matrix. +#[pyfunction] +#[pyo3(signature = (/, paulis, force_serial=false))] +pub fn to_matrix_sparse( + py: Python, + paulis: &ZXPaulis, + force_serial: bool, +) -> PyResult> { + let paulis_readonly = paulis + .try_readonly(py) + .ok_or_else(|| PyRuntimeError::new_err("could not produce a safe view onto the data"))?; + let mut paulis = paulis_readonly.as_array().matrix_compress()?; + paulis.combine(); + + // This deliberately erases the Rust types in the output so we can return either 32- or 64-bit + // indices as appropriate without breaking Rust's typing. + fn to_py_tuple(py: Python, csr_data: CSRData) -> Py + where + T: numpy::Element, + { + let (values, indices, indptr) = csr_data; + ( + PyArray1::from_vec_bound(py, values), + PyArray1::from_vec_bound(py, indices), + PyArray1::from_vec_bound(py, indptr), + ) + .into_py(py) + } + + // Pessimistic estimation of whether we can fit in `i32`. If there's any risk of overflowing + // `i32`, we use `i64`, but Scipy will always try to downcast to `i32`, so we try to match it. + let max_entries_per_row = (paulis.num_ops() as u64).min(1u64 << (paulis.num_qubits() - 1)); + let use_32_bit = + max_entries_per_row.saturating_mul(1u64 << paulis.num_qubits()) <= (i32::MAX as u64); + if use_32_bit { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_32 + } else { + to_matrix_sparse_serial_32 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } else { + let to_sparse: ToCSRData = if crate::getenv_use_multiple_threads() && !force_serial { + to_matrix_sparse_parallel_64 + } else { + to_matrix_sparse_serial_64 + }; + Ok(to_py_tuple(py, to_sparse(&paulis))) + } +} + +/// Copy several slices into a single flat vec, in parallel. Allocates a temporary `Vec` of +/// the same length as the input slice to track the chunking. +fn copy_flat_parallel(slices: &[U]) -> Vec +where + T: Copy + Send + Sync, + U: AsRef<[T]> + Sync, +{ + let lens = slices + .iter() + .map(|slice| slice.as_ref().len()) + .collect::>(); + let size = lens.iter().sum(); + #[allow(clippy::uninit_vec)] + let mut out = { + let mut out = Vec::with_capacity(size); + // SAFETY: we've just calculated that the lengths of the given vecs add up to the right + // thing, and we're about to copy in the data from each of them into this uninitialised + // array. It's guaranteed safe to write `T` to the uninitialised space, because `Copy` + // implies `!Drop`. + unsafe { out.set_len(size) }; + out + }; + out.par_uneven_chunks_mut(&lens) + .zip(slices.par_iter().map(|x| x.as_ref())) + .for_each(|(out_slice, in_slice)| out_slice.copy_from_slice(in_slice)); + out +} + +macro_rules! impl_to_matrix_sparse { + ($serial_fn:ident, $parallel_fn:ident, $int_ty:ty, $uint_ty:ty $(,)?) => { + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// completely serial strategy. + fn $serial_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut order = (0..num_ops).collect::>(); + let mut values = Vec::::with_capacity(side * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(side * (num_ops + 1) / 2); + let mut indptr: Vec<$int_ty> = vec![0; side + 1]; + let mut nnz = 0; + for i_row in 0..side { + order.sort_unstable_by(|&a, &b| { + ((i_row as $uint_ty) ^ (paulis.x_like[a] as $uint_ty)) + .cmp(&((i_row as $uint_ty) ^ (paulis.x_like[b] as $uint_ty))) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if ((i_row as $uint_ty) & (z_like as $uint_ty)).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + indptr[i_row + 1] = nnz; + } + (values, indices, indptr) + } + + /// Build CSR data arrays for the matrix-compressed set of the Pauli operators, using a + /// parallel strategy. This involves more data copying than the serial form, so there is a + /// nontrivial amount of parallel overhead. + fn $parallel_fn(paulis: &MatrixCompressedPaulis) -> CSRData<$int_ty> { + let side = 1 << paulis.num_qubits(); + let num_ops = paulis.num_ops(); + if num_ops == 0 { + return (vec![], vec![], vec![0; side + 1]); + } + + let mut indptr = Vec::<$int_ty>::with_capacity(side + 1); + indptr.push(0); + // SAFETY: we allocate the space for the `indptr` array here, then each thread writes + // in the number of nonzero entries for each row it was responsible for. We know ahead + // of time exactly how many entries we need (one per row, plus an explicit 0 to start). + // It's also important that `$int_ty` does not implement `Drop`, since otherwise it + // will be called on uninitialised memory (all primitive int types satisfy this). + unsafe { + indptr.set_len(side + 1); + } + + // The parallel overhead from splitting a subtask is fairly high (allocating and + // potentially growing a couple of vecs), so we're trading off some of Rayon's ability + // to keep threads busy by subdivision with minimising overhead; we're setting the + // chunk size such that the iterator will have as many elements as there are threads. + let num_threads = rayon::current_num_threads(); + let chunk_size = (side + num_threads - 1) / num_threads; + let mut values_chunks = Vec::with_capacity(num_threads); + let mut indices_chunks = Vec::with_capacity(num_threads); + // SAFETY: the slice here is uninitialised data; it must not be read. + indptr[1..] + .par_chunks_mut(chunk_size) + .enumerate() + .map(|(i, indptr_chunk)| { + let start = chunk_size * i; + let end = (chunk_size * (i + 1)).min(side); + let mut order = (0..num_ops).collect::>(); + // Since we compressed the Paulis by summing equal elements, we're + // lower-bounded on the number of elements per row by this value, up to + // cancellations. This should be a reasonable trade-off between sometimes + // expandin the vector and overallocation. + let mut values = + Vec::::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut indices = Vec::<$int_ty>::with_capacity(chunk_size * (num_ops + 1) / 2); + let mut nnz = 0; + for i_row in start..end { + order.sort_unstable_by(|&a, &b| { + (i_row as $uint_ty ^ paulis.x_like[a] as $uint_ty) + .cmp(&(i_row as $uint_ty ^ paulis.x_like[b] as $uint_ty)) + }); + let mut running = Complex64::new(0.0, 0.0); + let mut prev_index = i_row ^ (paulis.x_like[order[0]] as usize); + for (x_like, z_like, coeff) in order + .iter() + .map(|&i| (paulis.x_like[i], paulis.z_like[i], paulis.coeffs[i])) + { + let coeff = + if (i_row as $uint_ty & z_like as $uint_ty).count_ones() % 2 == 0 { + coeff + } else { + -coeff + }; + let index = i_row ^ (x_like as usize); + if index == prev_index { + running += coeff; + } else { + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + running = coeff; + prev_index = index; + } + } + nnz += 1; + values.push(running); + indices.push(prev_index as $int_ty); + // When we write it, this is a cumulative `nnz` _within the chunk_. We + // turn that into a proper cumulative sum in serial afterwards. + indptr_chunk[i_row - start] = nnz; + } + (values, indices) + }) + .unzip_into_vecs(&mut values_chunks, &mut indices_chunks); + // Turn the chunkwise nnz counts into absolute nnz counts. + let mut start_nnz = 0usize; + let chunk_nnz = values_chunks + .iter() + .map(|chunk| { + let prev = start_nnz; + start_nnz += chunk.len(); + prev as $int_ty + }) + .collect::>(); + indptr[1..] + .par_chunks_mut(chunk_size) + .zip(chunk_nnz) + .for_each(|(indptr_chunk, start_nnz)| { + indptr_chunk.iter_mut().for_each(|nnz| *nnz += start_nnz); + }); + // Concatenate the chunkwise values and indices togther. + let values = copy_flat_parallel(&values_chunks); + let indices = copy_flat_parallel(&indices_chunks); + (values, indices, indptr) + } + }; +} + +impl_to_matrix_sparse!( + to_matrix_sparse_serial_32, + to_matrix_sparse_parallel_32, + i32, + u32 +); +impl_to_matrix_sparse!( + to_matrix_sparse_serial_64, + to_matrix_sparse_parallel_64, + i64, + u64 +); + #[pymodule] pub fn sparse_pauli_op(m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(unordered_unique))?; m.add_wrapped(wrap_pyfunction!(decompose_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_dense))?; + m.add_wrapped(wrap_pyfunction!(to_matrix_sparse))?; m.add_class::()?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::*; + + // The purpose of these tests is more about exercising the `unsafe` code; we test for full + // correctness from Python space. + + fn example_paulis() -> MatrixCompressedPaulis { + MatrixCompressedPaulis { + num_qubits: 4, + x_like: vec![0b0000, 0b0001, 0b0010, 0b1100, 0b1010, 0b0000], + z_like: vec![0b1000, 0b0110, 0b1001, 0b0100, 0b1010, 0b1000], + // Deliberately using multiples of small powers of two so the floating-point addition + // of them is associative. + coeffs: vec![ + Complex64::new(0.25, 0.5), + Complex64::new(0.125, 0.25), + Complex64::new(0.375, 0.125), + Complex64::new(-0.375, 0.0625), + Complex64::new(-0.5, -0.25), + ], + } + } + + #[test] + fn dense_threaded_and_serial_equal() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_dense_inner(&paulis, true)).unwrap(); + let serial = to_matrix_dense_inner(&paulis, false); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_32() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_32(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_32(&paulis); + assert_eq!(parallel, serial); + } + + #[test] + fn sparse_threaded_and_serial_equal_64() { + let paulis = example_paulis(); + let parallel = in_scoped_thread_pool(|| to_matrix_sparse_parallel_64(&paulis)).unwrap(); + let serial = to_matrix_sparse_serial_64(&paulis); + assert_eq!(parallel, serial); + } +} diff --git a/crates/accelerate/src/test.rs b/crates/accelerate/src/test.rs new file mode 100644 index 000000000000..dac51499202b --- /dev/null +++ b/crates/accelerate/src/test.rs @@ -0,0 +1,24 @@ +// This code is part of Qiskit. +// +// (C) Copyright IBM 2023 +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE.txt file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +/// Helper for tests that involve calling Rayon code from within Miri. This runs the given +/// function in a scoped threadpool, which is then immediately dropped. This means that Miri will +/// not complain about the global (static) threads that are not joined when the process exits, +/// which is deliberate. +pub fn in_scoped_thread_pool(worker: F) -> Result +where + T: Send, + F: FnOnce() -> T + Send, +{ + ::rayon::ThreadPoolBuilder::new() + .build_scoped(::rayon::ThreadBuilder::run, |pool| pool.install(worker)) +} diff --git a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py index 0d29cd098dc3..8cf4f97c730b 100644 --- a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py +++ b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py @@ -23,7 +23,13 @@ import numpy as np import rustworkx as rx -from qiskit._accelerate.sparse_pauli_op import unordered_unique, decompose_dense +from qiskit._accelerate.sparse_pauli_op import ( + ZXPaulis, + decompose_dense, + to_matrix_dense, + to_matrix_sparse, + unordered_unique, +) from qiskit.circuit.parameter import Parameter from qiskit.circuit.parameterexpression import ParameterExpression from qiskit.circuit.parametertable import ParameterView @@ -919,24 +925,39 @@ def to_list(self, array: bool = False): return labels return labels.tolist() - def to_matrix(self, sparse: bool = False) -> np.ndarray: + def to_matrix(self, sparse: bool = False, force_serial: bool = False) -> np.ndarray: """Convert to a dense or sparse matrix. Args: - sparse (bool): if True return a sparse CSR matrix, otherwise - return dense Numpy array (Default: False). + sparse: if ``True`` return a sparse CSR matrix, otherwise return dense Numpy + array (the default). + force_serial: if ``True``, use an unthreaded implementation, regardless of the state of + the `Qiskit threading-control environment variables + `__. + By default, this will use threaded parallelism over the available CPUs. Returns: array: A dense matrix if `sparse=False`. csr_matrix: A sparse matrix in CSR format if `sparse=True`. """ - mat = None - for i in self.matrix_iter(sparse=sparse): - if mat is None: - mat = i - else: - mat += i - return mat + if self.coeffs.dtype == object: + # Fallback to slow Python-space method. + return sum(self.matrix_iter(sparse=sparse)) + + pauli_list = self.paulis + zx = ZXPaulis( + pauli_list.x.astype(np.bool_), + pauli_list.z.astype(np.bool_), + pauli_list.phase.astype(np.uint8), + self.coeffs.astype(np.complex128), + ) + if sparse: + from scipy.sparse import csr_matrix + + data, indices, indptr = to_matrix_sparse(zx, force_serial=force_serial) + side = 1 << self.num_qubits + return csr_matrix((data, indices, indptr), shape=(side, side)) + return to_matrix_dense(zx, force_serial=force_serial) def to_operator(self) -> Operator: """Convert to a matrix Operator object""" diff --git a/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml new file mode 100644 index 000000000000..135e83ef99b1 --- /dev/null +++ b/releasenotes/notes/spo-to-matrix-26445a791e24f62a.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + The performance of :meth:`.SparsePauliOp.to_matrix` has been greatly improved for both dense and + sparse forms. By default, both will now take advantage of threaded parallelism available on + your system, subject to the ``RAYON_NUM_THREADS`` environment variable. You can temporarily + force serial execution using the new ``force_serial`` Boolean argument to + :meth:`~.SparsePauliOp.to_matrix`. diff --git a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py index fdbfb4d4201d..330fd53bc35d 100644 --- a/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py +++ b/test/python/quantum_info/operators/symplectic/test_sparse_pauli_op.py @@ -15,9 +15,11 @@ import itertools as it import unittest import numpy as np +import scipy.sparse import rustworkx as rx from ddt import ddt + from qiskit import QiskitError from qiskit.circuit import ParameterExpression, Parameter, ParameterVector from qiskit.circuit.parametertable import ParameterView @@ -259,6 +261,36 @@ def test_to_matrix_large(self): np.testing.assert_array_equal(spp_op.to_matrix(), target) np.testing.assert_array_equal(spp_op.to_matrix(sparse=True).toarray(), target) + def test_to_matrix_zero(self): + """Test `to_matrix` with a zero operator.""" + num_qubits = 4 + zero_numpy = np.zeros((2**num_qubits, 2**num_qubits), dtype=np.complex128) + zero = SparsePauliOp.from_list([], num_qubits=num_qubits) + + zero_dense = zero.to_matrix(sparse=False) + np.testing.assert_array_equal(zero_dense, zero_numpy) + + zero_sparse = zero.to_matrix(sparse=True) + self.assertIsInstance(zero_sparse, scipy.sparse.csr_matrix) + np.testing.assert_array_equal(zero_sparse.A, zero_numpy) + + def test_to_matrix_parallel_vs_serial(self): + """Parallel execution should produce the same results as serial execution up to + floating-point associativity effects.""" + # Using powers-of-two coefficients to make floating-point arithmetic associative so we can + # do bit-for-bit assertions. Choose labels that have at least few overlapping locations. + labels = ["XZIXYX", "YIIYXY", "ZZZIIZ", "IIIIII"] + coeffs = [0.25, 0.125j, 0.5 - 0.25j, -0.125 + 0.5j] + op = SparsePauliOp(labels, coeffs) + np.testing.assert_array_equal( + op.to_matrix(sparse=True, force_serial=False).toarray(), + op.to_matrix(sparse=True, force_serial=True).toarray(), + ) + np.testing.assert_array_equal( + op.to_matrix(sparse=False, force_serial=False), + op.to_matrix(sparse=False, force_serial=True), + ) + def test_to_matrix_parameters(self): """Test to_matrix method for parameterized SparsePauliOp.""" labels = ["XI", "YZ", "YY", "ZZ"]