From ef7c22a67062a4804565323ecbed0e167315e9db Mon Sep 17 00:00:00 2001 From: Jae Yoo Date: Thu, 5 May 2022 07:52:57 +0900 Subject: [PATCH] Add initial qsim CUDA version expectation op --- tensorflow_quantum/core/ops/gpu/BUILD | 95 +++++ .../tfq_simulate_expectation_op_cuda.cu.cc | 403 ++++++++++++++++++ .../core/ops/gpu/tfq_simulate_ops_cuda.py | 61 +++ .../ops/gpu/tfq_simulate_ops_cuda_test.py | 130 ++++++ 4 files changed, 689 insertions(+) create mode 100644 tensorflow_quantum/core/ops/gpu/BUILD create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py diff --git a/tensorflow_quantum/core/ops/gpu/BUILD b/tensorflow_quantum/core/ops/gpu/BUILD new file mode 100644 index 000000000..4a5cd0a30 --- /dev/null +++ b/tensorflow_quantum/core/ops/gpu/BUILD @@ -0,0 +1,95 @@ +load("//tensorflow:tensorflow.bzl", "tf_cuda_library") +load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py") + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +# Export for the PIP package. +exports_files(["__init__.py"]) + +config_setting( + name = "windows", + constraint_values = ["@bazel_tools//platforms:windows"], +) + +tf_cuda_library( + name = "_tfq_simulate_ops_cuda.so", + srcs = [ + "tfq_simulate_expectation_op_cuda.cu.cc", + ], + copts = select({ + ":windows": [ + "/D__CLANG_SUPPORT_DYN_ANNOTATION__", + "/D_USE_MATH_DEFINES", + "/DEIGEN_MPL2_ONLY", + "/DEIGEN_MAX_ALIGN_BYTES=64", + "/DEIGEN_HAS_TYPE_TRAITS=0", + "/DTF_USE_SNAPPY", + "/showIncludes", + "/MD", + "/O2", + "/DNDEBUG", + "/w", + "-DWIN32_LEAN_AND_MEAN", + "-DNOGDI", + "/d2ReducedOptimizeHugeFunctions", + "/arch:AVX", + "/std:c++14", + "-DTENSORFLOW_MONOLITHIC_BUILD", + "/DPLATFORM_WINDOWS", + "/DEIGEN_HAS_C99_MATH", + "/DTENSORFLOW_USE_EIGEN_THREADPOOL", + "/DEIGEN_AVOID_STL_ARRAY", + "/Iexternal/gemmlowp", + "/wd4018", + "/wd4577", + "/DNOGDI", + "/UTF_COMPILE_LIBRARY", + ], + "//conditions:default": [ + "-pthread", + "-std=c++14", + "-D_GLIBCXX_USE_CXX11_ABI=0", + "-O3", + "-DNV_CUDNN_DISABLE_EXCEPTION", + ], + }), + features = select({ + ":windows": ["windows_export_all_symbols"], + "//conditions:default": [], + }), + linkshared = 1, + deps = [ + # cirq cc proto + "//tensorflow_quantum/core/ops:parse_context", + "//tensorflow_quantum/core/ops:tfq_simulate_utils", + "//tensorflow_quantum/core/proto:pauli_sum_cc_proto", + "//tensorflow_quantum/core/proto:program_cc_proto", + "//tensorflow_quantum/core/src:circuit_parser_qsim", + "//tensorflow_quantum/core/src:util_qsim", + "@qsim/lib:qsim_cuda_lib", + "@eigen//:eigen3", + # tensorflow core framework + # tensorflow core lib + # tensorflow core protos + ], +) + +tf_gen_op_wrapper_py( + name = "gen_tfq_simulate_ops_cuda_py", + out = "gen_tfq_simulate_ops_cuda.py", + deps = [":_tfq_simulate_ops_cuda.so"], +) + +py_binary( + name = "gpu_benchmark", + srcs = ["gpu_benchmark.py"], + python_version = "PY3", + deps = [ + "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuda_py", + "//tensorflow_quantum/core/ops:tfq_simulate_ops_py", + "//tensorflow_quantum/core/serialize:serializer", + "//tensorflow_quantum/python:util", + ], +) diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc b/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc new file mode 100644 index 000000000..ca81c8e00 --- /dev/null +++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc @@ -0,0 +1,403 @@ +/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include + +#include "../qsim/lib/circuit.h" +#include "../qsim/lib/gate_appl.h" +#include "../qsim/lib/gates_cirq.h" +#include "../qsim/lib/gates_qsim.h" +#include "../qsim/lib/seqfor.h" +#include "../qsim/lib/simulator_cuda.h" +#include "../qsim/lib/statespace_cuda.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/core/error_codes.pb.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow_quantum/core/ops/parse_context.h" +#include "tensorflow_quantum/core/proto/pauli_sum.pb.h" +#include "tensorflow_quantum/core/proto/program.pb.h" +#include "tensorflow_quantum/core/src/util_qsim.h" + +namespace tfq { + +using ::tensorflow::Status; +using ::tfq::proto::PauliSum; +using ::tfq::proto::Program; + +typedef qsim::Cirq::GateCirq QsimGate; +typedef qsim::Circuit QsimCircuit; + + +Status AllocateQsimTempTensors( + tensorflow::OpKernelContext* context, tensorflow::Tensor* d_wf_tensor, + tensorflow::Tensor* d_idx_tensor, tensorflow::Tensor* d_ms_tensor, + tensorflow::Tensor* d_xss_tensor) { + tensorflow::AllocatorAttributes alloc_attr; + alloc_attr.set_on_host(false); + alloc_attr.set_gpu_compatible(true); + TF_RETURN_IF_ERROR(context->allocate_temp( + tensorflow::DataType::DT_FLOAT, + tensorflow::TensorShape({131072 * sizeof(float)}), + d_wf_tensor, alloc_attr)); + TF_RETURN_IF_ERROR(context->allocate_temp( + tensorflow::DataType::DT_UINT32, + tensorflow::TensorShape({992 * sizeof(unsigned)}), + d_idx_tensor, alloc_attr)); + TF_RETURN_IF_ERROR(context->allocate_temp( + tensorflow::DataType::DT_UINT64, + tensorflow::TensorShape({7 * sizeof(uint64_t)}), + d_ms_tensor, alloc_attr)); + TF_RETURN_IF_ERROR(context->allocate_temp( + tensorflow::DataType::DT_UINT64, + tensorflow::TensorShape({64 * sizeof(uint64_t)}), + d_xss_tensor, alloc_attr)); + return Status::OK(); +} + +// __global__ void ComputeSmallCudaKernel(const int total_size, +// int output_dim_op_size, int* num_qubits, +// const thrust::host_vector>>& fused_circuits, +// const thrust::host_vector>& pauli_sums, +// float* out) { +// int old_batch_index = -2; +// int cur_batch_index = -1; +// int largest_nq = 1; +// int cur_op_index; +// auto sv = ss.Create(largest_nq); +// auto scratch = ss.Create(largest_nq); + +// for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < total_size; +// i += blockDim.x * gridDim.x) { +// cur_batch_index = i / output_dim_op_size; +// cur_op_index = i % output_dim_op_size; + +// const int nq = num_qubits[cur_batch_index]; +// // (#679) Just ignore empty program +// if (fused_circuits[cur_batch_index].size() == 0) { +// out[i] = -2.0; +// continue; +// } +// if (cur_batch_index != old_batch_index) { +// // We've run into a new state vector we must compute. +// // Only compute a new state vector when we have to. +// if (nq > largest_nq) { +// largest_nq = nq; +// sv = ss.Create(largest_nq); +// scratch = ss.Create(largest_nq); +// } +// // no need to update scratch_state since ComputeExpectation +// // will take care of things for us. +// ss.SetStateZero(sv); +// for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) { +// qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv); +// } +// } + +// float exp_v = 0.0; +// ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index], +// sim, ss, sv, scratch, &exp_v), +// out[i] = exp_v; +// old_batch_index = cur_batch_index; +// } +// } + + +class TfqSimulateExpectationOpGpuCpu : public tensorflow::OpKernel { + public: + explicit TfqSimulateExpectationOpGpuCpu(tensorflow::OpKernelConstruction* context) + : OpKernel(context) { + // Get the number of CPU cycle in ComputeSmall via attributes. + OP_REQUIRES_OK(context, context->GetAttr("cpu_cycle", &cpu_cycle_)); + + // Get the number of threads in SimulatorCUDA via attributes. + OP_REQUIRES_OK(context, context->GetAttr("num_threads_in_sim", + &num_threads_in_sim_)); + + // Get the number of blocks & threads in StateSpaceCUDA. + OP_REQUIRES_OK(context, context->GetAttr("block_count", &block_count_)); + OP_REQUIRES_OK(context, context->GetAttr("thread_per_block", + &thread_per_block_)); + } + + void Compute(tensorflow::OpKernelContext* context) override { + // TODO (mbbrough): add more dimension checks for other inputs here. + const int num_inputs = context->num_inputs(); + OP_REQUIRES(context, num_inputs == 4, + tensorflow::errors::InvalidArgument(absl::StrCat( + "Expected 4 inputs, got ", num_inputs, " inputs."))); + + // Create the output Tensor. + const int output_dim_batch_size = context->input(0).dim_size(0); + const int output_dim_op_size = context->input(3).dim_size(1); + tensorflow::TensorShape output_shape; + output_shape.AddDim(output_dim_batch_size); + output_shape.AddDim(output_dim_op_size); + + tensorflow::Tensor* output = nullptr; + tensorflow::AllocatorAttributes alloc_attr; + alloc_attr.set_on_host(true); + alloc_attr.set_gpu_compatible(true); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output, + alloc_attr)); + auto output_tensor = output->matrix(); + // Parse program protos. + std::vector programs; + std::vector num_qubits; + std::vector> pauli_sums; + OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs, + &num_qubits, &pauli_sums)); + + std::vector maps; + OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps)); + + OP_REQUIRES(context, programs.size() == maps.size(), + tensorflow::errors::InvalidArgument(absl::StrCat( + "Number of circuits and symbol_values do not match. Got ", + programs.size(), " circuits and ", maps.size(), + " symbol values."))); + + // Construct qsim circuits. + std::vector qsim_circuits(programs.size(), QsimCircuit()); + std::vector>> fused_circuits( + programs.size(), std::vector>({})); + + Status parse_status = Status::OK(); + auto p_lock = tensorflow::mutex(); + auto construct_f = [&](int start, int end) { + for (int i = start; i < end; i++) { + Status local = + QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i], + &qsim_circuits[i], &fused_circuits[i]); + NESTED_FN_STATUS_SYNC(parse_status, local, p_lock); + } + }; + + const int num_cycles = 1000; + context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor( + programs.size(), num_cycles, construct_f); + OP_REQUIRES_OK(context, parse_status); + + int max_num_qubits = 0; + for (const int num : num_qubits) { + max_num_qubits = std::max(max_num_qubits, num); + } + + if (max_num_qubits >= 26 || programs.size() == 1) { + tensorflow::Tensor d_wf_tensor; + tensorflow::Tensor d_idx_tensor; + tensorflow::Tensor d_ms_tensor; + tensorflow::Tensor d_xss_tensor; + OP_REQUIRES_OK(context, AllocateQsimTempTensors( + context, &d_wf_tensor, &d_idx_tensor, &d_ms_tensor, &d_xss_tensor)); + ComputeLarge(num_qubits, fused_circuits, pauli_sums, context, + &output_tensor, d_wf_tensor.flat().data(), + d_idx_tensor.flat().data(), + d_ms_tensor.flat().data(), + d_xss_tensor.flat().data()); + } else { + ComputeSmall(num_qubits, max_num_qubits, fused_circuits, pauli_sums, + context, &output_tensor); + } + } + + private: + int num_threads_in_sim_; + int thread_per_block_; + int block_count_; + int cpu_cycle_; + + // Define the GPU implementation that launches the CUDA kernel. + void ComputeLarge( + const std::vector& num_qubits, + const std::vector>>& fused_circuits, + const std::vector>& pauli_sums, + tensorflow::OpKernelContext* context, + tensorflow::TTypes::Matrix* output_tensor, + float* d_wf, unsigned* d_idx, uint64_t* d_ms, uint64_t* d_xss) { + // Instantiate qsim objects. + using Simulator = qsim::SimulatorCUDA; + using StateSpace = Simulator::StateSpace; + // Launch the cuda kernel. + // TFQ GPU + Simulator::Parameter param_sim; + param_sim.num_threads = num_threads_in_sim_; + + StateSpace::Parameter param_ss; + param_ss.num_threads = thread_per_block_; + param_ss.num_dblocks = block_count_; + + // Begin simulation. + int largest_nq = 1; + Simulator sim = Simulator(param_sim, d_wf, d_idx, d_ms, d_xss); + StateSpace ss = StateSpace(param_ss); + auto sv = ss.Create(largest_nq); + auto scratch = ss.Create(largest_nq); + + // Simulate programs one by one. Parallelizing over state vectors + // we no longer parallelize over circuits. Each time we encounter a + // a larger circuit we will grow the Statevector as necessary. + for (int i = 0; i < fused_circuits.size(); i++) { + int nq = num_qubits[i]; + + if (nq > largest_nq) { + // need to switch to larger statespace. + largest_nq = nq; + sv = ss.Create(largest_nq); + scratch = ss.Create(largest_nq); + } + // TODO: add heuristic here so that we do not always recompute + // the state if there is a possibility that circuit[i] and + // circuit[i + 1] produce the same state. + ss.SetStateZero(sv); + for (int j = 0; j < fused_circuits[i].size(); j++) { + qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv); + } + for (int j = 0; j < pauli_sums[i].size(); j++) { + // (#679) Just ignore empty program + if (fused_circuits[i].size() == 0) { + (*output_tensor)(i, j) = -2.0; + continue; + } + float exp_v = 0.0; + OP_REQUIRES_OK(context, + ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv, + scratch, &exp_v)); + (*output_tensor)(i, j) = exp_v; + } + } + } + + void ComputeSmall( + const std::vector& num_qubits, const int max_num_qubits, + const std::vector>>& fused_circuits, + const std::vector>& pauli_sums, + tensorflow::OpKernelContext* context, + tensorflow::TTypes::Matrix* output_tensor) { + using Simulator = qsim::SimulatorCUDA; + using StateSpace = Simulator::StateSpace; + // TFQ GPU + Simulator::Parameter param_sim; + param_sim.num_threads = num_threads_in_sim_; + + StateSpace::Parameter param_ss; + param_ss.num_threads = thread_per_block_; + param_ss.num_dblocks = block_count_; + + const int output_dim_op_size = output_tensor->dimension(1); + + Status compute_status = Status::OK(); + auto c_lock = tensorflow::mutex(); + auto DoWork = [&](int start, int end) { + int old_batch_index = -2; + int cur_batch_index = -1; + int largest_nq = 1; + int cur_op_index; + + // Begin simulation. + // Think later, d_wf, d_idx, d_ms, d_xss); + auto sim = Simulator(param_sim); + auto ss = StateSpace(param_ss); + auto sv = ss.Create(largest_nq); + auto scratch = ss.Create(largest_nq); + for (int i = start; i < end; i++) { + cur_batch_index = i / output_dim_op_size; + cur_op_index = i % output_dim_op_size; + + const int nq = num_qubits[cur_batch_index]; + + // (#679) Just ignore empty program + if (fused_circuits[cur_batch_index].size() == 0) { + (*output_tensor)(cur_batch_index, cur_op_index) = -2.0; + continue; + } + + if (cur_batch_index != old_batch_index) { + // We've run into a new state vector we must compute. + // Only compute a new state vector when we have to. + if (nq > largest_nq) { + largest_nq = nq; + sv = ss.Create(largest_nq); + scratch = ss.Create(largest_nq); + } + // no need to update scratch_state since ComputeExpectation + // will take care of things for us. + ss.SetStateZero(sv); + for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) { + qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv); + } + } + + float exp_v = 0.0; + NESTED_FN_STATUS_SYNC( + compute_status, + ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index], + sim, ss, sv, scratch, &exp_v), + c_lock); + (*output_tensor)(cur_batch_index, cur_op_index) = exp_v; + old_batch_index = cur_batch_index; + } + }; + + const int64_t num_cycles = static_cast(cpu_cycle_); + context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor( + fused_circuits.size() * output_dim_op_size, num_cycles, DoWork); + OP_REQUIRES_OK(context, compute_status); + } +}; + +REGISTER_KERNEL_BUILDER( + Name("TfqSimulateExpectationGpuCpu").Device(tensorflow::DEVICE_CPU), + TfqSimulateExpectationOpGpuCpu); + +REGISTER_OP("TfqSimulateExpectationGpuCpu") + .Input("programs: string") + .Input("symbol_names: string") + .Input("symbol_values: float") + .Input("pauli_sums: string") + .Output("expectations: float") + .Attr("num_threads_in_sim: int >= 32 = 32") + .Attr("block_count: int >= 2 = 2") + .Attr("thread_per_block: int >= 32 = 32") + .Attr("cpu_cycle: int >= 1 = 1") + .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) { + tensorflow::shape_inference::ShapeHandle programs_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape)); + + tensorflow::shape_inference::ShapeHandle symbol_names_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape)); + + tensorflow::shape_inference::ShapeHandle symbol_values_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape)); + + tensorflow::shape_inference::ShapeHandle pauli_sums_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape)); + + tensorflow::shape_inference::DimensionHandle output_rows = + c->Dim(programs_shape, 0); + tensorflow::shape_inference::DimensionHandle output_cols = + c->Dim(pauli_sums_shape, 1); + c->set_output(0, c->Matrix(output_rows, output_cols)); + + return tensorflow::Status::OK(); + }); + +} // namespace tfq diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py new file mode 100644 index 000000000..6f3ac1784 --- /dev/null +++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py @@ -0,0 +1,61 @@ +# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Module to register cuda simulation python op.""" +import os +import tensorflow as tf +from tensorflow_quantum.core.ops.load_module import load_module + +SIM_OP_MODULE = load_module(os.path.join("gpu", "_tfq_simulate_ops_gpu.so")) + + +def tfq_simulate_expectation(programs, symbol_names, symbol_values, pauli_sums, + num_threads_in_sim=32, block_count=2, + thread_per_block=32, cpu_cycle=200): + """Calculates the expectation value of circuits wrt some operator(s). + + Args: + programs: `tf.Tensor` of strings with shape [batch_size] containing + the string representations of the circuits to be executed. + symbol_names: `tf.Tensor` of strings with shape [n_params], which + is used to specify the order in which the values in + `symbol_values` should be placed inside of the circuits in + `programs`. + symbol_values: `tf.Tensor` of real numbers with shape + [batch_size, n_params] specifying parameter values to resolve + into the circuits specificed by programs, following the ordering + dictated by `symbol_names`. + pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops] + containing the string representation of the operators that will + be used on all of the circuits in the expectation calculations. + num_threads_in_sim: Python integer to specify the number of threads in + QSim SimulatorCUDA. One of [32, 64, 128, 256] + block_count: Python integer for the number of blocks in QSim + StateSpaceCUDA. One of [2, 16] + thread_per_block: Python integer to specify the number of threads per + block in StateSpaceCUDA. One of [32, 64, 128, 256, 512, 1024] + cpu_cycle: Python integer to specify the number of CPU cycles. + + Returns: + `tf.Tensor` with shape [batch_size, n_ops] that holds the + expectation value for each circuit with each op applied to it + (after resolving the corresponding parameters in). + """ + return SIM_OP_MODULE.tfq_simulate_expectation_gpu_cpu( + programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums, + num_threads_in_sim=num_threads_in_sim, + block_count=block_count, + thread_per_block=thread_per_block, + cpu_cycle=cpu_cycle) + diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py new file mode 100644 index 000000000..5626823ce --- /dev/null +++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py @@ -0,0 +1,130 @@ +# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests that specifically target tfq_simulate_ops_cuda.""" +import time +import numpy as np +from absl.testing import parameterized +import tensorflow as tf +import cirq + +from tensorflow_quantum.core.ops import tfq_simulate_ops +from tensorflow_quantum.core.ops import tfq_simulate_ops_gpu +from tensorflow_quantum.core.ops import tfq_simulate_ops_gpu_cpu +from tensorflow_quantum.python import util + + +class SimulateExpectationTest(tf.test.TestCase): + """Tests tfq_simulate_expectation.""" + + def test_simulate_expectation_diff(self): + """Make sure that cpu & gpu ops have the same results.""" + # TF 2 + gpus = tf.config.list_physical_devices('GPU') + if len(gpus) < 1: + self.skipTest("Expected at least 1 GPU but found {} GPUs".format( + len(gpus))) + n_qubits = 20 + batch_size = 5 + symbol_names = ['alpha'] + qubits = cirq.GridQubit.rect(1, n_qubits) + circuit_batch, resolver_batch = \ + util.random_symbol_circuit_resolver_batch( + qubits, symbol_names, batch_size) + + circuit_batch_tensor = util.convert_to_tensor(circuit_batch) + + symbol_values_array = np.array( + [[resolver[symbol] + for symbol in symbol_names] + for resolver in resolver_batch]) + + pauli_sums = util.random_pauli_sums(qubits, 3, batch_size) + pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums]) + + cpu_avg_time = [] + for _ in range(10): + cpu_time = time.time() + res_cpu = tfq_simulate_ops.tfq_simulate_expectation( + circuit_batch_tensor, + symbol_names, symbol_values_array.astype(np.float64), + pauli_sums_tensor) + cpu_time = time.time() - cpu_time + cpu_avg_time.append(cpu_time) + cpu_avg_time = sum(cpu_avg_time) / 10.0 + print("\n\tCPU time: ", cpu_avg_time, "\n") + + avg_cpu_with_gpu_time = [] + for _ in range(10): + cpu_with_gpu_time = time.time() + with tf.device("/device:GPU:0"): + res_cpu_with_gpu = tfq_simulate_ops.tfq_simulate_expectation( + circuit_batch_tensor, + symbol_names, symbol_values_array.astype(np.float64), + pauli_sums_tensor) + cpu_with_gpu_time = time.time() - cpu_with_gpu_time + avg_cpu_with_gpu_time.append(cpu_with_gpu_time) + avg_cpu_with_gpu_time = sum(avg_cpu_with_gpu_time) / 10.0 + + # Both are CPU devices. + self.assertEqual(res_cpu.device, res_cpu_with_gpu.device) + np.testing.assert_allclose(res_cpu, res_cpu_with_gpu) + print("\n\tCPU with GPU device time: ", avg_cpu_with_gpu_time, "\n") + + @tf.function + def cpu_with_gpu_fn(): + with tf.device("/device:GPU:0"): + return tfq_simulate_ops.tfq_simulate_expectation( + circuit_batch_tensor, + symbol_names, symbol_values_array.astype(np.float64), + pauli_sums_tensor) + + avg_fn_cpu_with_gpu_time = [] + for _ in range(10): + fn_cpu_with_gpu_time = time.time() + res_fn_cpu_with_gpu = cpu_with_gpu_fn() + fn_cpu_with_gpu_time = time.time() - fn_cpu_with_gpu_time + avg_fn_cpu_with_gpu_time.append(fn_cpu_with_gpu_time) + avg_fn_cpu_with_gpu_time = sum(avg_fn_cpu_with_gpu_time) / 10.0 + + # CPU & GPU devices. + self.assertNotEqual(res_cpu.device, res_fn_cpu_with_gpu.device) + np.testing.assert_allclose(res_cpu, res_fn_cpu_with_gpu) + print("\n\ttf.function, CPU with GPU device time: ", + avg_fn_cpu_with_gpu_time, "\n") + + avg_gpu_time = [] + for _ in range(10): + gpu_time = time.time() + res_gpu = tfq_simulate_ops_gpu_cpu.tfq_simulate_expectation( + circuit_batch_tensor, + symbol_names, symbol_values_array.astype(np.float64), + pauli_sums_tensor) + gpu_time = time.time() - gpu_time + avg_gpu_time.append(gpu_time) + avg_gpu_time = sum(avg_gpu_time) / 10.0 + print("\n\tGPU version time: ", avg_gpu_time, "\n") + + + # This guarantees that both tensors are not in the same devices + # (e.g. CPU vs GPU) + # self.assertNotEqual(res.device, res_gpu.device) + # -> this doesn't work anymore because TFQ op itself is in CPU. + # only qsim::SimulatorCUDA is in GPU + np.testing.assert_allclose(res_cpu, res_gpu) + self.assertGreater(cpu_avg_time, avg_gpu_time) + + +if __name__ == "__main__": + tf.test.main()