From ef7c22a67062a4804565323ecbed0e167315e9db Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Thu, 5 May 2022 07:52:57 +0900
Subject: [PATCH] Add initial qsim CUDA version expectation op

---
 tensorflow_quantum/core/ops/gpu/BUILD         |  95 +++++
 .../tfq_simulate_expectation_op_cuda.cu.cc    | 403 ++++++++++++++++++
 .../core/ops/gpu/tfq_simulate_ops_cuda.py     |  61 +++
 .../ops/gpu/tfq_simulate_ops_cuda_test.py     | 130 ++++++
 4 files changed, 689 insertions(+)
 create mode 100644 tensorflow_quantum/core/ops/gpu/BUILD
 create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc
 create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py
 create mode 100644 tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py

diff --git a/tensorflow_quantum/core/ops/gpu/BUILD b/tensorflow_quantum/core/ops/gpu/BUILD
new file mode 100644
index 000000000..4a5cd0a30
--- /dev/null
+++ b/tensorflow_quantum/core/ops/gpu/BUILD
@@ -0,0 +1,95 @@
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+# Export for the PIP package.
+exports_files(["__init__.py"])
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@bazel_tools//platforms:windows"],
+)
+
+tf_cuda_library(
+    name = "_tfq_simulate_ops_cuda.so",
+    srcs = [
+        "tfq_simulate_expectation_op_cuda.cu.cc",
+    ],
+    copts = select({
+        ":windows": [
+            "/D__CLANG_SUPPORT_DYN_ANNOTATION__",
+            "/D_USE_MATH_DEFINES",
+            "/DEIGEN_MPL2_ONLY",
+            "/DEIGEN_MAX_ALIGN_BYTES=64",
+            "/DEIGEN_HAS_TYPE_TRAITS=0",
+            "/DTF_USE_SNAPPY",
+            "/showIncludes",
+            "/MD",
+            "/O2",
+            "/DNDEBUG",
+            "/w",
+            "-DWIN32_LEAN_AND_MEAN",
+            "-DNOGDI",
+            "/d2ReducedOptimizeHugeFunctions",
+            "/arch:AVX",
+            "/std:c++14",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-pthread",
+            "-std=c++14",
+            "-D_GLIBCXX_USE_CXX11_ABI=0",
+            "-O3",
+            "-DNV_CUDNN_DISABLE_EXCEPTION",
+        ],
+    }),
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    linkshared = 1,
+    deps = [
+        # cirq cc proto
+        "//tensorflow_quantum/core/ops:parse_context",
+        "//tensorflow_quantum/core/ops:tfq_simulate_utils",
+        "//tensorflow_quantum/core/proto:pauli_sum_cc_proto",
+        "//tensorflow_quantum/core/proto:program_cc_proto",
+        "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "//tensorflow_quantum/core/src:util_qsim",
+        "@qsim/lib:qsim_cuda_lib",
+        "@eigen//:eigen3",
+        # tensorflow core framework
+        # tensorflow core lib
+        # tensorflow core protos
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_tfq_simulate_ops_cuda_py",
+    out = "gen_tfq_simulate_ops_cuda.py",
+    deps = [":_tfq_simulate_ops_cuda.so"],
+)
+
+py_binary(
+    name = "gpu_benchmark",
+    srcs = ["gpu_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuda_py",
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_py",
+        "//tensorflow_quantum/core/serialize:serializer",
+        "//tensorflow_quantum/python:util",
+    ],
+)
diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc b/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc
new file mode 100644
index 000000000..ca81c8e00
--- /dev/null
+++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_expectation_op_cuda.cu.cc
@@ -0,0 +1,403 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include <chrono>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/gates_qsim.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simulator_cuda.h"
+#include "../qsim/lib/statespace_cuda.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+
+Status AllocateQsimTempTensors(
+    tensorflow::OpKernelContext* context, tensorflow::Tensor* d_wf_tensor,
+     tensorflow::Tensor* d_idx_tensor, tensorflow::Tensor* d_ms_tensor,
+     tensorflow::Tensor* d_xss_tensor) {
+  tensorflow::AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(false);
+  alloc_attr.set_gpu_compatible(true);
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DataType::DT_FLOAT,
+      tensorflow::TensorShape({131072 * sizeof(float)}),
+      d_wf_tensor, alloc_attr));
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DataType::DT_UINT32,
+      tensorflow::TensorShape({992 * sizeof(unsigned)}),
+      d_idx_tensor, alloc_attr));
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DataType::DT_UINT64,
+      tensorflow::TensorShape({7 * sizeof(uint64_t)}),
+      d_ms_tensor, alloc_attr));
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DataType::DT_UINT64,
+      tensorflow::TensorShape({64 * sizeof(uint64_t)}),
+      d_xss_tensor, alloc_attr));
+  return Status::OK();
+}
+
+// __global__ void ComputeSmallCudaKernel(const int total_size,
+//     int output_dim_op_size, int* num_qubits,
+//     const thrust::host_vector<thrust::host_vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+//     const thrust::host_vector<thrust::host_vector<PauliSum>>& pauli_sums,
+//     float* out) {
+//   int old_batch_index = -2;
+//   int cur_batch_index = -1;
+//   int largest_nq = 1;
+//   int cur_op_index;
+//   auto sv = ss.Create(largest_nq);
+//   auto scratch = ss.Create(largest_nq);
+
+//   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < total_size;
+//        i += blockDim.x * gridDim.x) {
+//     cur_batch_index = i / output_dim_op_size;
+//     cur_op_index = i % output_dim_op_size;
+
+//     const int nq = num_qubits[cur_batch_index];
+//     // (#679) Just ignore empty program
+//     if (fused_circuits[cur_batch_index].size() == 0) {
+//       out[i] = -2.0;
+//       continue;
+//     }
+//     if (cur_batch_index != old_batch_index) {
+//       // We've run into a new state vector we must compute.
+//       // Only compute a new state vector when we have to.
+//       if (nq > largest_nq) {
+//         largest_nq = nq;
+//         sv = ss.Create(largest_nq);
+//         scratch = ss.Create(largest_nq);
+//       }
+//       // no need to update scratch_state since ComputeExpectation
+//       // will take care of things for us.
+//       ss.SetStateZero(sv);
+//       for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
+//         qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
+//       }
+//     }
+
+//     float exp_v = 0.0;
+//     ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index],
+//                            sim, ss, sv, scratch, &exp_v),
+//     out[i] = exp_v;
+//     old_batch_index = cur_batch_index;
+//   }
+// }
+
+
+class TfqSimulateExpectationOpGpuCpu : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateExpectationOpGpuCpu(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    // Get the number of CPU cycle in ComputeSmall via attributes.
+    OP_REQUIRES_OK(context, context->GetAttr("cpu_cycle", &cpu_cycle_));
+
+    // Get the number of threads in SimulatorCUDA via attributes.
+    OP_REQUIRES_OK(context, context->GetAttr("num_threads_in_sim",
+                                             &num_threads_in_sim_));
+
+    // Get the number of blocks & threads in StateSpaceCUDA.
+    OP_REQUIRES_OK(context, context->GetAttr("block_count", &block_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("thread_per_block",
+                                             &thread_per_block_));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 4,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 4 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_op_size = context->input(3).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_op_size);
+
+    tensorflow::Tensor* output = nullptr;
+    tensorflow::AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    alloc_attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output,
+                                                     alloc_attr));
+    auto output_tensor = output->matrix<float>();
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = Status::OK();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    if (max_num_qubits >= 26 || programs.size() == 1) {
+      tensorflow::Tensor d_wf_tensor;
+      tensorflow::Tensor d_idx_tensor;
+      tensorflow::Tensor d_ms_tensor;
+      tensorflow::Tensor d_xss_tensor;
+      OP_REQUIRES_OK(context, AllocateQsimTempTensors(
+          context, &d_wf_tensor, &d_idx_tensor, &d_ms_tensor, &d_xss_tensor));
+      ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
+                   &output_tensor, d_wf_tensor.flat<float>().data(),
+                   d_idx_tensor.flat<unsigned>().data(),
+                   d_ms_tensor.flat<uint64_t>().data(),
+                   d_xss_tensor.flat<uint64_t>().data());
+    } else {
+      ComputeSmall(num_qubits, max_num_qubits, fused_circuits, pauli_sums,
+                   context, &output_tensor);
+    }
+  }
+
+ private:
+  int num_threads_in_sim_;
+  int thread_per_block_;
+  int block_count_;
+  int cpu_cycle_;
+
+  // Define the GPU implementation that launches the CUDA kernel.
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor,
+      float* d_wf, unsigned* d_idx, uint64_t* d_ms, uint64_t* d_xss) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCUDA<float>;
+    using StateSpace = Simulator::StateSpace;
+    // Launch the cuda kernel.
+    // TFQ GPU
+    Simulator::Parameter param_sim;
+    param_sim.num_threads = num_threads_in_sim_;
+
+    StateSpace::Parameter param_ss;
+    param_ss.num_threads = thread_per_block_;
+    param_ss.num_dblocks = block_count_;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(param_sim, d_wf, d_idx, d_ms, d_xss);
+    StateSpace ss = StateSpace(param_ss);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (int i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+      }
+      // TODO: add heuristic here so that we do not always recompute
+      //  the state if there is a possibility that circuit[i] and
+      //  circuit[i + 1] produce the same state.
+      ss.SetStateZero(sv);
+      for (int j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+      for (int j = 0; j < pauli_sums[i].size(); j++) {
+        // (#679) Just ignore empty program
+        if (fused_circuits[i].size() == 0) {
+          (*output_tensor)(i, j) = -2.0;
+          continue;
+        }
+        float exp_v = 0.0;
+        OP_REQUIRES_OK(context,
+                       ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv,
+                                              scratch, &exp_v));
+        (*output_tensor)(i, j) = exp_v;
+      }
+    }
+  }
+
+  void ComputeSmall(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    using Simulator = qsim::SimulatorCUDA<float>;
+    using StateSpace = Simulator::StateSpace;
+    // TFQ GPU
+    Simulator::Parameter param_sim;
+    param_sim.num_threads = num_threads_in_sim_;
+
+    StateSpace::Parameter param_ss;
+    param_ss.num_threads = thread_per_block_;
+    param_ss.num_dblocks = block_count_;
+
+    const int output_dim_op_size = output_tensor->dimension(1);
+
+    Status compute_status = Status::OK();
+    auto c_lock = tensorflow::mutex();
+    auto DoWork = [&](int start, int end) {
+      int old_batch_index = -2;
+      int cur_batch_index = -1;
+      int largest_nq = 1;
+      int cur_op_index;
+
+      // Begin simulation.
+      // Think later, d_wf, d_idx, d_ms, d_xss);
+      auto sim = Simulator(param_sim);
+      auto ss = StateSpace(param_ss);
+      auto sv = ss.Create(largest_nq);
+      auto scratch = ss.Create(largest_nq);
+      for (int i = start; i < end; i++) {
+        cur_batch_index = i / output_dim_op_size;
+        cur_op_index = i % output_dim_op_size;
+
+        const int nq = num_qubits[cur_batch_index];
+
+        // (#679) Just ignore empty program
+        if (fused_circuits[cur_batch_index].size() == 0) {
+          (*output_tensor)(cur_batch_index, cur_op_index) = -2.0;
+          continue;
+        }
+
+        if (cur_batch_index != old_batch_index) {
+          // We've run into a new state vector we must compute.
+          // Only compute a new state vector when we have to.
+          if (nq > largest_nq) {
+            largest_nq = nq;
+            sv = ss.Create(largest_nq);
+            scratch = ss.Create(largest_nq);
+          }
+          // no need to update scratch_state since ComputeExpectation
+          // will take care of things for us.
+          ss.SetStateZero(sv);
+          for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
+            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
+          }
+        }
+
+        float exp_v = 0.0;
+        NESTED_FN_STATUS_SYNC(
+            compute_status,
+            ComputeExpectationQsim(pauli_sums[cur_batch_index][cur_op_index],
+                                   sim, ss, sv, scratch, &exp_v),
+            c_lock);
+        (*output_tensor)(cur_batch_index, cur_op_index) = exp_v;
+        old_batch_index = cur_batch_index;
+      }
+    };
+
+    const int64_t num_cycles = static_cast<int64_t>(cpu_cycle_);
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        fused_circuits.size() * output_dim_op_size, num_cycles, DoWork);
+    OP_REQUIRES_OK(context, compute_status);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateExpectationGpuCpu").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateExpectationOpGpuCpu);
+
+REGISTER_OP("TfqSimulateExpectationGpuCpu")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Output("expectations: float")
+    .Attr("num_threads_in_sim: int >= 32 = 32")
+    .Attr("block_count: int >= 2 = 2")
+    .Attr("thread_per_block: int >= 32 = 32")
+    .Attr("cpu_cycle: int >= 1 = 1")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(pauli_sums_shape, 1);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return tensorflow::Status::OK();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py
new file mode 100644
index 000000000..6f3ac1784
--- /dev/null
+++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda.py
@@ -0,0 +1,61 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module to register cuda simulation python op."""
+import os
+import tensorflow as tf
+from tensorflow_quantum.core.ops.load_module import load_module
+
+SIM_OP_MODULE = load_module(os.path.join("gpu", "_tfq_simulate_ops_gpu.so"))
+
+
+def tfq_simulate_expectation(programs, symbol_names, symbol_values, pauli_sums,
+                             num_threads_in_sim=32, block_count=2,
+                             thread_per_block=32, cpu_cycle=200):
+    """Calculates the expectation value of circuits wrt some operator(s).
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
+            containing the string representation of the operators that will
+            be used on all of the circuits in the expectation calculations.
+        num_threads_in_sim: Python integer to specify the number of threads in
+            QSim SimulatorCUDA. One of [32, 64, 128, 256]
+        block_count: Python integer for the number of blocks in QSim
+            StateSpaceCUDA. One of [2, 16]
+        thread_per_block: Python integer to specify the number of threads per
+            block in StateSpaceCUDA. One of [32, 64, 128, 256, 512, 1024]
+        cpu_cycle: Python integer to specify the number of CPU cycles.
+
+    Returns:
+        `tf.Tensor` with shape [batch_size, n_ops] that holds the
+            expectation value for each circuit with each op applied to it
+            (after resolving the corresponding parameters in).
+    """
+    return SIM_OP_MODULE.tfq_simulate_expectation_gpu_cpu(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums,
+        num_threads_in_sim=num_threads_in_sim,
+        block_count=block_count,
+        thread_per_block=thread_per_block,
+        cpu_cycle=cpu_cycle)
+
diff --git a/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py
new file mode 100644
index 000000000..5626823ce
--- /dev/null
+++ b/tensorflow_quantum/core/ops/gpu/tfq_simulate_ops_cuda_test.py
@@ -0,0 +1,130 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that specifically target tfq_simulate_ops_cuda."""
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops import tfq_simulate_ops
+from tensorflow_quantum.core.ops import tfq_simulate_ops_gpu
+from tensorflow_quantum.core.ops import tfq_simulate_ops_gpu_cpu
+from tensorflow_quantum.python import util
+
+
+class SimulateExpectationTest(tf.test.TestCase):
+    """Tests tfq_simulate_expectation."""
+
+    def test_simulate_expectation_diff(self):
+        """Make sure that cpu & gpu ops have the same results."""
+        # TF 2
+        gpus = tf.config.list_physical_devices('GPU')
+        if len(gpus) < 1:
+            self.skipTest("Expected at least 1 GPU but found {} GPUs".format(
+                len(gpus)))
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        cpu_avg_time = []
+        for _ in range(10):
+            cpu_time = time.time()
+            res_cpu = tfq_simulate_ops.tfq_simulate_expectation(
+                circuit_batch_tensor,
+                symbol_names, symbol_values_array.astype(np.float64),
+                pauli_sums_tensor)
+            cpu_time = time.time() - cpu_time
+            cpu_avg_time.append(cpu_time)
+        cpu_avg_time = sum(cpu_avg_time) / 10.0
+        print("\n\tCPU time: ", cpu_avg_time, "\n")
+
+        avg_cpu_with_gpu_time = []
+        for _ in range(10):
+            cpu_with_gpu_time = time.time()
+            with tf.device("/device:GPU:0"):
+                res_cpu_with_gpu = tfq_simulate_ops.tfq_simulate_expectation(
+                    circuit_batch_tensor,
+                    symbol_names, symbol_values_array.astype(np.float64),
+                    pauli_sums_tensor)
+            cpu_with_gpu_time = time.time() - cpu_with_gpu_time
+            avg_cpu_with_gpu_time.append(cpu_with_gpu_time)
+        avg_cpu_with_gpu_time = sum(avg_cpu_with_gpu_time) / 10.0
+
+        # Both are CPU devices.
+        self.assertEqual(res_cpu.device, res_cpu_with_gpu.device)
+        np.testing.assert_allclose(res_cpu, res_cpu_with_gpu)
+        print("\n\tCPU with GPU device time: ", avg_cpu_with_gpu_time, "\n")
+
+        @tf.function
+        def cpu_with_gpu_fn():
+            with tf.device("/device:GPU:0"):
+                return tfq_simulate_ops.tfq_simulate_expectation(
+                    circuit_batch_tensor,
+                    symbol_names, symbol_values_array.astype(np.float64),
+                    pauli_sums_tensor)
+
+        avg_fn_cpu_with_gpu_time = []
+        for _ in range(10):
+            fn_cpu_with_gpu_time = time.time()
+            res_fn_cpu_with_gpu = cpu_with_gpu_fn()
+            fn_cpu_with_gpu_time = time.time() - fn_cpu_with_gpu_time
+            avg_fn_cpu_with_gpu_time.append(fn_cpu_with_gpu_time)
+        avg_fn_cpu_with_gpu_time = sum(avg_fn_cpu_with_gpu_time) / 10.0
+
+        # CPU & GPU devices.
+        self.assertNotEqual(res_cpu.device, res_fn_cpu_with_gpu.device)
+        np.testing.assert_allclose(res_cpu, res_fn_cpu_with_gpu)
+        print("\n\ttf.function, CPU with GPU device time: ",
+              avg_fn_cpu_with_gpu_time, "\n")
+
+        avg_gpu_time = []
+        for _ in range(10):
+            gpu_time = time.time()
+            res_gpu = tfq_simulate_ops_gpu_cpu.tfq_simulate_expectation(
+                circuit_batch_tensor,
+                symbol_names, symbol_values_array.astype(np.float64),
+                pauli_sums_tensor)
+            gpu_time = time.time() - gpu_time
+            avg_gpu_time.append(gpu_time)
+        avg_gpu_time = sum(avg_gpu_time) / 10.0
+        print("\n\tGPU version time: ", avg_gpu_time, "\n")
+
+
+        # This guarantees that both tensors are not in the same devices
+        # (e.g. CPU vs GPU)
+        # self.assertNotEqual(res.device, res_gpu.device)
+        # -> this doesn't work anymore because TFQ op itself is in CPU.
+        # only qsim::SimulatorCUDA is in GPU
+        np.testing.assert_allclose(res_cpu, res_gpu)
+        self.assertGreater(cpu_avg_time, avg_gpu_time)
+
+
+if __name__ == "__main__":
+    tf.test.main()