From 45be2eab777fb58256c0f730d11cec8f0436f067 Mon Sep 17 00:00:00 2001
From: Po-Hsien Chu <stegben.benjamin@gmail.com>
Date: Wed, 29 Sep 2021 01:38:45 +0800
Subject: [PATCH] Add Pixel backend (#77)

* Add Pixel backend

* add license header to resize_argmax_op*
---
 Makefile                                      |   7 +-
 android/java/org/mlperf/inference/BUILD       |  42 +-
 .../org/mlperf/inference/Backends.java.in     |   2 +-
 mobile_back_pixel/BUILD                       |   9 +
 mobile_back_pixel/Makefile                    | 111 ++++
 mobile_back_pixel/README.md                   |  14 +
 mobile_back_pixel/cpp/BUILD                   |  18 +
 mobile_back_pixel/cpp/backend_tflite/BUILD    |  84 +++
 .../cpp/backend_tflite/resize_argmax_op.cc    | 362 +++++++++++++
 .../cpp/backend_tflite/resize_argmax_op.h     |  22 +
 .../cpp/backend_tflite/tflite_pixel.cc        | 484 ++++++++++++++++++
 .../backend_tflite/tflite_settings_pixel.h    | 134 +++++
 .../cpp/backend_tflite/thread_pool.h          | 110 ++++
 mobile_back_pixel/pixel_backend.mk            |  21 +
 mobile_back_pixel/third_party/BUILD.bazel     |  19 +
 .../tf_gpu_delegate_fix_from_tf_master.diff   |  37 ++
 .../third_party/tf_grappler_cost.diff         |  19 +
 .../cpp/backend_tflite/tflite_settings.h      |   4 +-
 18 files changed, 1480 insertions(+), 19 deletions(-)
 create mode 100644 mobile_back_pixel/BUILD
 create mode 100644 mobile_back_pixel/Makefile
 create mode 100644 mobile_back_pixel/README.md
 create mode 100644 mobile_back_pixel/cpp/BUILD
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/BUILD
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.cc
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.h
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/tflite_pixel.cc
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/tflite_settings_pixel.h
 create mode 100644 mobile_back_pixel/cpp/backend_tflite/thread_pool.h
 create mode 100644 mobile_back_pixel/pixel_backend.mk
 create mode 100644 mobile_back_pixel/third_party/BUILD.bazel
 create mode 100644 mobile_back_pixel/third_party/tf_gpu_delegate_fix_from_tf_master.diff
 create mode 100644 mobile_back_pixel/third_party/tf_grappler_cost.diff

diff --git a/Makefile b/Makefile
index ed3db4953..945ae3103 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@ SAMSUNG_BACKEND=
 all: app
 
 include mobile_back_tflite/tflite_backend.mk
+include mobile_back_pixel/pixel_backend.mk
 include mobile_back_qti/make/qti_backend.mk
 
 
@@ -95,7 +96,7 @@ app: output/mlperf_mobile_docker_1_0.stamp ${QTI_DEPS}
 	@mkdir -p output/home/mlperf/cache && chmod 777 output/home/mlperf/cache
 	@docker run \
 		${COMMON_DOCKER_FLAGS} \
-                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} \
+                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} ${PIXEL_BACKEND} \
 		--fat_apk_cpu=arm64-v8a \
 		//android/java/org/mlperf/inference:mlperf_app
 	@cp output/`readlink bazel-bin`/android/java/org/mlperf/inference/mlperf_app.apk output/mlperf_app.apk
@@ -106,7 +107,7 @@ app_x86_64: output/mlperf_mobile_docker_1_0.stamp
 	@mkdir -p output/home/mlperf/cache && chmod 777 output/home/mlperf/cache
 	@docker run \
 		${COMMON_DOCKER_FLAGS} \
-                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} \
+                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} ${PIXEL_BACKEND} \
 		--fat_apk_cpu=x86_64 \
 		//android/java/org/mlperf/inference:mlperf_app
 	@cp output/`readlink bazel-bin`/android/java/org/mlperf/inference/mlperf_app.apk output/mlperf_app_x86_64.apk
@@ -117,7 +118,7 @@ test_app: output/mlperf_mobile_docker_1_0.stamp
 	@mkdir -p output/home/mlperf/cache && chmod 777 output/home/mlperf/cache
 	@docker run \
 		${COMMON_DOCKER_FLAGS} \
-                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} \
+                ${QTI_BACKEND} ${SAMSUNG_BACKEND} ${MEDIATEK_BACKEND} ${PIXEL_BACKEND} \
 		--fat_apk_cpu=x86_64,arm64-v8a \
 		//androidTest:mlperf_test_app
 	@cp output/`readlink bazel-bin`/android/androidTest/mlperf_test_app.apk output/mlperf_test_app.apk
diff --git a/android/java/org/mlperf/inference/BUILD b/android/java/org/mlperf/inference/BUILD
index 490a2f004..3d3c28001 100644
--- a/android/java/org/mlperf/inference/BUILD
+++ b/android/java/org/mlperf/inference/BUILD
@@ -58,6 +58,18 @@ config_setting(
     },
 )
 
+string_flag(
+    name = "with_pixel",
+    build_setting_default = "0",
+)
+
+config_setting(
+    name = "use_pixel",
+    flag_values = {
+        ":with_pixel": "1",
+    },
+)
+
 java_lite_proto_library(
     name = "mlperf_task_java_proto_lite",
     deps = ["//android/cpp/proto:mlperf_task_proto"],
@@ -72,16 +84,19 @@ genrule(
         "Backends.java",
     ],
     cmd = "cat $(location Backends.java.in) " +
-          select({
-              ":use_qti": "| sed -e 's/QTI/\"qti\",/' ",
-              "//conditions:default": "| sed -e 's/QTI//' ",
-          }) + select({
-        ":use_samsung": "| sed -e 's/SAMSUNG/\"samsung\",/' ",
-        "//conditions:default": "| sed -e 's/SAMSUNG//'",
-    }) + select({
-        ":use_mediatek": "| sed -e 's/MEDIATEK/\"tfliteneuron\",/' ",
-        "//conditions:default": "| sed -e 's/MEDIATEK//' ",
-    }) + " > $(@)",
+        select({
+            ":use_pixel": "| sed -e 's/PIXEL/\"tflitepixel\",/' ",
+            "//conditions:default": "| sed -e 's/PIXEL//' ",
+        }) + select({
+            ":use_qti": "| sed -e 's/QTI/\"qti\",/' ",
+            "//conditions:default": "| sed -e 's/QTI//' ",
+        }) + select({
+            ":use_samsung": "| sed -e 's/SAMSUNG/\"samsung\",/' ",
+            "//conditions:default": "| sed -e 's/SAMSUNG//'",
+        }) + select({
+            ":use_mediatek": "| sed -e 's/MEDIATEK/\"tfliteneuron\",/' ",
+            "//conditions:default": "| sed -e 's/MEDIATEK//' ",
+        }) + " > $(@)",
 )
 
 android_library(
@@ -182,13 +197,14 @@ cc_library(
     deps = [
         "//mobile_back_tflite:tflitebackend",
     ] + select({
-        ":use_samsung": [
-            "//mobile_back_samsung:samsungbackend",
-        ],
+        ":use_samsung": ["//mobile_back_samsung:samsungbackend"],
         "//conditions:default": [],
     }) + select({
         ":use_mediatek": ["//mobile_back_tflite:tfliteneuronbackend"],
         "//conditions:default": [],
+    }) + select({
+        ":use_pixel": ["//mobile_back_pixel:tflitepixelbackend"],
+        "//conditions:default": [],
     }),
 )
 
diff --git a/android/java/org/mlperf/inference/Backends.java.in b/android/java/org/mlperf/inference/Backends.java.in
index 5258258d3..953eac354 100644
--- a/android/java/org/mlperf/inference/Backends.java.in
+++ b/android/java/org/mlperf/inference/Backends.java.in
@@ -5,5 +5,5 @@ public final class Backends {
     private Backends() {
     }
 
-    public static final String[] BACKEND_LIST = {QTI SAMSUNG MEDIATEK "tflite"};
+    public static final String[] BACKEND_LIST = {QTI SAMSUNG MEDIATEK PIXEL "tflite"};
 }
diff --git a/mobile_back_pixel/BUILD b/mobile_back_pixel/BUILD
new file mode 100644
index 000000000..70c579e08
--- /dev/null
+++ b/mobile_back_pixel/BUILD
@@ -0,0 +1,9 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "tflitepixelbackend",
+    srcs = ["//mobile_back_pixel/cpp/backend_tflite:libtflitepixelbackend.so"],
+)
diff --git a/mobile_back_pixel/Makefile b/mobile_back_pixel/Makefile
new file mode 100644
index 000000000..52e7cb201
--- /dev/null
+++ b/mobile_back_pixel/Makefile
@@ -0,0 +1,111 @@
+# Copyright 2020-2021 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##########################################################################
+
+this_mkfile := $(abspath $(lastword $(MAKEFILE_LIST)))
+TOPDIR := $(shell dirname ${this_mkfile})
+BUILDDIR=${TOPDIR}/build
+
+USERID=$(shell id -u)
+GROUPID=$(shell id -g)
+
+SRCFILES=\
+Makefile \
+cpp/BUILD \
+cpp/c/BUILD \
+cpp/c/version_script.lds \
+cpp/c/backend_c.h \
+cpp/c/type.h \
+cpp/backend_tflite/tflite_c.cc \
+cpp/backend_tflite/tflite_settings.h \
+cpp/backend_tflite/utils.h \
+cpp/backend_tflite/BUILD
+
+ifeq (${USE_PROXY_WORKAROUND},1)
+  export PROXY_WORKAROUND1=\
+	-v /etc/ssl/certs:/etc/ssl/certs \
+	-v /usr/share/ca-certificates:/usr/share/ca-certificates \
+	-v /usr/share/ca-certificates-java:/usr/share/ca-certificates-java
+
+  export PROXY_WORKAROUND2=--host_jvm_args -Djavax.net.ssl.trustStore=/etc/ssl/certs/java/cacerts
+else
+  export PROXY_WORKAROUND1=
+  export PROXY_WORKAROUND2=
+endif
+
+all: ${BUILDDIR}/lib_x86_64/libtflitebackend.so ${BUILDDIR}/lib_arm64/libtflitebackend.so
+
+.PHONY: clean docker_image
+
+USER_ID=$(shell id -u)
+
+${BUILDDIR}/mlperf_mobile_image.stamp: ${TOPDIR}/docker/mlperf_mobile/Dockerfile
+	@mkdir -p ${BUILDDIR}
+	@docker image build -t mlcommons/mlperf_mobile:1.0 docker/mlperf_mobile
+	@touch $@
+
+docker_image: ${BUILDDIR}/mlperf_mobile_image.stamp
+
+${BUILDDIR}/lib_arm64/libtflitebackend.so: ${SRCFILES} docker_image
+	@echo "Building arm64 libtflitebackend.so"
+	@mkdir -p ${BUILDDIR}/lib_arm64
+	@mkdir -p ${BUILDDIR}/cache
+	@docker run \
+	        -e USER=mlperf \
+                ${PROXY_WORKAROUND1} \
+		-v $(CURDIR):/tflite_backend \
+		-v ${BUILDDIR}/cache:/cache \
+		-w /tflite_backend \
+	        -u ${USERID}:${GROUPID} \
+		mlcommons/mlperf_mobile:1.0 bazel-3.7.2 ${PROXY_WORKAROUND2} \
+                --output_user_root=/cache/bazel build \
+		-c opt --cxxopt='--std=c++14' \
+                --cxxopt='-Wno-deprecated-declarations' \
+                --cxxopt='-Wno-unknown-attributes' \
+                --host_cxxopt='--std=c++14'  \
+                --host_cxxopt='-Wno-deprecated-declarations' \
+                --host_cxxopt='-Wno-class-memaccess' \
+		--config=android_arm64 \
+		:tflitebackend
+	@cp build/`readlink bazel-out`/arm64-v8a-opt/bin/cpp/backend_tflite/libtflitebackend.so $@
+	@chmod 777 $@
+
+${BUILDDIR}/lib_x86_64/libtflitebackend.so: ${SRCFILES} docker_image
+	@echo "Building x86_64 libtflitebackend.so"
+	@mkdir -p ${BUILDDIR}/lib_x86_64
+	@mkdir -p ${BUILDDIR}/cache
+	docker run \
+	        -e USER=mlperf \
+                ${PROXY_WORKAROUND1} \
+		-v $(CURDIR):/tflite_backend \
+		-v ${BUILDDIR}/cache:/cache \
+		-w /tflite_backend \
+	        -u ${USERID}:${GROUPID} \
+		 mlcommons/mlperf_mobile:1.0 bazel-3.7.2 ${PROXY_WORKAROUND2} \
+                --output_user_root=/cache/bazel build \
+		-c opt --cxxopt='--std=c++14' \
+                --cxxopt='-Wno-deprecated-declarations' \
+                --cxxopt='-Wno-unknown-attributes' \
+                --host_cxxopt='--std=c++14'  \
+                --host_cxxopt='-Wno-deprecated-declarations' \
+                --host_cxxopt='-Wno-class-memaccess' \
+		--config=android_x86_64 \
+		:tflitebackend
+	@cp build/`readlink bazel-out`/x86_64-opt/bin/cpp/backend_tflite/libtflitebackend.so $@
+	@chmod 777 $@
+
+clean:
+	@rm -rf bazel-bin bazel-testlogs bazel-out bazel-tflite_backend
+	@rm -rf ${BUILDDIR}
+
diff --git a/mobile_back_pixel/README.md b/mobile_back_pixel/README.md
new file mode 100644
index 000000000..6063383b9
--- /dev/null
+++ b/mobile_back_pixel/README.md
@@ -0,0 +1,14 @@
+# Mobile backend tflite
+
+Build so-library for selected architecture:
+```bash
+bazel build -c opt \
+    --cxxopt='--std=c++14' \
+    --host_cxxopt='--std=c++14' \
+    --host_cxxopt='-Wno-deprecated-declarations' \
+    --host_cxxopt='-Wno-class-memaccess' \
+    --cxxopt='-Wno-deprecated-declarations' \
+    --cxxopt='-Wno-unknown-attributes' \
+    --fat_apk_cpu={x86_64|arm64-v8a|armeabi-v7a} \
+    //cpp/backend_tflite:libtflitebackend.so
+```
diff --git a/mobile_back_pixel/cpp/BUILD b/mobile_back_pixel/cpp/BUILD
new file mode 100644
index 000000000..cb9c72e13
--- /dev/null
+++ b/mobile_back_pixel/cpp/BUILD
@@ -0,0 +1,18 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/mobile_back_pixel/cpp/backend_tflite/BUILD b/mobile_back_pixel/cpp/backend_tflite/BUILD
new file mode 100644
index 000000000..874b71441
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/BUILD
@@ -0,0 +1,84 @@
+# Copyright 2019-2021 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+load(
+    "@org_tensorflow//tensorflow/lite:build_def.bzl",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "resize_bilinear_op",
+    srcs = [
+        "resize_argmax_op.cc",
+    ],
+    hdrs = [
+        "resize_argmax_op.h",
+    ],
+    deps = [
+        "@org_tensorflow//tensorflow/lite/c:common",
+        "@org_tensorflow//tensorflow/lite/kernels:kernel_util",
+        "@org_tensorflow//tensorflow/lite/core/api",
+        "@org_tensorflow//tensorflow/lite/kernels:cpu_backend_context",
+        "@org_tensorflow//tensorflow/lite/kernels:cpu_backend_threadpool",
+    ],
+)
+
+cc_library(
+    name = "tflite_pixel",
+    srcs = [
+        "tflite_pixel.cc",
+    ],
+    hdrs = [
+        "tflite_settings_pixel.h",
+        "thread_pool.h",
+    ],
+    copts = tflite_copts() + select({
+        "//android/commonlibs:use_asan": [
+            "-fsanitize=address",
+            "-g",
+            "-O1",
+            "-fno-omit-frame-pointer",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":resize_bilinear_op",
+        "//android/cpp/c:headers",
+        "@org_tensorflow//tensorflow/lite/c:c_api",
+        "@org_tensorflow//tensorflow/lite/c:c_api_experimental",
+        "@org_tensorflow//tensorflow/lite/c:common",
+        "@org_tensorflow//tensorflow/core:tflite_portable_logging",
+    ] + select({
+        "@org_tensorflow//tensorflow:android": [
+            "@org_tensorflow//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,
+)
+
+tflite_jni_binary(
+    name = "libtflitepixelbackend.so",
+    exported_symbols = "//android/cpp/c:exported_symbols.lds",
+    linkscript = "//android/cpp/c:version_script.lds",
+    deps = [
+        ":tflite_pixel",
+    ],
+)
diff --git a/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.cc b/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.cc
new file mode 100644
index 000000000..f18578e5a
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.cc
@@ -0,0 +1,362 @@
+/* Copyright 2021 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#include "resize_argmax_op.h"
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
+constexpr int kAxis = 2;
+constexpr int kOutputTensor = 0;
+
+using namespace tflite;
+
+namespace {
+
+inline int ArgMaxVector(const uint8_t* input_data, int size) {
+  int32_t max_index = 0;
+  uint8_t max_value = input_data[0];
+  int32_t i = 0;
+#ifdef __ARM_NEON
+  constexpr int VECTOR_SIZE = 16;
+  if (size >= VECTOR_SIZE) {
+    uint8x16_t max_value_u8x16;
+    for (; i <= size - VECTOR_SIZE; i += VECTOR_SIZE) {
+      max_value_u8x16 = vld1q_u8(input_data + i);
+      uint8_t max_from_vec;
+      max_from_vec = vmaxvq_u8(max_value_u8x16);
+      if (max_from_vec > max_value) {
+        max_value = max_from_vec;
+        max_index = i;
+      }
+    }
+  }
+  for (int start_idx = max_index; start_idx < max_index + VECTOR_SIZE;
+       start_idx++) {
+    if (input_data[start_idx] == max_value) {
+      max_index = start_idx;
+      break;
+    }
+  }
+
+#endif  // __aarch64__
+  // Leftover loop.
+  for (; i < size; ++i) {
+    const uint8_t curr_value = input_data[i];
+    if (curr_value > max_value) {
+      max_value = curr_value;
+      max_index = i;
+    }
+  }
+
+  return max_index;
+}
+
+inline void fill_bilinear_row(uint32_t* row, uint32_t left_val,
+                              uint32_t right_val) {
+  row[0] = left_val;
+  row[1] = (left_val * 3 + right_val) / 4;
+  row[2] = (left_val + right_val) / 2;
+  row[3] = (left_val + right_val * 3) / 4;
+}
+
+#ifdef __ARM_NEON
+constexpr uint32_t left_multipliers[] = {4, 3, 2, 1};
+constexpr uint32_t right_multipliers[] = {0, 1, 2, 3};
+
+inline uint32x4_t fill_bilinear_row_simd(uint32_t left_val,
+                                         uint32_t right_val) {
+  const uint32x4_t left_multipliers_simd = vld1q_u32(left_multipliers);
+  const uint32x4_t right_multipliers_simd = vld1q_u32(right_multipliers);
+
+  uint32x4_t left_top_simd = vmulq_n_u32(left_multipliers_simd, left_val);
+  uint32x4_t top_row_simd =
+      vmlaq_n_u32(left_top_simd, right_multipliers_simd, right_val);
+  // shift by 2 = divide by 4
+  return vshrq_n_u32(top_row_simd, 2);
+}
+
+#endif
+
+void resize_argmax_task_4x(uint8_t* input, int input_width, int input_height,
+                           int input_argmax_depth, int32_t* output,
+                           int output_width, int output_height, int start_row,
+                           int row_count, int move,
+                           tflite::Profiler* profiler) {
+  TFLITE_SCOPED_TAGGED_DEFAULT_PROFILE(profiler, "resize_argmax_task_4x");
+  uint8_t* in_ptr = input + start_row * input_width * input_argmax_depth;
+  int32_t* out_ptr = output + start_row * 4 * output_width;
+  int cnt = 0;
+  for (int row = start_row; row < start_row + row_count; row++) {
+    for (int col = 0; col < input_width; col++) {
+      int top_left;
+      int top_right;
+      int bottom_left;
+      int bottom_right;
+      top_left = *out_ptr;
+      if (row != input_height - 1 && col != input_width - 1) {
+        if ((row + 1 % move) > 0) {
+          if (col == 0) {
+            *(out_ptr + output_width * 4) = ArgMaxVector(
+                in_ptr + input_width * input_argmax_depth, input_argmax_depth);
+          }
+          *(out_ptr + output_width * 4 + 4) = ArgMaxVector(
+              in_ptr + input_width * input_argmax_depth + input_argmax_depth,
+              input_argmax_depth);
+        }
+
+        top_right = *(out_ptr + 4);
+        bottom_left = *(out_ptr + output_width * 4);
+        bottom_right = *(out_ptr + output_width * 4 + 4);
+      }
+      if ((row == input_height - 1 || col == input_width - 1) ||
+          (top_left == top_right && top_right == bottom_left &&
+           bottom_left == bottom_right)) {
+        cnt++;
+#ifdef __ARM_NEON
+        int32_t* temp_out = out_ptr;
+        int32x4_t f = vdupq_n_s32(top_left);
+        for (int square_row = 0; square_row < 4; square_row++) {
+          vst1q_s32(temp_out, f);
+          temp_out += output_width;
+        }
+#else
+        for (int i = 1; i <= 3; ++i) {
+          out_ptr[i] = top_left;
+        }
+        for (int i = 1; i <= 3; ++i) {
+          memcpy(out_ptr + i * output_width, out_ptr, sizeof(int32_t) * 4);
+        }
+#endif
+      } else {
+        uint8_t* top_right_col = in_ptr + input_argmax_depth;
+        uint8_t* top_left_col = in_ptr;
+        uint8_t* bot_right_col =
+            in_ptr + input_width * input_argmax_depth + input_argmax_depth;
+        uint8_t* bot_left_col = in_ptr + input_width * input_argmax_depth;
+
+#ifdef __ARM_NEON
+        uint32x4_t max_values[4];
+        for (int i = 0; i < 4; i++) {
+          max_values[i] = vdupq_n_u32(0);
+        }
+        int32x4_t max_indices[4];
+        for (int i = 0; i < 4; i++) {
+          max_indices[i] = vdupq_n_s32(0);
+        }
+#else
+        uint32_t max_values[16] = {0};
+        int32_t max_indices[16] = {0};
+#endif
+        for (int depth = 0; depth < input_argmax_depth; depth++) {
+#ifdef __ARM_NEON
+          uint32x4_t simd_rows[5];
+          simd_rows[0] = fill_bilinear_row_simd(*top_left_col, *top_right_col);
+          simd_rows[4] = fill_bilinear_row_simd(*bot_left_col, *bot_right_col);
+
+          uint32x4_t row2 = vmlaq_n_u32(simd_rows[4], simd_rows[0], 3);
+          simd_rows[1] = vshrq_n_u32(row2, 2);
+
+          uint32x4_t row3 = vaddq_u32(simd_rows[0], simd_rows[4]);
+          simd_rows[2] = vshrq_n_u32(row3, 1);
+
+          uint32x4_t row4 = vmlaq_n_u32(simd_rows[0], simd_rows[4], 3);
+          simd_rows[3] = vshrq_n_u32(row4, 2);
+
+          int32x4_t depth_simd = vdupq_n_s32(depth);
+          for (int i = 0; i < 4; i++) {
+            uint32x4_t mask = vcgtq_u32(simd_rows[i], max_values[i]);
+            max_values[i] = vbslq_u32(mask, simd_rows[i], max_values[i]);
+            max_indices[i] = vbslq_s32(mask, depth_simd, max_indices[i]);
+          }
+#else
+          uint32_t bilinear_values[20];
+          fill_bilinear_row(bilinear_values, *top_left_col, *top_right_col);
+          fill_bilinear_row(bilinear_values + 16, *bot_left_col,
+                            *bot_right_col);
+
+          for (int i = 0; i < 4; i++) {
+            bilinear_values[i + 4] =
+                (bilinear_values[i] * 3 + bilinear_values[i + 16]) / 4;
+            bilinear_values[i + 8] =
+                (bilinear_values[i] + bilinear_values[i + 16]) / 2;
+            bilinear_values[i + 12] =
+                (bilinear_values[i] + bilinear_values[i + 16] * 3) / 4;
+          }
+
+          for (int i = 0; i < 16; i++) {
+            if (bilinear_values[i] > max_values[i]) {
+              max_indices[i] = depth;
+              max_values[i] = bilinear_values[i];
+            }
+          }
+#endif
+          top_right_col++;
+          top_left_col++;
+          bot_right_col++;
+          bot_left_col++;
+        }  // for depth
+#ifdef __ARM_NEON
+        int32_t* temp_out_ptr = out_ptr;
+        for (int square_row = 0; square_row < 4; square_row++) {
+          vst1q_s32(temp_out_ptr, max_indices[square_row]);
+          temp_out_ptr += output_width;
+        }
+#else
+        int32_t* temp_out_ptr = out_ptr;
+        int index = 0;
+        for (int square_row = 0; square_row < 4; square_row++) {
+          for (int square_col = 0; square_col < 4; square_col++) {
+            temp_out_ptr[square_col] = max_indices[index];
+            index++;
+          }
+          temp_out_ptr += output_width;
+        }
+#endif
+      }  // end regular bilinear resize
+
+      in_ptr += input_argmax_depth;
+      out_ptr += 4;
+    }  // for col
+    out_ptr += 3 * output_width;
+  }  // for row
+}
+
+}  // namespace
+
+TfLiteStatus ResizeArgmax_Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxis, &axis));
+  TF_LITE_ENSURE_EQ(context, *axis->data.i32, input->dims->size - 1);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  return kTfLiteOk;
+}
+
+struct ResizeArgmaxTask : cpu_backend_threadpool::Task {
+  ResizeArgmaxTask(uint8_t* input, int input_width, int input_height,
+                   int input_argmax_depth, int32_t* output, int output_width,
+                   int output_height, std::atomic<int>& start_row,
+                   int row_count, int move, tflite::Profiler* profiler)
+      : input_(input),
+        input_width_(input_width),
+        input_height_(input_height),
+        input_argmax_depth_(input_argmax_depth),
+        output_(output),
+        output_width_(output_width),
+        output_height_(output_height),
+        start_row_(start_row),
+        row_count_(row_count),
+        move_(move),
+        profiler_(profiler) {}
+
+  void Run() override {
+    int start_row;
+    while ((start_row = start_row_ += move_) - move_ < row_count_) {
+      resize_argmax_task_4x(input_, input_width_, input_height_,
+                            input_argmax_depth_, output_, output_width_,
+                            output_height_, start_row - move_, move_, move_,
+                            profiler_);
+    }
+  }
+
+  uint8_t* input_;
+  int input_width_;
+  int input_height_;
+  int input_argmax_depth_;
+  int32_t* output_;
+  int output_width_;
+  int output_height_;
+  std::atomic<int>& start_row_;
+  int row_count_;
+  int move_;
+  tflite::Profiler* profiler_;
+};
+
+TfLiteStatus ResizeArgmax_Invoke(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  int input_width = input->dims->data[1];
+  int input_height = input->dims->data[2];
+  int input_argmax_depth = input->dims->data[3];
+
+  int output_width = output->dims->data[1];
+  int output_height = output->dims->data[2];
+  tflite::CpuBackendContext* cpu_backend_context =
+      tflite::CpuBackendContext::GetFromContext(context);
+  const int thread_count = std::min(2, cpu_backend_context->max_num_threads());
+
+  int move = (thread_count == 1) ? 1 : 16;
+  uint8_t* argmax_input_ptr = input->data.uint8;
+  int32_t* argmax_output_ptr = output->data.i32;
+  for (int row = 0; row < input_height; row += move) {
+    for (int col = 0; col < input_width; col++) {
+      *argmax_output_ptr = ArgMaxVector(argmax_input_ptr, input_argmax_depth);
+      argmax_input_ptr += input_argmax_depth;
+      argmax_output_ptr += 4;
+    }
+    argmax_input_ptr += input_argmax_depth * input_width * (move - 1);
+    argmax_output_ptr += output_width * (4 * move - 1);
+  }
+  if (thread_count == 1) {
+    resize_argmax_task_4x(input->data.uint8, input_width, input_height,
+                          input_argmax_depth, output->data.i32, output_width,
+                          output_height, 0, input_height, 1,
+                          (tflite::Profiler*)context->profiler);
+  } else {
+    std::vector<ResizeArgmaxTask> tasks;
+    tasks.reserve(thread_count);
+    std::atomic<int> start_row;
+    start_row = 0;
+    for (int i = 0; i < thread_count; i++) {
+      tasks.emplace_back(input->data.uint8, input_width, input_height,
+                         input_argmax_depth, output->data.i32, output_width,
+                         output_height, start_row, input->dims->data[2], move,
+                         (tflite::Profiler*)context->profiler);
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                    cpu_backend_context);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteRegistration* Register_ResizeArgmax() {
+  static TfLiteRegistration r = {nullptr, nullptr, ResizeArgmax_Prepare,
+                                 ResizeArgmax_Invoke};
+  return &r;
+}
diff --git a/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.h b/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.h
new file mode 100644
index 000000000..9c6e139ba
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/resize_argmax_op.h
@@ -0,0 +1,22 @@
+/* Copyright 2021 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef RESIZE_ARGMAX_OP_H_
+#define RESIZE_ARGMAX_OP_H_
+
+#include "tensorflow/lite/c/common.h"
+
+TfLiteRegistration* Register_ResizeArgmax();
+
+#endif
\ No newline at end of file
diff --git a/mobile_back_pixel/cpp/backend_tflite/tflite_pixel.cc b/mobile_back_pixel/cpp/backend_tflite/tflite_pixel.cc
new file mode 100644
index 000000000..e78835316
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/tflite_pixel.cc
@@ -0,0 +1,484 @@
+/* Copyright 2021 The MLPerf Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "android/cpp/c/backend_c.h"
+#include "android/cpp/c/type.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_experimental.h"
+#include "tensorflow/lite/c/common.h"
+#if __ANDROID__
+#include <sys/system_properties.h>
+
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/core/platform/logging.h"
+#endif
+#include "tflite_settings_pixel.h"
+#include "resize_argmax_op.h"
+#include "thread_pool.h"
+
+#define N_OFFLINE_INTEPRETERS 8
+
+struct TFLiteBackendData {
+  const char* name = "TFLite";
+  const char* vendor = "Google";
+  TfLiteModel* model{nullptr};
+  TfLiteInterpreterOptions* options[N_OFFLINE_INTEPRETERS] = {};
+  TfLiteInterpreter* interpreter[N_OFFLINE_INTEPRETERS] = {};
+  TfLiteInterpreter* interpreter8[N_OFFLINE_INTEPRETERS] = {};
+  uint32_t batch_size = 64;
+  int32_t input_tensor_count;
+  void** acc_data[N_OFFLINE_INTEPRETERS] = {};
+  std::unique_ptr<Threadpool> executer;
+  bool use_shard = false;
+  bool has_temp_data = false;
+  int32_t original_tensor_size = 0;
+
+  std::future<TfLiteStatus> status[N_OFFLINE_INTEPRETERS];
+};
+
+static bool backendExists = false;
+
+inline mlperf_data_t::Type TfType2Type(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return mlperf_data_t::Float32;
+    case kTfLiteUInt8:
+      return mlperf_data_t::Uint8;
+    case kTfLiteInt8:
+      return mlperf_data_t::Int8;
+    case kTfLiteFloat16:
+      return mlperf_data_t::Float16;
+    case kTfLiteInt32:
+      return mlperf_data_t::Int32;
+    case kTfLiteInt64:
+      return mlperf_data_t::Int64;
+    default:
+      printf("TfLiteType %d not supported", type);
+      return mlperf_data_t::Float32;
+  }
+}
+
+size_t TFLiteNumElements(const TfLiteTensor* tensor) {
+  size_t result = 1;
+  for (int i = 0; i < TfLiteTensorNumDims(tensor); ++i) {
+    result *= TfLiteTensorDim(tensor, i);
+  }
+  return result;
+}
+
+// TFLite is the standard backend for all hardwares.
+bool mlperf_backend_matches_hardware(const char** not_allowed_message,
+                                     const char** settings,
+                                     const mlperf_device_info_t* device_info) {
+  *not_allowed_message = nullptr;
+  *settings = tflite_settings.c_str();
+  printf("TFLite backend matches hardware");
+  return true;
+}
+
+#if __ANDROID__
+bool is_emulator() {
+  char ro_build_characteristics[PROP_VALUE_MAX + 1];
+  if (__system_property_get("ro.build.characteristics",
+                            ro_build_characteristics)) {
+    char* ptr;
+    ptr = strstr(ro_build_characteristics, "emulator");
+    if (ptr) return true;
+  }
+  return false;
+}
+#endif
+
+// Create a new backend and return the pointer to it.
+mlperf_backend_ptr_t mlperf_backend_create(
+    const char* model_path, mlperf_backend_configuration_t* configs,
+    const char* native_lib_path) {
+  // Verify only one instance of the backend exists at any time
+  if (backendExists) {
+    printf("Error: Only one backend instance should exist at a time");
+    return nullptr;
+  }
+
+  TFLiteBackendData* backend_data = new TFLiteBackendData();
+
+  backendExists = true;
+
+  // Load the model.
+  backend_data->model = TfLiteModelCreateFromFile(model_path);
+  if (!backend_data->model) {
+    printf("Failed to load model: %s", model_path);
+    mlperf_backend_delete(backend_data);
+    return nullptr;
+  }
+
+  backend_data->executer =
+      std::unique_ptr<Threadpool>(new Threadpool(N_OFFLINE_INTEPRETERS));
+
+  // Create interpreter options.
+  // Create interpreter options function.
+  auto create_option = [&](TfLiteInterpreterOptions*& option_ptr) -> void {
+    option_ptr = TfLiteInterpreterOptionsCreate();
+    TfLiteInterpreterOptionsAddCustomOp(option_ptr,
+      "ResizeArgmax",
+      Register_ResizeArgmax(),
+      1,
+      999);
+    TfLiteDelegate* delegate = nullptr;
+
+    for (int i = 0; i < configs->count; ++i) {
+      if (strcmp(configs->keys[i], "num_threads") == 0) {
+        TfLiteInterpreterOptionsSetNumThreads(option_ptr,
+                                              atoi(configs->values[i]));
+      }
+  #if __ANDROID__
+      if (!is_emulator() && ((strcmp(configs->accelerator, "gpu_f16") == 0) ||
+                             (strcmp(configs->accelerator, "gpu") == 0))) {
+        auto options = TfLiteGpuDelegateOptionsV2Default();
+        if (strcmp(configs->accelerator, "gpu_f16") == 0)
+          options.inference_priority1 =
+              TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
+        delegate = TfLiteGpuDelegateV2Create(&options);
+      } else if (strcmp(configs->accelerator, "nnapi") == 0) {
+        auto options = tflite::StatefulNnApiDelegate::Options();
+        options.allow_fp16 = true;
+        options.disallow_nnapi_cpu = true;
+        options.accelerator_name = "google-edgetpu";
+        options.use_burst_computation = true;
+        delegate = new tflite::StatefulNnApiDelegate(options);
+      }
+      if (delegate != nullptr) {
+        TfLiteInterpreterOptionsAddDelegate(option_ptr, delegate);
+      }
+  #endif
+    }
+  };
+
+  for (int k = 0; k < N_OFFLINE_INTEPRETERS; k++) {
+    // Create Backend Option
+    create_option(backend_data->options[k]);
+
+    // Create the interpreter.
+    backend_data->interpreter[k] =
+        TfLiteInterpreterCreate(backend_data->model, backend_data->options[k]);
+    if (!backend_data->interpreter[k]) {
+      // create a vanilla interpreter
+      backend_data->interpreter[k] = TfLiteInterpreterCreate(
+          backend_data->model, TfLiteInterpreterOptionsCreate());
+      if (!backend_data->interpreter[k]) {
+        printf("Failed to create the interpreter");
+        mlperf_backend_delete(backend_data);
+        return nullptr;
+      }
+    }
+
+    // Create the interpreter.
+    backend_data->interpreter8[k] =
+        TfLiteInterpreterCreate(backend_data->model, backend_data->options[k]);
+    if (!backend_data->interpreter8[k]) {
+      // create a vanilla interpreter
+      backend_data->interpreter8[k] = TfLiteInterpreterCreate(
+          backend_data->model, TfLiteInterpreterOptionsCreate());
+      if (!backend_data->interpreter8[k]) {
+        printf("Failed to create the interpreter");
+        mlperf_backend_delete(backend_data);
+        return nullptr;
+      }
+    }
+  }
+
+  backend_data->input_tensor_count =
+      TfLiteInterpreterGetInputTensorCount(backend_data->interpreter[0]);
+  for (int i = 0; i < N_OFFLINE_INTEPRETERS; i++) {
+    backend_data->acc_data[i] =
+        (void**)malloc(sizeof(void*) * backend_data->input_tensor_count);
+    memset(backend_data->acc_data[i], 0, sizeof(void*) * backend_data->input_tensor_count);
+  }
+
+  for (int k = 0; k < N_OFFLINE_INTEPRETERS; k++) {
+    for (int i=0; i<backend_data->input_tensor_count; ++i) {
+      TfLiteTensor* tensor =
+        TfLiteInterpreterGetInputTensor(backend_data->interpreter[k], i);
+      int32_t* dims = (int32_t*)malloc(sizeof(int32_t) * tensor->dims->size);
+      dims[0] = 1;
+      for (int i = 1; i < tensor->dims->size; i++) {
+        dims[i] = tensor->dims->data[i];
+      }
+      TfLiteInterpreterResizeInputTensor(backend_data->interpreter[k], i,
+                                       dims, tensor->dims->size);
+      free(dims);
+    }
+    if (kTfLiteOk != TfLiteInterpreterAllocateTensors(backend_data->interpreter[k])) {
+      printf("Failed to allocate tensors");
+      return nullptr;
+    }
+  }
+
+  for (int k = 0; k < N_OFFLINE_INTEPRETERS; k++) {
+    for (int i=0; i<backend_data->input_tensor_count; ++i) {
+      TfLiteTensor* tensor =
+        TfLiteInterpreterGetInputTensor(backend_data->interpreter8[k], i);
+      int32_t* dims = (int32_t*)malloc(sizeof(int32_t) * tensor->dims->size);
+      dims[0] = 8;
+      for (int i = 1; i < tensor->dims->size; i++) {
+        dims[i] = tensor->dims->data[i];
+      }
+      TfLiteInterpreterResizeInputTensor(backend_data->interpreter8[k], i,
+                                       dims, tensor->dims->size);
+      free(dims);
+    }
+    if (kTfLiteOk != TfLiteInterpreterAllocateTensors(backend_data->interpreter8[k])) {
+      printf("Failed to allocate tensors");
+      break;
+    }
+  }
+
+  return backend_data;
+}
+
+// Vendor name who create this backend.
+const char* mlperf_backend_vendor_name(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  return backend_data->vendor;
+}
+
+// Return the name of this backend.
+const char* mlperf_backend_name(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  return backend_data->name;
+}
+
+// Destroy the backend pointer and its data.
+void mlperf_backend_delete(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  for (int i = 0; i < N_OFFLINE_INTEPRETERS; i++) {
+    if (backend_data->use_shard) {
+      free(backend_data->acc_data[i]);
+    }
+    backend_data->acc_data[i] = nullptr;
+  }
+  TfLiteModelDelete(backend_data->model);
+  for (int i = 0; i < N_OFFLINE_INTEPRETERS; i++) {
+    TfLiteInterpreterOptionsDelete(backend_data->options[i]);
+    TfLiteInterpreterDelete(backend_data->interpreter[i]);
+    TfLiteInterpreterDelete(backend_data->interpreter8[i]);
+  }
+  delete backend_data;
+  backendExists = false;
+}
+
+// Run the inference for a sample.
+mlperf_status_t mlperf_backend_issue_query(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+
+  // main thread for batch_size == 1
+  if (!backend_data->use_shard) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(6, &cpuset);
+    CPU_SET(7, &cpuset);
+    sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+    if (TfLiteInterpreterInvoke(backend_data->interpreter[0]) != kTfLiteOk) {
+      printf("Failed to run the inference");
+      return MLPERF_FAILURE;
+    }
+  }
+
+  return MLPERF_SUCCESS;
+}
+
+// Flush the staged queries immediately.
+mlperf_status_t mlperf_backend_flush_queries(mlperf_backend_ptr_t backend_ptr) {
+  return MLPERF_SUCCESS;
+}
+
+// Return the number of inputs of the model.
+int32_t mlperf_backend_get_input_count(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  return TfLiteInterpreterGetInputTensorCount(backend_data->interpreter[0]);
+}
+
+// Return the type of the ith input.
+mlperf_data_t mlperf_backend_get_input_type(mlperf_backend_ptr_t backend_ptr,
+                                            int32_t i) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  const TfLiteTensor* tensor =
+      TfLiteInterpreterGetInputTensor(backend_data->interpreter[0], i);
+  mlperf_data_t type;
+  type.type = TfType2Type(TfLiteTensorType(tensor));
+  type.size = TFLiteNumElements(tensor);
+  return type;
+}
+
+// Set the data for ith input.
+mlperf_status_t mlperf_backend_set_input(mlperf_backend_ptr_t backend_ptr,
+                                         int32_t batch_index, int32_t i,
+                                         void* data) {
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(6, &cpuset);
+  CPU_SET(7, &cpuset);
+  sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  const int real_batch_size = backend_data->batch_size / N_OFFLINE_INTEPRETERS;
+  const int shard = batch_index / (real_batch_size);
+  if (shard == 0 && batch_index == 0 && backend_data->use_shard == false) {
+    backend_data->use_shard = false;
+  } else {
+    backend_data->use_shard = true;
+  }
+  int real_batch_index = batch_index % real_batch_size;
+
+  TfLiteTensor* tensor = nullptr;
+  if (backend_data->use_shard) {
+    tensor = TfLiteInterpreterGetInputTensor(backend_data->interpreter8[shard], i);
+  } else {
+    tensor = TfLiteInterpreterGetInputTensor(backend_data->interpreter[shard], i);
+  }
+  if (real_batch_index == 0 && backend_data->use_shard == false) {
+    if (backend_data->original_tensor_size == 0) {
+      backend_data->original_tensor_size = tensor->bytes;
+      memcpy((char*)tensor->data.raw, (char*)data, backend_data->original_tensor_size);
+      backend_data->has_temp_data = true;
+    } else {
+      tensor->data.raw = (char*)data;
+    }
+  }
+
+  if (backend_data->use_shard) {
+    if (backend_data->acc_data[shard][i] == nullptr) {
+      backend_data->acc_data[shard][i] = malloc(real_batch_size * backend_data->original_tensor_size);
+    }
+    if (backend_data->has_temp_data) {
+      TfLiteTensor* tensor = TfLiteInterpreterGetInputTensor(backend_data->interpreter[shard], i);
+      memcpy(
+          (char*)backend_data->acc_data[shard][i],
+          (char*)tensor->data.raw, backend_data->original_tensor_size);
+      backend_data->has_temp_data = false;
+    }
+    memcpy(
+        ((char*)backend_data->acc_data[shard][i] +
+         ((batch_index % real_batch_size) * backend_data->original_tensor_size)),
+        data, backend_data->original_tensor_size);
+    if (real_batch_index == (real_batch_size - 1)) {
+      tensor->data.raw = (char*)backend_data->acc_data[shard][i];
+    }
+  }
+
+  // Allocate tensors.
+  if (((batch_index+1) % real_batch_size) == 0 && i == (backend_data->input_tensor_count - 1)) {
+    auto task = [](TFLiteBackendData* backend_data, int index) -> TfLiteStatus {
+      if (backend_data->use_shard) {
+        return TfLiteInterpreterInvoke(backend_data->interpreter8[index]);
+      } else {
+        return TfLiteInterpreterInvoke(backend_data->interpreter[index]);
+      }
+    };
+
+    // dispatch workers
+    if (backend_data->use_shard) {
+      backend_data->status[shard] = backend_data->executer->submit(task, backend_data, shard);
+    }
+  }
+
+  return MLPERF_SUCCESS;
+}
+
+// Return the number of outputs for the model.
+int32_t mlperf_backend_get_output_count(mlperf_backend_ptr_t backend_ptr) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  if (backend_data->use_shard) {
+    return TfLiteInterpreterGetOutputTensorCount(backend_data->interpreter8[0]);
+  } else {
+    return TfLiteInterpreterGetOutputTensorCount(backend_data->interpreter[0]);
+  }
+}
+
+// Return the type of ith output.
+mlperf_data_t mlperf_backend_get_output_type(mlperf_backend_ptr_t backend_ptr,
+                                             int32_t i) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  const TfLiteTensor* tensor;
+  if (backend_data->use_shard) {
+    tensor = TfLiteInterpreterGetOutputTensor(backend_data->interpreter8[0], i);
+  } else {
+    tensor = TfLiteInterpreterGetOutputTensor(backend_data->interpreter[0], i);
+  }
+  mlperf_data_t type;
+  type.type = TfType2Type(TfLiteTensorType(tensor));
+  type.size = TFLiteNumElements(tensor);
+  return type;
+}
+
+// Get the data from ith output.
+mlperf_status_t mlperf_backend_get_output(mlperf_backend_ptr_t backend_ptr,
+                                          uint32_t batch_index, int32_t i,
+                                          void** data) {
+  TFLiteBackendData* backend_data = (TFLiteBackendData*)backend_ptr;
+  const int real_batch_size =
+      (backend_data->use_shard) ? backend_data->batch_size / N_OFFLINE_INTEPRETERS : 1;
+  const int shard = batch_index / (real_batch_size);
+
+
+  if (backend_data->use_shard) {
+    if (backend_data->status[shard].valid()) {
+      if (backend_data->status[shard].get() != kTfLiteOk) {
+        printf("Failed to get output: %d", shard);
+        return MLPERF_FAILURE;
+      }
+    }
+  }
+
+  const TfLiteTensor* output_tensor;
+  if (backend_data->use_shard) {
+    output_tensor = TfLiteInterpreterGetOutputTensor(backend_data->interpreter8[shard], i);
+  } else {
+    output_tensor = TfLiteInterpreterGetOutputTensor(backend_data->interpreter[shard], i);
+  }
+  batch_index %= (real_batch_size);
+  int non_batch_size = 1;
+  for (int i = 1; i < output_tensor->dims->size; i++) {
+    non_batch_size *= output_tensor->dims->data[i];
+  }
+  switch (output_tensor->type) {
+    case kTfLiteFloat32:
+      *data = (output_tensor->data.f + (batch_index * non_batch_size));
+      break;
+    case kTfLiteUInt8:
+      *data = (output_tensor->data.uint8 + (batch_index * non_batch_size));
+      break;
+    case kTfLiteInt8:
+      *data = (output_tensor->data.int8 + (batch_index * non_batch_size));
+      break;
+    case kTfLiteFloat16:
+      *data = (output_tensor->data.f16 + (batch_index * non_batch_size));
+      break;
+    case kTfLiteInt32:
+      *data = (output_tensor->data.i32 + (batch_index * non_batch_size));
+      break;
+    case kTfLiteInt64:
+      *data = (output_tensor->data.i64 + (batch_index * non_batch_size));
+      break;
+    default:
+      printf("Data type not yet supported");
+      return MLPERF_FAILURE;
+  }
+  return MLPERF_SUCCESS;
+}
diff --git a/mobile_back_pixel/cpp/backend_tflite/tflite_settings_pixel.h b/mobile_back_pixel/cpp/backend_tflite/tflite_settings_pixel.h
new file mode 100644
index 000000000..297f39921
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/tflite_settings_pixel.h
@@ -0,0 +1,134 @@
+/* Copyright 2020-2021 The MLPerf Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#ifndef TFLITE_SETTINGS_H
+#define TFLITE_SETTINGS_H
+
+const std::string tflite_settings = R"SETTINGS(
+common_setting {
+  id: "num_threads"
+  name: "Number of threads"
+  value {
+    value: "2"
+    name: "2 threads"
+  }
+  acceptable_value {
+    value: "1"
+    name: "Single thread"
+  }
+  acceptable_value {
+    value: "2"
+    name: "2 threads"
+  }
+  acceptable_value {
+    value: "4"
+    name: "4 threads"
+  }
+  acceptable_value {
+    value: "8"
+    name: "8 threads"
+  }
+  acceptable_value {
+    value: "16"
+    name: "16 threads"
+  }
+}
+
+common_setting {
+  id: "configuration"
+  name: "Configuration"
+  value {
+    value: "TFLite backend using NNAPI, GPU delegate or CPU delegate."
+    name: "Default"
+  }
+}
+
+common_setting {
+  id: "share_results"
+  name: "Share results"
+  value {
+    value: "0"
+    name: "false"
+  }
+  acceptable_value {
+    value: "1"
+    name: "true"
+  }
+  acceptable_value {
+    value: "0"
+    name: "false"
+  }
+}
+
+common_setting {
+  id: "cooldown"
+  name: "Cooldown"
+  value {
+    value: "0"
+    name: "false"
+  }
+  acceptable_value {
+    value: "1"
+    name: "true"
+  }
+  acceptable_value {
+    value: "0"
+    name: "false"
+  }
+}
+
+benchmark_setting {
+  benchmark_id: "IC_tpu_uint8"
+  accelerator: "nnapi"
+  accelerator_desc: "NNAPI"
+  configuration: "TFLite"
+  src: "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilenet_edgetpu_224_1.0_uint8.tflite"
+}
+
+benchmark_setting {
+  benchmark_id: "IC_tpu_uint8_offline"
+  accelerator: "nnapi"
+  accelerator_desc: "NNAPI"
+  configuration: "TFLite"
+  batch_size: 64
+  src: "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilenet_edgetpu_224_1.0_uint8.tflite"
+}
+
+benchmark_setting {
+  benchmark_id: "OD_uint8"
+  accelerator: "nnapi"
+  accelerator_desc: "NNAPI"
+  configuration: "TFLite"
+  src: "https://github.com/mlcommons/mobile_models/raw/main/v1_0/tflite/mobiledet_qat.tflite"
+}
+
+benchmark_setting {
+  benchmark_id: "LU_float32"
+  accelerator: "nnapi"
+  accelerator_desc: "NNAPI"
+  configuration: "TFLite"
+  src: "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilebert_int8_384_nnapi.tflite"
+}
+
+benchmark_setting {
+  benchmark_id: "IS_uint8"
+  accelerator: "nnapi"
+  accelerator_desc: "NNAPI"
+  configuration: "TFLite"
+  src: "https://github.com/mlcommons/mobile_models/raw/Google/v1_0/Google/deeplabv3.tflite"
+})SETTINGS";
+
+#endif
diff --git a/mobile_back_pixel/cpp/backend_tflite/thread_pool.h b/mobile_back_pixel/cpp/backend_tflite/thread_pool.h
new file mode 100644
index 000000000..dd1693693
--- /dev/null
+++ b/mobile_back_pixel/cpp/backend_tflite/thread_pool.h
@@ -0,0 +1,110 @@
+/* Copyright 2021 The MLPerf Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+  claim that you wrote the original software. If you use this software
+  in a product, an acknowledgment in the product documentation would be
+  appreciated but is not required.
+
+  2. Altered source versions must be plainly marked as such, and must not be
+  misrepresented as being the original software.
+
+  3. This notice may not be removed or altered from any source
+  distribution.
+==============================================================================*/
+// reference:
+// https://github.com/progschj/ThreadPool
+
+#ifndef THREAD_POOL_H_
+#define THREAD_POOL_H_
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+
+class Threadpool {
+ private:
+  std::vector<std::thread> pool;
+  std::queue<std::function<void()>> tasks;
+  // synchronization variable
+  std::mutex m_lock;
+  std::condition_variable cond_var;
+  std::atomic<bool> available;
+
+ public:
+  inline Threadpool(size_t thread_num) : available(true) {
+    for (size_t i = 0; i < thread_num; i++) {
+      pool.emplace_back([this] {
+        while (available.load()) {
+          std::function<void()> picked_task;
+          // consumer in critical section
+          {
+            std::unique_lock<std::mutex> lock(this->m_lock);
+            auto wait_until = [this]() -> bool {
+              return !this->available.load() || !this->tasks.empty();
+            };
+            this->cond_var.wait(lock, wait_until);
+            if (!this->available && this->tasks.empty()) return;
+            picked_task = std::move(this->tasks.front());
+            this->tasks.pop();
+          }
+          // invoke picked task
+          picked_task();
+        }
+      });
+    }
+  }
+  ~Threadpool() {
+    available.store(false);
+    cond_var.notify_all();
+    for (auto& worker : pool) {
+      if (worker.joinable()) worker.join();
+    }
+  }
+
+  template <typename Fn, typename... Args>
+  auto submit(Fn&& fn, Args&&... args) -> std::future<decltype(fn(args...))> {
+    if (!available.load()) {
+      std::cerr << "don't accept commit after stop." << std::endl;
+      assert(false);
+    }
+    using FnReturnType = decltype(fn(args...));
+    auto enq_task = std::make_shared<std::packaged_task<FnReturnType()>>(
+        std::bind(std::forward<Fn>(fn), std::forward<Args>(args)...));
+    std::future<FnReturnType> ret = enq_task->get_future();
+    // producer in critical section
+    {
+      std::lock_guard<std::mutex> lock(m_lock);
+      tasks.emplace([enq_task]() { (*enq_task)(); });
+    }
+    cond_var.notify_one();
+    return ret;
+  }
+};
+#endif  // THREAD_POOL_H_
diff --git a/mobile_back_pixel/pixel_backend.mk b/mobile_back_pixel/pixel_backend.mk
new file mode 100644
index 000000000..0cfeeb75b
--- /dev/null
+++ b/mobile_back_pixel/pixel_backend.mk
@@ -0,0 +1,21 @@
+# Copyright 2021 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##########################################################################
+
+# Set the included backends
+WITH_PIXEL=0
+
+ifeq (${WITH_PIXEL},1)
+  PIXEL_BACKEND=--//android/java/org/mlperf/inference:with_pixel="1"
+endif
diff --git a/mobile_back_pixel/third_party/BUILD.bazel b/mobile_back_pixel/third_party/BUILD.bazel
new file mode 100644
index 000000000..ab723dce9
--- /dev/null
+++ b/mobile_back_pixel/third_party/BUILD.bazel
@@ -0,0 +1,19 @@
+# Copyright 2020 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##########################################################################
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
diff --git a/mobile_back_pixel/third_party/tf_gpu_delegate_fix_from_tf_master.diff b/mobile_back_pixel/third_party/tf_gpu_delegate_fix_from_tf_master.diff
new file mode 100644
index 000000000..05a6002d1
--- /dev/null
+++ b/mobile_back_pixel/third_party/tf_gpu_delegate_fix_from_tf_master.diff
@@ -0,0 +1,37 @@
+diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
+index b37629a97aa..4d5e9682789 100644
+--- a/tensorflow/lite/delegates/gpu/cl/BUILD
++++ b/tensorflow/lite/delegates/gpu/cl/BUILD
+@@ -3,6 +3,7 @@ load(
+     "//tensorflow/core/platform:build_config_root.bzl",
+     "tf_gpu_tests_tags",
+ )
++load("//tensorflow:tensorflow.bzl", "workspace_root")
+ 
+ package(
+     default_visibility = ["//visibility:public"],
+@@ -490,7 +491,7 @@ flatbuffer_cc_library(
+     srcs = ["serialization.fbs"],
+     flatc_args = [
+         "--scoped-enums",
+-        "-I ./",
++        "-I ./" + workspace_root,
+     ],
+     includes = [
+         "//tensorflow/lite/delegates/gpu/common/task:serialization_base_cc_fbs_includes",
+diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
+index a9da708bb53..d130169d7ad 100644
+--- a/tensorflow/tensorflow.bzl
++++ b/tensorflow/tensorflow.bzl
+@@ -52,6 +52,11 @@ VERSION = "2.5.0"
+ VERSION_MAJOR = VERSION.split(".")[0]
+ two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
+ 
++# The workspace root, to be used to set workspace 'include' paths in a way that
++# will still work correctly when TensorFlow is included as a dependency of an
++# external project.
++workspace_root = Label("//:WORKSPACE").workspace_root or "."
++
+ def clean_dep(target):
+     """Returns string to 'target' in @org_tensorflow repository.
+ 
diff --git a/mobile_back_pixel/third_party/tf_grappler_cost.diff b/mobile_back_pixel/third_party/tf_grappler_cost.diff
new file mode 100644
index 000000000..58391ea3e
--- /dev/null
+++ b/mobile_back_pixel/third_party/tf_grappler_cost.diff
@@ -0,0 +1,19 @@
+diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
+index b205f5c3e5..b64f4c7dc9 100644
+--- a/tensorflow/core/grappler/costs/BUILD
++++ b/tensorflow/core/grappler/costs/BUILD
+@@ -31,7 +31,13 @@ tf_proto_library(
+     srcs = ["op_performance_data.proto"],
+     cc_api_version = 2,
+     make_default_target_header_only = True,
+-    protodeps = tf_additional_all_protos(),
++    protodeps = [
++        "//tensorflow/core/framework:attr_value_proto",
++        "//tensorflow/core/framework:resource_handle_proto",
++        "//tensorflow/core/framework:tensor_proto",
++        "//tensorflow/core/framework:tensor_shape_proto",
++        "//tensorflow/core/protobuf:for_core_protos",
++    ],
+     visibility = ["//visibility:public"],
+ )
+
diff --git a/mobile_back_tflite/cpp/backend_tflite/tflite_settings.h b/mobile_back_tflite/cpp/backend_tflite/tflite_settings.h
index 3e7586866..da812a93d 100644
--- a/mobile_back_tflite/cpp/backend_tflite/tflite_settings.h
+++ b/mobile_back_tflite/cpp/backend_tflite/tflite_settings.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string>
-
 #ifndef TFLITE_SETTINGS_H
 #define TFLITE_SETTINGS_H
 
+#include <string>
+
 const std::string tflite_settings = R"SETTINGS(
 common_setting {
   id: "num_threads"