From c9a81961ca6771a981ae5d8149d93fb2d7b4eb18 Mon Sep 17 00:00:00 2001 From: erikmchut Date: Tue, 20 Oct 2020 13:32:48 -0700 Subject: [PATCH] Update tensorflow+deps to remove -pthread from wasm --- tensorflow/tensorflow.bzl | 1 + tensorflow/workspace.bzl | 44 +- third_party/FP16/workspace.bzl | 8 +- third_party/FXdiv/BUILD | 1 - third_party/FXdiv/BUILD.bazel | 15 - third_party/FXdiv/workspace.bzl | 15 - ..._google_absl_fix_wasm_config_setting.patch | 44 + third_party/pthreadpool.BUILD | 307 ++++ third_party/pthreadpool/BUILD | 1 - third_party/pthreadpool/BUILD.bazel | 32 - third_party/pthreadpool/workspace.bzl | 15 - third_party/xnnpack.BUILD | 1559 +++++++++-------- 12 files changed, 1221 insertions(+), 821 deletions(-) delete mode 100644 third_party/FXdiv/BUILD delete mode 100644 third_party/FXdiv/BUILD.bazel delete mode 100644 third_party/FXdiv/workspace.bzl create mode 100644 third_party/com_google_absl_fix_wasm_config_setting.patch create mode 100644 third_party/pthreadpool.BUILD delete mode 100644 third_party/pthreadpool/BUILD delete mode 100644 third_party/pthreadpool/BUILD.bazel delete mode 100644 third_party/pthreadpool/workspace.bzl diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 0626d145dd9e24..110ebbbaa1170c 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -324,6 +324,7 @@ def tf_copts( }) + select({ clean_dep("//tensorflow:android"): android_copts, + clean_dep("//tensorflow:emscripten"): [], clean_dep("//tensorflow:macos"): [], clean_dep("//tensorflow:windows"): get_win_copts(is_external), clean_dep("//tensorflow:ios"): [], diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 014d25e865f15f..929fa955605a8c 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -22,7 +22,6 @@ load( "def_file_filter_configure", ) load("//third_party/FP16:workspace.bzl", FP16 = "repo") -load("//third_party/FXdiv:workspace.bzl", FXdiv = "repo") load("//third_party/aws:workspace.bzl", aws = "repo") load("//third_party/clog:workspace.bzl", clog = "repo") load("//third_party/cpuinfo:workspace.bzl", cpuinfo = "repo") @@ -38,14 +37,12 @@ load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo") load("//third_party/kissfft:workspace.bzl", kissfft = "repo") load("//third_party/pasta:workspace.bzl", pasta = "repo") load("//third_party/psimd:workspace.bzl", psimd = "repo") -load("//third_party/pthreadpool:workspace.bzl", pthreadpool = "repo") load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo") load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo") def initialize_third_party(): """ Load third party repositories. See above load() statements. """ FP16() - FXdiv() aws() clog() cpuinfo() @@ -61,7 +58,6 @@ def initialize_third_party(): opencl_headers() pasta() psimd() - pthreadpool() sobol_data() vulkan_headers() @@ -145,11 +141,32 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "XNNPACK", build_file = clean_dep("//third_party:xnnpack.BUILD"), - sha256 = "190e61e50af3497bb46b8d936bd2d2d551a9aeedb02ff66388918408a54e216a", - strip_prefix = "XNNPACK-b18783570f0643560be641b193367d3906955141", + sha256 = "246aa56afc5263f1d41fc4a3437ecd51b56f78e16421818961cf79e39431c1df", + strip_prefix = "XNNPACK-b9d07cfa38af15c2abf564c980e00c965857ba21", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b18783570f0643560be641b193367d3906955141.zip", - "https://github.com/google/XNNPACK/archive/b18783570f0643560be641b193367d3906955141.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b9d07cfa38af15c2abf564c980e00c965857ba21.zip", + "https://github.com/google/XNNPACK/archive/b9d07cfa38af15c2abf564c980e00c965857ba21.zip", + ], + ) + + tf_http_archive( + name = "FXdiv", + sha256 = "8224ff187cdfa178b8c54d36eea70520391781eda16d13a418ab5ae53289e1ab", + strip_prefix = "FXdiv-561254d968e5679460e6a0a743206410284d9f46", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/FXdiv/archive/561254d968e5679460e6a0a743206410284d9f46.zip", + "https://github.com/Maratyszcza/FXdiv/archive/561254d968e5679460e6a0a743206410284d9f46.zip", + ], + ) + + tf_http_archive( + name = "pthreadpool", + build_file = clean_dep("//third_party:pthreadpool.BUILD"), + sha256 = "f894d845cefc091291329712deec85ce7020546f6eaff200b690ae04b6094535", + strip_prefix = "pthreadpool-bfa3b9ce6cb71dc8b792e39d24717320a4f92572", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/bfa3b9ce6cb71dc8b792e39d24717320a4f92572.zip", + "https://github.com/Maratyszcza/pthreadpool/archive/bfa3b9ce6cb71dc8b792e39d24717320a4f92572.zip", ], ) @@ -185,12 +202,13 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = clean_dep("//third_party:com_google_absl.BUILD"), # TODO: Remove the patch when https://github.com/abseil/abseil-cpp/issues/326 is resolved # and when TensorFlow is build against CUDA 10.2 - patch_file = clean_dep("//third_party:com_google_absl_fix_mac_and_nvcc_build.patch"), - sha256 = "acd93f6baaedc4414ebd08b33bebca7c7a46888916101d8c0b8083573526d070", # SHARED_ABSL_SHA - strip_prefix = "abseil-cpp-43ef2148c0936ebf7cb4be6b19927a9d9d145b8f", + # Added wasm config patch for code8 + patch_file = clean_dep("//third_party:com_google_absl_fix_wasm_config_setting.patch"), + sha256 = "ec8ef47335310cc3382bdc0d0cc1097a001e67dc83fcba807845aa5696e7e1e4", # SHARED_ABSL_SHA + strip_prefix = "abseil-cpp-302b250e1d917ede77b5ff00a6fd9f28430f1563", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/302b250e1d917ede77b5ff00a6fd9f28430f1563.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/302b250e1d917ede77b5ff00a6fd9f28430f1563.tar.gz", ], ) diff --git a/third_party/FP16/workspace.bzl b/third_party/FP16/workspace.bzl index f17352eade3cb4..441ef6b15e1a62 100644 --- a/third_party/FP16/workspace.bzl +++ b/third_party/FP16/workspace.bzl @@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive") def repo(): third_party_http_archive( name = "FP16", - strip_prefix = "FP16-febbb1c163726b5db24bed55cc9dc42529068997", - sha256 = "3e71681e0a67cd28552aa0bbb78ec6a6bd238216df15336dc1326280f7958de2", + strip_prefix = "FP16-3c54eacb74f6f5e39077300c5564156c424d77ba", + sha256 = "0d56bb92f649ec294dbccb13e04865e3c82933b6f6735d1d7145de45da700156", urls = [ - "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/febbb1c163726b5db24bed55cc9dc42529068997.tar.gz", - "https://github.com/Maratyszcza/FP16/archive/febbb1c163726b5db24bed55cc9dc42529068997.tar.gz", + "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip", + "https://github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip", ], build_file = "//third_party/FP16:BUILD.bazel", ) diff --git a/third_party/FXdiv/BUILD b/third_party/FXdiv/BUILD deleted file mode 100644 index 82bab3ffd96463..00000000000000 --- a/third_party/FXdiv/BUILD +++ /dev/null @@ -1 +0,0 @@ -# This empty BUILD file is required to make Bazel treat this directory as a package. diff --git a/third_party/FXdiv/BUILD.bazel b/third_party/FXdiv/BUILD.bazel deleted file mode 100644 index ef2853b7ceb8fb..00000000000000 --- a/third_party/FXdiv/BUILD.bazel +++ /dev/null @@ -1,15 +0,0 @@ -# Description: -# C99/C++ library for division via fixed-point multiplication by inverse - -package(default_visibility = ["//visibility:public"]) - -licenses(["notice"]) - -exports_files(["LICENSE"]) - -cc_library( - name = "FXdiv", - hdrs = glob(["include/fxdiv.h"]), - includes = ["include"], - strip_include_prefix = "include", -) diff --git a/third_party/FXdiv/workspace.bzl b/third_party/FXdiv/workspace.bzl deleted file mode 100644 index 91a6a6ce86051a..00000000000000 --- a/third_party/FXdiv/workspace.bzl +++ /dev/null @@ -1,15 +0,0 @@ -"""Loads the FXdiv library, used by XNNPACK & pthreadpool.""" - -load("//third_party:repo.bzl", "third_party_http_archive") - -def repo(): - third_party_http_archive( - name = "FXdiv", - strip_prefix = "FXdiv-f8c5354679ec2597792bc70a9e06eff50c508b9a", - sha256 = "7d3215bea832fe77091ec5666200b91156df6724da1e348205078346325fc45e", - urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/FXdiv/archive/f8c5354679ec2597792bc70a9e06eff50c508b9a.tar.gz", - "https://github.com/Maratyszcza/FXdiv/archive/f8c5354679ec2597792bc70a9e06eff50c508b9a.tar.gz", - ], - build_file = "//third_party/FXdiv:BUILD.bazel", - ) diff --git a/third_party/com_google_absl_fix_wasm_config_setting.patch b/third_party/com_google_absl_fix_wasm_config_setting.patch new file mode 100644 index 00000000000000..845b3ece17aab9 --- /dev/null +++ b/third_party/com_google_absl_fix_wasm_config_setting.patch @@ -0,0 +1,44 @@ +--- ./absl/BUILD.bazel 2020-07-12 14:21:35.000000000 -0700 ++++ ./absl/BUILD.bazel 2020-10-20 12:17:37.000000000 -0700 +@@ -61,5 +61,5 @@ + + config_setting( + name = "wasm", +- values = {"cpu": "wasm32"}, ++ values = {"cpu": "js"}, + ) +--- ./absl/time/internal/cctz/BUILD.bazel 2019-09-23 13:20:52.000000000 -0700 ++++ ./absl/time/internal/cctz/BUILD.bazel.fixed 2019-09-23 13:20:48.000000000 -0700 +@@ -76,15 +76,6 @@ + "include/cctz/time_zone.h", + "include/cctz/zone_info_source.h", + ], +- linkopts = select({ +- ":osx": [ +- "-framework Foundation", +- ], +- ":ios": [ +- "-framework Foundation", +- ], +- "//conditions:default": [], +- }), + visibility = ["//visibility:public"], + deps = [":civil_time"], + ) +--- ./absl/strings/string_view.h 2019-09-23 13:20:52.000000000 -0700 ++++ ./absl/strings/string_view.h.fixed 2019-09-23 13:20:48.000000000 -0700 +@@ -520,7 +520,14 @@ + (std::numeric_limits::max)(); + + static constexpr size_type CheckLengthInternal(size_type len) { ++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__<10 || (__CUDACC_VER_MAJOR__==10 && __CUDACC_VER_MINOR__<2)) && !defined(NDEBUG) ++ // An nvcc bug treats the original return expression as a non-constant, ++ // which is not allowed in a constexpr function. This only happens when ++ // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release. ++ return len; ++#else + return ABSL_HARDENING_ASSERT(len <= kMaxSize), len; ++#endif + } + + const char* ptr_; diff --git a/third_party/pthreadpool.BUILD b/third_party/pthreadpool.BUILD new file mode 100644 index 00000000000000..ab683f6d210a54 --- /dev/null +++ b/third_party/pthreadpool.BUILD @@ -0,0 +1,307 @@ +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") + +licenses(["notice"]) + +############################## pthreadpool library ############################# + +INTERNAL_HDRS = [ + "src/threadpool-atomics.h", + "src/threadpool-common.h", + "src/threadpool-object.h", + "src/threadpool-utils.h", +] + +PORTABLE_SRCS = [ + "src/memory.c", + "src/portable-api.c", +] + +PTHREADS_IMPL_SRCS = PORTABLE_SRCS + ["src/pthreads.c"] + +GCD_IMPL_SRCS = PORTABLE_SRCS + ["src/gcd.c"] + +SHIM_IMPL_SRCS = ["src/shim.c"] + +INTERNAL_HDRS = [ + "src/threadpool-atomics.h", + "src/threadpool-common.h", + "src/threadpool-object.h", + "src/threadpool-utils.h", +] + +PORTABLE_SRCS = [ + "src/memory.c", + "src/portable-api.c", +] + +PTHREADS_IMPL_SRCS = PORTABLE_SRCS + ["src/pthreads.c"] + +GCD_IMPL_SRCS = PORTABLE_SRCS + ["src/gcd.c"] + +WINDOWS_IMPL_SRCS = PORTABLE_SRCS + ["src/windows.c"] + +SHIM_IMPL_SRCS = ["src/shim.c"] + +cc_library( + name = "pthreadpool", + srcs = select({ + ":pthreadpool_sync_primitive_explicit_condvar": INTERNAL_HDRS + PTHREADS_IMPL_SRCS, + ":pthreadpool_sync_primitive_explicit_futex": INTERNAL_HDRS + PTHREADS_IMPL_SRCS, + ":pthreadpool_sync_primitive_explicit_gcd": INTERNAL_HDRS + GCD_IMPL_SRCS, + ":pthreadpool_sync_primitive_explicit_event": INTERNAL_HDRS + WINDOWS_IMPL_SRCS, + ":emscripten_with_threads": INTERNAL_HDRS + PTHREADS_IMPL_SRCS, + ":emscripten": INTERNAL_HDRS + SHIM_IMPL_SRCS, + ":macos_x86": INTERNAL_HDRS + GCD_IMPL_SRCS, + ":macos_x86_64": INTERNAL_HDRS + GCD_IMPL_SRCS, + ":ios": INTERNAL_HDRS + GCD_IMPL_SRCS, + ":windows_x86_64": INTERNAL_HDRS + WINDOWS_IMPL_SRCS, + ":windows_x86_64_msvc": INTERNAL_HDRS + WINDOWS_IMPL_SRCS, + "//conditions:default": INTERNAL_HDRS + PTHREADS_IMPL_SRCS, + }), + copts = [ + "-std=gnu11", + ] + select({ + ":optimized_build": ["-O2"], + "//conditions:default": [], + }) + select({ + ":linux_arm": ["-DPTHREADPOOL_USE_CPUINFO=1"], + ":linux_armhf": ["-DPTHREADPOOL_USE_CPUINFO=1"], + ":linux_aarch64": ["-DPTHREADPOOL_USE_CPUINFO=1"], + ":android_armv7": ["-DPTHREADPOOL_USE_CPUINFO=1"], + ":android_arm64": ["-DPTHREADPOOL_USE_CPUINFO=1"], + "//conditions:default": ["-DPTHREADPOOL_USE_CPUINFO=0"], + }) + select({ + ":pthreadpool_sync_primitive_explicit_condvar": [ + "-DPTHREADPOOL_USE_CONDVAR=1", + "-DPTHREADPOOL_USE_FUTEX=0", + "-DPTHREADPOOL_USE_GCD=0", + "-DPTHREADPOOL_USE_EVENT=0", + ], + ":pthreadpool_sync_primitive_explicit_futex": [ + "-DPTHREADPOOL_USE_CONDVAR=0", + "-DPTHREADPOOL_USE_FUTEX=1", + "-DPTHREADPOOL_USE_GCD=0", + "-DPTHREADPOOL_USE_EVENT=0", + ], + ":pthreadpool_sync_primitive_explicit_gcd": [ + "-DPTHREADPOOL_USE_CONDVAR=0", + "-DPTHREADPOOL_USE_FUTEX=0", + "-DPTHREADPOOL_USE_GCD=1", + "-DPTHREADPOOL_USE_EVENT=0", + ], + ":pthreadpool_sync_primitive_explicit_event": [ + "-DPTHREADPOOL_USE_CONDVAR=0", + "-DPTHREADPOOL_USE_FUTEX=0", + "-DPTHREADPOOL_USE_GCD=0", + "-DPTHREADPOOL_USE_EVENT=1", + ], + "//conditions:default": [], + }), + hdrs = [ + "include/pthreadpool.h", + ], + defines = [ + "PTHREADPOOL_NO_DEPRECATED_API", + ], + includes = [ + "include", + ], + linkopts = select({ + ":emscripten_with_threads": [ + "-s ALLOW_BLOCKING_ON_MAIN_THREAD=1", + "-s PTHREAD_POOL_SIZE=8", + ], + "//conditions:default": [], + }), + strip_include_prefix = "include", + deps = [ + "@FXdiv", + ] + select({ + ":linux_arm": ["@cpuinfo"], + ":linux_armhf": ["@cpuinfo"], + ":linux_aarch64": ["@cpuinfo"], + ":android_armv7": ["@cpuinfo"], + ":android_arm64": ["@cpuinfo"], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) + +################################## Unit tests ################################## + +EMSCRIPTEN_TEST_LINKOPTS = [ + "-s ASSERTIONS=2", + "-s ERROR_ON_UNDEFINED_SYMBOLS=1", + "-s DEMANGLE_SUPPORT=1", + "-s EXIT_RUNTIME=1", + "-s ALLOW_MEMORY_GROWTH=0", + "-s TOTAL_MEMORY=67108864", # 64M +] + +cc_test( + name = "pthreadpool_test", + srcs = ["test/pthreadpool.cc"], + linkopts = select({ + ":emscripten": EMSCRIPTEN_TEST_LINKOPTS, + "//conditions:default": [], + }), + deps = [ + ":pthreadpool", + "@com_google_googletest//:gtest_main", + ], +) + +################################## Benchmarks ################################## + +EMSCRIPTEN_BENCHMARK_LINKOPTS = [ + "-s ASSERTIONS=1", + "-s ERROR_ON_UNDEFINED_SYMBOLS=1", + "-s EXIT_RUNTIME=1", + "-s ALLOW_MEMORY_GROWTH=0", +] + +cc_binary( + name = "latency_bench", + srcs = ["bench/latency.cc"], + linkopts = select({ + ":emscripten": EMSCRIPTEN_BENCHMARK_LINKOPTS, + "//conditions:default": [], + }), + deps = [ + ":pthreadpool", + "@com_google_benchmark//:benchmark", + ], +) + +cc_binary( + name = "throughput_bench", + srcs = ["bench/throughput.cc"], + linkopts = select({ + ":emscripten": EMSCRIPTEN_BENCHMARK_LINKOPTS, + "//conditions:default": [], + }), + deps = [ + ":pthreadpool", + "@com_google_benchmark//:benchmark", + ], +) + +############################# Build configurations ############################# + +# Synchronize workers using pthreads condition variable. +config_setting( + name = "pthreadpool_sync_primitive_explicit_condvar", + define_values = {"pthreadpool_sync_primitive": "condvar"}, +) + +# Synchronize workers using futex. +config_setting( + name = "pthreadpool_sync_primitive_explicit_futex", + define_values = {"pthreadpool_sync_primitive": "futex"}, +) + +# Synchronize workers using Grand Central Dispatch. +config_setting( + name = "pthreadpool_sync_primitive_explicit_gcd", + define_values = {"pthreadpool_sync_primitive": "gcd"}, +) + +# Synchronize workers using WinAPI event. +config_setting( + name = "pthreadpool_sync_primitive_explicit_event", + define_values = {"pthreadpool_sync_primitive": "event"}, +) + +config_setting( + name = "optimized_build", + values = { + "compilation_mode": "opt", + }, +) + +config_setting( + name = "linux_arm", + values = {"cpu": "arm"}, +) + +config_setting( + name = "linux_armhf", + values = {"cpu": "armhf"}, +) + +config_setting( + name = "linux_aarch64", + values = {"cpu": "aarch64"}, +) + +config_setting( + name = "android_armv7", + values = { + "crosstool_top": "//external:android/crosstool", + "cpu": "armeabi-v7a", + }, +) + +config_setting( + name = "android_arm64", + values = { + "crosstool_top": "//external:android/crosstool", + "cpu": "arm64-v8a", + }, +) + +# Note: we need to individually match x86 and x86-64 macOS rather than use +# catch-all "apple_platform_type": "macos" because that option defaults to +# "macos" even when building on Linux! +config_setting( + name = "macos_x86", + values = { + "apple_platform_type": "macos", + "cpu": "darwin", + }, +) + +config_setting( + name = "macos_x86_64", + values = { + "apple_platform_type": "macos", + "cpu": "darwin_x86_64", + }, +) + +config_setting( + name = "ios", + values = { + "crosstool_top": "@bazel_tools//tools/cpp:toolchain", + "apple_platform_type": "ios", + }, +) + +config_setting( + name = "windows_x86_64", + values = { + "cpu": "x64_windows", + }, +) + +config_setting( + name = "windows_x86_64_msvc", + values = { + "cpu": "x64_windows_msvc", + }, +) + +config_setting( + name = "emscripten", + values = { + "cpu": "js", + } +) + +config_setting( + name = "emscripten_with_threads", + values = { + "crosstool_top": "//toolchain:emscripten", + "copt": "-pthread", + } +) diff --git a/third_party/pthreadpool/BUILD b/third_party/pthreadpool/BUILD deleted file mode 100644 index 82bab3ffd96463..00000000000000 --- a/third_party/pthreadpool/BUILD +++ /dev/null @@ -1 +0,0 @@ -# This empty BUILD file is required to make Bazel treat this directory as a package. diff --git a/third_party/pthreadpool/BUILD.bazel b/third_party/pthreadpool/BUILD.bazel deleted file mode 100644 index 1267e4f37368e4..00000000000000 --- a/third_party/pthreadpool/BUILD.bazel +++ /dev/null @@ -1,32 +0,0 @@ -# Description: -# Portable pthread-based thread pool for C and C++ - -package(default_visibility = ["//visibility:public"]) - -licenses(["notice"]) - -exports_files(["LICENSE"]) - -cc_library( - name = "pthreadpool", - srcs = [ - "src/threadpool-pthreads.c", - "src/threadpool-utils.h", - ], - hdrs = [ - "include/pthreadpool.h", - ], - copts = [ - "-O2", - ], - defines = [ - "PTHREADPOOL_NO_DEPRECATED_API", - ], - includes = [ - "include", - ], - strip_include_prefix = "include", - deps = [ - "@FXdiv", - ], -) diff --git a/third_party/pthreadpool/workspace.bzl b/third_party/pthreadpool/workspace.bzl deleted file mode 100644 index b21c9ca12f8bc3..00000000000000 --- a/third_party/pthreadpool/workspace.bzl +++ /dev/null @@ -1,15 +0,0 @@ -"""Loads the pthreadpool library, used by XNNPACK.""" - -load("//third_party:repo.bzl", "third_party_http_archive") - -def repo(): - third_party_http_archive( - name = "pthreadpool", - strip_prefix = "pthreadpool-7ad026703b3109907ad124025918da15cfd3f100", - sha256 = "96eb4256fc438b7b8cab40541d383efaf546fae7bad380c24ea601c326c5f685", - urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/7ad026703b3109907ad124025918da15cfd3f100.tar.gz", - "https://github.com/Maratyszcza/pthreadpool/archive/7ad026703b3109907ad124025918da15cfd3f100.tar.gz", - ], - build_file = "//third_party/pthreadpool:BUILD.bazel", - ) diff --git a/third_party/xnnpack.BUILD b/third_party/xnnpack.BUILD index 2e33465804c645..c0fce42965b012 100644 --- a/third_party/xnnpack.BUILD +++ b/third_party/xnnpack.BUILD @@ -82,8 +82,8 @@ SCALAR_UKERNELS = [ "src/f32-argmaxpool/4x-scalar-c1.c", "src/f32-argmaxpool/9p8x-scalar-c1.c", "src/f32-argmaxpool/9x-scalar-c1.c", - "src/f32-avgpool/9p8x-scalar-c1.c", - "src/f32-avgpool/9x-scalar-c1.c", + "src/f32-avgpool/9p8x-minmax-scalar-c1.c", + "src/f32-avgpool/9x-minmax-scalar-c1.c", "src/f32-clamp/gen/scalar-x1.c", "src/f32-clamp/gen/scalar-x2.c", "src/f32-clamp/gen/scalar-x4.c", @@ -94,28 +94,44 @@ SCALAR_UKERNELS = [ "src/f32-dwconv-spchw/3x3s2p1-scalar.c", "src/f32-dwconv-spchw/5x5p2-scalar.c", "src/f32-dwconv-spchw/5x5s2p2-scalar.c", - "src/f32-dwconv/gen/up1x25-scalar-acc2.c", - "src/f32-dwconv/gen/up1x25-scalar.c", "src/f32-dwconv/gen/up1x4-scalar-acc2.c", "src/f32-dwconv/gen/up1x4-scalar.c", "src/f32-dwconv/gen/up1x9-scalar-acc2.c", "src/f32-dwconv/gen/up1x9-scalar.c", - "src/f32-dwconv/gen/up2x25-scalar-acc2.c", - "src/f32-dwconv/gen/up2x25-scalar.c", + "src/f32-dwconv/gen/up1x25-scalar-acc2.c", + "src/f32-dwconv/gen/up1x25-scalar.c", "src/f32-dwconv/gen/up2x4-scalar-acc2.c", "src/f32-dwconv/gen/up2x4-scalar.c", "src/f32-dwconv/gen/up2x9-scalar-acc2.c", "src/f32-dwconv/gen/up2x9-scalar.c", + "src/f32-dwconv/gen/up2x25-scalar-acc2.c", + "src/f32-dwconv/gen/up2x25-scalar.c", + "src/f32-dwconv/gen/up1x4-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up1x4-minmax-scalar.c", + "src/f32-dwconv/gen/up1x9-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up1x9-minmax-scalar.c", + "src/f32-dwconv/gen/up1x25-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up1x25-minmax-scalar.c", + "src/f32-dwconv/gen/up2x4-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up2x4-minmax-scalar.c", + "src/f32-dwconv/gen/up2x9-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up2x9-minmax-scalar.c", + "src/f32-dwconv/gen/up2x25-minmax-scalar-acc2.c", + "src/f32-dwconv/gen/up2x25-minmax-scalar.c", "src/f32-gavgpool-spchw/scalar-x1.c", - "src/f32-gavgpool/7p7x-scalar-c1.c", - "src/f32-gavgpool/7x-scalar-c1.c", - "src/f32-gemm/gen-inc/1x4-scalar.c", - "src/f32-gemm/gen-inc/2x4-scalar.c", - "src/f32-gemm/gen-inc/4x4-scalar.c", + "src/f32-gavgpool/7p7x-minmax-scalar-c1.c", + "src/f32-gavgpool/7x-minmax-scalar-c1.c", + "src/f32-gemm/gen-inc/1x4inc-minmax-scalar.c", + "src/f32-gemm/gen-inc/2x4inc-minmax-scalar.c", + "src/f32-gemm/gen-inc/4x4inc-minmax-scalar.c", "src/f32-gemm/gen/1x4-scalar.c", "src/f32-gemm/gen/2x4-scalar.c", "src/f32-gemm/gen/4x2-scalar.c", "src/f32-gemm/gen/4x4-scalar.c", + "src/f32-gemm/gen/1x4-minmax-scalar.c", + "src/f32-gemm/gen/2x4-minmax-scalar.c", + "src/f32-gemm/gen/4x2-minmax-scalar.c", + "src/f32-gemm/gen/4x4-minmax-scalar.c", "src/f32-hswish/gen/scalar-x1.c", "src/f32-hswish/gen/scalar-x2.c", "src/f32-hswish/gen/scalar-x4.c", @@ -126,13 +142,17 @@ SCALAR_UKERNELS = [ "src/f32-igemm/gen/2x4-scalar.c", "src/f32-igemm/gen/4x2-scalar.c", "src/f32-igemm/gen/4x4-scalar.c", - "src/f32-maxpool/9p8x-scalar-c1.c", - "src/f32-pavgpool/9p8x-scalar-c1.c", - "src/f32-pavgpool/9x-scalar-c1.c", - "src/f32-ppmm/gen/2x4-scalar.c", - "src/f32-ppmm/gen/3x3-scalar.c", - "src/f32-ppmm/gen/4x2-scalar.c", - "src/f32-ppmm/gen/4x4-scalar.c", + "src/f32-igemm/gen/1x4-minmax-scalar.c", + "src/f32-igemm/gen/2x4-minmax-scalar.c", + "src/f32-igemm/gen/4x2-minmax-scalar.c", + "src/f32-igemm/gen/4x4-minmax-scalar.c", + "src/f32-maxpool/9p8x-minmax-scalar-c1.c", + "src/f32-pavgpool/9p8x-minmax-scalar-c1.c", + "src/f32-pavgpool/9x-minmax-scalar-c1.c", + "src/f32-ppmm/gen/2x4-minmax-scalar.c", + "src/f32-ppmm/gen/3x3-minmax-scalar.c", + "src/f32-ppmm/gen/4x2-minmax-scalar.c", + "src/f32-ppmm/gen/4x4-minmax-scalar.c", "src/f32-prelu/gen/scalar-2x1.c", "src/f32-prelu/gen/scalar-2x4.c", "src/f32-raddstoreexpminusmax/gen/scalar-lut64-p2-x1.c", @@ -157,28 +177,28 @@ SCALAR_UKERNELS = [ "src/f32-sigmoid/gen/scalar-p5-div-x1.c", "src/f32-sigmoid/gen/scalar-p5-div-x2.c", "src/f32-sigmoid/gen/scalar-p5-div-x4.c", - "src/f32-spmm/gen/1x1-scalar-pipelined.c", - "src/f32-spmm/gen/1x1-scalar.c", - "src/f32-spmm/gen/2x1-scalar-pipelined.c", - "src/f32-spmm/gen/2x1-scalar.c", - "src/f32-spmm/gen/4x1-scalar-pipelined.c", - "src/f32-spmm/gen/4x1-scalar.c", - "src/f32-spmm/gen/8x1-scalar-pipelined.c", - "src/f32-spmm/gen/8x1-scalar.c", - "src/f32-spmm/gen/8x2-scalar.c", - "src/f32-spmm/gen/8x4-scalar.c", - "src/f32-vbinary/gen/vadd-scalar-x1.c", - "src/f32-vbinary/gen/vadd-scalar-x2.c", - "src/f32-vbinary/gen/vadd-scalar-x4.c", - "src/f32-vbinary/gen/vaddc-scalar-x1.c", - "src/f32-vbinary/gen/vaddc-scalar-x2.c", - "src/f32-vbinary/gen/vaddc-scalar-x4.c", - "src/f32-vbinary/gen/vdiv-scalar-x1.c", - "src/f32-vbinary/gen/vdiv-scalar-x2.c", - "src/f32-vbinary/gen/vdiv-scalar-x4.c", - "src/f32-vbinary/gen/vdivc-scalar-x1.c", - "src/f32-vbinary/gen/vdivc-scalar-x2.c", - "src/f32-vbinary/gen/vdivc-scalar-x4.c", + "src/f32-spmm/gen/1x1-minmax-scalar-pipelined.c", + "src/f32-spmm/gen/1x1-minmax-scalar.c", + "src/f32-spmm/gen/2x1-minmax-scalar-pipelined.c", + "src/f32-spmm/gen/2x1-minmax-scalar.c", + "src/f32-spmm/gen/4x1-minmax-scalar-pipelined.c", + "src/f32-spmm/gen/4x1-minmax-scalar.c", + "src/f32-spmm/gen/8x1-minmax-scalar-pipelined.c", + "src/f32-spmm/gen/8x1-minmax-scalar.c", + "src/f32-spmm/gen/8x2-minmax-scalar.c", + "src/f32-spmm/gen/8x4-minmax-scalar.c", + "src/f32-vbinary/gen/vadd-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vadd-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vadd-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vaddc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vaddc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vaddc-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vdiv-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vdiv-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vdiv-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vdivc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vdivc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vdivc-minmax-scalar-x4.c", "src/f32-vbinary/gen/vmax-scalar-x1.c", "src/f32-vbinary/gen/vmax-scalar-x2.c", "src/f32-vbinary/gen/vmax-scalar-x4.c", @@ -191,44 +211,44 @@ SCALAR_UKERNELS = [ "src/f32-vbinary/gen/vminc-scalar-x1.c", "src/f32-vbinary/gen/vminc-scalar-x2.c", "src/f32-vbinary/gen/vminc-scalar-x4.c", - "src/f32-vbinary/gen/vmul-scalar-x1.c", - "src/f32-vbinary/gen/vmul-scalar-x2.c", - "src/f32-vbinary/gen/vmul-scalar-x4.c", - "src/f32-vbinary/gen/vmulc-scalar-x1.c", - "src/f32-vbinary/gen/vmulc-scalar-x2.c", - "src/f32-vbinary/gen/vmulc-scalar-x4.c", - "src/f32-vbinary/gen/vrdivc-scalar-x1.c", - "src/f32-vbinary/gen/vrdivc-scalar-x2.c", - "src/f32-vbinary/gen/vrdivc-scalar-x4.c", - "src/f32-vbinary/gen/vrsubc-scalar-x1.c", - "src/f32-vbinary/gen/vrsubc-scalar-x2.c", - "src/f32-vbinary/gen/vrsubc-scalar-x4.c", - "src/f32-vbinary/gen/vsub-scalar-x1.c", - "src/f32-vbinary/gen/vsub-scalar-x2.c", - "src/f32-vbinary/gen/vsub-scalar-x4.c", - "src/f32-vbinary/gen/vsubc-scalar-x1.c", - "src/f32-vbinary/gen/vsubc-scalar-x2.c", - "src/f32-vbinary/gen/vsubc-scalar-x4.c", - "src/f32-vmulcaddc/gen/c1-scalar-2x.c", - "src/f32-vmulcaddc/gen/c2-scalar-2x.c", - "src/f32-vmulcaddc/gen/c4-scalar-2x.c", + "src/f32-vbinary/gen/vmul-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vmul-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vmul-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vmulc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vmulc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vmulc-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vrdivc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vrdivc-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vrsubc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vrsubc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vrsubc-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vsub-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vsub-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vsub-minmax-scalar-x4.c", + "src/f32-vbinary/gen/vsubc-minmax-scalar-x1.c", + "src/f32-vbinary/gen/vsubc-minmax-scalar-x2.c", + "src/f32-vbinary/gen/vsubc-minmax-scalar-x4.c", + "src/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c", + "src/f32-vmulcaddc/gen/c2-minmax-scalar-2x.c", + "src/f32-vmulcaddc/gen/c4-minmax-scalar-2x.c", "src/math/expminus-scalar-lut2048-p1.c", "src/math/expminus-scalar-lut64-p2.c", "src/math/expminus-scalar-p5.c", "src/math/sigmoid-scalar-lut2048-p1-div.c", "src/math/sigmoid-scalar-lut64-p2-div.c", "src/math/sigmoid-scalar-p5-div.c", - "src/q8-avgpool/9p8x-scalar-c1.c", - "src/q8-avgpool/9x-scalar-c1.c", - "src/q8-dwconv/up1x9-scalar.c", - "src/q8-gavgpool/7p7x-scalar-c1.c", - "src/q8-gavgpool/7x-scalar-c1.c", - "src/q8-gemm/2x2-scalar.c", - "src/q8-igemm/2x2-scalar.c", - "src/q8-vadd/scalar.c", + "src/q8-avgpool/9p8x-minmax-scalar-c1.c", + "src/q8-avgpool/9x-minmax-scalar-c1.c", + "src/q8-dwconv/up1x9-minmax-scalar.c", + "src/q8-gavgpool/7p7x-minmax-scalar-c1.c", + "src/q8-gavgpool/7x-minmax-scalar-c1.c", + "src/q8-gemm/2x2-minmax-scalar.c", + "src/q8-igemm/2x2-minmax-scalar.c", + "src/q8-vadd/minmax-scalar.c", "src/u8-clamp/scalar-x4.c", "src/u8-lut32norm/scalar.c", - "src/u8-maxpool/9p8x-scalar-c1.c", + "src/u8-maxpool/9p8x-minmax-scalar-c1.c", "src/u8-rmax/scalar.c", "src/x32-packx/x2-scalar.c", "src/x32-packx/x3-scalar.c", @@ -251,32 +271,48 @@ SCALAR_UKERNELS = [ ] WASM_UKERNELS = [ - "src/f32-avgpool/9p8x-wasm-c1.c", - "src/f32-avgpool/9x-wasm-c1.c", + "src/f32-avgpool/9p8x-minmax-wasm-c1.c", + "src/f32-avgpool/9x-minmax-wasm-c1.c", "src/f32-clamp/gen/wasm-x1.c", "src/f32-clamp/gen/wasm-x2.c", "src/f32-clamp/gen/wasm-x4.c", - "src/f32-dwconv/gen/up1x25-wasm-acc2.c", - "src/f32-dwconv/gen/up1x25-wasm.c", "src/f32-dwconv/gen/up1x4-wasm-acc2.c", "src/f32-dwconv/gen/up1x4-wasm.c", "src/f32-dwconv/gen/up1x9-wasm-acc2.c", "src/f32-dwconv/gen/up1x9-wasm.c", - "src/f32-dwconv/gen/up2x25-wasm-acc2.c", - "src/f32-dwconv/gen/up2x25-wasm.c", + "src/f32-dwconv/gen/up1x25-wasm-acc2.c", + "src/f32-dwconv/gen/up1x25-wasm.c", "src/f32-dwconv/gen/up2x4-wasm-acc2.c", "src/f32-dwconv/gen/up2x4-wasm.c", "src/f32-dwconv/gen/up2x9-wasm-acc2.c", "src/f32-dwconv/gen/up2x9-wasm.c", - "src/f32-gavgpool/7p7x-wasm-c1.c", - "src/f32-gavgpool/7x-wasm-c1.c", - "src/f32-gemm/gen-inc/1x4-wasm.c", - "src/f32-gemm/gen-inc/2x4-wasm.c", - "src/f32-gemm/gen-inc/4x4-wasm.c", + "src/f32-dwconv/gen/up2x25-wasm-acc2.c", + "src/f32-dwconv/gen/up2x25-wasm.c", + "src/f32-dwconv/gen/up1x4-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up1x4-minmax-wasm.c", + "src/f32-dwconv/gen/up1x9-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up1x9-minmax-wasm.c", + "src/f32-dwconv/gen/up1x25-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up1x25-minmax-wasm.c", + "src/f32-dwconv/gen/up2x4-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up2x4-minmax-wasm.c", + "src/f32-dwconv/gen/up2x9-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up2x9-minmax-wasm.c", + "src/f32-dwconv/gen/up2x25-minmax-wasm-acc2.c", + "src/f32-dwconv/gen/up2x25-minmax-wasm.c", + "src/f32-gavgpool/7p7x-minmax-wasm-c1.c", + "src/f32-gavgpool/7x-minmax-wasm-c1.c", + "src/f32-gemm/gen-inc/1x4inc-minmax-wasm.c", + "src/f32-gemm/gen-inc/2x4inc-minmax-wasm.c", + "src/f32-gemm/gen-inc/4x4inc-minmax-wasm.c", "src/f32-gemm/gen/1x4-wasm.c", "src/f32-gemm/gen/2x4-wasm.c", "src/f32-gemm/gen/4x2-wasm.c", "src/f32-gemm/gen/4x4-wasm.c", + "src/f32-gemm/gen/1x4-minmax-wasm.c", + "src/f32-gemm/gen/2x4-minmax-wasm.c", + "src/f32-gemm/gen/4x2-minmax-wasm.c", + "src/f32-gemm/gen/4x4-minmax-wasm.c", "src/f32-hswish/gen/wasm-x1.c", "src/f32-hswish/gen/wasm-x2.c", "src/f32-hswish/gen/wasm-x4.c", @@ -284,21 +320,25 @@ WASM_UKERNELS = [ "src/f32-igemm/gen/2x4-wasm.c", "src/f32-igemm/gen/4x2-wasm.c", "src/f32-igemm/gen/4x4-wasm.c", - "src/f32-maxpool/9p8x-wasm-c1.c", - "src/f32-pavgpool/9p8x-wasm-c1.c", - "src/f32-pavgpool/9x-wasm-c1.c", - "src/f32-vbinary/gen/vadd-wasm-x1.c", - "src/f32-vbinary/gen/vadd-wasm-x2.c", - "src/f32-vbinary/gen/vadd-wasm-x4.c", - "src/f32-vbinary/gen/vaddc-wasm-x1.c", - "src/f32-vbinary/gen/vaddc-wasm-x2.c", - "src/f32-vbinary/gen/vaddc-wasm-x4.c", - "src/f32-vbinary/gen/vdiv-wasm-x1.c", - "src/f32-vbinary/gen/vdiv-wasm-x2.c", - "src/f32-vbinary/gen/vdiv-wasm-x4.c", - "src/f32-vbinary/gen/vdivc-wasm-x1.c", - "src/f32-vbinary/gen/vdivc-wasm-x2.c", - "src/f32-vbinary/gen/vdivc-wasm-x4.c", + "src/f32-igemm/gen/1x4-minmax-wasm.c", + "src/f32-igemm/gen/2x4-minmax-wasm.c", + "src/f32-igemm/gen/4x2-minmax-wasm.c", + "src/f32-igemm/gen/4x4-minmax-wasm.c", + "src/f32-maxpool/9p8x-minmax-wasm-c1.c", + "src/f32-pavgpool/9p8x-minmax-wasm-c1.c", + "src/f32-pavgpool/9x-minmax-wasm-c1.c", + "src/f32-vbinary/gen/vadd-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vadd-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vadd-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vaddc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vaddc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vaddc-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vdiv-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vdiv-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vdiv-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vdivc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vdivc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vdivc-minmax-wasm-x4.c", "src/f32-vbinary/gen/vmax-wasm-x1.c", "src/f32-vbinary/gen/vmax-wasm-x2.c", "src/f32-vbinary/gen/vmax-wasm-x4.c", @@ -311,99 +351,99 @@ WASM_UKERNELS = [ "src/f32-vbinary/gen/vminc-wasm-x1.c", "src/f32-vbinary/gen/vminc-wasm-x2.c", "src/f32-vbinary/gen/vminc-wasm-x4.c", - "src/f32-vbinary/gen/vmul-wasm-x1.c", - "src/f32-vbinary/gen/vmul-wasm-x2.c", - "src/f32-vbinary/gen/vmul-wasm-x4.c", - "src/f32-vbinary/gen/vmulc-wasm-x1.c", - "src/f32-vbinary/gen/vmulc-wasm-x2.c", - "src/f32-vbinary/gen/vmulc-wasm-x4.c", - "src/f32-vbinary/gen/vrdivc-wasm-x1.c", - "src/f32-vbinary/gen/vrdivc-wasm-x2.c", - "src/f32-vbinary/gen/vrdivc-wasm-x4.c", - "src/f32-vbinary/gen/vrsubc-wasm-x1.c", - "src/f32-vbinary/gen/vrsubc-wasm-x2.c", - "src/f32-vbinary/gen/vrsubc-wasm-x4.c", - "src/f32-vbinary/gen/vsub-wasm-x1.c", - "src/f32-vbinary/gen/vsub-wasm-x2.c", - "src/f32-vbinary/gen/vsub-wasm-x4.c", - "src/f32-vbinary/gen/vsubc-wasm-x1.c", - "src/f32-vbinary/gen/vsubc-wasm-x2.c", - "src/f32-vbinary/gen/vsubc-wasm-x4.c", - "src/f32-vmulcaddc/gen/c1-wasm-2x.c", - "src/f32-vmulcaddc/gen/c2-wasm-2x.c", - "src/f32-vmulcaddc/gen/c4-wasm-2x.c", + "src/f32-vbinary/gen/vmul-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vmul-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vmul-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vmulc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vmulc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vmulc-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vrdivc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vrdivc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vrdivc-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vrsubc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vrsubc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vrsubc-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vsub-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vsub-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vsub-minmax-wasm-x4.c", + "src/f32-vbinary/gen/vsubc-minmax-wasm-x1.c", + "src/f32-vbinary/gen/vsubc-minmax-wasm-x2.c", + "src/f32-vbinary/gen/vsubc-minmax-wasm-x4.c", + "src/f32-vmulcaddc/gen/c1-minmax-wasm-2x.c", + "src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c", + "src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c", ] PSIMD_FASTMATH_UKERNELS = [ "src/f32-argmaxpool/4x-psimd-c4.c", "src/f32-argmaxpool/9p8x-psimd-c4.c", "src/f32-argmaxpool/9x-psimd-c4.c", - "src/f32-avgpool/9p8x-psimd-c4.c", - "src/f32-avgpool/9x-psimd-c4.c", + "src/f32-avgpool/9p8x-minmax-psimd-c4.c", + "src/f32-avgpool/9x-minmax-psimd-c4.c", "src/f32-clamp/gen/psimd-x4.c", "src/f32-clamp/gen/psimd-x8.c", - "src/f32-dwconv/gen/up4x25-psimd-acc2.c", - "src/f32-dwconv/gen/up4x25-psimd.c", - "src/f32-dwconv/gen/up4x4-psimd-acc2.c", - "src/f32-dwconv/gen/up4x4-psimd.c", - "src/f32-dwconv/gen/up4x9-psimd-acc2.c", - "src/f32-dwconv/gen/up4x9-psimd.c", - "src/f32-dwconv/gen/up8x25-psimd-acc2.c", - "src/f32-dwconv/gen/up8x25-psimd.c", - "src/f32-dwconv/gen/up8x4-psimd-acc2.c", - "src/f32-dwconv/gen/up8x4-psimd.c", - "src/f32-dwconv/gen/up8x9-psimd-acc2.c", - "src/f32-dwconv/gen/up8x9-psimd.c", - "src/f32-gavgpool/7p7x-psimd-c4.c", - "src/f32-gavgpool/7x-psimd-c4.c", - "src/f32-gemm/gen/1x8-psimd-loadsplat.c", - "src/f32-gemm/gen/1x8-psimd-splat.c", - "src/f32-gemm/gen/1x8s4-psimd.c", - "src/f32-gemm/gen/4x2c4-psimd.c", - "src/f32-gemm/gen/4x8-psimd-loadsplat.c", - "src/f32-gemm/gen/4x8-psimd-splat.c", - "src/f32-gemm/gen/4x8s4-psimd.c", - "src/f32-gemm/gen/6x8-psimd-loadsplat.c", - "src/f32-gemm/gen/6x8-psimd-splat.c", - "src/f32-gemm/gen/6x8s4-psimd.c", - "src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c", - "src/f32-gemm/gen-inc/1x8-psimd-splat.c", - "src/f32-gemm/gen-inc/1x8s4-psimd.c", - "src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c", - "src/f32-gemm/gen-inc/4x8-psimd-splat.c", - "src/f32-gemm/gen-inc/4x8s4-psimd.c", - "src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c", - "src/f32-gemm/gen-inc/6x8-psimd-splat.c", - "src/f32-gemm/gen-inc/6x8s4-psimd.c", + "src/f32-dwconv/gen/up4x25-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up4x25-minmax-psimd.c", + "src/f32-dwconv/gen/up4x4-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up4x4-minmax-psimd.c", + "src/f32-dwconv/gen/up4x9-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up4x9-minmax-psimd.c", + "src/f32-dwconv/gen/up8x25-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up8x25-minmax-psimd.c", + "src/f32-dwconv/gen/up8x4-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up8x4-minmax-psimd.c", + "src/f32-dwconv/gen/up8x9-minmax-psimd-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-psimd.c", + "src/f32-gavgpool/7p7x-minmax-psimd-c4.c", + "src/f32-gavgpool/7x-minmax-psimd-c4.c", + "src/f32-gemm/gen/1x8-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen/1x8-minmax-psimd-splat.c", + "src/f32-gemm/gen/1x8s4-minmax-psimd.c", + "src/f32-gemm/gen/4x2c4-minmax-psimd.c", + "src/f32-gemm/gen/4x8-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen/4x8-minmax-psimd-splat.c", + "src/f32-gemm/gen/4x8s4-minmax-psimd.c", + "src/f32-gemm/gen/6x8-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen/6x8-minmax-psimd-splat.c", + "src/f32-gemm/gen/6x8s4-minmax-psimd.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-psimd-splat.c", + "src/f32-gemm/gen-inc/1x8s4inc-minmax-psimd.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-psimd-splat.c", + "src/f32-gemm/gen-inc/4x8s4inc-minmax-psimd.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-psimd-loadsplat.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-psimd-splat.c", + "src/f32-gemm/gen-inc/6x8s4inc-minmax-psimd.c", "src/f32-hswish/gen/psimd-x4.c", "src/f32-hswish/gen/psimd-x8.c", "src/f32-ibilinear/gen/psimd-c4.c", "src/f32-ibilinear/gen/psimd-c8.c", - "src/f32-igemm/gen/1x8-psimd-loadsplat.c", - "src/f32-igemm/gen/1x8-psimd-splat.c", - "src/f32-igemm/gen/1x8s4-psimd.c", - "src/f32-igemm/gen/4x2c4-psimd.c", - "src/f32-igemm/gen/4x8-psimd-loadsplat.c", - "src/f32-igemm/gen/4x8-psimd-splat.c", - "src/f32-igemm/gen/4x8s4-psimd.c", - "src/f32-igemm/gen/6x8-psimd-loadsplat.c", - "src/f32-igemm/gen/6x8-psimd-splat.c", - "src/f32-igemm/gen/6x8s4-psimd.c", - "src/f32-maxpool/9p8x-psimd-c4.c", - "src/f32-pavgpool/9p8x-psimd-c4.c", - "src/f32-pavgpool/9x-psimd-c4.c", - "src/f32-ppmm/gen/4x8-psimd.c", + "src/f32-igemm/gen/1x8-minmax-psimd-loadsplat.c", + "src/f32-igemm/gen/1x8-minmax-psimd-splat.c", + "src/f32-igemm/gen/1x8s4-minmax-psimd.c", + "src/f32-igemm/gen/4x2c4-minmax-psimd.c", + "src/f32-igemm/gen/4x8-minmax-psimd-loadsplat.c", + "src/f32-igemm/gen/4x8-minmax-psimd-splat.c", + "src/f32-igemm/gen/4x8s4-minmax-psimd.c", + "src/f32-igemm/gen/6x8-minmax-psimd-loadsplat.c", + "src/f32-igemm/gen/6x8-minmax-psimd-splat.c", + "src/f32-igemm/gen/6x8s4-minmax-psimd.c", + "src/f32-maxpool/9p8x-minmax-psimd-c4.c", + "src/f32-pavgpool/9p8x-minmax-psimd-c4.c", + "src/f32-pavgpool/9x-minmax-psimd-c4.c", + "src/f32-ppmm/gen/4x8-minmax-psimd.c", "src/f32-prelu/gen/psimd-2x4.c", "src/f32-prelu/gen/psimd-2x8.c", "src/f32-rmax/psimd.c", - "src/f32-vbinary/gen/vadd-psimd-x4.c", - "src/f32-vbinary/gen/vadd-psimd-x8.c", - "src/f32-vbinary/gen/vaddc-psimd-x4.c", - "src/f32-vbinary/gen/vaddc-psimd-x8.c", - "src/f32-vbinary/gen/vdiv-psimd-x4.c", - "src/f32-vbinary/gen/vdiv-psimd-x8.c", - "src/f32-vbinary/gen/vdivc-psimd-x4.c", - "src/f32-vbinary/gen/vdivc-psimd-x8.c", + "src/f32-vbinary/gen/vadd-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vadd-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vaddc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vaddc-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vdiv-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vdiv-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vdivc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vdivc-minmax-psimd-x8.c", "src/f32-vbinary/gen/vmax-psimd-x4.c", "src/f32-vbinary/gen/vmax-psimd-x8.c", "src/f32-vbinary/gen/vmaxc-psimd-x4.c", @@ -412,20 +452,20 @@ PSIMD_FASTMATH_UKERNELS = [ "src/f32-vbinary/gen/vmin-psimd-x8.c", "src/f32-vbinary/gen/vminc-psimd-x4.c", "src/f32-vbinary/gen/vminc-psimd-x8.c", - "src/f32-vbinary/gen/vmul-psimd-x4.c", - "src/f32-vbinary/gen/vmul-psimd-x8.c", - "src/f32-vbinary/gen/vmulc-psimd-x4.c", - "src/f32-vbinary/gen/vmulc-psimd-x8.c", - "src/f32-vbinary/gen/vrdivc-psimd-x4.c", - "src/f32-vbinary/gen/vrdivc-psimd-x8.c", - "src/f32-vbinary/gen/vrsubc-psimd-x4.c", - "src/f32-vbinary/gen/vrsubc-psimd-x8.c", - "src/f32-vbinary/gen/vsub-psimd-x4.c", - "src/f32-vbinary/gen/vsub-psimd-x8.c", - "src/f32-vbinary/gen/vsubc-psimd-x4.c", - "src/f32-vbinary/gen/vsubc-psimd-x8.c", - "src/f32-vmulcaddc/gen/c4-psimd-2x.c", - "src/f32-vmulcaddc/gen/c8-psimd-2x.c", + "src/f32-vbinary/gen/vmul-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vmul-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vmulc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vmulc-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vrdivc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vrdivc-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vrsubc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vrsubc-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vsub-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vsub-minmax-psimd-x8.c", + "src/f32-vbinary/gen/vsubc-minmax-psimd-x4.c", + "src/f32-vbinary/gen/vsubc-minmax-psimd-x8.c", + "src/f32-vmulcaddc/gen/c4-minmax-psimd-2x.c", + "src/f32-vmulcaddc/gen/c8-minmax-psimd-2x.c", "src/x32-packx/x4-psimd.c", "src/x32-pad/x2-psimd.c", "src/x32-unpool/psimd.c", @@ -461,73 +501,73 @@ PSIMD_ACCMATH_UKERNELS = [ # ISA-specific micro-kernels NEON_UKERNELS = [ - "src/f32-avgpool/9p8x-neon-c4.c", - "src/f32-avgpool/9x-neon-c4.c", + "src/f32-avgpool/9p8x-minmax-neon-c4.c", + "src/f32-avgpool/9x-minmax-neon-c4.c", "src/f32-clamp/gen/neon-x4.c", "src/f32-clamp/gen/neon-x8.c", - "src/f32-dwconv/gen/up4x9-neon.c", - "src/f32-dwconv/gen/up4x9-neon-acc2.c", - "src/f32-dwconv/gen/up8x9-neon.c", - "src/f32-dwconv/gen/up8x9-neon-acc2.c", + "src/f32-dwconv/gen/up4x9-minmax-neon.c", + "src/f32-dwconv/gen/up4x9-minmax-neon-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-neon.c", + "src/f32-dwconv/gen/up8x9-minmax-neon-acc2.c", "src/f32-gavgpool-spchw/neon-x4.c", - "src/f32-gavgpool/7p7x-neon-c4.c", - "src/f32-gavgpool/7x-neon-c4.c", - "src/f32-gemm/gen/1x8-neon-lane-ld64.c", - "src/f32-gemm/gen/4x2-neon-lane-ld64.c", - "src/f32-gemm/gen/4x8-neon-lane-ld128.c", - "src/f32-gemm/gen/4x8-neon-lane-ld64.c", - "src/f32-gemm/gen/5x8-neon-lane-ld64.c", - "src/f32-gemm/gen/6x8-neon-lane-ld64.c", - "src/f32-gemm/gen/6x8-neon-lane-ld128.c", - "src/f32-gemm/gen/1x8-neon-dup-ld64.c", - "src/f32-gemm/gen/4x8-neon-dup-ld128.c", - "src/f32-gemm/gen/4x8-neon-dup-ld64.c", - "src/f32-gemm/gen/6x8-neon-dup-ld64.c", - "src/f32-gemm/gen/6x8-neon-dup-ld128.c", - "src/f32-gemm/gen/1x8s4-neon.c", - "src/f32-gemm/gen/4x8s4-neon.c", - "src/f32-gemm/gen/6x8s4-neon.c", - "src/f32-gemm/gen/8x8s4-neon.c", - "src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c", - "src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c", - "src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c", - "src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c", - "src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c", - "src/f32-gemm/gen-inc/6x8-neon-lane-ld128.c", - "src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c", - "src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c", - "src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c", - "src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c", - "src/f32-gemm/gen-inc/6x8-neon-dup-ld128.c", - "src/f32-gemm/gen-inc/1x8s4-neon.c", - "src/f32-gemm/gen-inc/4x8s4-neon.c", - "src/f32-gemm/gen-inc/6x8s4-neon.c", - "src/f32-gemm/gen-inc/8x8s4-neon.c", + "src/f32-gavgpool/7p7x-minmax-neon-c4.c", + "src/f32-gavgpool/7x-minmax-neon-c4.c", + "src/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen/4x2-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen/4x8-minmax-neon-lane-ld128.c", + "src/f32-gemm/gen/4x8-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen/5x8-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neon-lane-ld128.c", + "src/f32-gemm/gen/1x8-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen/4x8-minmax-neon-dup-ld128.c", + "src/f32-gemm/gen/4x8-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neon-dup-ld128.c", + "src/f32-gemm/gen/1x8s4-minmax-neon.c", + "src/f32-gemm/gen/4x8s4-minmax-neon.c", + "src/f32-gemm/gen/6x8s4-minmax-neon.c", + "src/f32-gemm/gen/8x8s4-minmax-neon.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neon-lane-ld128.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen-inc/5x8inc-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neon-lane-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neon-lane-ld128.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neon-dup-ld128.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neon-dup-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neon-dup-ld128.c", + "src/f32-gemm/gen-inc/1x8s4inc-minmax-neon.c", + "src/f32-gemm/gen-inc/4x8s4inc-minmax-neon.c", + "src/f32-gemm/gen-inc/6x8s4inc-minmax-neon.c", + "src/f32-gemm/gen-inc/8x8s4inc-minmax-neon.c", "src/f32-hswish/gen/neon-x4.c", "src/f32-hswish/gen/neon-x8.c", "src/f32-ibilinear/gen/neon-c4.c", "src/f32-ibilinear/gen/neon-c8.c", - "src/f32-igemm/gen/1x8-neon-lane-ld64.c", - "src/f32-igemm/gen/4x2-neon-lane-ld64.c", - "src/f32-igemm/gen/4x4-neon-lane-ld64.c", - "src/f32-igemm/gen/4x8-neon-lane-ld128.c", - "src/f32-igemm/gen/4x8-neon-lane-ld64.c", - "src/f32-igemm/gen/6x8-neon-lane-ld64.c", - "src/f32-igemm/gen/6x8-neon-lane-ld128.c", - "src/f32-igemm/gen/1x8-neon-dup-ld64.c", - "src/f32-igemm/gen/4x8-neon-dup-ld128.c", - "src/f32-igemm/gen/4x8-neon-dup-ld64.c", - "src/f32-igemm/gen/6x8-neon-dup-ld64.c", - "src/f32-igemm/gen/6x8-neon-dup-ld128.c", - "src/f32-igemm/gen/1x8s4-neon.c", - "src/f32-igemm/gen/4x8s4-neon.c", - "src/f32-igemm/gen/6x8s4-neon.c", - "src/f32-igemm/gen/8x8s4-neon.c", - "src/f32-maxpool/9p8x-neon-c4.c", - "src/f32-pavgpool/9p8x-neon-c4.c", - "src/f32-pavgpool/9x-neon-c4.c", - "src/f32-ppmm/gen/4x8-neon.c", - "src/f32-ppmm/gen/8x8-neon.c", + "src/f32-igemm/gen/1x8-minmax-neon-lane-ld64.c", + "src/f32-igemm/gen/4x2-minmax-neon-lane-ld64.c", + "src/f32-igemm/gen/4x4-minmax-neon-lane-ld64.c", + "src/f32-igemm/gen/4x8-minmax-neon-lane-ld128.c", + "src/f32-igemm/gen/4x8-minmax-neon-lane-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neon-lane-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neon-lane-ld128.c", + "src/f32-igemm/gen/1x8-minmax-neon-dup-ld64.c", + "src/f32-igemm/gen/4x8-minmax-neon-dup-ld128.c", + "src/f32-igemm/gen/4x8-minmax-neon-dup-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neon-dup-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neon-dup-ld128.c", + "src/f32-igemm/gen/1x8s4-minmax-neon.c", + "src/f32-igemm/gen/4x8s4-minmax-neon.c", + "src/f32-igemm/gen/6x8s4-minmax-neon.c", + "src/f32-igemm/gen/8x8s4-minmax-neon.c", + "src/f32-maxpool/9p8x-minmax-neon-c4.c", + "src/f32-pavgpool/9p8x-minmax-neon-c4.c", + "src/f32-pavgpool/9x-minmax-neon-c4.c", + "src/f32-ppmm/gen/4x8-minmax-neon.c", + "src/f32-ppmm/gen/8x8-minmax-neon.c", "src/f32-prelu/gen/neon-2x4.c", "src/f32-prelu/gen/neon-2x8.c", "src/f32-raddstoreexpminusmax/gen/neon-p5-x4.c", @@ -574,10 +614,10 @@ NEON_UKERNELS = [ "src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x16.c", "src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x20.c", "src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x24.c", - "src/f32-vbinary/gen/vadd-neon-x4.c", - "src/f32-vbinary/gen/vadd-neon-x8.c", - "src/f32-vbinary/gen/vaddc-neon-x4.c", - "src/f32-vbinary/gen/vaddc-neon-x8.c", + "src/f32-vbinary/gen/vadd-minmax-neon-x4.c", + "src/f32-vbinary/gen/vadd-minmax-neon-x8.c", + "src/f32-vbinary/gen/vaddc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vaddc-minmax-neon-x8.c", "src/f32-vbinary/gen/vmax-neon-x4.c", "src/f32-vbinary/gen/vmax-neon-x8.c", "src/f32-vbinary/gen/vmaxc-neon-x4.c", @@ -586,30 +626,30 @@ NEON_UKERNELS = [ "src/f32-vbinary/gen/vmin-neon-x8.c", "src/f32-vbinary/gen/vminc-neon-x4.c", "src/f32-vbinary/gen/vminc-neon-x8.c", - "src/f32-vbinary/gen/vmul-neon-x4.c", - "src/f32-vbinary/gen/vmul-neon-x8.c", - "src/f32-vbinary/gen/vmulc-neon-x4.c", - "src/f32-vbinary/gen/vmulc-neon-x8.c", - "src/f32-vbinary/gen/vrsubc-neon-x4.c", - "src/f32-vbinary/gen/vrsubc-neon-x8.c", - "src/f32-vbinary/gen/vsub-neon-x4.c", - "src/f32-vbinary/gen/vsub-neon-x8.c", - "src/f32-vbinary/gen/vsubc-neon-x4.c", - "src/f32-vbinary/gen/vsubc-neon-x8.c", - "src/f32-vmulcaddc/gen/c4-neon-2x.c", - "src/f32-vmulcaddc/gen/c8-neon-2x.c", - "src/q8-avgpool/9p8x-neon-c8.c", - "src/q8-avgpool/9x-neon-c8.c", - "src/q8-dwconv/up8x9-neon.c", - "src/q8-gavgpool/7p7x-neon-c8.c", - "src/q8-gavgpool/7x-neon-c8.c", - "src/q8-gemm/4x8-neon.c", - "src/q8-gemm/8x8-neon.c", - "src/q8-igemm/4x8-neon.c", - "src/q8-igemm/8x8-neon.c", - "src/q8-vadd/neon.c", + "src/f32-vbinary/gen/vmul-minmax-neon-x4.c", + "src/f32-vbinary/gen/vmul-minmax-neon-x8.c", + "src/f32-vbinary/gen/vmulc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vmulc-minmax-neon-x8.c", + "src/f32-vbinary/gen/vrsubc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vrsubc-minmax-neon-x8.c", + "src/f32-vbinary/gen/vsub-minmax-neon-x4.c", + "src/f32-vbinary/gen/vsub-minmax-neon-x8.c", + "src/f32-vbinary/gen/vsubc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vsubc-minmax-neon-x8.c", + "src/f32-vmulcaddc/gen/c4-minmax-neon-2x.c", + "src/f32-vmulcaddc/gen/c8-minmax-neon-2x.c", + "src/q8-avgpool/9p8x-minmax-neon-c8.c", + "src/q8-avgpool/9x-minmax-neon-c8.c", + "src/q8-dwconv/up8x9-minmax-neon.c", + "src/q8-gavgpool/7p7x-minmax-neon-c8.c", + "src/q8-gavgpool/7x-minmax-neon-c8.c", + "src/q8-gemm/4x8-minmax-neon.c", + "src/q8-gemm/8x8-minmax-neon.c", + "src/q8-igemm/4x8-minmax-neon.c", + "src/q8-igemm/8x8-minmax-neon.c", + "src/q8-vadd/minmax-neon.c", "src/u8-clamp/neon-x64.c", - "src/u8-maxpool/9p8x-neon-c16.c", + "src/u8-maxpool/9p8x-minmax-neon-c16.c", "src/u8-rmax/neon.c", "src/x32-packx/x4-neon-st4.c", "src/x32-pad/x2-neon.c", @@ -637,41 +677,41 @@ NEON_UKERNELS = [ NEONFMA_UKERNELS = [ "src/f32-ibilinear/gen/neonfma-c4.c", "src/f32-ibilinear/gen/neonfma-c8.c", - "src/f32-igemm/gen/1x8-neonfma-dup-ld64.c", - "src/f32-igemm/gen/4x8-neonfma-dup-ld128.c", - "src/f32-igemm/gen/4x8-neonfma-dup-ld64.c", - "src/f32-igemm/gen/6x8-neonfma-dup-ld64.c", - "src/f32-igemm/gen/6x8-neonfma-dup-ld128.c", - "src/f32-igemm/gen/1x8s4-neonfma.c", - "src/f32-igemm/gen/4x8s4-neonfma.c", - "src/f32-igemm/gen/6x8s4-neonfma.c", - "src/f32-igemm/gen/8x8s4-neonfma.c", - "src/f32-dwconv/gen/up4x9-neonfma.c", - "src/f32-dwconv/gen/up4x9-neonfma-acc2.c", - "src/f32-dwconv/gen/up8x9-neonfma.c", - "src/f32-dwconv/gen/up8x9-neonfma-acc2.c", - "src/f32-gemm/gen/1x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen/4x8-neonfma-dup-ld128.c", - "src/f32-gemm/gen/4x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen/6x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen/6x8-neonfma-dup-ld128.c", - "src/f32-gemm/gen/1x8s4-neonfma.c", - "src/f32-gemm/gen/4x8s4-neonfma.c", - "src/f32-gemm/gen/6x8s4-neonfma.c", - "src/f32-gemm/gen/8x8s4-neonfma.c", - "src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c", - "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c", - "src/f32-gemm/gen-inc/6x8-neonfma-dup-ld128.c", - "src/f32-gemm/gen-inc/1x8s4-neonfma.c", - "src/f32-gemm/gen-inc/4x8s4-neonfma.c", - "src/f32-gemm/gen-inc/6x8s4-neonfma.c", - "src/f32-gemm/gen-inc/8x8s4-neonfma.c", + "src/f32-igemm/gen/1x8-minmax-neonfma-dup-ld64.c", + "src/f32-igemm/gen/4x8-minmax-neonfma-dup-ld128.c", + "src/f32-igemm/gen/4x8-minmax-neonfma-dup-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neonfma-dup-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neonfma-dup-ld128.c", + "src/f32-igemm/gen/1x8s4-minmax-neonfma.c", + "src/f32-igemm/gen/4x8s4-minmax-neonfma.c", + "src/f32-igemm/gen/6x8s4-minmax-neonfma.c", + "src/f32-igemm/gen/8x8s4-minmax-neonfma.c", + "src/f32-dwconv/gen/up4x9-minmax-neonfma.c", + "src/f32-dwconv/gen/up4x9-minmax-neonfma-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-neonfma.c", + "src/f32-dwconv/gen/up8x9-minmax-neonfma-acc2.c", + "src/f32-gemm/gen/1x8-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen/4x8-minmax-neonfma-dup-ld128.c", + "src/f32-gemm/gen/4x8-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neonfma-dup-ld128.c", + "src/f32-gemm/gen/1x8s4-minmax-neonfma.c", + "src/f32-gemm/gen/4x8s4-minmax-neonfma.c", + "src/f32-gemm/gen/6x8s4-minmax-neonfma.c", + "src/f32-gemm/gen/8x8s4-minmax-neonfma.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neonfma-dup-ld128.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neonfma-dup-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neonfma-dup-ld128.c", + "src/f32-gemm/gen-inc/1x8s4inc-minmax-neonfma.c", + "src/f32-gemm/gen-inc/4x8s4inc-minmax-neonfma.c", + "src/f32-gemm/gen-inc/6x8s4inc-minmax-neonfma.c", + "src/f32-gemm/gen-inc/8x8s4inc-minmax-neonfma.c", "src/f32-hswish/gen/neonfma-x4.c", "src/f32-hswish/gen/neonfma-x8.c", - "src/f32-ppmm/gen/4x8-neonfma.c", - "src/f32-ppmm/gen/8x8-neonfma.c", + "src/f32-ppmm/gen/4x8-minmax-neonfma.c", + "src/f32-ppmm/gen/8x8-minmax-neonfma.c", "src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c", "src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8.c", "src/f32-raddstoreexpminusmax/gen/neonfma-p5-x8-acc2.c", @@ -750,8 +790,8 @@ NEONFMA_UKERNELS = [ "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x16.c", "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x20.c", "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x24.c", - "src/f32-vmulcaddc/gen/c4-neonfma-2x.c", - "src/f32-vmulcaddc/gen/c8-neonfma-2x.c", + "src/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c", + "src/f32-vmulcaddc/gen/c8-minmax-neonfma-2x.c", "src/math/exp-neonfma-lut64-p2.c", "src/math/exp-neonfma-p5.c", "src/math/expminus-neonfma-lut2048-p1.c", @@ -778,32 +818,32 @@ NEONFMA_UKERNELS = [ ] AARCH64_NEONFMA_UKERNELS = [ - "src/f32-vbinary/gen/vdiv-neon-x4.c", - "src/f32-vbinary/gen/vdiv-neon-x8.c", - "src/f32-vbinary/gen/vdivc-neon-x4.c", - "src/f32-vbinary/gen/vdivc-neon-x8.c", - "src/f32-vbinary/gen/vrdivc-neon-x4.c", - "src/f32-vbinary/gen/vrdivc-neon-x8.c", - "src/f32-gemm/gen/1x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen/4x2-neonfma-lane-ld64.c", - "src/f32-gemm/gen/4x8-neonfma-lane-ld128.c", - "src/f32-gemm/gen/4x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen/5x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen/6x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen/6x8-neonfma-lane-ld128.c", - "src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c", - "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c", - "src/f32-gemm/gen-inc/6x8-neonfma-lane-ld128.c", - "src/f32-igemm/gen/1x8-neonfma-lane-ld64.c", - "src/f32-igemm/gen/4x2-neonfma-lane-ld64.c", - "src/f32-igemm/gen/4x4-neonfma-lane-ld64.c", - "src/f32-igemm/gen/4x8-neonfma-lane-ld128.c", - "src/f32-igemm/gen/4x8-neonfma-lane-ld64.c", - "src/f32-igemm/gen/6x8-neonfma-lane-ld64.c", - "src/f32-igemm/gen/6x8-neonfma-lane-ld128.c", + "src/f32-vbinary/gen/vdiv-minmax-neon-x4.c", + "src/f32-vbinary/gen/vdiv-minmax-neon-x8.c", + "src/f32-vbinary/gen/vdivc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vdivc-minmax-neon-x8.c", + "src/f32-vbinary/gen/vrdivc-minmax-neon-x4.c", + "src/f32-vbinary/gen/vrdivc-minmax-neon-x8.c", + "src/f32-gemm/gen/1x8-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen/4x2-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen/4x8-minmax-neonfma-lane-ld128.c", + "src/f32-gemm/gen/4x8-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen/5x8-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen/6x8-minmax-neonfma-lane-ld128.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neonfma-lane-ld128.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen-inc/5x8inc-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neonfma-lane-ld64.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-neonfma-lane-ld128.c", + "src/f32-igemm/gen/1x8-minmax-neonfma-lane-ld64.c", + "src/f32-igemm/gen/4x2-minmax-neonfma-lane-ld64.c", + "src/f32-igemm/gen/4x4-minmax-neonfma-lane-ld64.c", + "src/f32-igemm/gen/4x8-minmax-neonfma-lane-ld128.c", + "src/f32-igemm/gen/4x8-minmax-neonfma-lane-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neonfma-lane-ld64.c", + "src/f32-igemm/gen/6x8-minmax-neonfma-lane-ld128.c", "src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c", "src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c", "src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c", @@ -829,24 +869,24 @@ AARCH64_NEONFMA_UKERNELS = [ "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x16.c", "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x20.c", "src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x24.c", - "src/f32-spmm/gen/12x1-neonfma.c", - "src/f32-spmm/gen/12x2-neonfma.c", - "src/f32-spmm/gen/12x4-neonfma.c", - "src/f32-spmm/gen/16x1-neonfma-pipelined.c", - "src/f32-spmm/gen/16x1-neonfma-unroll2.c", - "src/f32-spmm/gen/16x1-neonfma.c", - "src/f32-spmm/gen/16x2-neonfma.c", - "src/f32-spmm/gen/16x4-neonfma.c", - "src/f32-spmm/gen/4x1-neonfma-pipelined.c", - "src/f32-spmm/gen/4x1-neonfma-unroll2.c", - "src/f32-spmm/gen/4x1-neonfma.c", - "src/f32-spmm/gen/4x2-neonfma.c", - "src/f32-spmm/gen/4x4-neonfma.c", - "src/f32-spmm/gen/8x1-neonfma-pipelined.c", - "src/f32-spmm/gen/8x1-neonfma-unroll2.c", - "src/f32-spmm/gen/8x1-neonfma.c", - "src/f32-spmm/gen/8x2-neonfma.c", - "src/f32-spmm/gen/8x4-neonfma.c", + "src/f32-spmm/gen/12x1-minmax-neonfma.c", + "src/f32-spmm/gen/12x2-minmax-neonfma.c", + "src/f32-spmm/gen/12x4-minmax-neonfma.c", + "src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c", + "src/f32-spmm/gen/16x1-minmax-neonfma-unroll2.c", + "src/f32-spmm/gen/16x1-minmax-neonfma.c", + "src/f32-spmm/gen/16x2-minmax-neonfma.c", + "src/f32-spmm/gen/16x4-minmax-neonfma.c", + "src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c", + "src/f32-spmm/gen/4x1-minmax-neonfma-unroll2.c", + "src/f32-spmm/gen/4x1-minmax-neonfma.c", + "src/f32-spmm/gen/4x2-minmax-neonfma.c", + "src/f32-spmm/gen/4x4-minmax-neonfma.c", + "src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c", + "src/f32-spmm/gen/8x1-minmax-neonfma-unroll2.c", + "src/f32-spmm/gen/8x1-minmax-neonfma.c", + "src/f32-spmm/gen/8x2-minmax-neonfma.c", + "src/f32-spmm/gen/8x4-minmax-neonfma.c", "src/math/sigmoid-neonfma-rr1-lut2048-p1-div.c", "src/math/sigmoid-neonfma-rr1-lut64-p2-div.c", "src/math/sigmoid-neonfma-rr1-p5-div.c", @@ -859,77 +899,77 @@ AARCH64_NEONFP16ARITH_UKERNELS = [ "src/f16-gemm/gen/4x8-neonfp16arith-ld64.c", "src/f16-gemm/gen/6x8-neonfp16arith-ld64.c", "src/f16-gemm/gen/8x8-neonfp16arith-ld64.c", - "src/f16-spmm/gen/8x1-neonfp16arith.c", - "src/f16-spmm/gen/8x1-neonfp16arith-unroll2.c", - "src/f16-spmm/gen/16x1-neonfp16arith.c", - "src/f16-spmm/gen/16x1-neonfp16arith-unroll2.c", - "src/f16-spmm/gen/24x1-neonfp16arith.c", - "src/f16-spmm/gen/24x1-neonfp16arith-unroll2.c", - "src/f16-spmm/gen/32x1-neonfp16arith.c", - "src/f16-spmm/gen/32x1-neonfp16arith-unroll2.c", + "src/f16-spmm/gen/8x1-minmax-neonfp16arith.c", + "src/f16-spmm/gen/8x1-minmax-neonfp16arith-unroll2.c", + "src/f16-spmm/gen/16x1-minmax-neonfp16arith.c", + "src/f16-spmm/gen/16x1-minmax-neonfp16arith-unroll2.c", + "src/f16-spmm/gen/24x1-minmax-neonfp16arith.c", + "src/f16-spmm/gen/24x1-minmax-neonfp16arith-unroll2.c", + "src/f16-spmm/gen/32x1-minmax-neonfp16arith.c", + "src/f16-spmm/gen/32x1-minmax-neonfp16arith-unroll2.c", ] SSE_UKERNELS = [ - "src/f32-avgpool/9p8x-sse-c4.c", - "src/f32-avgpool/9x-sse-c4.c", + "src/f32-avgpool/9p8x-minmax-sse-c4.c", + "src/f32-avgpool/9x-minmax-sse-c4.c", "src/f32-clamp/gen/sse-x4.c", "src/f32-clamp/gen/sse-x8.c", "src/f32-dwconv-spchw/3x3p1-sse.c", "src/f32-dwconv-spchw/3x3s2p1-sse.c", - "src/f32-dwconv/gen/up4x25-sse-acc2.c", - "src/f32-dwconv/gen/up4x25-sse.c", - "src/f32-dwconv/gen/up4x4-sse-acc2.c", - "src/f32-dwconv/gen/up4x4-sse.c", - "src/f32-dwconv/gen/up4x9-sse-acc2.c", - "src/f32-dwconv/gen/up4x9-sse.c", - "src/f32-dwconv/gen/up8x25-sse-acc2.c", - "src/f32-dwconv/gen/up8x25-sse.c", - "src/f32-dwconv/gen/up8x4-sse-acc2.c", - "src/f32-dwconv/gen/up8x4-sse.c", - "src/f32-dwconv/gen/up8x9-sse-acc2.c", - "src/f32-dwconv/gen/up8x9-sse.c", + "src/f32-dwconv/gen/up4x25-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up4x25-minmax-sse.c", + "src/f32-dwconv/gen/up4x4-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up4x4-minmax-sse.c", + "src/f32-dwconv/gen/up4x9-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up4x9-minmax-sse.c", + "src/f32-dwconv/gen/up8x25-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up8x25-minmax-sse.c", + "src/f32-dwconv/gen/up8x4-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up8x4-minmax-sse.c", + "src/f32-dwconv/gen/up8x9-minmax-sse-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-sse.c", "src/f32-gavgpool-spchw/sse-x4.c", - "src/f32-gavgpool/7p7x-sse-c4.c", - "src/f32-gavgpool/7x-sse-c4.c", - "src/f32-gemm/gen/1x8-sse-dup.c", - "src/f32-gemm/gen/1x8-sse-load1.c", - "src/f32-gemm/gen/1x8s4-sse.c", - "src/f32-gemm/gen/4x2c4-sse.c", - "src/f32-gemm/gen/4x8-sse-dup.c", - "src/f32-gemm/gen/4x8-sse-load1.c", - "src/f32-gemm/gen/4x8s4-sse.c", - "src/f32-gemm/gen-inc/1x8-sse-dup.c", - "src/f32-gemm/gen-inc/1x8-sse-load1.c", - "src/f32-gemm/gen-inc/1x8s4-sse.c", - "src/f32-gemm/gen-inc/4x8-sse-dup.c", - "src/f32-gemm/gen-inc/4x8-sse-load1.c", - "src/f32-gemm/gen-inc/4x8s4-sse.c", + "src/f32-gavgpool/7p7x-minmax-sse-c4.c", + "src/f32-gavgpool/7x-minmax-sse-c4.c", + "src/f32-gemm/gen/1x8-minmax-sse-dup.c", + "src/f32-gemm/gen/1x8-minmax-sse-load1.c", + "src/f32-gemm/gen/1x8s4-minmax-sse.c", + "src/f32-gemm/gen/4x2c4-minmax-sse.c", + "src/f32-gemm/gen/4x8-minmax-sse-dup.c", + "src/f32-gemm/gen/4x8-minmax-sse-load1.c", + "src/f32-gemm/gen/4x8s4-minmax-sse.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-sse-dup.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-sse-load1.c", + "src/f32-gemm/gen-inc/1x8s4inc-minmax-sse.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-sse-dup.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-sse-load1.c", + "src/f32-gemm/gen-inc/4x8s4inc-minmax-sse.c", "src/f32-hswish/gen/sse-x4.c", "src/f32-hswish/gen/sse-x8.c", "src/f32-ibilinear/gen/sse-c4.c", "src/f32-ibilinear/gen/sse-c8.c", - "src/f32-igemm/gen/1x8-sse-dup.c", - "src/f32-igemm/gen/1x8-sse-load1.c", - "src/f32-igemm/gen/1x8s4-sse.c", - "src/f32-igemm/gen/4x2c4-sse.c", - "src/f32-igemm/gen/4x8-sse-dup.c", - "src/f32-igemm/gen/4x8-sse-load1.c", - "src/f32-igemm/gen/4x8s4-sse.c", - "src/f32-maxpool/9p8x-sse-c4.c", - "src/f32-pavgpool/9p8x-sse-c4.c", - "src/f32-pavgpool/9x-sse-c4.c", - "src/f32-ppmm/gen/4x8-sse.c", + "src/f32-igemm/gen/1x8-minmax-sse-dup.c", + "src/f32-igemm/gen/1x8-minmax-sse-load1.c", + "src/f32-igemm/gen/1x8s4-minmax-sse.c", + "src/f32-igemm/gen/4x2c4-minmax-sse.c", + "src/f32-igemm/gen/4x8-minmax-sse-dup.c", + "src/f32-igemm/gen/4x8-minmax-sse-load1.c", + "src/f32-igemm/gen/4x8s4-minmax-sse.c", + "src/f32-maxpool/9p8x-minmax-sse-c4.c", + "src/f32-pavgpool/9p8x-minmax-sse-c4.c", + "src/f32-pavgpool/9x-minmax-sse-c4.c", + "src/f32-ppmm/gen/4x8-minmax-sse.c", "src/f32-rmax/sse.c", - "src/f32-spmm/gen/4x1-sse.c", - "src/f32-spmm/gen/8x1-sse.c", - "src/f32-vbinary/gen/vadd-sse-x4.c", - "src/f32-vbinary/gen/vadd-sse-x8.c", - "src/f32-vbinary/gen/vaddc-sse-x4.c", - "src/f32-vbinary/gen/vaddc-sse-x8.c", - "src/f32-vbinary/gen/vdiv-sse-x4.c", - "src/f32-vbinary/gen/vdiv-sse-x8.c", - "src/f32-vbinary/gen/vdivc-sse-x4.c", - "src/f32-vbinary/gen/vdivc-sse-x8.c", + "src/f32-spmm/gen/4x1-minmax-sse.c", + "src/f32-spmm/gen/8x1-minmax-sse.c", + "src/f32-vbinary/gen/vadd-minmax-sse-x4.c", + "src/f32-vbinary/gen/vadd-minmax-sse-x8.c", + "src/f32-vbinary/gen/vaddc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vaddc-minmax-sse-x8.c", + "src/f32-vbinary/gen/vdiv-minmax-sse-x4.c", + "src/f32-vbinary/gen/vdiv-minmax-sse-x8.c", + "src/f32-vbinary/gen/vdivc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vdivc-minmax-sse-x8.c", "src/f32-vbinary/gen/vmax-sse-x4.c", "src/f32-vbinary/gen/vmax-sse-x8.c", "src/f32-vbinary/gen/vmaxc-sse-x4.c", @@ -938,20 +978,20 @@ SSE_UKERNELS = [ "src/f32-vbinary/gen/vmin-sse-x8.c", "src/f32-vbinary/gen/vminc-sse-x4.c", "src/f32-vbinary/gen/vminc-sse-x8.c", - "src/f32-vbinary/gen/vmul-sse-x4.c", - "src/f32-vbinary/gen/vmul-sse-x8.c", - "src/f32-vbinary/gen/vmulc-sse-x4.c", - "src/f32-vbinary/gen/vmulc-sse-x8.c", - "src/f32-vbinary/gen/vrdivc-sse-x4.c", - "src/f32-vbinary/gen/vrdivc-sse-x8.c", - "src/f32-vbinary/gen/vrsubc-sse-x4.c", - "src/f32-vbinary/gen/vrsubc-sse-x8.c", - "src/f32-vbinary/gen/vsub-sse-x4.c", - "src/f32-vbinary/gen/vsub-sse-x8.c", - "src/f32-vbinary/gen/vsubc-sse-x4.c", - "src/f32-vbinary/gen/vsubc-sse-x8.c", - "src/f32-vmulcaddc/gen/c4-sse-2x.c", - "src/f32-vmulcaddc/gen/c8-sse-2x.c", + "src/f32-vbinary/gen/vmul-minmax-sse-x4.c", + "src/f32-vbinary/gen/vmul-minmax-sse-x8.c", + "src/f32-vbinary/gen/vmulc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vmulc-minmax-sse-x8.c", + "src/f32-vbinary/gen/vrdivc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c", + "src/f32-vbinary/gen/vrsubc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c", + "src/f32-vbinary/gen/vsub-minmax-sse-x4.c", + "src/f32-vbinary/gen/vsub-minmax-sse-x8.c", + "src/f32-vbinary/gen/vsubc-minmax-sse-x4.c", + "src/f32-vbinary/gen/vsubc-minmax-sse-x8.c", + "src/f32-vmulcaddc/gen/c4-minmax-sse-2x.c", + "src/f32-vmulcaddc/gen/c8-minmax-sse-2x.c", "src/x32-packx/x4-sse.c", ] @@ -979,17 +1019,17 @@ SSE2_UKERNELS = [ "src/f32-sigmoid/gen/sse2-p5-div-x16.c", "src/f32-sigmoid/gen/sse2-p5-div-x20.c", "src/f32-sigmoid/gen/sse2-p5-div-x24.c", - "src/q8-avgpool/9p8x-sse2-c8.c", - "src/q8-avgpool/9x-sse2-c8.c", - "src/q8-igemm/4x4c2-sse2.c", - "src/q8-dwconv/up8x9-sse2.c", - "src/q8-gavgpool/7p7x-sse2-c8.c", - "src/q8-gavgpool/7x-sse2-c8.c", - "src/q8-gemm/2x4c8-sse2.c", - "src/q8-gemm/4x4c2-sse2.c", - "src/q8-vadd/sse2.c", + "src/q8-avgpool/9p8x-minmax-sse2-c8.c", + "src/q8-avgpool/9x-minmax-sse2-c8.c", + "src/q8-igemm/4x4c2-minmax-sse2.c", + "src/q8-dwconv/up8x9-minmax-sse2.c", + "src/q8-gavgpool/7p7x-minmax-sse2-c8.c", + "src/q8-gavgpool/7x-minmax-sse2-c8.c", + "src/q8-gemm/2x4c8-minmax-sse2.c", + "src/q8-gemm/4x4c2-minmax-sse2.c", + "src/q8-vadd/minmax-sse2.c", "src/u8-clamp/sse2-x64.c", - "src/u8-maxpool/9p8x-sse2-c16.c", + "src/u8-maxpool/9p8x-minmax-sse2-c16.c", "src/u8-rmax/sse2.c", "src/x32-pad/x2-sse2.c", "src/x32-zip/x2-sse2.c", @@ -1032,58 +1072,58 @@ SSE41_UKERNELS = [ AVX_UKERNELS = [ "src/f32-clamp/gen/avx-x8.c", "src/f32-clamp/gen/avx-x16.c", - "src/f32-dwconv/gen/up16x4-avx-acc2.c", - "src/f32-dwconv/gen/up16x4-avx.c", - "src/f32-dwconv/gen/up8x4-avx-acc2.c", - "src/f32-dwconv/gen/up8x4-avx.c", - "src/f32-dwconv/gen/up16x9-avx-acc2.c", - "src/f32-dwconv/gen/up16x9-avx.c", - "src/f32-dwconv/gen/up8x9-avx-acc2.c", - "src/f32-dwconv/gen/up8x9-avx.c", - "src/f32-dwconv/gen/up16x25-avx-acc2.c", - "src/f32-dwconv/gen/up16x25-avx.c", - "src/f32-dwconv/gen/up8x25-avx-acc2.c", - "src/f32-dwconv/gen/up8x25-avx.c", - "src/f32-gemm/gen/1x8-avx-broadcast.c", - "src/f32-gemm/gen/4x8-avx-broadcast.c", - "src/f32-gemm/gen/5x8-avx-broadcast.c", - "src/f32-gemm/gen/6x8-avx-broadcast.c", - "src/f32-gemm/gen/7x8-avx-broadcast.c", - "src/f32-gemm/gen/1x16-avx-broadcast.c", - "src/f32-gemm/gen/3x16-avx-broadcast.c", - "src/f32-gemm/gen/4x16-avx-broadcast.c", - "src/f32-gemm/gen/5x16-avx-broadcast.c", - "src/f32-gemm/gen-inc/1x8-avx-broadcast.c", - "src/f32-gemm/gen-inc/4x8-avx-broadcast.c", - "src/f32-gemm/gen-inc/5x8-avx-broadcast.c", - "src/f32-gemm/gen-inc/6x8-avx-broadcast.c", - "src/f32-gemm/gen-inc/7x8-avx-broadcast.c", - "src/f32-gemm/gen-inc/1x16-avx-broadcast.c", - "src/f32-gemm/gen-inc/3x16-avx-broadcast.c", - "src/f32-gemm/gen-inc/4x16-avx-broadcast.c", - "src/f32-gemm/gen-inc/5x16-avx-broadcast.c", + "src/f32-dwconv/gen/up16x4-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up16x4-minmax-avx.c", + "src/f32-dwconv/gen/up8x4-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up8x4-minmax-avx.c", + "src/f32-dwconv/gen/up16x9-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up16x9-minmax-avx.c", + "src/f32-dwconv/gen/up8x9-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-avx.c", + "src/f32-dwconv/gen/up16x25-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up16x25-minmax-avx.c", + "src/f32-dwconv/gen/up8x25-minmax-avx-acc2.c", + "src/f32-dwconv/gen/up8x25-minmax-avx.c", + "src/f32-gemm/gen/1x8-minmax-avx-broadcast.c", + "src/f32-gemm/gen/4x8-minmax-avx-broadcast.c", + "src/f32-gemm/gen/5x8-minmax-avx-broadcast.c", + "src/f32-gemm/gen/6x8-minmax-avx-broadcast.c", + "src/f32-gemm/gen/7x8-minmax-avx-broadcast.c", + "src/f32-gemm/gen/1x16-minmax-avx-broadcast.c", + "src/f32-gemm/gen/3x16-minmax-avx-broadcast.c", + "src/f32-gemm/gen/4x16-minmax-avx-broadcast.c", + "src/f32-gemm/gen/5x16-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/5x8inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/7x8inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/1x16inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/3x16inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/4x16inc-minmax-avx-broadcast.c", + "src/f32-gemm/gen-inc/5x16inc-minmax-avx-broadcast.c", "src/f32-hswish/gen/avx-x8.c", "src/f32-hswish/gen/avx-x16.c", - "src/f32-igemm/gen/1x8-avx-broadcast.c", - "src/f32-igemm/gen/4x8-avx-broadcast.c", - "src/f32-igemm/gen/5x8-avx-broadcast.c", - "src/f32-igemm/gen/6x8-avx-broadcast.c", - "src/f32-igemm/gen/7x8-avx-broadcast.c", - "src/f32-igemm/gen/1x16-avx-broadcast.c", - "src/f32-igemm/gen/3x16-avx-broadcast.c", - "src/f32-igemm/gen/4x16-avx-broadcast.c", - "src/f32-igemm/gen/5x16-avx-broadcast.c", + "src/f32-igemm/gen/1x8-minmax-avx-broadcast.c", + "src/f32-igemm/gen/4x8-minmax-avx-broadcast.c", + "src/f32-igemm/gen/5x8-minmax-avx-broadcast.c", + "src/f32-igemm/gen/6x8-minmax-avx-broadcast.c", + "src/f32-igemm/gen/7x8-minmax-avx-broadcast.c", + "src/f32-igemm/gen/1x16-minmax-avx-broadcast.c", + "src/f32-igemm/gen/3x16-minmax-avx-broadcast.c", + "src/f32-igemm/gen/4x16-minmax-avx-broadcast.c", + "src/f32-igemm/gen/5x16-minmax-avx-broadcast.c", "src/f32-prelu/gen/avx-2x8.c", "src/f32-prelu/gen/avx-2x16.c", "src/f32-rmax/avx.c", - "src/f32-vbinary/gen/vadd-avx-x8.c", - "src/f32-vbinary/gen/vadd-avx-x16.c", - "src/f32-vbinary/gen/vaddc-avx-x8.c", - "src/f32-vbinary/gen/vaddc-avx-x16.c", - "src/f32-vbinary/gen/vdiv-avx-x8.c", - "src/f32-vbinary/gen/vdiv-avx-x16.c", - "src/f32-vbinary/gen/vdivc-avx-x8.c", - "src/f32-vbinary/gen/vdivc-avx-x16.c", + "src/f32-vbinary/gen/vadd-minmax-avx-x8.c", + "src/f32-vbinary/gen/vadd-minmax-avx-x16.c", + "src/f32-vbinary/gen/vaddc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vaddc-minmax-avx-x16.c", + "src/f32-vbinary/gen/vdiv-minmax-avx-x8.c", + "src/f32-vbinary/gen/vdiv-minmax-avx-x16.c", + "src/f32-vbinary/gen/vdivc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vdivc-minmax-avx-x16.c", "src/f32-vbinary/gen/vmax-avx-x8.c", "src/f32-vbinary/gen/vmax-avx-x16.c", "src/f32-vbinary/gen/vmaxc-avx-x8.c", @@ -1092,78 +1132,78 @@ AVX_UKERNELS = [ "src/f32-vbinary/gen/vmin-avx-x16.c", "src/f32-vbinary/gen/vminc-avx-x8.c", "src/f32-vbinary/gen/vminc-avx-x16.c", - "src/f32-vbinary/gen/vmul-avx-x8.c", - "src/f32-vbinary/gen/vmul-avx-x16.c", - "src/f32-vbinary/gen/vmulc-avx-x8.c", - "src/f32-vbinary/gen/vmulc-avx-x16.c", - "src/f32-vbinary/gen/vrdivc-avx-x8.c", - "src/f32-vbinary/gen/vrdivc-avx-x16.c", - "src/f32-vbinary/gen/vrsubc-avx-x8.c", - "src/f32-vbinary/gen/vrsubc-avx-x16.c", - "src/f32-vbinary/gen/vsub-avx-x8.c", - "src/f32-vbinary/gen/vsub-avx-x16.c", - "src/f32-vbinary/gen/vsubc-avx-x8.c", - "src/f32-vbinary/gen/vsubc-avx-x16.c", + "src/f32-vbinary/gen/vmul-minmax-avx-x8.c", + "src/f32-vbinary/gen/vmul-minmax-avx-x16.c", + "src/f32-vbinary/gen/vmulc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vmulc-minmax-avx-x16.c", + "src/f32-vbinary/gen/vrdivc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vrdivc-minmax-avx-x16.c", + "src/f32-vbinary/gen/vrsubc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vrsubc-minmax-avx-x16.c", + "src/f32-vbinary/gen/vsub-minmax-avx-x8.c", + "src/f32-vbinary/gen/vsub-minmax-avx-x16.c", + "src/f32-vbinary/gen/vsubc-minmax-avx-x8.c", + "src/f32-vbinary/gen/vsubc-minmax-avx-x16.c", "src/f32-vscale/avx-unroll32.c", ] FMA3_UKERNELS = [ - "src/f32-dwconv/gen/up16x4-fma3-acc2.c", - "src/f32-dwconv/gen/up16x4-fma3.c", - "src/f32-dwconv/gen/up8x4-fma3-acc2.c", - "src/f32-dwconv/gen/up8x4-fma3.c", - "src/f32-dwconv/gen/up16x9-fma3-acc2.c", - "src/f32-dwconv/gen/up16x9-fma3.c", - "src/f32-dwconv/gen/up8x9-fma3-acc2.c", - "src/f32-dwconv/gen/up8x9-fma3.c", - "src/f32-dwconv/gen/up16x25-fma3-acc2.c", - "src/f32-dwconv/gen/up16x25-fma3.c", - "src/f32-dwconv/gen/up8x25-fma3-acc2.c", - "src/f32-dwconv/gen/up8x25-fma3.c", - "src/f32-gemm/gen/1x8-fma3-broadcast.c", - "src/f32-gemm/gen/4x8-fma3-broadcast.c", - "src/f32-gemm/gen/5x8-fma3-broadcast.c", - "src/f32-gemm/gen/6x8-fma3-broadcast.c", - "src/f32-gemm/gen/7x8-fma3-broadcast.c", - "src/f32-gemm/gen/8x8-fma3-broadcast.c", - "src/f32-gemm/gen/1x16-fma3-broadcast.c", - "src/f32-gemm/gen/3x16-fma3-broadcast.c", - "src/f32-gemm/gen/4x16-fma3-broadcast.c", - "src/f32-gemm/gen/5x16-fma3-broadcast.c", - "src/f32-gemm/gen/1x16s4-fma3-broadcast.c", - "src/f32-gemm/gen/3x16s4-fma3-broadcast.c", - "src/f32-gemm/gen/4x16s4-fma3-broadcast.c", - "src/f32-gemm/gen/5x16s4-fma3-broadcast.c", - "src/f32-gemm/gen-inc/1x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/4x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/5x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/6x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/7x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/8x8-fma3-broadcast.c", - "src/f32-gemm/gen-inc/1x16-fma3-broadcast.c", - "src/f32-gemm/gen-inc/3x16-fma3-broadcast.c", - "src/f32-gemm/gen-inc/4x16-fma3-broadcast.c", - "src/f32-gemm/gen-inc/5x16-fma3-broadcast.c", - "src/f32-gemm/gen-inc/1x16s4-fma3-broadcast.c", - "src/f32-gemm/gen-inc/3x16s4-fma3-broadcast.c", - "src/f32-gemm/gen-inc/4x16s4-fma3-broadcast.c", - "src/f32-gemm/gen-inc/5x16s4-fma3-broadcast.c", + "src/f32-dwconv/gen/up16x4-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up16x4-minmax-fma3.c", + "src/f32-dwconv/gen/up8x4-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up8x4-minmax-fma3.c", + "src/f32-dwconv/gen/up16x9-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up16x9-minmax-fma3.c", + "src/f32-dwconv/gen/up8x9-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up8x9-minmax-fma3.c", + "src/f32-dwconv/gen/up16x25-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up16x25-minmax-fma3.c", + "src/f32-dwconv/gen/up8x25-minmax-fma3-acc2.c", + "src/f32-dwconv/gen/up8x25-minmax-fma3.c", + "src/f32-gemm/gen/1x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/4x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/5x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/6x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/7x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/8x8-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/1x16-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/3x16-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/4x16-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/5x16-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/1x16s4-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/3x16s4-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/4x16s4-minmax-fma3-broadcast.c", + "src/f32-gemm/gen/5x16s4-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/1x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/4x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/5x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/6x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/7x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/8x8inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/1x16inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/3x16inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/4x16inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/5x16inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/1x16s4inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/3x16s4inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/4x16s4inc-minmax-fma3-broadcast.c", + "src/f32-gemm/gen-inc/5x16s4inc-minmax-fma3-broadcast.c", "src/f32-hswish/gen/fma3-x8.c", "src/f32-hswish/gen/fma3-x16.c", - "src/f32-igemm/gen/1x8-fma3-broadcast.c", - "src/f32-igemm/gen/4x8-fma3-broadcast.c", - "src/f32-igemm/gen/5x8-fma3-broadcast.c", - "src/f32-igemm/gen/6x8-fma3-broadcast.c", - "src/f32-igemm/gen/7x8-fma3-broadcast.c", - "src/f32-igemm/gen/8x8-fma3-broadcast.c", - "src/f32-igemm/gen/1x16-fma3-broadcast.c", - "src/f32-igemm/gen/3x16-fma3-broadcast.c", - "src/f32-igemm/gen/4x16-fma3-broadcast.c", - "src/f32-igemm/gen/5x16-fma3-broadcast.c", - "src/f32-igemm/gen/1x16s4-fma3-broadcast.c", - "src/f32-igemm/gen/3x16s4-fma3-broadcast.c", - "src/f32-igemm/gen/4x16s4-fma3-broadcast.c", - "src/f32-igemm/gen/5x16s4-fma3-broadcast.c", + "src/f32-igemm/gen/1x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/4x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/5x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/6x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/7x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/8x8-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/1x16-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/3x16-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/4x16-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/5x16-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/1x16s4-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/3x16s4-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c", + "src/f32-igemm/gen/5x16s4-minmax-fma3-broadcast.c", ] AVX2_UKERNELS = [ @@ -1273,38 +1313,38 @@ AVX2_UKERNELS = [ AVX512F_UKERNELS = [ "src/f32-clamp/gen/avx512f-x16.c", "src/f32-clamp/gen/avx512f-x32.c", - "src/f32-dwconv/gen/up32x4-avx512f-acc2.c", - "src/f32-dwconv/gen/up32x4-avx512f.c", - "src/f32-dwconv/gen/up16x4-avx512f-acc2.c", - "src/f32-dwconv/gen/up16x4-avx512f.c", - "src/f32-dwconv/gen/up32x9-avx512f-acc2.c", - "src/f32-dwconv/gen/up32x9-avx512f.c", - "src/f32-dwconv/gen/up16x9-avx512f-acc2.c", - "src/f32-dwconv/gen/up16x9-avx512f.c", - "src/f32-dwconv/gen/up32x25-avx512f-acc2.c", - "src/f32-dwconv/gen/up32x25-avx512f.c", - "src/f32-dwconv/gen/up16x25-avx512f-acc2.c", - "src/f32-dwconv/gen/up16x25-avx512f.c", - "src/f32-gemm/gen/1x16-avx512f-broadcast.c", - "src/f32-gemm/gen/4x16-avx512f-broadcast.c", - "src/f32-gemm/gen/5x16-avx512f-broadcast.c", - "src/f32-gemm/gen/6x16-avx512f-broadcast.c", - "src/f32-gemm/gen/7x16-avx512f-broadcast.c", - "src/f32-gemm/gen/8x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/1x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/4x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/5x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/6x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/7x16-avx512f-broadcast.c", - "src/f32-gemm/gen-inc/8x16-avx512f-broadcast.c", + "src/f32-dwconv/gen/up32x4-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up32x4-minmax-avx512f.c", + "src/f32-dwconv/gen/up16x4-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up16x4-minmax-avx512f.c", + "src/f32-dwconv/gen/up32x9-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up32x9-minmax-avx512f.c", + "src/f32-dwconv/gen/up16x9-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up16x9-minmax-avx512f.c", + "src/f32-dwconv/gen/up32x25-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up32x25-minmax-avx512f.c", + "src/f32-dwconv/gen/up16x25-minmax-avx512f-acc2.c", + "src/f32-dwconv/gen/up16x25-minmax-avx512f.c", + "src/f32-gemm/gen/1x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen/4x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen/5x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen/6x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen/7x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen/8x16-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/1x16inc-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/4x16inc-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/5x16inc-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/6x16inc-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/7x16inc-minmax-avx512f-broadcast.c", + "src/f32-gemm/gen-inc/8x16inc-minmax-avx512f-broadcast.c", "src/f32-hswish/gen/avx512f-x16.c", "src/f32-hswish/gen/avx512f-x32.c", - "src/f32-igemm/gen/1x16-avx512f-broadcast.c", - "src/f32-igemm/gen/4x16-avx512f-broadcast.c", - "src/f32-igemm/gen/5x16-avx512f-broadcast.c", - "src/f32-igemm/gen/6x16-avx512f-broadcast.c", - "src/f32-igemm/gen/7x16-avx512f-broadcast.c", - "src/f32-igemm/gen/8x16-avx512f-broadcast.c", + "src/f32-igemm/gen/1x16-minmax-avx512f-broadcast.c", + "src/f32-igemm/gen/4x16-minmax-avx512f-broadcast.c", + "src/f32-igemm/gen/5x16-minmax-avx512f-broadcast.c", + "src/f32-igemm/gen/6x16-minmax-avx512f-broadcast.c", + "src/f32-igemm/gen/7x16-minmax-avx512f-broadcast.c", + "src/f32-igemm/gen/8x16-minmax-avx512f-broadcast.c", "src/f32-prelu/gen/avx512f-2x16.c", "src/f32-prelu/gen/avx512f-2x32.c", "src/f32-raddexpminusmax/gen/avx512f-p5-scalef-x128.c", @@ -1344,14 +1384,14 @@ AVX512F_UKERNELS = [ "src/f32-raddstoreexpminusmax/gen/avx512f-p5-scalef-x192-acc3.c", "src/f32-raddstoreexpminusmax/gen/avx512f-p5-scalef-x192-acc6.c", "src/f32-rmax/avx512f.c", - "src/f32-vbinary/gen/vadd-avx512f-x16.c", - "src/f32-vbinary/gen/vadd-avx512f-x32.c", - "src/f32-vbinary/gen/vaddc-avx512f-x16.c", - "src/f32-vbinary/gen/vaddc-avx512f-x32.c", - "src/f32-vbinary/gen/vdiv-avx512f-x16.c", - "src/f32-vbinary/gen/vdiv-avx512f-x32.c", - "src/f32-vbinary/gen/vdivc-avx512f-x16.c", - "src/f32-vbinary/gen/vdivc-avx512f-x32.c", + "src/f32-vbinary/gen/vadd-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vadd-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vaddc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vaddc-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vdiv-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vdiv-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vdivc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vdivc-minmax-avx512f-x32.c", "src/f32-vbinary/gen/vmax-avx512f-x16.c", "src/f32-vbinary/gen/vmax-avx512f-x32.c", "src/f32-vbinary/gen/vmaxc-avx512f-x16.c", @@ -1360,18 +1400,18 @@ AVX512F_UKERNELS = [ "src/f32-vbinary/gen/vmin-avx512f-x32.c", "src/f32-vbinary/gen/vminc-avx512f-x16.c", "src/f32-vbinary/gen/vminc-avx512f-x32.c", - "src/f32-vbinary/gen/vmul-avx512f-x16.c", - "src/f32-vbinary/gen/vmul-avx512f-x32.c", - "src/f32-vbinary/gen/vmulc-avx512f-x16.c", - "src/f32-vbinary/gen/vmulc-avx512f-x32.c", - "src/f32-vbinary/gen/vrdivc-avx512f-x16.c", - "src/f32-vbinary/gen/vrdivc-avx512f-x32.c", - "src/f32-vbinary/gen/vrsubc-avx512f-x16.c", - "src/f32-vbinary/gen/vrsubc-avx512f-x32.c", - "src/f32-vbinary/gen/vsub-avx512f-x16.c", - "src/f32-vbinary/gen/vsub-avx512f-x32.c", - "src/f32-vbinary/gen/vsubc-avx512f-x16.c", - "src/f32-vbinary/gen/vsubc-avx512f-x32.c", + "src/f32-vbinary/gen/vmul-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vmul-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vmulc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vmulc-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vrdivc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vrdivc-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vrsubc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vrsubc-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vsub-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vsub-minmax-avx512f-x32.c", + "src/f32-vbinary/gen/vsubc-minmax-avx512f-x16.c", + "src/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c", "src/f32-vscale/avx512f-unroll64.c", "src/f32-vscaleexpminusmax/gen/avx512f-p5-scalef-x16.c", "src/f32-vscaleexpminusmax/gen/avx512f-p5-scalef-x32.c", @@ -1405,78 +1445,89 @@ AVX512F_UKERNELS = [ ] AARCH32_ASM_UKERNELS = [ - "src/q8-dwconv/up8x9-aarch32-neon.S", - "src/f32-gemm/4x8-aarch32-neon-cortex-a53.S", - "src/f32-gemm/4x8-aarch32-neon-cortex-a55.S", - "src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S", - "src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S", - "src/f32-gemm/4x8-aarch32-neon-ld64.S", - "src/f32-igemm/4x8-aarch32-neon-ld64.S", - "src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S", - "src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S", - "src/f32-igemm/4x8-aarch32-neon-cortex-a53.S", - "src/f32-igemm/4x8-aarch32-neon-cortex-a55.S", + "src/q8-dwconv/up8x9-minmax-aarch32-neon.S", + "src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S", + "src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S", + "src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", + "src/f32-gemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S", + "src/f32-gemm/4x8-minmax-aarch32-neon-ld64.S", + "src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S", + "src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", + "src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S", + "src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S", + "src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S", ] AARCH64_ASM_UKERNELS = [ - "src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S", - "src/f32-dwconv/up4x9-aarch64-neonfma.S", - "src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a55.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S", - "src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S", - "src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a55.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S", - "src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S", - "src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a55.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S", - "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S", - "src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a55.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S", - "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S", - "src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S", - "src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S", - "src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a57.S", - "src/f32-igemm/gen/1x8-aarch64-neonfma-cortex-a75.S", - "src/f32-igemm/4x12-aarch64-neonfma-cortex-a53.S", - "src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S", - "src/f32-igemm/4x8-aarch64-neonfma-cortex-a55.S", - "src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a57.S", - "src/f32-igemm/gen/4x8-aarch64-neonfma-cortex-a75.S", - "src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a57.S", - "src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S", - "src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S", - "src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S", - "src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S", - "src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S", - "src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S", + "src/f16-gemm/gen/1x16-minmax-aarch64-neonfp16arith-ld32.S", + "src/f16-gemm/gen/4x16-minmax-aarch64-neonfp16arith-ld32.S", + "src/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-ld32.S", + "src/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S", + "src/f16-gemm/gen-inc/4x16inc-minmax-aarch64-neonfp16arith-ld32.S", + "src/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-ld32.S", + "src/f32-dwconv/up4x9-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-dwconv/up4x9-minmax-aarch64-neonfma.S", + "src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-ld64.S", + "src/f32-gemm/gen/1x12-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen/4x12-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld128.S", + "src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld64.S", + "src/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a73.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ios.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld128.S", + "src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld64.S", + "src/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-ld64.S", + "src/f32-gemm/gen-inc/1x12inc-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen-inc/4x12inc-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld128.S", + "src/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld64.S", + "src/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a73.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ios.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld128.S", + "src/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld64.S", + "src/f32-igemm/1x12-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-igemm/1x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-igemm/4x12-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a53.S", + "src/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a55.S", + "src/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a73.S", + "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a57.S", + "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", + "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ios.S", ] INTERNAL_MICROKERNEL_HDRS = [ @@ -1797,6 +1848,7 @@ xnnpack_cc_library( name = "asm_ukernels", hdrs = ["src/xnnpack/assembly.h"], aarch32_srcs = AARCH32_ASM_UKERNELS, + aarch64_copts = ["-march=armv8.2-a+fp16"], aarch64_srcs = AARCH64_ASM_UKERNELS, ) @@ -1868,7 +1920,10 @@ xnnpack_cc_library( copts = xnnpack_std_copts() + LOGGING_COPTS + [ # Wrappers for multi-pass microkernels use VLAs for temporary buffers. "-Wno-vla", - ], + ] + select({ + ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"], + "//conditions:default": [], + }), deps = [ "@FP16", "@FXdiv", @@ -1899,6 +1954,9 @@ xnnpack_cc_library( ] + select({ ":debug_build": [], "//conditions:default": xnnpack_min_size_copts(), + }) + select({ + ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"], + "//conditions:default": [], }), wasm_srcs = ["src/wasm-stubs.c"], wasmsimd_srcs = ["src/wasm-stubs.c"], @@ -1925,6 +1983,9 @@ cc_library( ] + select({ ":debug_build": [], "//conditions:default": xnnpack_min_size_copts(), + }) + select({ + ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"], + "//conditions:default": [], }), includes = ["include"], linkstatic = True, @@ -1954,6 +2015,9 @@ cc_library( ] + select({ ":debug_build": [], "//conditions:default": xnnpack_min_size_copts(), + }) + select({ + ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"], + "//conditions:default": [], }), defines = [ "XNN_NO_Q8_OPERATORS", @@ -2397,9 +2461,9 @@ xnnpack_benchmark( ######################### Unit tests for micro-kernels ######################### xnnpack_unit_test( - name = "f16_gemm_test", + name = "f16_gemm_minmax_test", srcs = [ - "test/f16-gemm.cc", + "test/f16-gemm-minmax.cc", "test/gemm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2407,9 +2471,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f16_spmm_test", + name = "f16_spmm_minmax_test", srcs = [ - "test/f16-spmm.cc", + "test/f16-spmm-minmax.cc", "test/spmm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2427,9 +2491,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_avgpool_test", + name = "f32_avgpool_minmax_test", srcs = [ - "test/f32-avgpool.cc", + "test/f32-avgpool-minmax.cc", "test/avgpool-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2465,6 +2529,16 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) +xnnpack_unit_test( + name = "f32_igemm_minmax_test", + srcs = [ + "test/f32-igemm-minmax.cc", + "test/gemm-microkernel-tester.h", + "src/xnnpack/AlignedAllocator.h", + ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, + deps = MICROKERNEL_TEST_DEPS, +) + xnnpack_unit_test( name = "f32_conv_hwc_test", srcs = [ @@ -2495,6 +2569,16 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) +xnnpack_unit_test( + name = "f32_dwconv_minmax_test", + srcs = [ + "test/f32-dwconv-minmax.cc", + "test/dwconv-microkernel-tester.h", + "src/xnnpack/AlignedAllocator.h", + ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, + deps = MICROKERNEL_TEST_DEPS, +) + xnnpack_unit_test( name = "f32_dwconv_spchw_test", srcs = [ @@ -2506,9 +2590,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_gavgpool_test", + name = "f32_gavgpool_minmax_test", srcs = [ - "test/f32-gavgpool.cc", + "test/f32-gavgpool-minmax.cc", "test/gavgpool-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2536,9 +2620,19 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_gemminc_test", + name = "f32_gemm_minmax_test", + srcs = [ + "test/f32-gemm-minmax.cc", + "test/gemm-microkernel-tester.h", + "src/xnnpack/AlignedAllocator.h", + ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, + deps = MICROKERNEL_TEST_DEPS, +) + +xnnpack_unit_test( + name = "f32_gemminc_minmax_test", srcs = [ - "test/f32-gemminc.cc", + "test/f32-gemminc-minmax.cc", "test/gemm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2555,18 +2649,18 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_maxpool_test", + name = "f32_maxpool_minmax_test", srcs = [ - "test/f32-maxpool.cc", + "test/f32-maxpool-minmax.cc", "test/maxpool-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_pavgpool_test", + name = "f32_pavgpool_minmax_test", srcs = [ - "test/f32-pavgpool.cc", + "test/f32-pavgpool-minmax.cc", "test/avgpool-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2574,9 +2668,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_ppmm_test", + name = "f32_ppmm_minmax_test", srcs = [ - "test/f32-ppmm.cc", + "test/f32-ppmm-minmax.cc", "test/gemm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2639,9 +2733,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_spmm_test", + name = "f32_spmm_minmax_test", srcs = [ - "test/f32-spmm.cc", + "test/f32-spmm-minmax.cc", "test/spmm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2649,45 +2743,45 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_vadd_test", + name = "f32_vadd_minmax_test", srcs = [ - "test/f32-vadd.cc", + "test/f32-vadd-minmax.cc", "test/vbinary-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vaddc_test", + name = "f32_vaddc_minmax_test", srcs = [ - "test/f32-vaddc.cc", + "test/f32-vaddc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vdiv_test", + name = "f32_vdiv_minmax_test", srcs = [ - "test/f32-vdiv.cc", + "test/f32-vdiv-minmax.cc", "test/vbinary-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vdivc_test", + name = "f32_vdivc_minmax_test", srcs = [ - "test/f32-vdivc.cc", + "test/f32-vdivc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vrdivc_test", + name = "f32_vrdivc_minmax_test", srcs = [ - "test/f32-vrdivc.cc", + "test/f32-vrdivc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, @@ -2730,27 +2824,27 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_vmul_test", + name = "f32_vmul_minmax_test", srcs = [ - "test/f32-vmul.cc", + "test/f32-vmul-minmax.cc", "test/vbinary-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vmulc_test", + name = "f32_vmulc_minmax_test", srcs = [ - "test/f32-vmulc.cc", + "test/f32-vmulc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vmulcaddc_test", + name = "f32_vmulcaddc_minmax_test", srcs = [ - "test/f32-vmulcaddc.cc", + "test/f32-vmulcaddc-minmax.cc", "test/vmulcaddc-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2785,36 +2879,36 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "f32_vsub_test", + name = "f32_vsub_minmax_test", srcs = [ - "test/f32-vsub.cc", + "test/f32-vsub-minmax.cc", "test/vbinary-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vsubc_test", + name = "f32_vsubc_minmax_test", srcs = [ - "test/f32-vsubc.cc", + "test/f32-vsubc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "f32_vrsubc_test", + name = "f32_vrsubc_minmax_test", srcs = [ - "test/f32-vrsubc.cc", + "test/f32-vrsubc-minmax.cc", "test/vbinaryc-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, ) xnnpack_unit_test( - name = "q8_avgpool_test", + name = "q8_avgpool_minmax_test", srcs = [ - "test/q8-avgpool.cc", + "test/q8-avgpool-minmax.cc", "test/avgpool-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2822,9 +2916,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "q8_igemm_test", + name = "q8_igemm_minmax_test", srcs = [ - "test/q8-igemm.cc", + "test/q8-igemm-minmax.cc", "test/gemm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2832,9 +2926,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "q8_dwconv_test", + name = "q8_dwconv_minmax_test", srcs = [ - "test/q8-dwconv.cc", + "test/q8-dwconv-minmax.cc", "test/dwconv-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2842,9 +2936,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "q8_gavgpool_test", + name = "q8_gavgpool_minmax_test", srcs = [ - "test/q8-gavgpool.cc", + "test/q8-gavgpool-minmax.cc", "test/gavgpool-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + MICROKERNEL_TEST_HDRS, @@ -2852,9 +2946,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "q8_gemm_test", + name = "q8_gemm_minmax_test", srcs = [ - "test/q8-gemm.cc", + "test/q8-gemm-minmax.cc", "test/gemm-microkernel-tester.h", "src/xnnpack/AlignedAllocator.h", ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS, @@ -2862,9 +2956,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "q8_vadd_test", + name = "q8_vadd_minmax_test", srcs = [ - "test/q8-vadd.cc", + "test/q8-vadd-minmax.cc", "test/vadd-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, @@ -2889,9 +2983,9 @@ xnnpack_unit_test( ) xnnpack_unit_test( - name = "u8_maxpool_test", + name = "u8_maxpool_minmax_test", srcs = [ - "test/u8-maxpool.cc", + "test/u8-maxpool-minmax.cc", "test/maxpool-microkernel-tester.h", ] + MICROKERNEL_TEST_HDRS, deps = MICROKERNEL_TEST_DEPS, @@ -3235,6 +3329,12 @@ config_setting( define_values = {"xnn_enable_assembly": "false"}, ) +# Disables usage of HMP-aware optimizations. +config_setting( + name = "xnn_enable_hmp_explicit_false", + define_values = {"xnn_enable_hmp": "false"}, +) + # Builds with -c dbg config_setting( name = "debug_build", @@ -3298,6 +3398,16 @@ config_setting( }, ) +config_setting( + name = "windows_x86", + values = {"cpu": "win_x86"}, +) + +config_setting( + name = "windows_x86_64", + values = {"cpu": "win_x64"}, +) + config_setting( name = "macos_x86_64", values = { @@ -3308,7 +3418,7 @@ config_setting( config_setting( name = "emscripten", - values = {"crosstool_top": "//toolchain:emscripten"}, + values = {"cpu": "js"}, ) config_setting( @@ -3329,7 +3439,6 @@ config_setting( config_setting( name = "emscripten_asmjs", values = { - "crosstool_top": "//toolchain:emscripten", "cpu": "asmjs", }, )