Squashed commit of the following:

commit 300751d Author: ddavis-2015 <[email protected]> Date: Wed Nov 20 12:03:16 2024 -0800 Update to latest Cadence code. Int8 any bitwidth on normal quant axis updated. commit 0a49b2a Author: ddavis-2015 <[email protected]> Date: Wed Nov 20 10:50:30 2024 -0800 Add input tensor CRC to Generic Benchmark application. Only use -O3 -LNO:simd with the xtensa decompress.cc target. commit 83dafce Author: ddavis-2015 <[email protected]> Date: Mon Nov 18 17:23:58 2024 -0800 cleanup commit 2f8cead Author: ddavis-2015 <[email protected]> Date: Mon Nov 18 13:29:33 2024 -0800 Revert FakeMicroContext changes for alternate profiler. Add default alternate profiler implementation to MicroContext. commit fddf003 Author: ddavis-2015 <[email protected]> Date: Mon Nov 18 12:29:26 2024 -0800 Fix typo. commit ae6a207 Author: ddavis-2015 <[email protected]> Date: Mon Nov 18 12:29:06 2024 -0800 Implement alternate profiler for MicroInterpreter. Enable use of alternate profiler by decompression code. Enable use of alternate profiler by Generic Benchmark application. commit 5e1a1c9 Author: ddavis-2015 <[email protected]> Date: Sun Nov 10 18:24:02 2024 -0800 changes to make the memory planner debug output easier to interpret commit f651c88 Author: ddavis-2015 <[email protected]> Date: Sun Nov 10 04:27:29 2024 -0800 single pending ops queue process pending ops recursively commit cfd9890 Author: ddavis-2015 <[email protected]> Date: Sun Nov 10 00:26:14 2024 -0800 expand model_facade redo var handle tracking commit 7776cda Author: ddavis-2015 <[email protected]> Date: Tue Nov 5 13:24:58 2024 -0800 remove [[maybe_unused]] commit 40e7530 Author: ddavis-2015 <[email protected]> Date: Mon Nov 4 11:17:43 2024 -0800 fix arena commit 0d889e0 Author: ddavis-2015 <[email protected]> Date: Sat Nov 2 14:18:55 2024 -0700 Fix MicroProfiler bug with ClearEvents(). Add pre-inference profiling to the Generic Benchmark.
ddavis-2015 · Nov 22, 2024 · 1110543 · 1110543
1 parent bd04eb2
commit 1110543
Show file tree

Hide file tree

Showing 16 changed files with 147 additions and 28 deletions.
diff --git a/tensorflow/lite/micro/compression.h b/tensorflow/lite/micro/compression.h
@@ -33,7 +33,6 @@ enum class CompressionScheme : uint8_t {
   kBinQuant,
 };
 
-// TODO(ddavis-2015): pack struct
 struct LookupTableData {
   static constexpr size_t kMaxBitWidth = 7;
   static constexpr size_t kMaxValueTableChannelStride = 128;
@@ -51,13 +50,11 @@ union CompressionData {
   LookupTableData* lut_data;
 };
 
-// TODO(ddavis-2015): pack struct
 struct CompressionTensorData {
   CompressionScheme scheme;
   CompressionData data;
 };
 
-// TODO(ddavis-2015): pack struct
 struct CompressedTensorList {
   // Sparsely populated array with the same number of elements as there are
   // tensors in the Subgraph. An alternative would include a tensor index in

diff --git a/tensorflow/lite/micro/docs/compression.md b/tensorflow/lite/micro/docs/compression.md
@@ -276,7 +276,7 @@ bazel run --cache_test_results=no --test_output=all -s  tensorflow/lite/micro/to
 
 The Generic Benchmark Application can be used to see the size of the model, the
 amount of arena memory used, and the size of the interpreter data structures
-including those involved with tensor conpression.
+including those involved with tensor compression.
 
 The benchmark also reports total inference time, as well as time taken for
 tensor decompression.  Timing data may be either wall-clock time or processor

diff --git a/tensorflow/lite/micro/kernels/decompress.cc b/tensorflow/lite/micro/kernels/decompress.cc
@@ -22,8 +22,6 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_common.h"
-#include "tensorflow/lite/micro/micro_log.h"
-#include "tensorflow/lite/micro/micro_profiler.h"
 
 namespace tflite {
 

diff --git a/tensorflow/lite/micro/kernels/decompress.h b/tensorflow/lite/micro/kernels/decompress.h
@@ -32,7 +32,7 @@ struct DecompressionState {
                      const size_t count_indices,
                      const CompressionTensorData& comp_data,
                      const size_t num_channels,
-                     MicroProfiler* profiler = nullptr)
+                     MicroProfilerInterface* profiler = nullptr)
       : compressed_indices_(compressed_indices),
         count_indices_(count_indices),
         comp_data_(comp_data),
@@ -79,7 +79,7 @@ struct DecompressionState {
       comp_data_.data.lut_data->use_alternate_axis
           ? 1
           : count_indices_ / num_channels_;
-  MicroProfiler* micro_profiler_;
+  MicroProfilerInterface* micro_profiler_;
 };
 
 #endif  // USE_TFLM_COMPRESSION

diff --git a/tensorflow/lite/micro/kernels/xtensa/decompress.cc b/tensorflow/lite/micro/kernels/xtensa/decompress.cc
@@ -385,14 +385,33 @@ void DecompressionStateXtensa::DecompressToBufferWidthAnyInt8_Xtensa(
     }
   } else {
     int elements_per_channel_t = elements_per_channel_;
+    uint32_t index_1, index_2;
+    uint32_t mask_bits = (1 << compressed_bit_width_) - 1;
 
     for (int i = 0; i < num_channels_t; i++) {
-      for (int j = 0; j < elements_per_channel_t; j++) {
+      elements_per_channel_t = elements_per_channel_;
+      /* if output pointer is not 2 byte aligned */
+      if ((unsigned int)p_out_tmp & 0x1) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
+        AE_S8_0_IP(d_tmp, p_out_tmp, 1);
+        elements_per_channel_t = elements_per_channel_t - 1;
+      }
+      for (int j = 0; j < (elements_per_channel_t >> 1); j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, 2 * bw);
+        index_1 = (index >> compressed_bit_width_) & mask_bits;
+        index_2 = (index)&mask_bits;
+        ae_int8x8 d_tmp1 = AE_L8_X((const ae_int8*)value_table, index_1);
+        ae_int8x8 d_tmp2 = AE_L8_X((const ae_int8*)value_table, index_2);
+        ae_int16x4 d_tmp =
+            AE_MOVINT16X4_FROMINT8X8(AE_SEL8X8I(d_tmp2, d_tmp1, 21));
+        AE_S16_0_IP(d_tmp, (ae_int16*)p_out_tmp, 2);
+      }
+      if (elements_per_channel_t & 0x1) {
         AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
         ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
         AE_S8_0_IP(d_tmp, p_out_tmp, 1);
       }
-
       value_table += stride;
     }
   }

diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {
   } else if (i < 62) {
     return 'A' + (i - 36);
   }
-  return '*';
+  return GetOrdinalCharacter(i % 62);
 }
 
 }  // namespace
@@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
   CalculateOffsetsIfNeeded();
 
   for (int i = 0; i < buffer_count_; ++i) {
-    MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",
-                GetOrdinalCharacter(i), i, requirements_[i].size,
-                buffer_offsets_[i], requirements_[i].first_time_used,
+    char c = '*';
+    if (requirements_[i].first_time_used != requirements_[i].last_time_used) {
+      // not a scratch buffer nor subgraph output tensor
+      c = GetOrdinalCharacter(i);
+    }
+    MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,
+                i, requirements_[i].size, buffer_offsets_[i],
+                requirements_[i].first_time_used,
                 requirements_[i].last_time_used);
   }
 
@@ -379,7 +384,12 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
       const int line_end = ((offset + size) * kLineWidth) / max_size;
       for (int n = line_start; n < line_end; ++n) {
         if (line[n] == '.') {
-          line[n] = GetOrdinalCharacter(i);
+          if (requirements->first_time_used == requirements->last_time_used) {
+            // scratch buffer or subgraph output tensor
+            line[n] = '*';
+          } else {
+            line[n] = GetOrdinalCharacter(i);
+          }
         } else {
           line[n] = '!';
         }

diff --git a/tensorflow/lite/micro/micro_context.cc b/tensorflow/lite/micro/micro_context.cc
@@ -96,8 +96,7 @@ void* MicroContext::DecompressTensorToBuffer(
   }
 
   DecompressionState ds(static_cast<uint8_t*>(tensor.data.data), count,
-                        compression_data, num_channels,
-                        static_cast<MicroProfiler*>(external_context()));
+                        compression_data, num_channels, GetAlternateProfiler());
 
   switch (tensor.type) {
     case kTfLiteBool: {

diff --git a/tensorflow/lite/micro/micro_context.h b/tensorflow/lite/micro/micro_context.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/micro_graph.h"
+#include "tensorflow/lite/micro/micro_profiler_interface.h"
 
 #ifdef USE_TFLM_COMPRESSION
 
@@ -125,6 +126,25 @@ class MicroContext {
 
 #endif  // USE_TFLM_COMPRESSION
 
+  // Set the alternate MicroProfilerInterface.
+  // This can be used to profile subsystems simultaneously with the profiling
+  // of kernels during the Eval phase.  See (b/379584353).
+  // The alternate MicroProfilerInterface is currently used by the tensor
+  // decompression subsystem.
+  virtual TfLiteStatus SetAlternateProfiler(
+      MicroProfilerInterface* alt_profiler) {
+    return kTfLiteError;
+  }
+
+  // Get the alternate MicroProfilerInterface.
+  // This can be used to profile subsystems simultaneously with the profiling
+  // of kernels during the Eval phase.  See (b/379584353).
+  // The alternate MicroProfilerInterface is currently used by the tensor
+  // decompression subsystem.
+  virtual MicroProfilerInterface* GetAlternateProfiler() const {
+    return nullptr;
+  }
+
  private:
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };

diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
@@ -334,4 +334,9 @@ TfLiteStatus MicroInterpreter::SetMicroExternalContext(
   return micro_context_.set_external_context(external_context_payload);
 }
 
+TfLiteStatus MicroInterpreter::SetAlternateProfiler(
+    MicroProfilerInterface* alt_profiler) {
+  return micro_context_.SetAlternateProfiler(alt_profiler);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
@@ -146,6 +146,14 @@ class MicroInterpreter {
     return allocator_.preserves_all_tensor();
   }
 
+  // Set the alternate MicroProfilerInterface.
+  // This value is passed through to the MicroContext.
+  // This can be used to profile subsystems simultaneously with the profiling
+  // of kernels during the Eval phase.  See (b/379584353).
+  // The alternate MicroProfilerInterface is currently used by the tensor
+  // decompression subsystem.
+  TfLiteStatus SetAlternateProfiler(MicroProfilerInterface* alt_profiler);
+
  protected:
   const MicroAllocator& allocator() const { return allocator_; }
   const TfLiteContext& context() const { return context_; }

diff --git a/tensorflow/lite/micro/micro_interpreter_context.cc b/tensorflow/lite/micro/micro_interpreter_context.cc
@@ -206,4 +206,14 @@ void* MicroInterpreterContext::DecompressTensorToBuffer(
 
 #endif  // USE_TFLM_COMPRESSION
 
+TfLiteStatus MicroInterpreterContext::SetAlternateProfiler(
+    tflite::MicroProfilerInterface* alt_profiler) {
+  alt_profiler_ = alt_profiler;
+  return kTfLiteOk;
+}
+
+MicroProfilerInterface* MicroInterpreterContext::GetAlternateProfiler() const {
+  return alt_profiler_;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_interpreter_context.h b/tensorflow/lite/micro/micro_interpreter_context.h
@@ -130,6 +130,21 @@ class MicroInterpreterContext : public MicroContext {
 
 #endif  // USE_TFLM_COMPRESSION
 
+  // Set the alternate MicroProfilerInterface.
+  // This can be used to profile subsystems simultaneously with the profiling
+  // of kernels during the Eval phase.  See (b/379584353).
+  // The alternate MicroProfilerInterface is currently used by the tensor
+  // decompression subsystem.
+  TfLiteStatus SetAlternateProfiler(
+      MicroProfilerInterface* alt_profiler) override;
+
+  // Get the alternate MicroProfilerInterface.
+  // This can be used to profile subsystems simultaneously with the profiling
+  // of kernels during the Eval phase.  See (b/379584353).
+  // The alternate MicroProfilerInterface is currently used by the tensor
+  // decompression subsystem.
+  MicroProfilerInterface* GetAlternateProfiler() const override;
+
  private:
   MicroAllocator& allocator_;
   MicroInterpreterGraph& graph_;
@@ -138,6 +153,7 @@ class MicroInterpreterContext : public MicroContext {
 
   ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
   void* external_context_payload_ = nullptr;
+  MicroProfilerInterface* alt_profiler_ = nullptr;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };

diff --git a/tensorflow/lite/micro/micro_profiler.cc b/tensorflow/lite/micro/micro_profiler.cc
@@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() {
     TFLITE_DCHECK(tags_[i] != nullptr);
     int position = FindExistingOrNextPosition(tags_[i]);
     TFLITE_DCHECK(position >= 0);
-    total_ticks_per_tag[position].tag = tags_[i];
-    total_ticks_per_tag[position].ticks =
-        total_ticks_per_tag[position].ticks + ticks;
+    total_ticks_per_tag_[position].tag = tags_[i];
+    total_ticks_per_tag_[position].ticks =
+        total_ticks_per_tag_[position].ticks + ticks;
     total_ticks += ticks;
   }
 
   for (int i = 0; i < num_events_; ++i) {
-    TicksPerTag each_tag_entry = total_ticks_per_tag[i];
+    TicksPerTag each_tag_entry = total_ticks_per_tag_[i];
     if (each_tag_entry.tag == nullptr) {
       break;
     }
@@ -112,12 +112,21 @@ void MicroProfiler::LogTicksPerTagCsv() {
 int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) {
   int pos = 0;
   for (; pos < num_events_; pos++) {
-    TicksPerTag each_tag_entry = total_ticks_per_tag[pos];
+    TicksPerTag each_tag_entry = total_ticks_per_tag_[pos];
     if (each_tag_entry.tag == nullptr ||
         strcmp(each_tag_entry.tag, tag_name) == 0) {
       return pos;
     }
   }
   return pos < num_events_ ? pos : -1;
 }
+
+void MicroProfiler::ClearEvents() {
+  for (int i = 0; i < num_events_; i++) {
+    total_ticks_per_tag_[i].tag = nullptr;
+  }
+
+  num_events_ = 0;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_profiler.h b/tensorflow/lite/micro/micro_profiler.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface {
   virtual void EndEvent(uint32_t event_handle) override;
 
   // Clears all the events that have been currently profiled.
-  void ClearEvents() { num_events_ = 0; }
+  void ClearEvents();
 
   // Returns the sum of the ticks taken across all the events. This number
   // is only meaningful if all of the events are disjoint (the end time of
@@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface {
   // In practice, the number of tags will be much lower than the number of
   // events. But it is theoretically possible that each event to be unique and
   // hence we allow total_ticks_per_tag to have kMaxEvents entries.
-  TicksPerTag total_ticks_per_tag[kMaxEvents] = {};
+  TicksPerTag total_ticks_per_tag_[kMaxEvents] = {};
 
   int FindExistingOrNextPosition(const char* tag_name);
 

diff --git a/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc b/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc
@@ -170,6 +170,16 @@ void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) {
   }
 }
 
+void ShowInputCRC32(tflite::MicroInterpreter* interpreter) {
+  GenCRC32Table();
+  for (size_t i = 0; i < interpreter->inputs_size(); ++i) {
+    TfLiteTensor* input = interpreter->input_tensor(i);
+    uint8_t* input_values = tflite::GetTensorData<uint8_t>(input);
+    uint32_t crc32_value = ComputeCRC32(input_values, input->bytes);
+    MicroPrintf("Input CRC32: 0x%X", crc32_value);
+  }
+}
+
 int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
   static Profiler profiler;
   static Profiler profiler2;
@@ -184,26 +194,35 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
 
   alignas(16) static uint8_t tensor_arena[kTensorArenaSize];
 
-  uint32_t event_handle = profiler.BeginEvent("TfliteGetModel");
+  uint32_t event_handle = profiler.BeginEvent("tflite::GetModel");
   const tflite::Model* model = tflite::GetModel(model_data);
   profiler.EndEvent(event_handle);
 
+  event_handle = profiler.BeginEvent("tflite::CreateOpResolver");
   TflmOpResolver op_resolver;
   TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver));
+  profiler.EndEvent(event_handle);
 
+  event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create");
   tflite::RecordingMicroAllocator* allocator(
       tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize));
+  profiler.EndEvent(event_handle);
+  event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation");
   tflite::RecordingMicroInterpreter interpreter(
       model, op_resolver, allocator,
       tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable),
       &profiler);
+  profiler.EndEvent(event_handle);
+  event_handle =
+      profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors");
   TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
+  profiler.EndEvent(event_handle);
 
-  profiler.Log();
+  profiler.LogTicksPerTagCsv();
   profiler.ClearEvents();
 
   if (using_compression) {
-    TF_LITE_ENSURE_STATUS(interpreter.SetMicroExternalContext(&profiler2));
+    TF_LITE_ENSURE_STATUS(interpreter.SetAlternateProfiler(&profiler2));
   }
 
   MicroPrintf("");  // null MicroPrintf serves as a newline.
@@ -216,6 +235,9 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
   uint32_t seed = kRandomSeed;
   while (true) {
     SetRandomInput(seed++, interpreter);
+    ShowInputCRC32(&interpreter);
+    MicroPrintf("");  // null MicroPrintf serves as a newline.
+
     TfLiteStatus status = interpreter.Invoke();
     if ((status != kTfLiteOk) && (static_cast<int>(status) != kTfLiteAbort)) {
       MicroPrintf("Model interpreter invocation failed: %d\n", status);

diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
@@ -97,4 +97,10 @@ ifeq ($(OPTIMIZED_KERNEL_DIR), xtensa)
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc \
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/unidirectional_sequence_lstm.cc
+
+  # override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
+  # Xtensa intrinsics.
+$(KERNEL_OBJDIR)$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.o: $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) -O3 -LNO:simd $(INCLUDES) -c $< -o $@
 endif