Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 300751d
Author: ddavis-2015 <[email protected]>
Date:   Wed Nov 20 12:03:16 2024 -0800

    Update to latest Cadence code.  Int8 any bitwidth on normal quant axis updated.

commit 0a49b2a
Author: ddavis-2015 <[email protected]>
Date:   Wed Nov 20 10:50:30 2024 -0800

    Add input tensor CRC to Generic Benchmark application.
    Only use -O3 -LNO:simd with the xtensa decompress.cc target.

commit 83dafce
Author: ddavis-2015 <[email protected]>
Date:   Mon Nov 18 17:23:58 2024 -0800

    cleanup

commit 2f8cead
Author: ddavis-2015 <[email protected]>
Date:   Mon Nov 18 13:29:33 2024 -0800

    Revert FakeMicroContext changes for alternate profiler.
    Add default alternate profiler implementation to MicroContext.

commit fddf003
Author: ddavis-2015 <[email protected]>
Date:   Mon Nov 18 12:29:26 2024 -0800

    Fix typo.

commit ae6a207
Author: ddavis-2015 <[email protected]>
Date:   Mon Nov 18 12:29:06 2024 -0800

    Implement alternate profiler for MicroInterpreter.
    Enable use of alternate profiler by decompression code.
    Enable use of alternate profiler by Generic Benchmark application.

commit 5e1a1c9
Author: ddavis-2015 <[email protected]>
Date:   Sun Nov 10 18:24:02 2024 -0800

    changes to make the memory planner debug output easier to interpret

commit f651c88
Author: ddavis-2015 <[email protected]>
Date:   Sun Nov 10 04:27:29 2024 -0800

    single pending ops queue
    process pending ops recursively

commit cfd9890
Author: ddavis-2015 <[email protected]>
Date:   Sun Nov 10 00:26:14 2024 -0800

    expand model_facade
    redo var handle tracking

commit 7776cda
Author: ddavis-2015 <[email protected]>
Date:   Tue Nov 5 13:24:58 2024 -0800

    remove [[maybe_unused]]

commit 40e7530
Author: ddavis-2015 <[email protected]>
Date:   Mon Nov 4 11:17:43 2024 -0800

    fix arena

commit 0d889e0
Author: ddavis-2015 <[email protected]>
Date:   Sat Nov 2 14:18:55 2024 -0700

    Fix MicroProfiler bug with ClearEvents().
    Add pre-inference profiling to the Generic Benchmark.
  • Loading branch information
ddavis-2015 committed Nov 22, 2024
1 parent bd04eb2 commit 1110543
Show file tree
Hide file tree
Showing 16 changed files with 147 additions and 28 deletions.
3 changes: 0 additions & 3 deletions tensorflow/lite/micro/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ enum class CompressionScheme : uint8_t {
kBinQuant,
};

// TODO(ddavis-2015): pack struct
struct LookupTableData {
static constexpr size_t kMaxBitWidth = 7;
static constexpr size_t kMaxValueTableChannelStride = 128;
Expand All @@ -51,13 +50,11 @@ union CompressionData {
LookupTableData* lut_data;
};

// TODO(ddavis-2015): pack struct
struct CompressionTensorData {
CompressionScheme scheme;
CompressionData data;
};

// TODO(ddavis-2015): pack struct
struct CompressedTensorList {
// Sparsely populated array with the same number of elements as there are
// tensors in the Subgraph. An alternative would include a tensor index in
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/lite/micro/docs/compression.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ bazel run --cache_test_results=no --test_output=all -s tensorflow/lite/micro/to
The Generic Benchmark Application can be used to see the size of the model, the
amount of arena memory used, and the size of the interpreter data structures
including those involved with tensor conpression.
including those involved with tensor compression.
The benchmark also reports total inference time, as well as time taken for
tensor decompression. Timing data may be either wall-clock time or processor
Expand Down
2 changes: 0 additions & 2 deletions tensorflow/lite/micro/kernels/decompress.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ limitations under the License.

#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/micro/micro_common.h"
#include "tensorflow/lite/micro/micro_log.h"
#include "tensorflow/lite/micro/micro_profiler.h"

namespace tflite {

Expand Down
4 changes: 2 additions & 2 deletions tensorflow/lite/micro/kernels/decompress.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct DecompressionState {
const size_t count_indices,
const CompressionTensorData& comp_data,
const size_t num_channels,
MicroProfiler* profiler = nullptr)
MicroProfilerInterface* profiler = nullptr)
: compressed_indices_(compressed_indices),
count_indices_(count_indices),
comp_data_(comp_data),
Expand Down Expand Up @@ -79,7 +79,7 @@ struct DecompressionState {
comp_data_.data.lut_data->use_alternate_axis
? 1
: count_indices_ / num_channels_;
MicroProfiler* micro_profiler_;
MicroProfilerInterface* micro_profiler_;
};

#endif // USE_TFLM_COMPRESSION
Expand Down
23 changes: 21 additions & 2 deletions tensorflow/lite/micro/kernels/xtensa/decompress.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,14 +385,33 @@ void DecompressionStateXtensa::DecompressToBufferWidthAnyInt8_Xtensa(
}
} else {
int elements_per_channel_t = elements_per_channel_;
uint32_t index_1, index_2;
uint32_t mask_bits = (1 << compressed_bit_width_) - 1;

for (int i = 0; i < num_channels_t; i++) {
for (int j = 0; j < elements_per_channel_t; j++) {
elements_per_channel_t = elements_per_channel_;
/* if output pointer is not 2 byte aligned */
if ((unsigned int)p_out_tmp & 0x1) {
AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
AE_S8_0_IP(d_tmp, p_out_tmp, 1);
elements_per_channel_t = elements_per_channel_t - 1;
}
for (int j = 0; j < (elements_per_channel_t >> 1); j++) {
AE_LB_DB_IP((unsigned short*)p_stream, index, 2 * bw);
index_1 = (index >> compressed_bit_width_) & mask_bits;
index_2 = (index)&mask_bits;
ae_int8x8 d_tmp1 = AE_L8_X((const ae_int8*)value_table, index_1);
ae_int8x8 d_tmp2 = AE_L8_X((const ae_int8*)value_table, index_2);
ae_int16x4 d_tmp =
AE_MOVINT16X4_FROMINT8X8(AE_SEL8X8I(d_tmp2, d_tmp1, 21));
AE_S16_0_IP(d_tmp, (ae_int16*)p_out_tmp, 2);
}
if (elements_per_channel_t & 0x1) {
AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
AE_S8_0_IP(d_tmp, p_out_tmp, 1);
}

value_table += stride;
}
}
Expand Down
20 changes: 15 additions & 5 deletions tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {
} else if (i < 62) {
return 'A' + (i - 36);
}
return '*';
return GetOrdinalCharacter(i % 62);
}

} // namespace
Expand Down Expand Up @@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
CalculateOffsetsIfNeeded();

for (int i = 0; i < buffer_count_; ++i) {
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",
GetOrdinalCharacter(i), i, requirements_[i].size,
buffer_offsets_[i], requirements_[i].first_time_used,
char c = '*';
if (requirements_[i].first_time_used != requirements_[i].last_time_used) {
// not a scratch buffer nor subgraph output tensor
c = GetOrdinalCharacter(i);
}
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,
i, requirements_[i].size, buffer_offsets_[i],
requirements_[i].first_time_used,
requirements_[i].last_time_used);
}

Expand Down Expand Up @@ -379,7 +384,12 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
const int line_end = ((offset + size) * kLineWidth) / max_size;
for (int n = line_start; n < line_end; ++n) {
if (line[n] == '.') {
line[n] = GetOrdinalCharacter(i);
if (requirements->first_time_used == requirements->last_time_used) {
// scratch buffer or subgraph output tensor
line[n] = '*';
} else {
line[n] = GetOrdinalCharacter(i);
}
} else {
line[n] = '!';
}
Expand Down
3 changes: 1 addition & 2 deletions tensorflow/lite/micro/micro_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ void* MicroContext::DecompressTensorToBuffer(
}

DecompressionState ds(static_cast<uint8_t*>(tensor.data.data), count,
compression_data, num_channels,
static_cast<MicroProfiler*>(external_context()));
compression_data, num_channels, GetAlternateProfiler());

switch (tensor.type) {
case kTfLiteBool: {
Expand Down
20 changes: 20 additions & 0 deletions tensorflow/lite/micro/micro_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License.

#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/micro/micro_graph.h"
#include "tensorflow/lite/micro/micro_profiler_interface.h"

#ifdef USE_TFLM_COMPRESSION

Expand Down Expand Up @@ -125,6 +126,25 @@ class MicroContext {

#endif // USE_TFLM_COMPRESSION

// Set the alternate MicroProfilerInterface.
// This can be used to profile subsystems simultaneously with the profiling
// of kernels during the Eval phase. See (b/379584353).
// The alternate MicroProfilerInterface is currently used by the tensor
// decompression subsystem.
virtual TfLiteStatus SetAlternateProfiler(
MicroProfilerInterface* alt_profiler) {
return kTfLiteError;
}

// Get the alternate MicroProfilerInterface.
// This can be used to profile subsystems simultaneously with the profiling
// of kernels during the Eval phase. See (b/379584353).
// The alternate MicroProfilerInterface is currently used by the tensor
// decompression subsystem.
virtual MicroProfilerInterface* GetAlternateProfiler() const {
return nullptr;
}

private:
TF_LITE_REMOVE_VIRTUAL_DELETE
};
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/lite/micro/micro_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -334,4 +334,9 @@ TfLiteStatus MicroInterpreter::SetMicroExternalContext(
return micro_context_.set_external_context(external_context_payload);
}

TfLiteStatus MicroInterpreter::SetAlternateProfiler(
MicroProfilerInterface* alt_profiler) {
return micro_context_.SetAlternateProfiler(alt_profiler);
}

} // namespace tflite
8 changes: 8 additions & 0 deletions tensorflow/lite/micro/micro_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ class MicroInterpreter {
return allocator_.preserves_all_tensor();
}

// Set the alternate MicroProfilerInterface.
// This value is passed through to the MicroContext.
// This can be used to profile subsystems simultaneously with the profiling
// of kernels during the Eval phase. See (b/379584353).
// The alternate MicroProfilerInterface is currently used by the tensor
// decompression subsystem.
TfLiteStatus SetAlternateProfiler(MicroProfilerInterface* alt_profiler);

protected:
const MicroAllocator& allocator() const { return allocator_; }
const TfLiteContext& context() const { return context_; }
Expand Down
10 changes: 10 additions & 0 deletions tensorflow/lite/micro/micro_interpreter_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,14 @@ void* MicroInterpreterContext::DecompressTensorToBuffer(

#endif // USE_TFLM_COMPRESSION

TfLiteStatus MicroInterpreterContext::SetAlternateProfiler(
tflite::MicroProfilerInterface* alt_profiler) {
alt_profiler_ = alt_profiler;
return kTfLiteOk;
}

MicroProfilerInterface* MicroInterpreterContext::GetAlternateProfiler() const {
return alt_profiler_;
}

} // namespace tflite
16 changes: 16 additions & 0 deletions tensorflow/lite/micro/micro_interpreter_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,21 @@ class MicroInterpreterContext : public MicroContext {

#endif // USE_TFLM_COMPRESSION

// Set the alternate MicroProfilerInterface.
// This can be used to profile subsystems simultaneously with the profiling
// of kernels during the Eval phase. See (b/379584353).
// The alternate MicroProfilerInterface is currently used by the tensor
// decompression subsystem.
TfLiteStatus SetAlternateProfiler(
MicroProfilerInterface* alt_profiler) override;

// Get the alternate MicroProfilerInterface.
// This can be used to profile subsystems simultaneously with the profiling
// of kernels during the Eval phase. See (b/379584353).
// The alternate MicroProfilerInterface is currently used by the tensor
// decompression subsystem.
MicroProfilerInterface* GetAlternateProfiler() const override;

private:
MicroAllocator& allocator_;
MicroInterpreterGraph& graph_;
Expand All @@ -138,6 +153,7 @@ class MicroInterpreterContext : public MicroContext {

ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
void* external_context_payload_ = nullptr;
MicroProfilerInterface* alt_profiler_ = nullptr;

TF_LITE_REMOVE_VIRTUAL_DELETE
};
Expand Down
19 changes: 14 additions & 5 deletions tensorflow/lite/micro/micro_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() {
TFLITE_DCHECK(tags_[i] != nullptr);
int position = FindExistingOrNextPosition(tags_[i]);
TFLITE_DCHECK(position >= 0);
total_ticks_per_tag[position].tag = tags_[i];
total_ticks_per_tag[position].ticks =
total_ticks_per_tag[position].ticks + ticks;
total_ticks_per_tag_[position].tag = tags_[i];
total_ticks_per_tag_[position].ticks =
total_ticks_per_tag_[position].ticks + ticks;
total_ticks += ticks;
}

for (int i = 0; i < num_events_; ++i) {
TicksPerTag each_tag_entry = total_ticks_per_tag[i];
TicksPerTag each_tag_entry = total_ticks_per_tag_[i];
if (each_tag_entry.tag == nullptr) {
break;
}
Expand All @@ -112,12 +112,21 @@ void MicroProfiler::LogTicksPerTagCsv() {
int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) {
int pos = 0;
for (; pos < num_events_; pos++) {
TicksPerTag each_tag_entry = total_ticks_per_tag[pos];
TicksPerTag each_tag_entry = total_ticks_per_tag_[pos];
if (each_tag_entry.tag == nullptr ||
strcmp(each_tag_entry.tag, tag_name) == 0) {
return pos;
}
}
return pos < num_events_ ? pos : -1;
}

void MicroProfiler::ClearEvents() {
for (int i = 0; i < num_events_; i++) {
total_ticks_per_tag_[i].tag = nullptr;
}

num_events_ = 0;
}

} // namespace tflite
6 changes: 3 additions & 3 deletions tensorflow/lite/micro/micro_profiler.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface {
virtual void EndEvent(uint32_t event_handle) override;

// Clears all the events that have been currently profiled.
void ClearEvents() { num_events_ = 0; }
void ClearEvents();

// Returns the sum of the ticks taken across all the events. This number
// is only meaningful if all of the events are disjoint (the end time of
Expand Down Expand Up @@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface {
// In practice, the number of tags will be much lower than the number of
// events. But it is theoretically possible that each event to be unique and
// hence we allow total_ticks_per_tag to have kMaxEvents entries.
TicksPerTag total_ticks_per_tag[kMaxEvents] = {};
TicksPerTag total_ticks_per_tag_[kMaxEvents] = {};

int FindExistingOrNextPosition(const char* tag_name);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) {
}
}

void ShowInputCRC32(tflite::MicroInterpreter* interpreter) {
GenCRC32Table();
for (size_t i = 0; i < interpreter->inputs_size(); ++i) {
TfLiteTensor* input = interpreter->input_tensor(i);
uint8_t* input_values = tflite::GetTensorData<uint8_t>(input);
uint32_t crc32_value = ComputeCRC32(input_values, input->bytes);
MicroPrintf("Input CRC32: 0x%X", crc32_value);
}
}

int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
static Profiler profiler;
static Profiler profiler2;
Expand All @@ -184,26 +194,35 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {

alignas(16) static uint8_t tensor_arena[kTensorArenaSize];

uint32_t event_handle = profiler.BeginEvent("TfliteGetModel");
uint32_t event_handle = profiler.BeginEvent("tflite::GetModel");
const tflite::Model* model = tflite::GetModel(model_data);
profiler.EndEvent(event_handle);

event_handle = profiler.BeginEvent("tflite::CreateOpResolver");
TflmOpResolver op_resolver;
TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver));
profiler.EndEvent(event_handle);

event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create");
tflite::RecordingMicroAllocator* allocator(
tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize));
profiler.EndEvent(event_handle);
event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation");
tflite::RecordingMicroInterpreter interpreter(
model, op_resolver, allocator,
tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable),
&profiler);
profiler.EndEvent(event_handle);
event_handle =
profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors");
TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
profiler.EndEvent(event_handle);

profiler.Log();
profiler.LogTicksPerTagCsv();
profiler.ClearEvents();

if (using_compression) {
TF_LITE_ENSURE_STATUS(interpreter.SetMicroExternalContext(&profiler2));
TF_LITE_ENSURE_STATUS(interpreter.SetAlternateProfiler(&profiler2));
}

MicroPrintf(""); // null MicroPrintf serves as a newline.
Expand All @@ -216,6 +235,9 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
uint32_t seed = kRandomSeed;
while (true) {
SetRandomInput(seed++, interpreter);
ShowInputCRC32(&interpreter);
MicroPrintf(""); // null MicroPrintf serves as a newline.

TfLiteStatus status = interpreter.Invoke();
if ((status != kTfLiteOk) && (static_cast<int>(status) != kTfLiteAbort)) {
MicroPrintf("Model interpreter invocation failed: %d\n", status);
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,10 @@ ifeq ($(OPTIMIZED_KERNEL_DIR), xtensa)
$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc \
$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/unidirectional_sequence_lstm.cc

# override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
# Xtensa intrinsics.
$(KERNEL_OBJDIR)$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.o: $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.cc
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) -O3 -LNO:simd $(INCLUDES) -c $< -o $@
endif

0 comments on commit 1110543

Please sign in to comment.