Floating Point to Fixed Point Converter Module (#123)

* float to fixed converter module with documentation
intel · Nov 5, 2024 · 5bf9681 · 5bf9681
1 parent a66bcda
commit 5bf9681
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 1 deletion.
diff --git a/doc/components/fixed_point.md b/doc/components/fixed_point.md
@@ -13,3 +13,11 @@ The `FixedPoint` type is an extension of `Logic` with additional attributes (sig
 ## FixedToFloat
 
 This component converts a fixed-point signal to a floating point signal specified by exponent and mantissa width. The output is rounded to nearest even when applicable and set to infinity if the input exceed the representable range.
+
+## FloatToFixed
+
+This component converts a floating-point signal to a signed fixed-point signal. Infinities and NaN's are not supported. The integer and fraction widths are auto-calculated to achieve lossles conversion.
+
+## Float8ToFixed
+
+This component converts an 8-bit floating-point (FP8) representation (E5M2 or E4M3) to a signed fixed-point representation. This component offers using the same hardware for both FP8 formats. Therefore, both input and output are of type `Logic` and can be cast from/to floating point/fixed point by the producer/consumer based on the selected `mode`. Infinities and NaN's are not supported. The output width is 33 to accomodate E5M2 without loss.
diff --git a/lib/src/arithmetic/arithmetic.dart b/lib/src/arithmetic/arithmetic.dart
@@ -7,6 +7,7 @@ export 'carry_save_mutiplier.dart';
 export 'compound_adder.dart';
 export 'divider.dart';
 export 'fixed_to_float.dart';
+export 'float_to_fixed.dart';
 export 'floating_point/floating_point.dart';
 export 'multiplier.dart';
 export 'multiplier_lib.dart';

diff --git a/lib/src/arithmetic/float_to_fixed.dart b/lib/src/arithmetic/float_to_fixed.dart
@@ -0,0 +1,118 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// float_to_fixed.dart
+// Transform floating point input signals to fixed point signals.
+//
+// 2024 November 1
+// Author: Soner Yaldiz <[email protected]>
+
+import 'package:rohd/rohd.dart';
+import 'package:rohd_hcl/rohd_hcl.dart';
+
+/// [FloatToFixed] converts a floating point input to a signed
+/// fixed-point output following Q notation (Qm.n format) as introduced by
+/// (Texas Instruments)[https://www.ti.com/lit/ug/spru565b/spru565b.pdf].
+/// Infinities and NaN's are not supported. Conversion is lossless.
+/// The output is in two's complement and in Qm.n format where:
+/// m = e_max - bias + 1
+/// n = mantissa + bias - 1
+class FloatToFixed extends Module {
+  /// Width of output integer part.
+  late final int m;
+
+  /// Width of output fractional part.
+  late final int n;
+
+  /// Internal representation of the output port
+  late final FixedPoint _fixed = FixedPoint(signed: true, m: m, n: n);
+
+  /// Output fixed point port
+  late final FixedPoint fixed = _fixed.clone()..gets(output('fixed'));
+
+  /// Constructor
+  FloatToFixed(FloatingPoint float, {super.name = 'FloatToFixed'}) {
+    float = float.clone()..gets(addInput('float', float, width: float.width));
+
+    final bias = FloatingPointValue.computeBias(float.exponent.width);
+    // E4M3 expands the max exponent by 1.
+    m = ((float.exponent.width == 4) & (float.mantissa.width == 3))
+        ? bias + 1
+        : bias;
+    n = bias + float.mantissa.width - 1;
+    final outputWidth = m + n + 1;
+
+    final jBit = Logic(name: 'jBit')..gets(float.isNormal());
+    final shift = Logic(name: 'shift', width: float.exponent.width)
+      ..gets(
+          mux(jBit, float.exponent - 1, Const(0, width: float.exponent.width)));
+
+    final number = Logic(name: 'number', width: outputWidth)
+      ..gets([
+            Const(0, width: outputWidth - float.mantissa.width - 1),
+            jBit,
+            float.mantissa
+          ].swizzle() <<
+          shift);
+
+    _fixed <= mux(float.sign, ~number + 1, number);
+    addOutput('fixed', width: outputWidth) <= _fixed;
+  }
+}
+
+/// [Float8ToFixed] converts an 8-bit floating point (FP8) input
+/// to a signed fixed-point output following Q notation (Qm.n) as introduced by
+/// (Texas Instruments)[https://www.ti.com/lit/ug/spru565b/spru565b.pdf].
+/// FP8 input must follow E4M3 or E5M2 as described in
+/// (FP8 formats for deep learning)[https://arxiv.org/pdf/2209.05433].
+/// This component offers re-using the same hardware for both FP8 formats.
+/// Infinities and NaN's are not supported.
+/// The output is of type [Logic] and in two's complement.
+/// It can be cast to a [FixedPoint] by the consumer based on the mode.
+/// if `mode` is true:
+///   Input is treated as E4M3 and converted to Q9.9
+///   `fixed[17:9] contains integer part
+///   `fixed[8:0] contains fractional part
+/// else:
+///    Input is treated as E5M2 and converted to Q16.16
+///   `fixed[31:16] contains integer part
+///   `fixed[15:0] contains fractional part
+class Float8ToFixed extends Module {
+  /// Output port [fixed]
+  Logic get fixed => output('fixed');
+
+  /// Getter for Q23.9
+  FixedPoint get q23p9 => FixedPoint.of(fixed, signed: true, m: 23, n: 9);
+
+  /// Getter for Q16.16
+  FixedPoint get q16p16 => FixedPoint.of(fixed, signed: true, m: 16, n: 16);
+
+  /// Constructor
+  Float8ToFixed(Logic float, Logic mode, {super.name = 'Float8ToFixed'}) {
+    float = addInput('float', float, width: float.width);
+    mode = addInput('mode', mode);
+    addOutput('fixed', width: 33);
+
+    if (float.width != 8) {
+      throw RohdHclException('Input width must be 8.');
+    }
+
+    final exponent = Logic(name: 'exponent', width: 5)
+      ..gets(mux(
+          mode, [Const(0), float.slice(6, 3)].swizzle(), float.slice(6, 2)));
+
+    final jBit = Logic(name: 'jBit')..gets(exponent.or());
+
+    final mantissa = Logic(name: 'mantissa', width: 4)
+      ..gets(mux(mode, [jBit, float.slice(2, 0)].swizzle(),
+          [Const(0), jBit, float.slice(1, 0)].swizzle()));
+
+    final shift = Logic(name: 'shift', width: exponent.width)
+      ..gets(mux(jBit, exponent - 1, Const(0, width: exponent.width)));
+
+    final number = Logic(name: 'number', width: 33)
+      ..gets([Const(0, width: 29), mantissa].swizzle() << shift);
+
+    fixed <= mux(float[float.width - 1], ~number + 1, number);
+  }
+}
diff --git a/lib/src/component_config/components/component_registry.dart b/lib/src/component_config/components/component_registry.dart
@@ -30,5 +30,7 @@ List<Configurator> get componentRegistry => [
       CompressionTreeMultiplierConfigurator(),
       ExtremaConfigurator(),
       CompoundAdderConfigurator(),
-      FixedToFloatConfigurator()
+      FixedToFloatConfigurator(),
+      FloatToFixedConfigurator(),
+      Float8ToFixedConfigurator()
     ];
diff --git a/lib/src/component_config/components/components.dart b/lib/src/component_config/components/components.dart
@@ -10,6 +10,8 @@ export 'config_extrema.dart';
 export 'config_fifo.dart';
 export 'config_find.dart';
 export 'config_fixed_to_float.dart';
+export 'config_float8_to_fixed.dart';
+export 'config_float_to_fixed.dart';
 export 'config_floating_point_adder_round.dart';
 export 'config_one_hot.dart';
 export 'config_parallel_prefix_adder.dart';

diff --git a/lib/src/component_config/components/config_float8_to_fixed.dart b/lib/src/component_config/components/config_float8_to_fixed.dart
@@ -0,0 +1,25 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// config_float_to_fixed.dart
+// Configurator for a Float8ToFixed converter.
+//
+// 2024 November 1
+// Author: Soner Yaldiz <[email protected]>
+
+import 'dart:collection';
+
+import 'package:rohd/rohd.dart';
+import 'package:rohd_hcl/rohd_hcl.dart';
+
+/// A [Configurator] for [Float8ToFixed].
+class Float8ToFixedConfigurator extends Configurator {
+  @override
+  final String name = 'FP8 To Fixed Converter';
+
+  @override
+  late final Map<String, ConfigKnob<dynamic>> knobs = UnmodifiableMapView({});
+
+  @override
+  Module createModule() => Float8ToFixed(Logic(width: 8), Logic());
+}
diff --git a/lib/src/component_config/components/config_float_to_fixed.dart b/lib/src/component_config/components/config_float_to_fixed.dart
@@ -0,0 +1,36 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// config_float_to_fixed.dart
+// Configurator for a FloatToFixed converter.
+//
+// 2024 November 1
+// Author: Soner Yaldiz <[email protected]>
+
+import 'dart:collection';
+
+import 'package:rohd/rohd.dart';
+import 'package:rohd_hcl/rohd_hcl.dart';
+
+/// A [Configurator] for [FloatToFixed].
+class FloatToFixedConfigurator extends Configurator {
+  /// Width of exponent, must be greater than 0.
+  final IntConfigKnob exponentWidthKnob = IntConfigKnob(value: 8);
+
+  /// Width of mantissa, must be greater than 0.
+  final IntConfigKnob mantissaWidthKnob = IntConfigKnob(value: 23);
+
+  @override
+  final String name = 'Float To Fixed Converter';
+
+  @override
+  late final Map<String, ConfigKnob<dynamic>> knobs = UnmodifiableMapView({
+    'Input exponent width': exponentWidthKnob,
+    'Input mantissa width': mantissaWidthKnob,
+  });
+
+  @override
+  Module createModule() => FloatToFixed(FloatingPoint(
+      exponentWidth: exponentWidthKnob.value,
+      mantissaWidth: mantissaWidthKnob.value));
+}
diff --git a/test/arithmetic/float_to_fixed_test.dart b/test/arithmetic/float_to_fixed_test.dart
@@ -0,0 +1,67 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// float_to_fixed_test.dart
+// Test floating point to fixed point conversion.
+//
+// 2024 November 1
+// Author: Soner Yaldiz <[email protected]>
+
+import 'dart:math';
+import 'package:rohd/rohd.dart';
+import 'package:rohd_hcl/rohd_hcl.dart';
+import 'package:test/test.dart';
+
+void main() async {
+  test('E5M2 to Q16.16 exhaustive', () async {
+    final float = FloatingPoint(exponentWidth: 5, mantissaWidth: 2);
+    final dut = FloatToFixed(float);
+    await dut.build();
+    for (var val = 0; val < pow(2, 8); val++) {
+      final fpv = FloatingPointValue.ofLogicValue(
+          5, 2, LogicValue.ofInt(val, float.width));
+      if (!fpv.isNaN()) {
+        float.put(fpv);
+        final fxp = dut.fixed;
+        final fxpExp = FixedPointValue.ofDouble(fpv.toDouble(),
+            signed: true, m: dut.m, n: dut.n);
+        expect(fxp.value.bitString, fxpExp.value.bitString);
+      }
+    }
+  });
+
+  test('FP8toINT: exhaustive', () async {
+    final float = Logic(width: 8);
+    final mode = Logic();
+    final dut = Float8ToFixed(float, mode);
+    await dut.build();
+
+    // E4M3
+    mode.put(1);
+    for (var val = 0; val < pow(2, 8); val++) {
+      final fp8 = FloatingPointValue.ofLogicValue(
+          4, 3, LogicValue.ofInt(val, float.width));
+      if (!fp8.isNaN()) {
+        float.put(fp8.value);
+        final fx8 =
+            FixedPointValue.ofDouble(fp8.toDouble(), signed: true, m: 23, n: 9);
+        expect(dut.fixed.value.bitString, fx8.value.bitString);
+        expect(dut.q23p9.value, fx8.value);
+      }
+    }
+
+    // E5M2
+    mode.put(0);
+    for (var val = 0; val < pow(2, 8); val++) {
+      final fp8 = FloatingPointValue.ofLogicValue(
+          5, 2, LogicValue.ofInt(val, float.width));
+      if (!fp8.isNaN()) {
+        float.put(fp8.value);
+        final fx8 = FixedPointValue.ofDouble(fp8.toDouble(),
+            signed: true, m: 16, n: 16);
+        expect(dut.fixed.value.bitString, fx8.value.bitString);
+        expect(dut.q16p16.value, fx8.value);
+      }
+    }
+  });
+}