Merge pull request #24 from Dequino/pulp-trainlib-dev

Pulp trainlib dev
pulp-platform · Jan 11, 2024 · b30b419 · b30b419
2 parents e519411 + 8a2e2e1
commit b30b419
Show file tree

Hide file tree

Showing 26 changed files with 2,673 additions and 74 deletions.
diff --git a/lib/include/pulp_act_fp32.h b/lib/include/pulp_act_fp32.h
@@ -20,7 +20,8 @@
 
 /**
  * Activation functions configuration structure
- */
+ */  
+
 
 /**
  * @brief Structure for activation functions
@@ -46,6 +47,8 @@ struct softmax_args{
   int n_heads;
   float * global_max;
   float * partial_exp_sum;
+  float * maxes;
+  float * sums;
 };
 
 
@@ -85,19 +88,33 @@ void pulp_softmax_fp32_fw_cl( void * act_args );
 void pulp_softmax_fp32_bw_cl( void * act_args );
 
 /**
- * @brief Forward pass function, second version using partial algorithm.
+ * @brief Forward pass function, second version using partial algorithm
  * @param input Input for softmax.
  * @param output Output of softmax.
 */
 void pulp_partial_softmax_fp32_fw_cl( void * act_args );
 
+/**
+ * @brief Forward pass function, second version using partial algorithm
+ * @param input Input for softmax.
+ * @param output Output of softmax.
+*/
+void pulp_partial_softmax_simple_fp32_fw_cl( void * act_args );
+
 /**
  * @brief Forward pass function, second version using partial algorithm.
  * @param input Input for softmax.
  * @param output Output of softmax.
 */
 void pulp_partial_softmax_shift_fp32_fw_cl( void * act_args );
 
+/**
+ * @brief Forward pass function, third version using partial algorithm and taylor approximation.
+ * @param input Input for softmax.
+ * @param output Output of softmax.
+*/
+void pulp_partial_softmax_approximate_fp32_fw_cl(void * act_args);
+
 /**
  * @brief Forward pass function that parallelize the fastertanh function (below).
  * @param pointer to a tanh_args struct

diff --git a/lib/include/pulp_mhsa_fp32.h b/lib/include/pulp_mhsa_fp32.h
@@ -60,6 +60,8 @@ struct Mhsa_args {
     struct blob * softmax_buffer;
     float * global_max;
     float * partial_exp_sum;
+    float * maxes;
+    float * sums;
 };
 
 
@@ -85,6 +87,13 @@ void pulp_mhsa_fp32_fw_cl(void * Mhsa_args);
 void pulp_mhsa_fp32_fw_cl_2(void * Mhsa_args);
 
 
+/**
+ * @brief Forward pass function, forked on PULP cluster, using partial softmax.
+ * @param Mhsa_args structure configuring the MHSA layer.
+ */
+void pulp_mhsa_fp32_fw_cl_3(void * Mhsa_args);
+
+
 // BACKWARD FUNCTIONS
 
 /**

diff --git a/lib/include/pulp_train_utils_fp32.h b/lib/include/pulp_train_utils_fp32.h
@@ -21,6 +21,21 @@
 #include "pmsis.h"
 #include "pulp_train_defines.h"
 
+/**
+ * Constants for Taylor's propagation of 1/2^x 
+ */
+
+#define LOG2    0.6931471805599453f
+#define LOG2_2  0.4804530139182014f
+#define LOG2_3  0.3330246519889294f
+#define LOG2_4  0.2308350985830834f
+#define LOG2_5  0.1600026977571413f
+#define T1      1.0f
+#define T2      0.5f
+#define T3      0.16f
+#define T4      0.0416f
+#define T5      0.008f   
+
 /**
  * =====> BACKEND STRUCTURES <=====
  */
@@ -305,11 +320,13 @@ struct update_weight_args{
  * @param input   input vector on which we want to find the max
  * @param maxes   vector on which each core saves the max they have found
  * @param dim     dimension of input
+ * @param dim     dimension of input^2
 */
 struct max_args{
   float* input;
   float* maxes;
   int dim;
+  int dim2;
 };
 
 /**
@@ -328,6 +345,24 @@ struct exp_sum_args{
   float max;
 };
 
+/**
+ * @brief Arguments for implementing parallelized exponential and sum on an input vector
+ * @param input   input vector on which we want to calculate the exponential and summatory
+ * @param sums    vector on which each core saves their sum
+ * @param output  vector where the exponential is saved
+ * @param dim     dimension of input
+ * @param dim     dimension of input^2
+ * @param maxes   maximum value for each row of the input map
+*/
+struct shift_sum_args{
+  float* input;
+  float* sums;
+  float* output;
+  int dim;
+  int dim2;
+  float* maxes;
+};
+
 /**
  * @brief Arguments for implementing parallelized division of an input vector and a scalar
  * @param input   input vector we want to divide
@@ -340,6 +375,20 @@ struct div_args{
   int dim;
 };
 
+/**
+ * @brief Arguments for implementing parallelized division of an input vector and a vector
+ * @param input   input vector we want to divide
+ * @param sums    values we want to divide the vector with
+ * @param dim     dimension of input
+ * @param dim2    dimension of input^2
+*/
+struct row_div_args{
+  float* input;
+  float* sums;
+  int dim;
+  int dim2;
+};
+
 /**
  * @brief Arguments for implementing parallelized multiplication of an input vector and a scalar
  * @param input   input vector we want to multiply
@@ -440,26 +489,57 @@ void softmax (void * void_args);
  */
 void pulp_max_fp32_cl(void * void_args);
 
+/**
+ * @brief Calculate the maxes for each row of a square matrix in parallelized fashion
+ * @param (void *)  (struct max_args void_args)
+ */
+void pulp_row_max_fp32_cl(void * void_args);
+
 /**
  * @brief Calculate the exponential of each element and sum them
  * @param (void *)  (struct exp_sum_args void_args)
  */
 void pulp_exp_sum_fp32_cl(void* void_args);
 
+/**
+ * @brief Calculate the 1/2^diff of each element and sum them
+ * @param (void *)  (struct exp_sum_args void_args)
+ */
+void pulp_shift_sum_fp32_cl(void* void_args);
+
 /**
  * @brief Element-wise division of vector with a single constant
  * @param (void *)  (struct div_args void_args)
  */
 void pulp_div_fp32_cl(void* void_args);
 
+/**
+ * @brief Element-wise division of vector with values obtained by shit_sum
+ * @param (void *)  (struct div_args void_args)
+ */
+void pulp_row_div_fp32_cl(void* void_args);
+
 /**
  * @brief Element-wise multiplication of vector with a single constant
  * @param (void *)  (struct scalar_mul_args void_args)
  */
 void pulp_scalar_mul_fp32_cl(void* void_args);
 
+float threshold(float x);
+
 static inline float
 fasterexp (float p);
 
 static inline float
 fasterpow2 (float p);
+
+#define LOG2    0.6931471805599453f
+#define LOG2_2  0.4804530139182014f
+#define LOG2_3  0.3330246519889294f
+#define LOG2_4  0.2308350985830834f
+#define LOG2_5  0.1600026977571413f
+#define T1      1.0f
+#define T2      0.5f
+#define T3      0.16f
+#define T4      0.0416f
+#define T5      0.008f   
diff --git a/lib/sources/pulp_act_fp16.c b/lib/sources/pulp_act_fp16.c
@@ -15,7 +15,7 @@
  */
 
 /**
- * Authors: Davide Nadalini, Leonardo Ravaglia
+ * Authors: Davide Nadalini, Leonardo Ravaglia, Alberto Dequino
 */ 
 
 #include "pulp_train_utils_fp16.h"
@@ -62,11 +62,14 @@ void pulp_softmax_fp16_fw_cl( void * act_args_fp16 )
   const int stop = start + blockSize > args_tanh->dim ? args_tanh->dim : start+blockSize;
   */
 
-  fp16 sum = 0.0;
-  fp16 sum2 = 0.0;
-  fp16 max = 0.0;
-  fp16 maxes[NUM_CORES] = {0.0};
-  fp16 sums[NUM_CORES] = {0.0};
+  short s = 0;
+  fp16 zero = (fp16) s;
+
+  fp16 sum = zero;
+  fp16 sum2 = zero;
+  fp16 max = zero;
+  fp16 maxes[NUM_CORES] = {zero};
+  fp16 sums[NUM_CORES] = {zero};
 
 
   struct max_args_fp16 m_args;
@@ -109,10 +112,14 @@ void pulp_softmax_fp16_bw_cl( void * act_args_fp16 )
   fp16* inDiff = args->input->diff;
   fp16* outData = args->output->data;
   fp16* outDiff = args->output->diff;
-  fp16 sum = 0.0;
+
+  short s = 0;
+  fp16 zero = (fp16) s;
+
+  fp16 sum = zero;
 
   for(int j = 0; j < dim; j++){ // Cycle over the elements of the i-th head buffer
-      fp16 sum = 0.0;
+      fp16 sum = zero;
       const fp16 neg_sft_j  =  -(outData)[j]; 
       for(int z = 0; z < dim; ++z){ // Softmax involves all the elements of the i-th head buffer
           fp16 mul =  (outDiff)[z] * (outData)[z] * neg_sft_j;