Skip to content

Commit

Permalink
Merge pull request #24 from Dequino/pulp-trainlib-dev
Browse files Browse the repository at this point in the history
Pulp trainlib dev
  • Loading branch information
dnadalini authored Jan 11, 2024
2 parents e519411 + 8a2e2e1 commit b30b419
Show file tree
Hide file tree
Showing 26 changed files with 2,673 additions and 74 deletions.
21 changes: 19 additions & 2 deletions lib/include/pulp_act_fp32.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

/**
* Activation functions configuration structure
*/
*/


/**
* @brief Structure for activation functions
Expand All @@ -46,6 +47,8 @@ struct softmax_args{
int n_heads;
float * global_max;
float * partial_exp_sum;
float * maxes;
float * sums;
};


Expand Down Expand Up @@ -85,19 +88,33 @@ void pulp_softmax_fp32_fw_cl( void * act_args );
void pulp_softmax_fp32_bw_cl( void * act_args );

/**
* @brief Forward pass function, second version using partial algorithm.
* @brief Forward pass function, second version using partial algorithm
* @param input Input for softmax.
* @param output Output of softmax.
*/
void pulp_partial_softmax_fp32_fw_cl( void * act_args );

/**
* @brief Forward pass function, second version using partial algorithm
* @param input Input for softmax.
* @param output Output of softmax.
*/
void pulp_partial_softmax_simple_fp32_fw_cl( void * act_args );

/**
* @brief Forward pass function, second version using partial algorithm.
* @param input Input for softmax.
* @param output Output of softmax.
*/
void pulp_partial_softmax_shift_fp32_fw_cl( void * act_args );

/**
* @brief Forward pass function, third version using partial algorithm and taylor approximation.
* @param input Input for softmax.
* @param output Output of softmax.
*/
void pulp_partial_softmax_approximate_fp32_fw_cl(void * act_args);

/**
* @brief Forward pass function that parallelize the fastertanh function (below).
* @param pointer to a tanh_args struct
Expand Down
9 changes: 9 additions & 0 deletions lib/include/pulp_mhsa_fp32.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ struct Mhsa_args {
struct blob * softmax_buffer;
float * global_max;
float * partial_exp_sum;
float * maxes;
float * sums;
};


Expand All @@ -85,6 +87,13 @@ void pulp_mhsa_fp32_fw_cl(void * Mhsa_args);
void pulp_mhsa_fp32_fw_cl_2(void * Mhsa_args);


/**
* @brief Forward pass function, forked on PULP cluster, using partial softmax.
* @param Mhsa_args structure configuring the MHSA layer.
*/
void pulp_mhsa_fp32_fw_cl_3(void * Mhsa_args);


// BACKWARD FUNCTIONS

/**
Expand Down
80 changes: 80 additions & 0 deletions lib/include/pulp_train_utils_fp32.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
#include "pmsis.h"
#include "pulp_train_defines.h"

/**
* Constants for Taylor's propagation of 1/2^x
*/

#define LOG2 0.6931471805599453f
#define LOG2_2 0.4804530139182014f
#define LOG2_3 0.3330246519889294f
#define LOG2_4 0.2308350985830834f
#define LOG2_5 0.1600026977571413f
#define T1 1.0f
#define T2 0.5f
#define T3 0.16f
#define T4 0.0416f
#define T5 0.008f

/**
* =====> BACKEND STRUCTURES <=====
*/
Expand Down Expand Up @@ -305,11 +320,13 @@ struct update_weight_args{
* @param input input vector on which we want to find the max
* @param maxes vector on which each core saves the max they have found
* @param dim dimension of input
* @param dim dimension of input^2
*/
struct max_args{
float* input;
float* maxes;
int dim;
int dim2;
};

/**
Expand All @@ -328,6 +345,24 @@ struct exp_sum_args{
float max;
};

/**
* @brief Arguments for implementing parallelized exponential and sum on an input vector
* @param input input vector on which we want to calculate the exponential and summatory
* @param sums vector on which each core saves their sum
* @param output vector where the exponential is saved
* @param dim dimension of input
* @param dim dimension of input^2
* @param maxes maximum value for each row of the input map
*/
struct shift_sum_args{
float* input;
float* sums;
float* output;
int dim;
int dim2;
float* maxes;
};

/**
* @brief Arguments for implementing parallelized division of an input vector and a scalar
* @param input input vector we want to divide
Expand All @@ -340,6 +375,20 @@ struct div_args{
int dim;
};

/**
* @brief Arguments for implementing parallelized division of an input vector and a vector
* @param input input vector we want to divide
* @param sums values we want to divide the vector with
* @param dim dimension of input
* @param dim2 dimension of input^2
*/
struct row_div_args{
float* input;
float* sums;
int dim;
int dim2;
};

/**
* @brief Arguments for implementing parallelized multiplication of an input vector and a scalar
* @param input input vector we want to multiply
Expand Down Expand Up @@ -440,26 +489,57 @@ void softmax (void * void_args);
*/
void pulp_max_fp32_cl(void * void_args);

/**
* @brief Calculate the maxes for each row of a square matrix in parallelized fashion
* @param (void *) (struct max_args void_args)
*/
void pulp_row_max_fp32_cl(void * void_args);

/**
* @brief Calculate the exponential of each element and sum them
* @param (void *) (struct exp_sum_args void_args)
*/
void pulp_exp_sum_fp32_cl(void* void_args);

/**
* @brief Calculate the 1/2^diff of each element and sum them
* @param (void *) (struct exp_sum_args void_args)
*/
void pulp_shift_sum_fp32_cl(void* void_args);

/**
* @brief Element-wise division of vector with a single constant
* @param (void *) (struct div_args void_args)
*/
void pulp_div_fp32_cl(void* void_args);

/**
* @brief Element-wise division of vector with values obtained by shit_sum
* @param (void *) (struct div_args void_args)
*/
void pulp_row_div_fp32_cl(void* void_args);

/**
* @brief Element-wise multiplication of vector with a single constant
* @param (void *) (struct scalar_mul_args void_args)
*/
void pulp_scalar_mul_fp32_cl(void* void_args);

float threshold(float x);

static inline float
fasterexp (float p);

static inline float
fasterpow2 (float p);

#define LOG2 0.6931471805599453f
#define LOG2_2 0.4804530139182014f
#define LOG2_3 0.3330246519889294f
#define LOG2_4 0.2308350985830834f
#define LOG2_5 0.1600026977571413f
#define T1 1.0f
#define T2 0.5f
#define T3 0.16f
#define T4 0.0416f
#define T5 0.008f
23 changes: 15 additions & 8 deletions lib/sources/pulp_act_fp16.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

/**
* Authors: Davide Nadalini, Leonardo Ravaglia
* Authors: Davide Nadalini, Leonardo Ravaglia, Alberto Dequino
*/

#include "pulp_train_utils_fp16.h"
Expand Down Expand Up @@ -62,11 +62,14 @@ void pulp_softmax_fp16_fw_cl( void * act_args_fp16 )
const int stop = start + blockSize > args_tanh->dim ? args_tanh->dim : start+blockSize;
*/

fp16 sum = 0.0;
fp16 sum2 = 0.0;
fp16 max = 0.0;
fp16 maxes[NUM_CORES] = {0.0};
fp16 sums[NUM_CORES] = {0.0};
short s = 0;
fp16 zero = (fp16) s;

fp16 sum = zero;
fp16 sum2 = zero;
fp16 max = zero;
fp16 maxes[NUM_CORES] = {zero};
fp16 sums[NUM_CORES] = {zero};


struct max_args_fp16 m_args;
Expand Down Expand Up @@ -109,10 +112,14 @@ void pulp_softmax_fp16_bw_cl( void * act_args_fp16 )
fp16* inDiff = args->input->diff;
fp16* outData = args->output->data;
fp16* outDiff = args->output->diff;
fp16 sum = 0.0;

short s = 0;
fp16 zero = (fp16) s;

fp16 sum = zero;

for(int j = 0; j < dim; j++){ // Cycle over the elements of the i-th head buffer
fp16 sum = 0.0;
fp16 sum = zero;
const fp16 neg_sft_j = -(outData)[j];
for(int z = 0; z < dim; ++z){ // Softmax involves all the elements of the i-th head buffer
fp16 mul = (outDiff)[z] * (outData)[z] * neg_sft_j;
Expand Down
Loading

0 comments on commit b30b419

Please sign in to comment.