From 3934d10f219187fdee3e45ac6970baddec85b6cd Mon Sep 17 00:00:00 2001 From: mrwgx3 <23343823+mrwgx3@users.noreply.github.com> Date: Mon, 12 Mar 2018 12:28:04 -0600 Subject: [PATCH] Correct millis() drift, issue #3078 (#4264) * Correct millis() drift, issue 3078 * Add 'test_millis_mm.ino' runtime benchmark * Eliminate 'punning' warning Add union 'acc' in millis() to eliminate 'punning' compiler warning * Correct minor typo * Eliminate 'punning' warning Add union 'acc' to eliminate 'punning' compiler warning in 'millis_test'_DEBUG() and 'millis_test()' --- cores/esp8266/core_esp8266_wiring.c | 132 ++++- .../device/test_millis_mm/test_millis_mm.ino | 491 ++++++++++++++++++ 2 files changed, 618 insertions(+), 5 deletions(-) create mode 100644 tests/device/test_millis_mm/test_millis_mm.ino diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c index 4b55a711e0..3d91043d34 100644 --- a/cores/esp8266/core_esp8266_wiring.c +++ b/cores/esp8266/core_esp8266_wiring.c @@ -61,11 +61,133 @@ void micros_overflow_tick(void* arg) { micros_at_last_overflow_tick = m; } -unsigned long ICACHE_RAM_ATTR millis() { - uint32_t m = system_get_time(); - uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick) ? 1 : 0); - return c * 4294967 + m / 1000; -} +//--------------------------------------------------------------------------- +// millis() 'magic multiplier' approximation +// +// This function corrects the cumlative (296us / usec overflow) drift +// seen in the orignal 'millis()' function. +// +// Input: +// 'm' - 32-bit usec counter, 0 <= m <= 0xFFFFFFFF +// 'c' - 32-bit usec overflow counter 0 <= c < 0x00400000 +// Output: +// Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF) +// +// Notes: +// +// 1) This routine approximates the 64-bit integer division, +// +// quotient = ( 2^32 c + m ) / 1000, +// +// through the use of 'magic' multipliers. A slow division is replaced by +// a faster multiply using a scaled multiplicative inverse of the divisor: +// +// quotient =~ ( 2^32 c + m ) * k, where k = Ceiling[ 2^n / 1000 ] +// +// The precision difference between multiplier and divisor sets the +// upper-bound of the dividend which can be successfully divided. +// +// For this application, n = 64, and the divisor (1000) has 10-bits of +// precision. This sets the dividend upper-bound to (64 - 10) = 54 bits, +// and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value +// for 'c' = 0x0040,0000 , or +570 years of usec counter overflows. +// +// 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ): +// +// prd = (2^32 kh + kl) * ( 2^32 c + m ) +// = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m +// (d) (c) (b) (a) +// +// Graphically, the offset-sums align in little endian like this: +// LS -> MS +// 32 64 96 128 +// | a[-1] | a[0] | a[1] | a[2] | +// | m kl | 0 | 0 | a[-1] not needed +// | | m kh | | +// | | c kl | | a[1] holds the result +// | | | c kh | a[2] can be discarded +// +// As only the high-word of 'm kl' and low-word of 'c kh' contribute to the +// overall result, only (2) 32-bit words are needed for the accumulator. +// +// 3) As C++ does not intrinsically test for addition overflows, one must +// code specifically to detect them. This approximation skips these +// overflow checks for speed, hence the sum, +// +// highword( m kl ) + m kh + c kl < (2^64-1), MUST NOT OVERFLOW. +// +// To meet this criteria, not only do we have to pick 'k' to achieve our +// desired precision, we also have to split 'k' appropriately to avoid +// any addition overflows. +// +// 'k' should be also chosen to align the various products on byte +// boundaries to avoid any 64-bit shifts before additions, as they incur +// major time penalties. The 'k' chosen for this specific division by 1000 +// was picked primarily to avoid shifts as well as for precision. +// +// For the reasons list above, this routine is NOT a general one. +// Changing divisors could break the overflow requirement and force +// picking a 'k' split which requires shifts before additions. +// +// ** Test THOROUGHLY after making changes ** +// +// 4) Results of time benchmarks run on an ESP8266 Huzzah feather are: +// +// usec x Orig Comment +// Orig: 3.18 1.00 Original code +// Corr: 13.21 4.15 64-bit reference code +// Test: 4.60 1.45 64-bit magic multiply, 4x32 +// +// The magic multiplier routine runs ~3x faster than the reference. Execution +// times can vary considerably with the numbers being multiplied, so one +// should derate this factor to around 2x, worst case. +// +// Reference function: corrected millis(), 64-bit arithmetic, +// truncated to 32-bits by return +// unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void ) +// { +// // Get usec system time, usec overflow conter +// ...... +// return ( (c * 4294967296 + m) / 1000 ); // 64-bit division is SLOW +// } //millis_corr +// +// 5) See this link for a good discussion on magic multipliers: +// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html +// + +#define MAGIC_1E3_wLO 0x4bc6a7f0 // LS part +#define MAGIC_1E3_wHI 0x00418937 // MS part, magic multiplier + +unsigned long ICACHE_RAM_ATTR millis() +{ + union { + uint64_t q; // Accumulator, 64-bit, little endian + uint32_t a[2]; // ..........., 32-bit segments + } acc; + acc.a[1] = 0; // Zero high-acc + + // Get usec system time, usec overflow counter + uint32_t m = system_get_time(); + uint32_t c = micros_overflow_count + + ((m < micros_at_last_overflow_tick) ? 1 : 0); + + // (a) Init. low-acc with high-word of 1st product. The right-shift + // falls on a byte boundary, hence is relatively quick. + + acc.q = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 ); + + // (b) Offset sum, low-acc + acc.q += ( m * (uint64_t)MAGIC_1E3_wHI ); + + // (c) Offset sum, low-acc + acc.q += ( c * (uint64_t)MAGIC_1E3_wLO ); + + // (d) Truncated sum, high-acc + acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI ); + + return ( acc.a[1] ); // Extract result, high-acc + +} //millis unsigned long ICACHE_RAM_ATTR micros() { return system_get_time(); diff --git a/tests/device/test_millis_mm/test_millis_mm.ino b/tests/device/test_millis_mm/test_millis_mm.ino new file mode 100644 index 0000000000..4bc4c78e2e --- /dev/null +++ b/tests/device/test_millis_mm/test_millis_mm.ino @@ -0,0 +1,491 @@ +// +// Millis() Runtime Benchmarke +// +// Code to determine the runtime in 'usec' of various millis() +// functions. +// + +#include +#include +#include +#include + +// Include API-Headers +extern "C" { // SDK functions for Arduino IDE access +#include "osapi.h" +#include "user_interface.h" +#include "espconn.h" +} //end, 'extern C' + +//--------------------------------------------------------------------------- +// Globals +//--------------------------------------------------------------------------- + +// Here // In 'core_esp8266_wiring.c' +static os_timer_t us_ovf_timer; // 'micros_overflow_timer' +static uint32_t us_cnt = 0; // 'm' in 'micros_overflow_tick()' +static uint32_t us_ovflow = 0; // 'micros_overflow_count' +static uint32_t us_at_last_ovf = 0; // 'micros_at_last_overflow_tick' + +static bool fixed_systime = false; // Testing vars +static bool debugf = false; +static uint32_t us_systime = 0; +static float nsf = 0; // Normalization factor +static uint32_t cntref = 0; // Ref. comparision count + +//--------------------------------------------------------------------------- +// Interrupt code lifted directly from "cores/core_esp8266_wiring.c", +// with some variable renaming +//--------------------------------------------------------------------------- + +// Callback for usec counter overflow timer interrupt +void us_overflow_tick ( void* arg ) +{ + us_cnt = system_get_time(); + + // Check for usec counter overflow + if ( us_cnt < us_at_last_ovf ) { + ++us_ovflow; + } //end-if + us_at_last_ovf = us_cnt; + +} //us_overflow_tick + +//--------------------------------------------------------------------------- +#define REPEAT 1 +#define ONCE 0 + +void us_count_init ( void ) +{ + os_timer_setfn( &us_ovf_timer, (os_timer_func_t*)&us_overflow_tick, 0 ); + os_timer_arm( &us_ovf_timer, 60000, REPEAT ); + +} //us_count_init + +//--------------------------------------------------------------------------- +// Wrapper(s) for our benchmark +//--------------------------------------------------------------------------- +// Set a fixed value of usec system time +void set_systime ( uint32_t usec ) +{ + us_systime = usec; +} //set_systime + +//--------------------------------------------------------------------------- +// Wrapper to return a fixed system time +uint32_t system_get_timeA ( void ) +{ + return ( fixed_systime ? us_systime : system_get_time() ); +} //system_get_timeA + +//--------------------------------------------------------------------------- +// Functions to be tested +//--------------------------------------------------------------------------- +// Print integer list as hex +void viewhex( uint16_t *p, uint8_t n ) +{ + Serial.print( "0x" ); + for ( uint8_t i = 0; i < n; i++ ) + { + Serial.printf( "%04X ", p[ (n - 1) - i ] ); + } + +} //viewhex + +//--------------------------------------------------------------------------- +// Support routine for 'millis_test_DEBUG()' +// Print accumulator value along interm summed into it +void view_accsum ( const char *desc, uint16_t *acc, uint16_t *itrm ) +{ + Serial.print( "acc: " ); + viewhex( acc, 4 ); + Serial.printf( " %s = ", desc ); + viewhex( itrm, 4 ); + Serial.println(); + +} //view_accsum + +//--------------------------------------------------------------------------- +// FOR BENCHTEST +// Original millis() function +unsigned long ICACHE_RAM_ATTR millis_orig ( void ) +{ + // Get usec system time, usec overflow conter + uint32_t m = system_get_time(); + uint32_t c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0); + + return ( (c * 4294967) + m / 1000 ); + +} //millis_orig + +//--------------------------------------------------------------------------- +// FOR DEBUG +// Corrected millis(), 64-bit arithmetic gold standard +// truncated to 32-bits by return +unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void ) +{ + // Get usec system time, usec overflow conter + uint32_t m = system_get_timeA(); // DEBUG + uint32_t c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0); + + return ( (c * 4294967296 + m) / 1000 ); + +} //millis_corr_DEBUG + +//--------------------------------------------------------------------------- +// FOR BENCHMARK +unsigned long ICACHE_RAM_ATTR millis_corr ( void ) +{ + // Get usec system time, usec overflow conter + uint32_t m = system_get_time(); + uint32_t c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0); + + return ( (c * 4294967296 + m) / 1000 ); + +} //millis_corr + +//--------------------------------------------------------------------------- +// FOR DEBUG +// millis() 'magic multiplier' approximation +// +// This function corrects the cumlative (296us / usec overflow) drift +// seen in the orignal 'millis()' function. +// +// Input: +// 'm' - 32-bit usec counter, 0 <= m <= 0xFFFFFFFF +// 'c' - 32-bit usec overflow counter 0 <= c < 0x00400000 +// Output: +// Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF) +// +// Notes: +// +// 1) This routine approximates the 64-bit integer division, +// +// quotient = ( 2^32 c + m ) / 1000, +// +// through the use of 'magic' multipliers. A slow division is replaced by +// a faster multiply using a scaled multiplicative inverse of the divisor: +// +// quotient =~ ( 2^32 c + m ) * k, where k = Ceiling[ 2^n / 1000 ] +// +// The precision difference between multiplier and divisor sets the +// upper-bound of the dividend which can be successfully divided. +// +// For this application, n = 64, and the divisor (1000) has 10-bits of +// precision. This sets the dividend upper-bound to (64 - 10) = 54 bits, +// and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value +// for 'c' = 0x0040,0000 , or +570 years of usec counter overflows. +// +// 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ): +// +// prd = (2^32 kh + kl) * ( 2^32 c + m ) +// = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m +// (d) (c) (b) (a) +// +// Graphically, the offset-sums align in little endian like this: +// LS -> MS +// 32 64 96 128 +// | a[-1] | a[0] | a[1] | a[2] | +// | m kl | 0 | 0 | a[-1] not needed +// | | m kh | | +// | | c kl | | a[1] holds the result +// | | | c kh | a[2] can be discarded +// +// As only the high-word of 'm kl' and low-word of 'c kh' contribute to the +// overall result, only (2) 32-bit words are needed for the accumulator. +// +// 3) As C++ does not intrinsically test for addition overflows, one must +// code specifically to detect them. This approximation skips these +// overflow checks for speed, hence the sum, +// +// highword( m kl ) + m kh + c kl < (2^64-1), MUST NOT OVERFLOW. +// +// To meet this criteria, not only do we have to pick 'k' to achieve our +// desired precision, we also have to split 'k' appropriately to avoid +// any addition overflows. +// +// 'k' should be also chosen to align the various products on byte +// boundaries to avoid any 64-bit shifts before additions, as they incur +// major time penalties. The 'k' chosen for this specific division by 1000 +// was picked primarily to avoid shifts as well as for precision. +// +// For the reasons list above, this routine is NOT a general one. +// Changing divisors could break the overflow requirement and force +// picking a 'k' split which requires shifts before additions. +// +// ** Test THOROUGHLY after making changes ** +// +// 4) Results of time benchmarks run on an ESP8266 Huzzah feather are: +// +// usec x Orig Comment +// Orig: 3.18 1.00 Original code +// Corr: 13.21 4.15 64-bit reference code +// Test: 4.60 1.45 64-bit magic multiply, 4x32 +// +// The magic multiplier routine runs ~3x faster than the reference. Execution +// times can vary considerably with the numbers being multiplied, so one +// should derate this factor to around 2x, worst case. +// +// Reference function: corrected millis(), 64-bit arithmetic, +// truncated to 32-bits by return +// unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void ) +// { +// // Get usec system time, usec overflow conter +// ...... +// return ( (c * 4294967296 + m) / 1000 ); // 64-bit division is SLOW +// } //millis_corr +// +// 5) See this link for a good discussion on magic multipliers: +// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html +// + +#define M_USEC_MAX 0xFFFFFFFF +#define C_COUNT_MAX 0x00400000 + +#define MAGIC_1E3_wLO 0x4bc6a7f0 // LS part +#define MAGIC_1E3_wHI 0x00418937 // MS part, magic multiplier + +unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void ) +{ + union { + uint64_t q; // Accumulator, 64-bit, little endian + uint32_t a[2]; // ..........., 32-bit segments + } acc; + acc.a[1] = 0; // Zero high-acc + + uint64_t prd; // Interm product + + // Get usec system time, usec overflow counter + uint32_t m = system_get_timeA(); + uint32_t c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0); + + // DEBUG: Show input vars + if ( debugf ) + Serial.printf( "Test m: 0x%08X c: 0x%08X\n", m, c ); + + // (a) Init. low-acc with high-word of 1st product. The right-shift + // falls on a byte boundary, hence is relatively quick. + + acc.q = ( (prd = (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO )) >> 32 ); + + // DEBUG: Show both accumulator and interm product + if( debugf ) + view_accsum( "m kl", (uint16_t *)&acc.q, (uint16_t *)&prd ); + + // (b) Offset sum, low-acc + acc.q += ( prd = ( m * (uint64_t)MAGIC_1E3_wHI ) ); + + // DEBUG: Show both accumulator and interm product + if( debugf ) + view_accsum( "m kh", (uint16_t *)&acc.q, (uint16_t *)&prd ); + + // (c) Offset sum, low-acc + acc.q += ( prd = ( c * (uint64_t)MAGIC_1E3_wLO ) ); + + // DEBUG: Show both accumulator and interm product + if( debugf ) + view_accsum( "c kl", (uint16_t *)&acc.q, (uint16_t *)&prd ); + + // (d) Truncated sum, high-acc + acc.a[1] += (uint32_t)( prd = ( c * (uint64_t)MAGIC_1E3_wHI ) ); + + // DEBUG: Show both accumulator and interm product + if( debugf ) + view_accsum( "c kh", (uint16_t *)&acc.q, (uint16_t *)&prd ); + + return ( acc.a[1] ); // Extract result, high-acc + +} //millis_test_DEBUG + +//--------------------------------------------------------------------------- +// FOR BENCHTEST +unsigned long ICACHE_RAM_ATTR millis_test ( void ) +{ + union { + uint64_t q; // Accumulator, 64-bit, little endian + uint32_t a[2]; // ..........., 32-bit segments + } acc; + acc.a[1] = 0; // Zero high-acc + + // Get usec system time, usec overflow counter + uint32_t m = system_get_time(); + uint32_t c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0); + + // (a) Init. low-acc with high-word of 1st product. The right-shift + // falls on a byte boundary, hence is relatively quick. + + acc.q = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 ); + + // (b) Offset sum, low-acc + acc.q += ( m * (uint64_t)MAGIC_1E3_wHI ); + + // (c) Offset sum, low-acc + acc.q += ( c * (uint64_t)MAGIC_1E3_wLO ); + + // (d) Truncated sum, high-acc + acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI ); + + return ( acc.a[1] ); // Extract result, high-acc + +} //millis_test + +//--------------------------------------------------------------------------- +// Execution time benchmark +//--------------------------------------------------------------------------- + +// Print benchmark result +void millis_rtms_print ( const char *pream, uint32_t cntx, const char *desc ) +{ + Serial.print( pream ); + Serial.print( nsf * (float)cntx ); + Serial.print( " " ); + Serial.print( (float)cntx / (float)cntref ); + Serial.print( " " ); + Serial.println( desc ); + +} //millis_rtms_print + +//--------------------------------------------------------------------------- +void Millis_RunTimes ( void ) +{ + Serial.println(); + Serial.println( "Millis() RunTime Benchmarks" ); + + uint32_t lc = 100000; // Samples + nsf = 1 / float(lc); // Normalization (global) + + uint32_t bgn; + uint32_t cnto = 0, cntv = 0; + uint32_t cntcx = 0, cntc = 0; + uint32_t cntfx = 0, cntf = 0; + + // Setup timer values + fixed_systime = true; + us_ovflow = C_COUNT_MAX; + us_at_last_ovf = (M_USEC_MAX - (20 * 1000000)); // Max. less 20 sec + +// No printing, systime active + debugf = false; fixed_systime = false; + + for (uint32_t i = 0; i < lc; i++ ) + { + bgn = system_get_time(); + millis_orig(); + cnto += system_get_time() - bgn; + + bgn = system_get_time(); + millis(); + cntv += system_get_time() - bgn; + + bgn = system_get_time(); + millis_corr_DEBUG(); + cntcx += system_get_time() - bgn; + + bgn = system_get_time(); + millis_corr(); + cntc += system_get_time() - bgn; + + bgn = system_get_time(); + millis_test_DEBUG(); + cntfx += system_get_time() - bgn; + + bgn = system_get_time(); + millis_test(); + cntf += system_get_time() - bgn; + + yield(); + } //end-for + + cntref = cnto; // Set global ref. count + + Serial.println(); + Serial.println( " usec x Orig Comment" ); + + millis_rtms_print( " Orig: ", cntref, "Original code" ); + millis_rtms_print( " Core: ", cntv, "Current core" ); + Serial.println(); + + millis_rtms_print( " Ref: ", cntcx, "64-bit reference code, DEBUG" ); + millis_rtms_print( " Test: ", cntfx, "64-bit magic multiply, 4x32 DEBUG" ); + Serial.println(); + + millis_rtms_print( " Ref: ", cntc, "64-bit reference code" ); + millis_rtms_print( " Test: ", cntf, "64-bit magic multiply, 4x32" ); + Serial.println(); + + Serial.println( F("*** End, Bench Test ***") ); + Serial.println(); + +} //Millis_RunTimes + +//--------------------------------------------------------------------------- +// Debug millis_test() +//--------------------------------------------------------------------------- + +bool Debug_Millis_Test ( void ) +{ + uint32_t m, msc, mstx; + int32_t diff; + +// Switch over to fixed system time, enable printing + fixed_systime = true; debugf = true; + + us_ovflow = C_COUNT_MAX; + m = (M_USEC_MAX - (0 * 1000000)); + set_systime( m ); + us_at_last_ovf = m - 1; // Disables 'c' bump + +// Millis() comparison, test vs. reference + Serial.println(); + mstx = millis_test_DEBUG(); + msc = millis_corr_DEBUG(); + diff = (int32_t)(mstx - msc); + + Serial.println(); + Serial.println( " m Test Reference Difference" ); + Serial.printf( "0x%08x 0x%08x 0x%08X %9d\n", m, mstx, msc, diff ); + Serial.println(); + +// No printing, variable systime + debugf = false; fixed_systime = false; + + return( (bool)( diff == 0 ) ); // Good test, matches reference + +} //Debug_Millis_Test + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +BS_ENV_DECLARE(); + +void setup () +{ + Serial.begin(115200); + WiFi.mode( WIFI_OFF ); + us_count_init(); // Start up timer overflow sampling + BS_RUN(Serial); + +} //setup + +//--------------------------------------------------------------------------- +void loop(void) +{ + yield(); +} //loop + +//--------------------------------------------------------------------------- +// Test cases +//--------------------------------------------------------------------------- +TEST_CASE( "Millis RunTime Benchmarks", "[bs]" ) +{ + Millis_RunTimes(); +} //testcase1 + +TEST_CASE( "Debug 'millis_test()' Code", "[bs]" ) +{ + bool ok = Debug_Millis_Test(); + CHECK( ok ); +} //testcase2 + +//--------------------------------------------------------------------------- +//---------------------------------------------------------------------------