oPossum 1,083 Posted June 14, 2015 Share Posted June 14, 2015 This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz. The general strategy of the code is: - Count the number of digits in the decimal representation - Divide the value to reduce it to 3 digits. - Print the 3 digits with proper formatting and SI prefix. - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal. A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code... static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code. static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds. Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds. The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide. static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds. Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds. Future revisions will use intrinsic divide for GCC and this divide function for TI. static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds. Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds. unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds. Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds. static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds. Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds. div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds. Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds. uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits. Complete code #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; } timotet, dubnet, pine and 6 others 9 Quote Link to post Share on other sites
roadrunner84 466 Posted June 15, 2015 Share Posted June 15, 2015 Sweet! Maybe make it easier to replace the " kMGTPE" string with something else, so you can have a value in millivolts or the like "m kMGTP", or nanoamperes "num kMG" Quote Link to post Share on other sites
bluehash 1,581 Posted June 17, 2015 Share Posted June 17, 2015 Nice work @@oPossum. Took it all the way from 3.24seconds to 10ms for the TI compiler - that is a 99.67% decrease in processing time. Quote Link to post Share on other sites
spirilis 1,265 Posted June 17, 2015 Share Posted June 17, 2015 Just the marsupial working his magic as usual! Quote Link to post Share on other sites
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.