Jump to content
43oh

oPossum

Members
  • Content Count

    925
  • Joined

  • Last visited

  • Days Won

    103

Reputation Activity

  1. Like
    oPossum got a reaction from Jake in What to use?? power operated gate control interface   
    // Timer A0 interrupt service routine
    #pragma vector=TIMER0_A0_VECTOR
    __interrupt void Timer_A (void)
    {
    } // <<<---- missing closing brace <<<----
     
    // Port 1 interrupt service routine
    #pragma vector=PORT1_VECTOR
    __interrupt void Port_1(void)
    {
    P2OUT ^= (BYPASS & STATUS_LED); // MAKE BYPASS AND LED HIGH
    P2IFG &= ~POSITION; // RESET INT UPON POSITION HIGH
    }

  2. Like
    oPossum got a reaction from agaelema in Tiny printf() - C version   
    This is a tiny printf() function that can be used with the chips that come with the Launchpad. Code size is about 640 bytes with CCS.
     
    There are 7 format specifiers:
    %c - Character
    %s - String
    %i - signed Integer (16 bit)
    %u - Unsigned integer (16 bit)
    %l - signed Long (32 bit)
    %n - uNsigned loNg (32 bit)
    %x - heXadecimal (16 bit)
     
    Field width, floating point and other standard printf() features are not supported.
     
    printf() code

    #include "msp430g2231.h" #include "stdarg.h" void putc(unsigned); void puts(char *); static const unsigned long dv[] = { // 4294967296 // 32 bit unsigned max 1000000000, // +0 100000000, // +1 10000000, // +2 1000000, // +3 100000, // +4 // 65535 // 16 bit unsigned max 10000, // +5 1000, // +6 100, // +7 10, // +8 1, // +9 }; static void xtoa(unsigned long x, const unsigned long *dp) { char c; unsigned long d; if(x) { while(x < *dp) ++dp; do { d = *dp++; c = '0'; while(x >= d) ++c, x -= d; putc(c); } while(!(d & 1)); } else putc('0'); } static void puth(unsigned n) { static const char hex[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'}; putc(hex[n & 15]); } void printf(char *format, ...) { char c; int i; long n; va_list a; va_start(a, format); while(c = *format++) { if(c == '%') { switch(c = *format++) { case 's': // String puts(va_arg(a, char*)); break; case 'c': // Char putc(va_arg(a, char)); break; case 'i': // 16 bit Integer case 'u': // 16 bit Unsigned i = va_arg(a, int); if(c == 'i' && i < 0) i = -i, putc('-'); xtoa((unsigned)i, dv + 5); break; case 'l': // 32 bit Long case 'n': // 32 bit uNsigned loNg n = va_arg(a, long); if(c == 'l' && n < 0) n = -n, putc('-'); xtoa((unsigned long)n, dv); break; case 'x': // 16 bit heXadecimal i = va_arg(a, int); puth(i >> 12); puth(i >> 8); puth(i >> 4); puth(i); break; case 0: return; default: goto bad_fmt; } } else bad_fmt: putc(c); } va_end(a); }
     
    test code

    #include "msp430g2231.h" void serial_setup(unsigned out_mask, unsigned in_mask, unsigned duration); void printf(char *, ...); void main(void) { char *s; char c; int i; unsigned u; long int l; long unsigned n; unsigned x; // Disable watchdog WDTCTL = WDTPW + WDTHOLD; // Use 1 MHz DCO factory calibration DCOCTL = 0; BCSCTL1 = CALBC1_1MHZ; DCOCTL = CALDCO_1MHZ; // Setup the serial port // Serial out: P1.1 (BIT1) // Serial in: P1.2 (BIT2) // Bit rate: 9600 (CPU freq / bit rate) serial_setup(BIT1, BIT2, 1000000 / 9600); printf("%s", "\r\n*** printf() test ***\r\n"); s = "test"; c = 'X'; i = -12345; u = 12345; l = -1234567890; n = 1234567890; x = 0xABCD; printf("String %s\r\n", s); printf("Char %c\r\n", c); printf("Integer %i\r\n", i); printf("Unsigned %u\r\n", u); printf("Long %l\r\n", l); printf("uNsigned loNg %n\r\n", n); printf("heX %x\r\n", x); printf("multiple args %s %c %i %u %l %n %x\r\n", s, c, i, u, l, n, x); printf("\r\n*** Done ***\r\n"); for(;; }

  3. Like
    oPossum got a reaction from Antscran in MSP430G Launchpad Frequency Counter   
    The itoa() function probably takes a signed integer. On the MSP430 that has a range of -32768 to +32767. You need something that takes an unsigned integer that has a range of 0 to 65535 to get beyond the 32767 limit. The next step would be using unsigned long to go all the way to 4,294,967,295.
     
    A general purpose unsigned / unsigned long to ascii conversion can be found here:
    http://forum.43oh.com/topic/1289-tiny-printf-c-version/
     
    Here is a frequency counter that uses unsigned long to go up to 16 Mhz:
    http://forum.43oh.com/topic/1913-frequency-counter-using-launchpad-nokia-5110-lcd/
  4. Like
    oPossum got a reaction from Fmilburn in MSP430G Launchpad Frequency Counter   
    The itoa() function probably takes a signed integer. On the MSP430 that has a range of -32768 to +32767. You need something that takes an unsigned integer that has a range of 0 to 65535 to get beyond the 32767 limit. The next step would be using unsigned long to go all the way to 4,294,967,295.
     
    A general purpose unsigned / unsigned long to ascii conversion can be found here:
    http://forum.43oh.com/topic/1289-tiny-printf-c-version/
     
    Here is a frequency counter that uses unsigned long to go up to 16 Mhz:
    http://forum.43oh.com/topic/1913-frequency-counter-using-launchpad-nokia-5110-lcd/
  5. Like
    oPossum got a reaction from tripwire in MSP430G Launchpad Frequency Counter   
    The itoa() function probably takes a signed integer. On the MSP430 that has a range of -32768 to +32767. You need something that takes an unsigned integer that has a range of 0 to 65535 to get beyond the 32767 limit. The next step would be using unsigned long to go all the way to 4,294,967,295.
     
    A general purpose unsigned / unsigned long to ascii conversion can be found here:
    http://forum.43oh.com/topic/1289-tiny-printf-c-version/
     
    Here is a frequency counter that uses unsigned long to go up to 16 Mhz:
    http://forum.43oh.com/topic/1913-frequency-counter-using-launchpad-nokia-5110-lcd/
  6. Like
    oPossum reacted to greeeg in RGB 4x4 button thing   
    Started looking into a enclosure for this project. ( Typically this would be done before designing the PCB.... )
    But I'd like to make use of some of the rapid prototyping tools I've invested in.
     
    Something I should have done before sending the board off to the fab.. Verified that the holes lined up. (they didn't on board version 1)
    By exporting a copper layer as a dxf or svg, you can then import your PCB into your favorite CAD tool. Luckily Adafruit provide the 3d CAD file for the rubber elastomer buttons.


     
    Good to see that everything lines up. Next step is a simple 2 part case.
  7. Like
    oPossum got a reaction from agaelema in Faster printing of single precision floating point   
    Printf supports the %e, %f and %g format specifiers for printing floating point numbers. Due to the automatic type promotion of varadic functions in C, the printf code must always print double precision floating point. This makes printing single precision floating point numbers slower than optimal. The precision rounding done by printf also imposes a speed penalty. The code presented here will print single precision floating point numbers much faster than printf - up to 75 times faster. It allows the number of significant digits to be 3 to 8 (a maximum of 7 is recommended). The printed number will be normalized to the range to 1.0 to less than 1000.0 and the appropriate SI prefix appended. This is similar to what the printf %e format does (1.0 to less than 10.0), but a bit easier to interpret. The full range of single precision float can not be represented by SI prefixes, so very small and very large numbers will have '?' as the prefix. This code is intended for display only. It should not be used to store values in a file for later conversion back to float due to limitations in rounding and range.
     
    A brief explanation of the code.
    Do a bitwise conversion of the float to a 32 bit unsigned integer. This is done with by getting the address of the float, casting to an unsigned integer pointer, and then dereferencing the pointer. Simply casting to an unsigned integer would not produce a bitwise copy.

    uint32_t s = *(uint32_t *)&f; The msb contains the sign flag. Append a '-' to the string if the floating point number is negative.
    if (s & (1UL << 31)) *a++ = '-'; Extract the 8 bit exponent.
    int e = (s >> 23) & 0xFF; Move the significand to the upper 24 bits. Set the msb to 0.
    s <<= 8; s &= ~(1UL << 31); An exponent of 255 is used for special values of NaN (not a number) and infinity. Handle these special cases and return.
    if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; An exponent of 0 is used to represent a value of 0 and for denormals. A value of 0 is handled by setting the exponent to 127 - the same exponent used to represent 1.0, and leaving the significand as 0. Denormal numbers do not have the implicit msb of 1, so they are normalized by shifting until the leading 1 is in the msb position.
    } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } If the exponent is some other value, then set the msb of the significand.
    } else { s |= (1UL << 31); } Setup a pointer to the SI prefix. This will be adjusted as the value is normalized.
    char const * sp = "???????yzafpnum kMGTPEZY????" + 15; If the value is less than 1.0 it must be multiplied by 1000 until it is 1.0 or greater. Multiplication by 1000 is done by an implicit multiply by 1024 and then subtracting a multiply by 16 and a multiply by 8.
    if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); If the value is 1000.0 or more it must be divided by 1000 until it is less than 1000.0. An unrolled floating point divide is used for maximum speed.
    } else if (e > 135) { while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } The divide code is quite time consuming, so it would be advantageous to quickly reduce very large numbers. A divide by 1,000,000,000,000 is used to improve performance for these large numbers.The preceding multiply code could do the same for very small numbers, but there is no speed advantage due to the multiply by 1000 using 2 shift/subtract operations and multiply by lager values requiring more than 2 per 1000.

    while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } Rounding is the most difficult part of printing floating point numbers. Precalculated float constants are applied based on the value of the float and the specified number of significant digits. This simple method is fast and allows for good results for up to 7 significant digits.
    typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } The integer part is printed using iterative subtraction of base 10 constants. This is typically faster than the common divide/modulus method.
    unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; The fractional part is printed by iterative multiplication by 10.
    *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } The SI prefix is appended and the string is terminated.
    *a++ = *sp; *a = 0; The resulting performance increase was more than I expected. 

    TI 4.4.4 GCC 4.9.1 --------------------------------------- %e 16402 6.47 s non-functional %f 16380 5.78 s non-functional %g 16402 4.69 s non-functional ftoas(7) 8480 0.12 s 11892 0.27 s ftoas(3) 8480 0.09 s 11892 0.17 s Results of the test code using ftoas(7), %e, %g, and %f
    0.000000 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 9.809089? 0.000000e+00 0 0.000000 99.49219? 0.000000e+00 0 0.000000 1.000527? 0.000000e+00 0 0.000000 9.999666? 0.000000e+00 0 0.000000 99.99946? 0.000000e+00 0 0.000000 1.000000? 0.000000e+00 0 0.000000 12.00000? 1.200000e-38 1.2e-38 0.000000 100.0000? 1.000000e-37 1e-37 0.000000 1.000000? 1.000000e-36 1e-36 0.000000 10.00000? 1.000000e-35 1e-35 0.000000 100.0000? 1.000000e-34 1e-34 0.000000 1.000000? 1.000000e-33 1e-33 0.000000 10.00000? 1.000000e-32 1e-32 0.000000 100.0000? 1.000000e-31 1e-31 0.000000 1.000000? 1.000000e-30 1e-30 0.000000 10.00000? 1.000000e-29 1e-29 0.000000 100.0000? 1.000000e-28 1e-28 0.000000 1.000000? 1.000000e-27 1e-27 0.000000 10.00000? 1.000000e-26 1e-26 0.000000 100.0000? 1.000000e-25 1e-25 0.000000 1.000000y 1.000000e-24 1e-24 0.000000 10.00000y 1.000000e-23 1e-23 0.000000 100.0000y 1.000000e-22 1e-22 0.000000 1.000000z 1.000000e-21 1e-21 0.000000 10.00000z 1.000000e-20 1e-20 0.000000 100.0000z 1.000000e-19 1e-19 0.000000 1.000000a 1.000000e-18 1e-18 0.000000 10.00000a 1.000000e-17 1e-17 0.000000 100.0000a 1.000000e-16 1e-16 0.000000 1.000000f 1.000000e-15 1e-15 0.000000 10.00000f 1.000000e-14 1e-14 0.000000 100.0000f 1.000000e-13 1e-13 0.000000 1.000000p 1.000000e-12 1e-12 0.000000 10.00000p 1.000000e-11 1e-11 0.000000 100.0000p 1.000000e-10 1e-10 0.000000 1.000000n 1.000000e-09 1e-09 0.000000 10.00000n 1.000000e-08 1e-08 0.000000 100.0000n 1.000000e-07 1e-07 0.000000 1.000000u 1.000000e-06 1e-06 0.000001 10.00000u 1.000000e-05 1e-05 0.000010 100.0000u 1.000000e-04 0.0001 0.000100 1.000000m 1.000000e-03 0.001 0.001000 10.00000m 1.000000e-02 0.01 0.010000 100.0000m 1.000000e-01 0.1 0.100000 1.000000 1.000000e+00 1 1.000000 1.234568 1.234568e+00 1.23457 1.234568 10.00000 1.000000e+01 10 10.000000 100.0000 1.000000e+02 100 100.000000 1.000000k 1.000000e+03 1000 1000.000000 10.00000k 1.000000e+04 10000 10000.000000 100.0000k 1.000000e+05 100000 100000.000000 1.000000M 1.000000e+06 1e+06 1000000.000000 10.00000M 1.000000e+07 1e+07 10000000.000000 100.0000M 1.000000e+08 1e+08 100000000.000000 1.000000G 1.000000e+09 1e+09 1000000000.000000 10.00000G 1.000000e+10 1e+10 10000000000.000000 100.0000G 1.000000e+11 1e+11 99999997952.000010 1.000000T 1.000000e+12 1e+12 999999995903.999925 10.00000T 1.000000e+13 1e+13 9999999827968.000174 100.0000T 1.000000e+14 1e+14 100000000376832.008362 1.000000P 1.000000e+15 1e+15 999999986991104.125977 10.00000P 1.000000e+16 1e+16 10000000272564222.812653 100.0000P 1.000000e+17 1e+17 99999998430674934.387207 1.000000E 1.000000e+18 1e+18 999999984306749343.872070 10.00000E 1.000000e+19 1e+19 9999999980506448745.727539 100.0000E 1.000000e+20 1e+20 100000002004087710380.554199 1.000000Z 1.000000e+21 1e+21 1000000020040877103805.541992 10.00000Z 1.000000e+22 1e+22 9999999778196308612823.486328 100.0000Z 1.000000e+23 1e+23 99999997781963086128234.863281 1.000000Y 1.000000e+24 1e+24 1000000013848427772521972.656250 10.00000Y 1.000000e+25 1e+25 9999999562023527622222900.390625 100.0000Y 1.000000e+26 1e+26 100000002537764322757720947.265625 1.000000? 1.000000e+27 1e+27 999999988484154701232910156.250000 10.00000? 9.999999e+27 1e+28 9999999442119691371917724609.375000 100.0000? 1.000000e+29 1e+29 100000001504746651649475097656.250000 1.000000? 1.000000e+30 1e+30 1000000015047466516494750976562.500000 10.00000? 1.000000e+31 1e+31 9999999848243210315704345703125.000000 100.0000? 1.000000e+32 1e+32 100000003318135333061218261718750.000000 1.000000? 1.000000e+33 1e+33 999999994495727896690368652343750.000000 10.00000? 1.000000e+34 1e+34 9999999790214771032333374023437500.000000 100.0000? 1.000000e+35 1e+35 100000004091847860813140869140625000.000000 1.000000? 1.000000e+36 1e+36 999999961690316438674926757812500000.000000 10.00000? 1.000000e+37 1e+37 9999999933815813064575195312500000000.000000 100.0000? 1.000000e+38 1e+38 99999996802856898307800292968750000000.000000 340.0001? 3.400000e+38 3.4e+38 339999995214436411857604980468750000000.000000 NaN nan nan nan Inf +inf +inf +inf Complete code with test case.
    #include <msp430.h> #include <stdio.h> #include <stdint.h> #include <string.h> #include <math.h> static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; void ftoas(char *a, float const f, unsigned d) { uint32_t s = *(uint32_t *)&f; if (s & (1UL << 31)) *a++ = '-'; int e = (s >> 23) & 0xFF; s <<= 8; s &= ~(1UL << 31); if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } } else { s |= (1UL << 31); } char const * sp = "???????yzafpnum kMGTPEZY????" + 15; if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); } else if (e > 135) { while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } } if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } *a++ = *sp; *a = 0; } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset char s[32], t[96]; float const tv[] = { 0.0f, 7.1e-46f, 1.0e-45f, 1.0e-44f, 1.0e-43f, 1.0e-42f, 1.0e-41f, 1.0e-40f, 1.0e-39f, 1.2e-38f, 1.0e-37f, 1.0e-36f, 1.0e-35f, 1.0e-34f, 1.0e-33f, 1.0e-32f, 1.0e-31f, 1.0e-30f, 1.0e-29f, 1.0e-28f, 1.0e-27f, 1.0e-26f, 1.0e-25f, 1.0e-24f, 1.0e-23f, 1.0e-22f, 1.0e-21f, 1.0e-20f, 1.0e-19f, 1.0e-18f, 1.0e-17f, 1.0e-16f, 1.0e-15f, 1.0e-14f, 1.0e-13f, 1.0e-12f, 1.0e-11f, 1.0e-10f, 1.0e-9f, 1.0e-8f, 1.0e-7f, 0.000001f, 0.00001f, 0.0001f, 0.001f, 0.01f, 0.1f, 1.0f, 1.23456789f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f, 1000000.0f, 10000000.0f, 100000000.0f, 1000000000.0f, 1.0e10f, 1.0e11f, 1.0e12f, 1.0e13f, 1.0e14f, 1.0e15f, 1.0e16f, 1.0e17f, 1.0e18f, 1.0e19f, 1.0e20f, 1.0e21f, 1.0e22f, 1.0e23f, 1.0e24f, 1.0e25f, 1.0e26f, 1.0e27f, 1.0e28f, 1.0e29f, 1.0e30f, 1.0e31f, 1.0e32f, 1.0e33f, 1.0e34f, 1.0e35f, 1.0e36f, 1.0e37f, 1.0e38f, 3.4e38f, NAN, INFINITY }; TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; unsigned i; for (i = 0; i < sizeof(tv) / sizeof(tv[0]); ++i) { float const f = tv[i]; ftoas(s, f, 7); //print(s); print("\r\n"); //sprintf(t, "%e", f); //sprintf(t, "%f", f); //sprintf(t, "%g", f); //sprintf(t, "%e %f %g %s\r\n", f, f, f, s); sprintf(t, "%s %e %g %f\r\n", s, f, f, f); print(t); } volatile float et = ((float)TA0R + ((TA0CTL & TAIFG) ? 65536.0 : 0.0)) * 63.0f / 1000000.0f; // Elapsed time in microseconds ftoas(t, et, 5); //sprintf(t, "%f", et); print(t); print("s\r\n"); for(;; return 0; }
  8. Like
    oPossum got a reaction from veryalive in Faster printing of single precision floating point   
    Printf supports the %e, %f and %g format specifiers for printing floating point numbers. Due to the automatic type promotion of varadic functions in C, the printf code must always print double precision floating point. This makes printing single precision floating point numbers slower than optimal. The precision rounding done by printf also imposes a speed penalty. The code presented here will print single precision floating point numbers much faster than printf - up to 75 times faster. It allows the number of significant digits to be 3 to 8 (a maximum of 7 is recommended). The printed number will be normalized to the range to 1.0 to less than 1000.0 and the appropriate SI prefix appended. This is similar to what the printf %e format does (1.0 to less than 10.0), but a bit easier to interpret. The full range of single precision float can not be represented by SI prefixes, so very small and very large numbers will have '?' as the prefix. This code is intended for display only. It should not be used to store values in a file for later conversion back to float due to limitations in rounding and range.
     
    A brief explanation of the code.
    Do a bitwise conversion of the float to a 32 bit unsigned integer. This is done with by getting the address of the float, casting to an unsigned integer pointer, and then dereferencing the pointer. Simply casting to an unsigned integer would not produce a bitwise copy.

    uint32_t s = *(uint32_t *)&f; The msb contains the sign flag. Append a '-' to the string if the floating point number is negative.
    if (s & (1UL << 31)) *a++ = '-'; Extract the 8 bit exponent.
    int e = (s >> 23) & 0xFF; Move the significand to the upper 24 bits. Set the msb to 0.
    s <<= 8; s &= ~(1UL << 31); An exponent of 255 is used for special values of NaN (not a number) and infinity. Handle these special cases and return.
    if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; An exponent of 0 is used to represent a value of 0 and for denormals. A value of 0 is handled by setting the exponent to 127 - the same exponent used to represent 1.0, and leaving the significand as 0. Denormal numbers do not have the implicit msb of 1, so they are normalized by shifting until the leading 1 is in the msb position.
    } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } If the exponent is some other value, then set the msb of the significand.
    } else { s |= (1UL << 31); } Setup a pointer to the SI prefix. This will be adjusted as the value is normalized.
    char const * sp = "???????yzafpnum kMGTPEZY????" + 15; If the value is less than 1.0 it must be multiplied by 1000 until it is 1.0 or greater. Multiplication by 1000 is done by an implicit multiply by 1024 and then subtracting a multiply by 16 and a multiply by 8.
    if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); If the value is 1000.0 or more it must be divided by 1000 until it is less than 1000.0. An unrolled floating point divide is used for maximum speed.
    } else if (e > 135) { while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } The divide code is quite time consuming, so it would be advantageous to quickly reduce very large numbers. A divide by 1,000,000,000,000 is used to improve performance for these large numbers.The preceding multiply code could do the same for very small numbers, but there is no speed advantage due to the multiply by 1000 using 2 shift/subtract operations and multiply by lager values requiring more than 2 per 1000.

    while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } Rounding is the most difficult part of printing floating point numbers. Precalculated float constants are applied based on the value of the float and the specified number of significant digits. This simple method is fast and allows for good results for up to 7 significant digits.
    typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } The integer part is printed using iterative subtraction of base 10 constants. This is typically faster than the common divide/modulus method.
    unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; The fractional part is printed by iterative multiplication by 10.
    *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } The SI prefix is appended and the string is terminated.
    *a++ = *sp; *a = 0; The resulting performance increase was more than I expected. 

    TI 4.4.4 GCC 4.9.1 --------------------------------------- %e 16402 6.47 s non-functional %f 16380 5.78 s non-functional %g 16402 4.69 s non-functional ftoas(7) 8480 0.12 s 11892 0.27 s ftoas(3) 8480 0.09 s 11892 0.17 s Results of the test code using ftoas(7), %e, %g, and %f
    0.000000 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 9.809089? 0.000000e+00 0 0.000000 99.49219? 0.000000e+00 0 0.000000 1.000527? 0.000000e+00 0 0.000000 9.999666? 0.000000e+00 0 0.000000 99.99946? 0.000000e+00 0 0.000000 1.000000? 0.000000e+00 0 0.000000 12.00000? 1.200000e-38 1.2e-38 0.000000 100.0000? 1.000000e-37 1e-37 0.000000 1.000000? 1.000000e-36 1e-36 0.000000 10.00000? 1.000000e-35 1e-35 0.000000 100.0000? 1.000000e-34 1e-34 0.000000 1.000000? 1.000000e-33 1e-33 0.000000 10.00000? 1.000000e-32 1e-32 0.000000 100.0000? 1.000000e-31 1e-31 0.000000 1.000000? 1.000000e-30 1e-30 0.000000 10.00000? 1.000000e-29 1e-29 0.000000 100.0000? 1.000000e-28 1e-28 0.000000 1.000000? 1.000000e-27 1e-27 0.000000 10.00000? 1.000000e-26 1e-26 0.000000 100.0000? 1.000000e-25 1e-25 0.000000 1.000000y 1.000000e-24 1e-24 0.000000 10.00000y 1.000000e-23 1e-23 0.000000 100.0000y 1.000000e-22 1e-22 0.000000 1.000000z 1.000000e-21 1e-21 0.000000 10.00000z 1.000000e-20 1e-20 0.000000 100.0000z 1.000000e-19 1e-19 0.000000 1.000000a 1.000000e-18 1e-18 0.000000 10.00000a 1.000000e-17 1e-17 0.000000 100.0000a 1.000000e-16 1e-16 0.000000 1.000000f 1.000000e-15 1e-15 0.000000 10.00000f 1.000000e-14 1e-14 0.000000 100.0000f 1.000000e-13 1e-13 0.000000 1.000000p 1.000000e-12 1e-12 0.000000 10.00000p 1.000000e-11 1e-11 0.000000 100.0000p 1.000000e-10 1e-10 0.000000 1.000000n 1.000000e-09 1e-09 0.000000 10.00000n 1.000000e-08 1e-08 0.000000 100.0000n 1.000000e-07 1e-07 0.000000 1.000000u 1.000000e-06 1e-06 0.000001 10.00000u 1.000000e-05 1e-05 0.000010 100.0000u 1.000000e-04 0.0001 0.000100 1.000000m 1.000000e-03 0.001 0.001000 10.00000m 1.000000e-02 0.01 0.010000 100.0000m 1.000000e-01 0.1 0.100000 1.000000 1.000000e+00 1 1.000000 1.234568 1.234568e+00 1.23457 1.234568 10.00000 1.000000e+01 10 10.000000 100.0000 1.000000e+02 100 100.000000 1.000000k 1.000000e+03 1000 1000.000000 10.00000k 1.000000e+04 10000 10000.000000 100.0000k 1.000000e+05 100000 100000.000000 1.000000M 1.000000e+06 1e+06 1000000.000000 10.00000M 1.000000e+07 1e+07 10000000.000000 100.0000M 1.000000e+08 1e+08 100000000.000000 1.000000G 1.000000e+09 1e+09 1000000000.000000 10.00000G 1.000000e+10 1e+10 10000000000.000000 100.0000G 1.000000e+11 1e+11 99999997952.000010 1.000000T 1.000000e+12 1e+12 999999995903.999925 10.00000T 1.000000e+13 1e+13 9999999827968.000174 100.0000T 1.000000e+14 1e+14 100000000376832.008362 1.000000P 1.000000e+15 1e+15 999999986991104.125977 10.00000P 1.000000e+16 1e+16 10000000272564222.812653 100.0000P 1.000000e+17 1e+17 99999998430674934.387207 1.000000E 1.000000e+18 1e+18 999999984306749343.872070 10.00000E 1.000000e+19 1e+19 9999999980506448745.727539 100.0000E 1.000000e+20 1e+20 100000002004087710380.554199 1.000000Z 1.000000e+21 1e+21 1000000020040877103805.541992 10.00000Z 1.000000e+22 1e+22 9999999778196308612823.486328 100.0000Z 1.000000e+23 1e+23 99999997781963086128234.863281 1.000000Y 1.000000e+24 1e+24 1000000013848427772521972.656250 10.00000Y 1.000000e+25 1e+25 9999999562023527622222900.390625 100.0000Y 1.000000e+26 1e+26 100000002537764322757720947.265625 1.000000? 1.000000e+27 1e+27 999999988484154701232910156.250000 10.00000? 9.999999e+27 1e+28 9999999442119691371917724609.375000 100.0000? 1.000000e+29 1e+29 100000001504746651649475097656.250000 1.000000? 1.000000e+30 1e+30 1000000015047466516494750976562.500000 10.00000? 1.000000e+31 1e+31 9999999848243210315704345703125.000000 100.0000? 1.000000e+32 1e+32 100000003318135333061218261718750.000000 1.000000? 1.000000e+33 1e+33 999999994495727896690368652343750.000000 10.00000? 1.000000e+34 1e+34 9999999790214771032333374023437500.000000 100.0000? 1.000000e+35 1e+35 100000004091847860813140869140625000.000000 1.000000? 1.000000e+36 1e+36 999999961690316438674926757812500000.000000 10.00000? 1.000000e+37 1e+37 9999999933815813064575195312500000000.000000 100.0000? 1.000000e+38 1e+38 99999996802856898307800292968750000000.000000 340.0001? 3.400000e+38 3.4e+38 339999995214436411857604980468750000000.000000 NaN nan nan nan Inf +inf +inf +inf Complete code with test case.
    #include <msp430.h> #include <stdio.h> #include <stdint.h> #include <string.h> #include <math.h> static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; void ftoas(char *a, float const f, unsigned d) { uint32_t s = *(uint32_t *)&f; if (s & (1UL << 31)) *a++ = '-'; int e = (s >> 23) & 0xFF; s <<= 8; s &= ~(1UL << 31); if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } } else { s |= (1UL << 31); } char const * sp = "???????yzafpnum kMGTPEZY????" + 15; if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); } else if (e > 135) { while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } } if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } *a++ = *sp; *a = 0; } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset char s[32], t[96]; float const tv[] = { 0.0f, 7.1e-46f, 1.0e-45f, 1.0e-44f, 1.0e-43f, 1.0e-42f, 1.0e-41f, 1.0e-40f, 1.0e-39f, 1.2e-38f, 1.0e-37f, 1.0e-36f, 1.0e-35f, 1.0e-34f, 1.0e-33f, 1.0e-32f, 1.0e-31f, 1.0e-30f, 1.0e-29f, 1.0e-28f, 1.0e-27f, 1.0e-26f, 1.0e-25f, 1.0e-24f, 1.0e-23f, 1.0e-22f, 1.0e-21f, 1.0e-20f, 1.0e-19f, 1.0e-18f, 1.0e-17f, 1.0e-16f, 1.0e-15f, 1.0e-14f, 1.0e-13f, 1.0e-12f, 1.0e-11f, 1.0e-10f, 1.0e-9f, 1.0e-8f, 1.0e-7f, 0.000001f, 0.00001f, 0.0001f, 0.001f, 0.01f, 0.1f, 1.0f, 1.23456789f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f, 1000000.0f, 10000000.0f, 100000000.0f, 1000000000.0f, 1.0e10f, 1.0e11f, 1.0e12f, 1.0e13f, 1.0e14f, 1.0e15f, 1.0e16f, 1.0e17f, 1.0e18f, 1.0e19f, 1.0e20f, 1.0e21f, 1.0e22f, 1.0e23f, 1.0e24f, 1.0e25f, 1.0e26f, 1.0e27f, 1.0e28f, 1.0e29f, 1.0e30f, 1.0e31f, 1.0e32f, 1.0e33f, 1.0e34f, 1.0e35f, 1.0e36f, 1.0e37f, 1.0e38f, 3.4e38f, NAN, INFINITY }; TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; unsigned i; for (i = 0; i < sizeof(tv) / sizeof(tv[0]); ++i) { float const f = tv[i]; ftoas(s, f, 7); //print(s); print("\r\n"); //sprintf(t, "%e", f); //sprintf(t, "%f", f); //sprintf(t, "%g", f); //sprintf(t, "%e %f %g %s\r\n", f, f, f, s); sprintf(t, "%s %e %g %f\r\n", s, f, f, f); print(t); } volatile float et = ((float)TA0R + ((TA0CTL & TAIFG) ? 65536.0 : 0.0)) * 63.0f / 1000000.0f; // Elapsed time in microseconds ftoas(t, et, 5); //sprintf(t, "%f", et); print(t); print("s\r\n"); for(;; return 0; }
  9. Like
    oPossum got a reaction from artium in An unusual way to print integers   
    A common method of printing integers uses a divide (/) and modulus (%) operation to calculate each digit. This is portable and supports any number base. The string is created in reverse order and the digit generation can easily be terminated when all significant digits have been printed, so additional logic for leading zero suppression is not needed.
     
    A much faster method uses BCD math to generate a BCD value that can then be easily convered to a string. This requires assembly or intrinsic C functions for the most efficient BCD math, so it isn't portable.
     
    The method shown here creates a fixed point binary fraction using a single division operation and then extracts the digits using bit shifts and an add. The division is unrolled and crafted to generate a result with the binary point between words to allow for efficient digit extraction. After division, the upper word contains the first digit and the lower word contains a binary fraction with all other digits. The digits are generated from the fraction by multiplying by 10. This is done with 2 shifts and an add. The result will be in the upper word. Preparation for the next digit is just a matter of clearing the upper word.
    This certainly seems like an awkward method, but it requires far less math than doing division for each digit.
     

    // Print unsigned 16 bit integer with leading zero suppression static void spu16(char * s, uint16_t n) { uint16_t d = 10000 << 2; uint32_t r = 0; if (n >= d) n -= d, r |= (1 << 18); d >>= 1; if (n >= d) n -= d, r |= (1 << 17); n <<= 1; if (n >= d) n -= d, r |= (1 << 16); n <<= 1; if (n >= d) n -= d, r |= (1 << 15); n <<= 1; if (n >= d) n -= d, r |= (1 << 14); n <<= 1; if (n >= d) n -= d, r |= (1 << 13); n <<= 1; if (n >= d) n -= d, r |= (1 << 12); n <<= 1; if (n >= d) n -= d, r |= (1 << 11); n <<= 1; if (n >= d) n -= d, r |= (1 << 10); n <<= 1; if (n >= d) n -= d, r |= (1 << 9); n <<= 1; if (n >= d) n -= d, r |= (1 << 8); n <<= 1; if (n >= d) n -= d, r |= (1 << 7); n <<= 1; if (n >= d) n -= d, r |= (1 << 6); n <<= 1; if (n >= d) n -= d, r |= (1 << 5); n <<= 1; if (n >= d) n -= d, r |= (1 << 4); n <<= 1; if (n >= d) n -= d, r |= (1 << 3); n <<= 1; if (n >= d) n -= d, r |= (1 << 2); r += (1 << 2); unsigned c, z; c = z = (r >> 16); unsigned i = 4; do { if (z || c) *s++ = z = ('0' + c); r &= 0xFFFF; r <<= 1; r += (r << 2); c = (r >> 16); } while (--i); *s++ = '0' + c; *s = 0; } When a 32 bit hardware multiplier is available, the code can be modified to multiply by the reciprocal rather than divide. The multiplication by 10 is now explicit to also take advantage of the hardware multiplier.

    static void spu16(char * s, uint16_t n) { uint32_t r = ((53687UL * n) >> 13) + (1 << 2); unsigned c, z; c = z = (r >> 16); unsigned i = 4; do { if (z || c) *s++ = z = ('0' + c); r = (r & 0xFFFF) * 10; c = (r >> 16); } while (--i); *s++ = '0' + c; *s = 0; }
  10. Like
    oPossum got a reaction from sq7bti in Faster printing of single precision floating point   
    Printf supports the %e, %f and %g format specifiers for printing floating point numbers. Due to the automatic type promotion of varadic functions in C, the printf code must always print double precision floating point. This makes printing single precision floating point numbers slower than optimal. The precision rounding done by printf also imposes a speed penalty. The code presented here will print single precision floating point numbers much faster than printf - up to 75 times faster. It allows the number of significant digits to be 3 to 8 (a maximum of 7 is recommended). The printed number will be normalized to the range to 1.0 to less than 1000.0 and the appropriate SI prefix appended. This is similar to what the printf %e format does (1.0 to less than 10.0), but a bit easier to interpret. The full range of single precision float can not be represented by SI prefixes, so very small and very large numbers will have '?' as the prefix. This code is intended for display only. It should not be used to store values in a file for later conversion back to float due to limitations in rounding and range.
     
    A brief explanation of the code.
    Do a bitwise conversion of the float to a 32 bit unsigned integer. This is done with by getting the address of the float, casting to an unsigned integer pointer, and then dereferencing the pointer. Simply casting to an unsigned integer would not produce a bitwise copy.

    uint32_t s = *(uint32_t *)&f; The msb contains the sign flag. Append a '-' to the string if the floating point number is negative.
    if (s & (1UL << 31)) *a++ = '-'; Extract the 8 bit exponent.
    int e = (s >> 23) & 0xFF; Move the significand to the upper 24 bits. Set the msb to 0.
    s <<= 8; s &= ~(1UL << 31); An exponent of 255 is used for special values of NaN (not a number) and infinity. Handle these special cases and return.
    if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; An exponent of 0 is used to represent a value of 0 and for denormals. A value of 0 is handled by setting the exponent to 127 - the same exponent used to represent 1.0, and leaving the significand as 0. Denormal numbers do not have the implicit msb of 1, so they are normalized by shifting until the leading 1 is in the msb position.
    } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } If the exponent is some other value, then set the msb of the significand.
    } else { s |= (1UL << 31); } Setup a pointer to the SI prefix. This will be adjusted as the value is normalized.
    char const * sp = "???????yzafpnum kMGTPEZY????" + 15; If the value is less than 1.0 it must be multiplied by 1000 until it is 1.0 or greater. Multiplication by 1000 is done by an implicit multiply by 1024 and then subtracting a multiply by 16 and a multiply by 8.
    if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); If the value is 1000.0 or more it must be divided by 1000 until it is less than 1000.0. An unrolled floating point divide is used for maximum speed.
    } else if (e > 135) { while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } The divide code is quite time consuming, so it would be advantageous to quickly reduce very large numbers. A divide by 1,000,000,000,000 is used to improve performance for these large numbers.The preceding multiply code could do the same for very small numbers, but there is no speed advantage due to the multiply by 1000 using 2 shift/subtract operations and multiply by lager values requiring more than 2 per 1000.

    while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } Rounding is the most difficult part of printing floating point numbers. Precalculated float constants are applied based on the value of the float and the specified number of significant digits. This simple method is fast and allows for good results for up to 7 significant digits.
    typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } The integer part is printed using iterative subtraction of base 10 constants. This is typically faster than the common divide/modulus method.
    unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; The fractional part is printed by iterative multiplication by 10.
    *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } The SI prefix is appended and the string is terminated.
    *a++ = *sp; *a = 0; The resulting performance increase was more than I expected. 

    TI 4.4.4 GCC 4.9.1 --------------------------------------- %e 16402 6.47 s non-functional %f 16380 5.78 s non-functional %g 16402 4.69 s non-functional ftoas(7) 8480 0.12 s 11892 0.27 s ftoas(3) 8480 0.09 s 11892 0.17 s Results of the test code using ftoas(7), %e, %g, and %f
    0.000000 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 9.809089? 0.000000e+00 0 0.000000 99.49219? 0.000000e+00 0 0.000000 1.000527? 0.000000e+00 0 0.000000 9.999666? 0.000000e+00 0 0.000000 99.99946? 0.000000e+00 0 0.000000 1.000000? 0.000000e+00 0 0.000000 12.00000? 1.200000e-38 1.2e-38 0.000000 100.0000? 1.000000e-37 1e-37 0.000000 1.000000? 1.000000e-36 1e-36 0.000000 10.00000? 1.000000e-35 1e-35 0.000000 100.0000? 1.000000e-34 1e-34 0.000000 1.000000? 1.000000e-33 1e-33 0.000000 10.00000? 1.000000e-32 1e-32 0.000000 100.0000? 1.000000e-31 1e-31 0.000000 1.000000? 1.000000e-30 1e-30 0.000000 10.00000? 1.000000e-29 1e-29 0.000000 100.0000? 1.000000e-28 1e-28 0.000000 1.000000? 1.000000e-27 1e-27 0.000000 10.00000? 1.000000e-26 1e-26 0.000000 100.0000? 1.000000e-25 1e-25 0.000000 1.000000y 1.000000e-24 1e-24 0.000000 10.00000y 1.000000e-23 1e-23 0.000000 100.0000y 1.000000e-22 1e-22 0.000000 1.000000z 1.000000e-21 1e-21 0.000000 10.00000z 1.000000e-20 1e-20 0.000000 100.0000z 1.000000e-19 1e-19 0.000000 1.000000a 1.000000e-18 1e-18 0.000000 10.00000a 1.000000e-17 1e-17 0.000000 100.0000a 1.000000e-16 1e-16 0.000000 1.000000f 1.000000e-15 1e-15 0.000000 10.00000f 1.000000e-14 1e-14 0.000000 100.0000f 1.000000e-13 1e-13 0.000000 1.000000p 1.000000e-12 1e-12 0.000000 10.00000p 1.000000e-11 1e-11 0.000000 100.0000p 1.000000e-10 1e-10 0.000000 1.000000n 1.000000e-09 1e-09 0.000000 10.00000n 1.000000e-08 1e-08 0.000000 100.0000n 1.000000e-07 1e-07 0.000000 1.000000u 1.000000e-06 1e-06 0.000001 10.00000u 1.000000e-05 1e-05 0.000010 100.0000u 1.000000e-04 0.0001 0.000100 1.000000m 1.000000e-03 0.001 0.001000 10.00000m 1.000000e-02 0.01 0.010000 100.0000m 1.000000e-01 0.1 0.100000 1.000000 1.000000e+00 1 1.000000 1.234568 1.234568e+00 1.23457 1.234568 10.00000 1.000000e+01 10 10.000000 100.0000 1.000000e+02 100 100.000000 1.000000k 1.000000e+03 1000 1000.000000 10.00000k 1.000000e+04 10000 10000.000000 100.0000k 1.000000e+05 100000 100000.000000 1.000000M 1.000000e+06 1e+06 1000000.000000 10.00000M 1.000000e+07 1e+07 10000000.000000 100.0000M 1.000000e+08 1e+08 100000000.000000 1.000000G 1.000000e+09 1e+09 1000000000.000000 10.00000G 1.000000e+10 1e+10 10000000000.000000 100.0000G 1.000000e+11 1e+11 99999997952.000010 1.000000T 1.000000e+12 1e+12 999999995903.999925 10.00000T 1.000000e+13 1e+13 9999999827968.000174 100.0000T 1.000000e+14 1e+14 100000000376832.008362 1.000000P 1.000000e+15 1e+15 999999986991104.125977 10.00000P 1.000000e+16 1e+16 10000000272564222.812653 100.0000P 1.000000e+17 1e+17 99999998430674934.387207 1.000000E 1.000000e+18 1e+18 999999984306749343.872070 10.00000E 1.000000e+19 1e+19 9999999980506448745.727539 100.0000E 1.000000e+20 1e+20 100000002004087710380.554199 1.000000Z 1.000000e+21 1e+21 1000000020040877103805.541992 10.00000Z 1.000000e+22 1e+22 9999999778196308612823.486328 100.0000Z 1.000000e+23 1e+23 99999997781963086128234.863281 1.000000Y 1.000000e+24 1e+24 1000000013848427772521972.656250 10.00000Y 1.000000e+25 1e+25 9999999562023527622222900.390625 100.0000Y 1.000000e+26 1e+26 100000002537764322757720947.265625 1.000000? 1.000000e+27 1e+27 999999988484154701232910156.250000 10.00000? 9.999999e+27 1e+28 9999999442119691371917724609.375000 100.0000? 1.000000e+29 1e+29 100000001504746651649475097656.250000 1.000000? 1.000000e+30 1e+30 1000000015047466516494750976562.500000 10.00000? 1.000000e+31 1e+31 9999999848243210315704345703125.000000 100.0000? 1.000000e+32 1e+32 100000003318135333061218261718750.000000 1.000000? 1.000000e+33 1e+33 999999994495727896690368652343750.000000 10.00000? 1.000000e+34 1e+34 9999999790214771032333374023437500.000000 100.0000? 1.000000e+35 1e+35 100000004091847860813140869140625000.000000 1.000000? 1.000000e+36 1e+36 999999961690316438674926757812500000.000000 10.00000? 1.000000e+37 1e+37 9999999933815813064575195312500000000.000000 100.0000? 1.000000e+38 1e+38 99999996802856898307800292968750000000.000000 340.0001? 3.400000e+38 3.4e+38 339999995214436411857604980468750000000.000000 NaN nan nan nan Inf +inf +inf +inf Complete code with test case.
    #include <msp430.h> #include <stdio.h> #include <stdint.h> #include <string.h> #include <math.h> static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; void ftoas(char *a, float const f, unsigned d) { uint32_t s = *(uint32_t *)&f; if (s & (1UL << 31)) *a++ = '-'; int e = (s >> 23) & 0xFF; s <<= 8; s &= ~(1UL << 31); if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } } else { s |= (1UL << 31); } char const * sp = "???????yzafpnum kMGTPEZY????" + 15; if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); } else if (e > 135) { while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } } if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } *a++ = *sp; *a = 0; } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset char s[32], t[96]; float const tv[] = { 0.0f, 7.1e-46f, 1.0e-45f, 1.0e-44f, 1.0e-43f, 1.0e-42f, 1.0e-41f, 1.0e-40f, 1.0e-39f, 1.2e-38f, 1.0e-37f, 1.0e-36f, 1.0e-35f, 1.0e-34f, 1.0e-33f, 1.0e-32f, 1.0e-31f, 1.0e-30f, 1.0e-29f, 1.0e-28f, 1.0e-27f, 1.0e-26f, 1.0e-25f, 1.0e-24f, 1.0e-23f, 1.0e-22f, 1.0e-21f, 1.0e-20f, 1.0e-19f, 1.0e-18f, 1.0e-17f, 1.0e-16f, 1.0e-15f, 1.0e-14f, 1.0e-13f, 1.0e-12f, 1.0e-11f, 1.0e-10f, 1.0e-9f, 1.0e-8f, 1.0e-7f, 0.000001f, 0.00001f, 0.0001f, 0.001f, 0.01f, 0.1f, 1.0f, 1.23456789f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f, 1000000.0f, 10000000.0f, 100000000.0f, 1000000000.0f, 1.0e10f, 1.0e11f, 1.0e12f, 1.0e13f, 1.0e14f, 1.0e15f, 1.0e16f, 1.0e17f, 1.0e18f, 1.0e19f, 1.0e20f, 1.0e21f, 1.0e22f, 1.0e23f, 1.0e24f, 1.0e25f, 1.0e26f, 1.0e27f, 1.0e28f, 1.0e29f, 1.0e30f, 1.0e31f, 1.0e32f, 1.0e33f, 1.0e34f, 1.0e35f, 1.0e36f, 1.0e37f, 1.0e38f, 3.4e38f, NAN, INFINITY }; TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; unsigned i; for (i = 0; i < sizeof(tv) / sizeof(tv[0]); ++i) { float const f = tv[i]; ftoas(s, f, 7); //print(s); print("\r\n"); //sprintf(t, "%e", f); //sprintf(t, "%f", f); //sprintf(t, "%g", f); //sprintf(t, "%e %f %g %s\r\n", f, f, f, s); sprintf(t, "%s %e %g %f\r\n", s, f, f, f); print(t); } volatile float et = ((float)TA0R + ((TA0CTL & TAIFG) ? 65536.0 : 0.0)) * 63.0f / 1000000.0f; // Elapsed time in microseconds ftoas(t, et, 5); //sprintf(t, "%f", et); print(t); print("s\r\n"); for(;; return 0; }
  11. Like
    oPossum got a reaction from abecedarian in MSP432 and high speed interrupts   
    Yes, only 1 interrupt. The PPS will trigger the interrupt and the timer capture register will hold the *exact* time count when the PPS input changed. Using timer capture eliminates the jitter cause by interrupt latency.
  12. Like
    oPossum got a reaction from tripwire in Faster printing of single precision floating point   
    Printf supports the %e, %f and %g format specifiers for printing floating point numbers. Due to the automatic type promotion of varadic functions in C, the printf code must always print double precision floating point. This makes printing single precision floating point numbers slower than optimal. The precision rounding done by printf also imposes a speed penalty. The code presented here will print single precision floating point numbers much faster than printf - up to 75 times faster. It allows the number of significant digits to be 3 to 8 (a maximum of 7 is recommended). The printed number will be normalized to the range to 1.0 to less than 1000.0 and the appropriate SI prefix appended. This is similar to what the printf %e format does (1.0 to less than 10.0), but a bit easier to interpret. The full range of single precision float can not be represented by SI prefixes, so very small and very large numbers will have '?' as the prefix. This code is intended for display only. It should not be used to store values in a file for later conversion back to float due to limitations in rounding and range.
     
    A brief explanation of the code.
    Do a bitwise conversion of the float to a 32 bit unsigned integer. This is done with by getting the address of the float, casting to an unsigned integer pointer, and then dereferencing the pointer. Simply casting to an unsigned integer would not produce a bitwise copy.

    uint32_t s = *(uint32_t *)&f; The msb contains the sign flag. Append a '-' to the string if the floating point number is negative.
    if (s & (1UL << 31)) *a++ = '-'; Extract the 8 bit exponent.
    int e = (s >> 23) & 0xFF; Move the significand to the upper 24 bits. Set the msb to 0.
    s <<= 8; s &= ~(1UL << 31); An exponent of 255 is used for special values of NaN (not a number) and infinity. Handle these special cases and return.
    if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; An exponent of 0 is used to represent a value of 0 and for denormals. A value of 0 is handled by setting the exponent to 127 - the same exponent used to represent 1.0, and leaving the significand as 0. Denormal numbers do not have the implicit msb of 1, so they are normalized by shifting until the leading 1 is in the msb position.
    } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } If the exponent is some other value, then set the msb of the significand.
    } else { s |= (1UL << 31); } Setup a pointer to the SI prefix. This will be adjusted as the value is normalized.
    char const * sp = "???????yzafpnum kMGTPEZY????" + 15; If the value is less than 1.0 it must be multiplied by 1000 until it is 1.0 or greater. Multiplication by 1000 is done by an implicit multiply by 1024 and then subtracting a multiply by 16 and a multiply by 8.
    if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); If the value is 1000.0 or more it must be divided by 1000 until it is less than 1000.0. An unrolled floating point divide is used for maximum speed.
    } else if (e > 135) { while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } The divide code is quite time consuming, so it would be advantageous to quickly reduce very large numbers. A divide by 1,000,000,000,000 is used to improve performance for these large numbers.The preceding multiply code could do the same for very small numbers, but there is no speed advantage due to the multiply by 1000 using 2 shift/subtract operations and multiply by lager values requiring more than 2 per 1000.

    while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } Rounding is the most difficult part of printing floating point numbers. Precalculated float constants are applied based on the value of the float and the specified number of significant digits. This simple method is fast and allows for good results for up to 7 significant digits.
    typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } The integer part is printed using iterative subtraction of base 10 constants. This is typically faster than the common divide/modulus method.
    unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; The fractional part is printed by iterative multiplication by 10.
    *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } The SI prefix is appended and the string is terminated.
    *a++ = *sp; *a = 0; The resulting performance increase was more than I expected. 

    TI 4.4.4 GCC 4.9.1 --------------------------------------- %e 16402 6.47 s non-functional %f 16380 5.78 s non-functional %g 16402 4.69 s non-functional ftoas(7) 8480 0.12 s 11892 0.27 s ftoas(3) 8480 0.09 s 11892 0.17 s Results of the test code using ftoas(7), %e, %g, and %f
    0.000000 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 1.401298? 0.000000e+00 0 0.000000 9.809089? 0.000000e+00 0 0.000000 99.49219? 0.000000e+00 0 0.000000 1.000527? 0.000000e+00 0 0.000000 9.999666? 0.000000e+00 0 0.000000 99.99946? 0.000000e+00 0 0.000000 1.000000? 0.000000e+00 0 0.000000 12.00000? 1.200000e-38 1.2e-38 0.000000 100.0000? 1.000000e-37 1e-37 0.000000 1.000000? 1.000000e-36 1e-36 0.000000 10.00000? 1.000000e-35 1e-35 0.000000 100.0000? 1.000000e-34 1e-34 0.000000 1.000000? 1.000000e-33 1e-33 0.000000 10.00000? 1.000000e-32 1e-32 0.000000 100.0000? 1.000000e-31 1e-31 0.000000 1.000000? 1.000000e-30 1e-30 0.000000 10.00000? 1.000000e-29 1e-29 0.000000 100.0000? 1.000000e-28 1e-28 0.000000 1.000000? 1.000000e-27 1e-27 0.000000 10.00000? 1.000000e-26 1e-26 0.000000 100.0000? 1.000000e-25 1e-25 0.000000 1.000000y 1.000000e-24 1e-24 0.000000 10.00000y 1.000000e-23 1e-23 0.000000 100.0000y 1.000000e-22 1e-22 0.000000 1.000000z 1.000000e-21 1e-21 0.000000 10.00000z 1.000000e-20 1e-20 0.000000 100.0000z 1.000000e-19 1e-19 0.000000 1.000000a 1.000000e-18 1e-18 0.000000 10.00000a 1.000000e-17 1e-17 0.000000 100.0000a 1.000000e-16 1e-16 0.000000 1.000000f 1.000000e-15 1e-15 0.000000 10.00000f 1.000000e-14 1e-14 0.000000 100.0000f 1.000000e-13 1e-13 0.000000 1.000000p 1.000000e-12 1e-12 0.000000 10.00000p 1.000000e-11 1e-11 0.000000 100.0000p 1.000000e-10 1e-10 0.000000 1.000000n 1.000000e-09 1e-09 0.000000 10.00000n 1.000000e-08 1e-08 0.000000 100.0000n 1.000000e-07 1e-07 0.000000 1.000000u 1.000000e-06 1e-06 0.000001 10.00000u 1.000000e-05 1e-05 0.000010 100.0000u 1.000000e-04 0.0001 0.000100 1.000000m 1.000000e-03 0.001 0.001000 10.00000m 1.000000e-02 0.01 0.010000 100.0000m 1.000000e-01 0.1 0.100000 1.000000 1.000000e+00 1 1.000000 1.234568 1.234568e+00 1.23457 1.234568 10.00000 1.000000e+01 10 10.000000 100.0000 1.000000e+02 100 100.000000 1.000000k 1.000000e+03 1000 1000.000000 10.00000k 1.000000e+04 10000 10000.000000 100.0000k 1.000000e+05 100000 100000.000000 1.000000M 1.000000e+06 1e+06 1000000.000000 10.00000M 1.000000e+07 1e+07 10000000.000000 100.0000M 1.000000e+08 1e+08 100000000.000000 1.000000G 1.000000e+09 1e+09 1000000000.000000 10.00000G 1.000000e+10 1e+10 10000000000.000000 100.0000G 1.000000e+11 1e+11 99999997952.000010 1.000000T 1.000000e+12 1e+12 999999995903.999925 10.00000T 1.000000e+13 1e+13 9999999827968.000174 100.0000T 1.000000e+14 1e+14 100000000376832.008362 1.000000P 1.000000e+15 1e+15 999999986991104.125977 10.00000P 1.000000e+16 1e+16 10000000272564222.812653 100.0000P 1.000000e+17 1e+17 99999998430674934.387207 1.000000E 1.000000e+18 1e+18 999999984306749343.872070 10.00000E 1.000000e+19 1e+19 9999999980506448745.727539 100.0000E 1.000000e+20 1e+20 100000002004087710380.554199 1.000000Z 1.000000e+21 1e+21 1000000020040877103805.541992 10.00000Z 1.000000e+22 1e+22 9999999778196308612823.486328 100.0000Z 1.000000e+23 1e+23 99999997781963086128234.863281 1.000000Y 1.000000e+24 1e+24 1000000013848427772521972.656250 10.00000Y 1.000000e+25 1e+25 9999999562023527622222900.390625 100.0000Y 1.000000e+26 1e+26 100000002537764322757720947.265625 1.000000? 1.000000e+27 1e+27 999999988484154701232910156.250000 10.00000? 9.999999e+27 1e+28 9999999442119691371917724609.375000 100.0000? 1.000000e+29 1e+29 100000001504746651649475097656.250000 1.000000? 1.000000e+30 1e+30 1000000015047466516494750976562.500000 10.00000? 1.000000e+31 1e+31 9999999848243210315704345703125.000000 100.0000? 1.000000e+32 1e+32 100000003318135333061218261718750.000000 1.000000? 1.000000e+33 1e+33 999999994495727896690368652343750.000000 10.00000? 1.000000e+34 1e+34 9999999790214771032333374023437500.000000 100.0000? 1.000000e+35 1e+35 100000004091847860813140869140625000.000000 1.000000? 1.000000e+36 1e+36 999999961690316438674926757812500000.000000 10.00000? 1.000000e+37 1e+37 9999999933815813064575195312500000000.000000 100.0000? 1.000000e+38 1e+38 99999996802856898307800292968750000000.000000 340.0001? 3.400000e+38 3.4e+38 339999995214436411857604980468750000000.000000 NaN nan nan nan Inf +inf +inf +inf Complete code with test case.
    #include <msp430.h> #include <stdio.h> #include <stdint.h> #include <string.h> #include <math.h> static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } typedef struct { uint32_t s; int e; } TFR; TFR const r[] = { 0x800000UL << 7, 126 + 1, // 0.5 0xCCCCCDUL << 7, 122 + 1, // 0.05 0xA3D70AUL << 7, 119 + 1, // 0.005 0x83126FUL << 7, 116 + 1, // 0.0005 0xD1B717UL << 7, 112 + 1, // 0.00005 0xA7C5ACUL << 7, 109 + 1, // 0.000005 0x8637BDUL << 7, 106 + 1, // 0.0000005 0xD6BF95UL << 7, 102 + 1 // 0.00000005 }; void ftoas(char *a, float const f, unsigned d) { uint32_t s = *(uint32_t *)&f; if (s & (1UL << 31)) *a++ = '-'; int e = (s >> 23) & 0xFF; s <<= 8; s &= ~(1UL << 31); if (e == 255) { if (s) { strcpy(a, "NaN"); } else { strcpy(a, "Inf"); } return; } else if (e == 0) { if (s) { e = 1; while (!(s & (1UL << 31))) s <<= 1, --e; } else { e = 127; } } else { s |= (1UL << 31); } char const * sp = "???????yzafpnum kMGTPEZY????" + 15; if (e < 127) { do { s = s - (s >> 6) - (s >> 7); if (!(s & (1UL << 31))) s <<= 1, --e; e += 10; --sp; } while (e < 127); } else if (e > 135) { while (e > (150 + 16) || (e == (150 + 16) && s > (999999995904ULL >> 16))) { uint64_t n = s; n <<= 32; s = 0; uint64_t d = 1000000000000ULL << (64 - 40); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); //d >>= 1; //if (n >= d) s += (1UL << 8); if (n) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 39; sp += 4; } while (e > (126 + 10) || (e == (126 + 10) && s >= (1000UL << (32 - 10)))) { uint32_t n = s; s = 0; uint32_t d = 1000UL << (32 - 10); if (n >= d) n -= d, s |= (1UL << 31); d >>= 1; if (n >= d) n -= d, s |= (1UL << 30); d >>= 1; if (n >= d) n -= d, s |= (1UL << 29); d >>= 1; if (n >= d) n -= d, s |= (1UL << 28); d >>= 1; if (n >= d) n -= d, s |= (1UL << 27); d >>= 1; if (n >= d) n -= d, s |= (1UL << 26); d >>= 1; if (n >= d) n -= d, s |= (1UL << 25); d >>= 1; if (n >= d) n -= d, s |= (1UL << 24); d >>= 1; if (n >= d) n -= d, s |= (1UL << 23); d >>= 1; if (n >= d) n -= d, s |= (1UL << 22); d >>= 1; if (n >= d) n -= d, s |= (1UL << 21); d >>= 1; if (n >= d) n -= d, s |= (1UL << 20); d >>= 1; if (n >= d) n -= d, s |= (1UL << 19); d >>= 1; if (n >= d) n -= d, s |= (1UL << 18); d >>= 1; if (n >= d) n -= d, s |= (1UL << 17); d >>= 1; if (n >= d) n -= d, s |= (1UL << 16); d >>= 1; if (n >= d) n -= d, s |= (1UL << 15); d >>= 1; if (n >= d) n -= d, s |= (1UL << 14); d >>= 1; if (n >= d) n -= d, s |= (1UL << 13); d >>= 1; if (n >= d) n -= d, s |= (1UL << 12); d >>= 1; if (n >= d) n -= d, s |= (1UL << 11); d >>= 1; if (n >= d) n -= d, s |= (1UL << 10); d >>= 1; if (n >= d) n -= d, s |= (1UL << 9); d >>= 1; if (n >= d) n -= d, s |= (1UL << 8); d >>= 1; if (n >= d) s += (1UL << 8); if (!(s & (1UL << 31))) s <<= 1, --e; e -= 9; ++sp; } } if (d < 3) d = 3; else if (d > 8) d = 8; if (s) { TFR const *pr = &r[d - 3]; if (e < (126 + 4) || (e == (126 + 4) && s < (10UL << (32 - 4)))) { // < 10 pr += 2; } else if (e < (126 + 7) || (e == (126 + 7) && s < (100UL << (32 - 7)))) { // < 100 ++pr; } s += (pr->s >> (e - pr->e)); if (e == (126 + 10) && s >= (1000UL << (32 - 10))) s = (1UL << 31), e = 127, ++sp; else if (!(s & (1UL << 31))) s >>= 1, s |= (1UL << 31), ++e; } unsigned i = s >> 16; i >>= (136 - e); unsigned id = 1; char c; if (i >= (100 << 6)) { ++id; c = '0'; while (i >= (100 << 6)) i -= (100 << 6), ++c; *a++ = c; } if (id == 2 || i >= (10 << 6)) { ++id; c = '0'; while (i >= (10 << 6)) i -= (10 << 6), ++c; *a++ = c; } c = '0'; while (i >= (1 << 6)) i -= (1 << 6), ++c; *a++ = c; *a++ = '.'; if (e < 130) s >>= (130 - e); else s <<= (e - 130); d -= id; while (d) { s &= ((1UL << 28) - 1); s = (s << 3) + (s << 1); *a++ = '0' + (s >> 28); --d; } *a++ = *sp; *a = 0; } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset char s[32], t[96]; float const tv[] = { 0.0f, 7.1e-46f, 1.0e-45f, 1.0e-44f, 1.0e-43f, 1.0e-42f, 1.0e-41f, 1.0e-40f, 1.0e-39f, 1.2e-38f, 1.0e-37f, 1.0e-36f, 1.0e-35f, 1.0e-34f, 1.0e-33f, 1.0e-32f, 1.0e-31f, 1.0e-30f, 1.0e-29f, 1.0e-28f, 1.0e-27f, 1.0e-26f, 1.0e-25f, 1.0e-24f, 1.0e-23f, 1.0e-22f, 1.0e-21f, 1.0e-20f, 1.0e-19f, 1.0e-18f, 1.0e-17f, 1.0e-16f, 1.0e-15f, 1.0e-14f, 1.0e-13f, 1.0e-12f, 1.0e-11f, 1.0e-10f, 1.0e-9f, 1.0e-8f, 1.0e-7f, 0.000001f, 0.00001f, 0.0001f, 0.001f, 0.01f, 0.1f, 1.0f, 1.23456789f, 10.0f, 100.0f, 1000.0f, 10000.0f, 100000.0f, 1000000.0f, 10000000.0f, 100000000.0f, 1000000000.0f, 1.0e10f, 1.0e11f, 1.0e12f, 1.0e13f, 1.0e14f, 1.0e15f, 1.0e16f, 1.0e17f, 1.0e18f, 1.0e19f, 1.0e20f, 1.0e21f, 1.0e22f, 1.0e23f, 1.0e24f, 1.0e25f, 1.0e26f, 1.0e27f, 1.0e28f, 1.0e29f, 1.0e30f, 1.0e31f, 1.0e32f, 1.0e33f, 1.0e34f, 1.0e35f, 1.0e36f, 1.0e37f, 1.0e38f, 3.4e38f, NAN, INFINITY }; TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; unsigned i; for (i = 0; i < sizeof(tv) / sizeof(tv[0]); ++i) { float const f = tv[i]; ftoas(s, f, 7); //print(s); print("\r\n"); //sprintf(t, "%e", f); //sprintf(t, "%f", f); //sprintf(t, "%g", f); //sprintf(t, "%e %f %g %s\r\n", f, f, f, s); sprintf(t, "%s %e %g %f\r\n", s, f, f, f); print(t); } volatile float et = ((float)TA0R + ((TA0CTL & TAIFG) ? 65536.0 : 0.0)) * 63.0f / 1000000.0f; // Elapsed time in microseconds ftoas(t, et, 5); //sprintf(t, "%f", et); print(t); print("s\r\n"); for(;; return 0; }
  13. Like
    oPossum got a reaction from zeke in Console with One-Wire Temperature on MSP430F5529 LP   
    This is obviously wrong:
     

    UCSCTL2 |= 499; // Set DCO Multiplier for 16MHz // (N + 1) * FLLRef = Fdco // (499 + 1) * 32768 = 16MHz change to... 

    UCSCTL2 |= 499; // Set DCO Multiplier for 16MHz // (N + 1) * FLLRef = Fdco // (499 + 1) * 32768 = 16.384 MHz and also change... 

    const unsigned long smclk_freq = 16384000UL; // SMCLK frequency in hertz and delete (or fix and make use of)... 

    /******************************************************************************* * ******************************************************************************/ #define F_CPU 16000000UL #define BAUD 19200 #define BAUD_PRESCALE (((F_CPU / (BAUD * 16UL))) - 1)
  14. Like
    oPossum reacted to veryalive in Console with One-Wire Temperature on MSP430F5529 LP   
    Hi guys.
    I used Zeke's sw a long time ago, so maybe my recollection is correct / useful:
    .... Check your bit times with a scope ....?
     
    I recall changing baud rate down to 9600 and also got bit errors.
    The logic analyser showed faster bit times than 104 uSec (9600), it was around 95 or 97 ?
    When I adjusted dividers in the async uart setup routine, it worked.
    Maybe this helps?
    Cheers....
  15. Like
    oPossum got a reaction from agaelema in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  16. Like
    oPossum got a reaction from Rickta59 in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  17. Like
    oPossum got a reaction from bluehash in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  18. Like
    oPossum reacted to chicken in RGB 4x4 button thing   
    I really like working with the MSP430F550x. The ones without ADC (5500, 01, 02 and 03) are 10-20% cheaper than their equivalents with ADC. E.g the 5501 instead of 5508, though only available as QFN48.
     
    As for BSL: Yes, you can enter BSL mode programmatically with this bit of code: (using MSPware driverlib and usblib)
    void start_usb_boot_loader(void) {     typedef void (* __data16 functionpointer)(void);     const functionpointer bsloader = (functionpointer)(0x1000);     USB_disconnect();     USB_disable();     SYSBSLC &= ~(SYSBSLPE);     __disable_interrupt();     bsloader();          // won't return } Works well for my purposes. But it does not work reliably with some USB hubs which seem to power-cycle the device, i.e. reset the MCU. Maybe the USB_disconnect/disable lines are unnecessary, but I didn't bother to debug that yet. Just break out the pins to connect an ezFET in case you're stuck
     
    Also of note is, that new chips automatically go into BSL on power-up. That's very convenient to program them over USB.
     
    For programming I use the BSL_USB_GUI windows client. I find it more convenient than the Python and Java clients that come with usblib (also fewer dependencies, just an EXE and a DLL to distribute). The VisualStudio project looks simple enough to modify if one wants to reduce the number of clicks and/or include some automatic test procedures after flashing the firmware. You can find the client and source code in this package: http://www.ti.com/tool/msp430usbdevpack
     
    I never got the open source msp430-usbbsl command line tool working. Looks like a half-finished project that was abandoned once it worked for a specific MCU and hex file format.
     
    Hope this gets you started. Let me know if you have any questions or want to look at my complete USB source code.
  19. Like
    oPossum reacted to greeeg in RGB 4x4 button thing   
    It's been awhile since I've done a proper project, mainly been doing alot of tinkering.
     
    But I have a new project that I'd like to share. I wanted to make something like sparkfuns 2x2 Simon game. but using adafruits 4x4 silicone buttons.
    Ofcourse adafruit sell a breakout board with an i2c keypad scanner/single colour led driver.
     
    I thought how hard is it to replicate that design, but add SMD RGB LEDs under each button!! Now this is still a work in progress.
    Here is the schematic, there isn't too much to the design. Just a MSP430F55xx 64QFN and a TLC5955 48 channel 16bit LED driver.
    Because I'm using such a large MSP, I trashed a keypad multiplexing idea and went straight for direct. the same with the LEDs.

     
    The board isn't very fancy either. This was my first board with kicad, moved from altium. (I needed to make the change sometime, will not be able to afford a non-student license)

     
    I haven't doe any software for this project. and this is the first USB project I've ever made. So progress might be slow. but I'm interested if you guys have ideas for this platform.
     
    I'm not sure if the stock USB bootloader can be called from software? I know that the ezFET does this, but they use custom BSL code, which I might need to look into.
    I have a fair bit of leftovers and might consider placing some boards on tindie, the only downside is that the two ICs are expensive and practically double the BOM.
     
    ----Edit----
    TLC5955, not TLC5595
  20. Like
    oPossum got a reaction from roadrunner84 in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  21. Like
    oPossum got a reaction from spirilis in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  22. Like
    oPossum got a reaction from pine in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  23. Like
    oPossum got a reaction from dubnet in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  24. Like
    oPossum got a reaction from timotet in Printing large numbers in small spaces (64 bits in 5 chars)   
    This code will print a 64 bit unsigned integer in 5 characters using 3 significant digits and and an SI prefix. Several revisions are shown beginning with the obvious implementation and then optimizing it to improve speed and decrease code size. Compilers used are TI 4.4.4 and GCC 4.9.1 Test hardware is the F5529 Launchpad running at the default clock of 1.016 MHz.
     
    The general strategy of the code is:
    - Count the number of digits in the decimal representation
    - Divide the value to reduce it to 3 digits.
    - Print the 3 digits with proper formatting and SI prefix.
    - Handle the special case of values of 99 or less that must be printed with 2 or 1 digits and no decimal.
     
    A series of values from zero to the maximum possible 64 bit value are used to test the performance and correct operation of the code...

    static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; The first version uses C standard library functions for a very simple implementation. Decimal digits are counted using log10(). Dividing the digit count by three using the div() function provides values for formatting, division, and the SI prefix. The divisor needed to reduce the value to three digits is calculated with pow(). The special case of values of 99 or less is handled by using a fixed digit count for values less than 1000 rather than the actual base ten digit count. Conversion from uint64_t to double will have a loss of precision for large values due to float having a 52 bit significand. This is not a problem for this code because only three significant digits are printed.Code size for the TI compiler is 20,636 bytes and the execution time for the test case is 1.22 seconds. The GCC compiler does not produce working code.

    static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } The C standard library does not have integer versions of log10() and pow(), so another strategy will be used. The value is divided by 10 until it is less than 1000. The number of divisions is counted to determine the number of base ten digits. Stopping the division when the value is less than 1000 handles the special case for values of 99 or less by limiting the digit count to 3 or more. The resulting 3 digit value will be split as necessary for formatting using the div() library function.Code size for the TI compiler is 5,346 bytes and the execution time for the test case is 3.24 seconds.
    Code size for the GCC compiler is 36,040 bytes and the execution time for the test case is 981 milliseconds.
    The poor performance of the TI compiler is due the an inefficient intrinsic 64 bit unsigned divide.

    static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } Using this 64 bit unsigned divide provides much better performance on the TI compiler, but much worse on the GCC compiler.Code size for the TI compiler is 5,666 bytes and the execution time for the test case is 396 milliseconds.
    Code size for the GCC compiler is 36,116 bytes and the execution time for the test case is 4.06 seconds.
    Future revisions will use intrinsic divide for GCC and this divide function for TI.

    static uint64_t divu64(uint64_t n, uint64_t d) { if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; } The number of 64 bit divides can be reduced by using constants of 10 to the power of 2 to the power of N rather than iterative division by 10.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 160 milliseconds.
    Code size for the GCC compiler is 36,256 bytes and the execution time for the test case is 223 milliseconds.

    unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; The number of 64 bit divides can be reduced to one by using a table to determine the number of base ten digits. The table is searched for the highest value that is less than or equal to the value to be printed. The value is then divided by the table entry that is 1/100th of the value representing the base ten digit count to reduce the value to 3 digits.Code size for the TI compiler is 5,886 bytes and the execution time for the test case is 143 milliseconds.
    Code size for the GCC compiler is 36,096 bytes and the execution time for the test case is 143 milliseconds.

    static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } The sprintf() function takes significant space, so it is replaced with decimal to ASCII conversion using the div() library function.Code size for the TI compiler is 1,660 bytes and the execution time for the test case is 25.8 milliseconds.
    Code size for the GCC compiler is 3,384 bytes and the execution time for the test case is 82.0 milliseconds.

    div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; The final optimization removes all division. The base ten tables values are used with iterative subtraction to create the ASCII string.Code size for the TI compiler is 1,368 bytes and the execution time for the test case is 10.8 milliseconds.
    Code size for the GCC compiler is 2,452 bytes and the execution time for the test case is 19.9 milliseconds.

    uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; Performance summary
    function TI GCC ---------------------------------------------- sprint_u64 1368 10.8 ms 2452 19.9 ms sprint_u64_e 1660 25.8 ms 3384 82.0 ms sprint_u64_d 5886 143 ms 36096 143 ms sprint_u64_c 5886 160 ms 36256 223 ms sprint_u64_b 5666 396 ms 36116 4.06 s sprint_u64_a 5346 3.24 s 36040 981 ms sprint_f3d 20636 1.22 s non-functional Network bandwidth display (lower right) using 3 digits.
     
    Complete code

    #include <msp430.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #define PU64 sprint_u64 //#define PU64 sprint_u64_e //#define PU64 sprint_u64_d //#define PU64 sprint_u64_c //#define PU64 sprint_u64_b //#define USE_DIV_FUNC //#define PU64 sprint_u64_a //#define PU64 sprint_f3d //#define PU64 sprint_u64_null static void sprint_u64(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; uint64_t const *p = pt + 17; if(n >= 1000) { p = pt; while (n < *p) ++p; } // n += (p[2] >> 1); // optional rounding int dp = p - pt; while (dp > 2) dp -= 3; if (dp == 2) *s++ = ' '; char c; if(n < 100) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *s = c; ++p; if(dp == 1) *++s = '.'; if((*s == ' ') && (n < 10)) { c = ' '; } else { c = '0'; while(n >= *p) n -= *p, ++c; } *++s = c; ++p; if(dp == 0) *++s = '.'; c = '0'; while (n >= *p) n -= *p, ++c; *++s = c; *++s = " EEPPPTTTGGGMMMkkk "[p - pt]; *++s = 0; } static uint64_t divu64(uint64_t n, uint64_t d) { #if defined( __GNUC__) & !defined(USE_DIV_FUNC) return n / d; #else if((n < d) || (!d)) return 0; uint64_t register b = 1; if(((uint16_t)(n >> 48)) & 0x8000) { while(!(((uint16_t)(d >> 48)) & 0x8000)) d <<= 1, b <<= 1; if(n < d) d >>= 1, b >>= 1; } else { d <<= 1; while(n >= d) d <<= 1, b <<= 1; d >>= 1; } uint64_t q = b; n -= d; while(!(b & 1)) { d >>= 1; b >>= 1; if(n >= d) n -= d, q |= b; } return q; #endif } static void sprint_u64_e(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr; qr.rem = (int)n; div_t const dp = div(d, 3); if(dp.rem == 2) *s++ = ' '; if(qr.rem < 100) { *s++ = ' '; qr.quot = 0; } else { qr = div(qr.rem, 100); *s++ = '0' + qr.quot; } if(dp.rem == 0) *s++ = '.'; if((!qr.quot) && (qr.rem < 10)) { *s++ = ' '; } else { qr = div(qr.rem, 10); *s++ = '0' + qr.quot; } if(dp.rem == 1) *s++ = '.'; *s++ = '0' + qr.rem; *s++ = " kMGTPE"[dp.quot]; *s = 0; } static void sprint_u64_d(char *s, uint64_t n) { static uint64_t const pt[] = { // Powers of 10 // 18446744073709551615 // 2^64 - 1 10000000000000000000ULL, // 10^19 NN.N E 1000000000000000000ULL, // 10^18 N.NN E 100000000000000000ULL, // 10^17 NNN P 10000000000000000ULL, // 10^16 NN.N P 1000000000000000ULL, // 10^15 N.NN P 100000000000000ULL, // 10^14 NNN T 10000000000000ULL, // 10^13 NN.N T 1000000000000ULL, // 10^12 N.NN T 100000000000ULL, // 10^11 NNN G 10000000000ULL, // 10^10 NN.N G // 4294967295 // 2^32 - 1 1000000000ULL, // 10^9 N.NN G 100000000ULL, // 10^8 NNN M 10000000ULL, // 10^7 NN.N M 1000000ULL, // 10^6 N.NN M 100000ULL, // 10^5 NNN k 10000ULL, // 10^4 NN.N k 1000ULL, // 10^3 N.NN k 100ULL, // 10^2 NNN 10ULL, // 10^1 NN 1ULL, // 10^0 N }; unsigned d = 2; if(n >= 1000) { d = 19; uint64_t const *p = pt; while (n < *p) ++p, --d; n = divu64(n, p[2]); } div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_c(char *s, uint64_t n) { unsigned d = 2; if(n >= 1000000000000000000ULL) n = divu64(n, 10000000000000000ULL), d += 16; if(n >= 10000000000ULL) n = divu64(n, 100000000ULL), d += 8; if(n >= 1000000ULL) n = divu64(n, 10000ULL), d += 4; if(n >= 10000ULL) n = divu64(n, 100ULL), d += 2; if(n >= 1000ULL) n = divu64(n, 10ULL), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_b(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n = divu64(n, 10), ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_u64_a(char *s, uint64_t n) { unsigned d = 2; while(n >= 1000) n /= 10, ++d; div_t qr = div(d, 3); char const u = " kMGTPE"[qr.quot]; switch(qr.rem) { case 0: qr = div((int)n, 100); sprintf(s, "%i.%02i%c", qr.quot, qr.rem, u); break; case 1: qr = div((int)n, 10); sprintf(s, "%2i.%i%c", qr.quot, qr.rem, u); break; case 2: sprintf(s, "%4i%c", (int)n, u); break; } } static void sprint_f3d(char * s, double f) { static char const * const fmt[3] = { "%1.2f%c", "%2.1f%c", "%4.0f%c" }; div_t const d = div((f < 1000.0) ? 2 : (int)log10(f), 3); sprintf(s, fmt[d.rem], f / pow(1000.0, d.quot), " kMGTPEZY"[d.quot]); } static void sprint_u64_null(char *s, uint64_t n) { *s = 0; } static void print(char const *s) { while(*s) { while(!(UCA1IFG & UCTXIFG)); UCA1TXBUF = *s++; } } #define smclk_freq (32768UL * 31UL) // SMCLK frequency in hertz #define bps (9600UL) // Async serial bit rate int main(void) { WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer // P4SEL = BIT4 | BIT5; // Enable UART pins P4DIR = BIT4 | BIT5; // // // Initialize UART UCA1CTL1 = UCSWRST; // Hold USCI in reset to allow configuration UCA1CTL0 = 0; // No parity, LSB first, 8 bits, one stop bit, UART (async) const unsigned long brd = (smclk_freq + (bps >> 1)) / bps; // Bit rate divisor UCA1BR1 = (brd >> 12) & 0xFF; // High byte of whole divisor UCA1BR0 = (brd >> 4) & 0xFF; // Low byte of whole divisor UCA1MCTL = ((brd << 4) & 0xF0) | UCOS16; // Fractional divisor, oversampling mode UCA1CTL1 = UCSSEL_2; // Use SMCLK for bit rate generator, release reset static const uint64_t td[] = { 0ULL, 1ULL, 12ULL, 123ULL, 1234ULL, 12345ULL, 123456ULL, 1234567ULL, 12345678ULL, 123456789ULL, 1234567890ULL, 12345678901ULL, 123456789012ULL, 1234567890123ULL, 12345678901234ULL, 123456789012345ULL, 1234567890123456ULL, 12345678901234567ULL, 123456789012345678ULL, 1234567890123456789ULL, 12345678901234567890ULL, 18446744073709551615ULL }; char s[32]; int n; uint64_t const *p; // Print test array to application UART print("\r\n"); n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); print(s); print("\r\n"); } while(--n); // Time the test array - make string only - do not print TA0EX0 = 7; TA0CTL = TASSEL_2 | ID_3 | MC_2; TA0CTL |= TACLR; n = sizeof(td) / sizeof(td[0]); p = td; do { PU64(s, *p++); } while(--n); uint64_t et = TA0R * 63ULL; // Elapsed time in microseconds PU64(s, et); print(s); print(" us\r\n"); for(;; return 0; }
  25. Like
    oPossum got a reaction from JonnyBoats in How to solder HTSSOP heatsink without reflowing the board?   
    It is possible to do thermal pads without a reflow oven or hot air.
     

×
×
  • Create New...