oPossum 1,083 Posted February 24, 2013 Share Posted February 24, 2013 This code will multiply an unsigned 8.24 fixed point number by a unsigned 32 bit value (any binary point). The returned result is adjusted for the binary point (right shift 24 bits). It could be done like this: uint32_t mul824(uint32_t a, uint32_t { // Multiply 8.24 fixed point in a by b // Return is same type as b return ((uint64_t(a) * >> 24) & 0xFFFFFFFF; } Unfortunately C uses a product accumulator with the same number of bits as the multiplicands, so this requires casts and is not as efficient as it could be. There are several optimizations done in this assembly code: - A 64 bit accumulator is used for the 32 bit multiplicands - cast to 64 bits eliminated - The multiply is done in two stages, first using the LSW of a, and then the MSW. This allows the use of a 48 bit shift register rather than 64 bit. - The multiply code is a bit more efficient that what CCS uses - a TST Rxx instruction has been eliminated by restructuring the loop. A CLC instruction could also eliminated thanks to the 48 bit shift register (never will shift out a 1) - There is optional optimization that will skip the second stage of multiplication when a is close to 1.0 - The final right shift of 24 bits is done with byte swap rather than shift ; ; Copyright © 2013 Kevin Timmerman ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see <http://www.gnu.org/licenses/>. ; .cdecls C, LIST, "msp430.h" ; ; opt1 .equ 1 ; Optimize for values near 1.0 (faster / larger code) .text ; ; .def mul824 ; uint32_t mul824(uint32_t a, uint32_t ; a is 8.24 fixed point ; b is any type ; return is same type as b ; ; mul824: ; push R6 ; Accumulator LSW push R7 ; Accumulator push R8 ; Accumulator push R9 ; Accumulator MSW / B shift LSW push R10 ; B shift push R11 ; B shift MSW ; ; R12 - A LSW ; R13 - A MSW ; R14 - B LSW ; R15 - B MSW ; clr R6 ; Clear accumulator clr R7 ; clr R8 ; ; - LSW of a * b mov R14, R9 ; Copy b in to shift register mov R15, R10 ; clr R11 ; clrc ; jmp lsw_begin ; lsw_add: ; Add b shift register to accumulator add R9, R6 ; addc R10, R7 ; addc R11, R8 ; lsw_shift: ; rla R9 ; Shift b rlc R10 ; rlc R11 ; lsw_begin: ; rrc R12 ; Shift a, test lsb jc lsw_add ; lsb is 1, add b... jne lsw_shift ; lsb is 0, but more 1 bits remain... clr R9 ; Clear MSW of accumulator ; .if opt1 ; cmp #0x0100, R13 ; Multiply by 1.00xxxx ? jeq shift_acc ; Yes, skip MSW multiply and just add b to result cmp #0x00FF, R13 ; Multiply by 0.FFxxxx ? jne do_msw ; No, do the MSW multiply ; - Use Booth's algorithm sub R14, R7 ; Subtract b << 16 from accumulator (b * 00.01000) subc R15, R8 ; Add b to result later subc #0, R9 ; 01.000000 - 00.010000 = 00.FF000 ; This effectively adds b * 00.FF0000 to the accumulator jmp shift_acc ; Skip MSW multiply do_msw: ; clrc ; .endif ; - MSW of a * b clr R11 ; Clear MSW of b shift register jmp msw_begin ; msw_add: ; Add b shift register to accumulator add R14, R7 ; addc R15, R8 ; addc R11, R9 ; msw_shift: ; Shift b rla R14 ; rlc R15 ; rlc R11 ; msw_begin: ; rrc R13 ; Shift a, test lsb jc msw_add ; lsb is 1, add b... jne msw_shift ; lsb is 0, but more 1 bits remain... .if opt1 ; clr R14 ; Clear b - it will be added to result later clr R15 ; .endif ; ; shift_acc: ; mov.b R8, R12 ; Shift accumulator right by 24 bits swpb R12 ; in to R12/R13 swpb R7 ; mov.b R7, R7 ; or R7, R12 ; LSW in R12 ; mov.b R9, R9 ; swpb R9 ; swpb R8 ; mov.b R8, R13 ; or R9, R13 ; MSW in R13 ; .if opt1 ; add R14, R12 ; Add b to result if MSW multiply was skipped addc R15, R13 ; (b is cleared by MSW multiply) .endif ; ; pop R11 ; pop R10 ; pop R9 ; pop R8 ; pop R7 ; pop R6 ; ; ret ; Rickta59 and bluehash 2 Quote Link to post Share on other sites
roadrunner84 466 Posted February 24, 2013 Share Posted February 24, 2013 Won't your compiler optimise a shift by any multiple of 8 to a byte swap or word swap? Did you try using higher optimisation levels? Quote Link to post Share on other sites
oPossum 1,083 Posted February 25, 2013 Author Share Posted February 25, 2013 CCSv4 (what I use) doesn't support 64 bit integers. Using CCSv5 with -O 4 -mf 5... Well, at least it inlined the mul824() function. I will take on any MSP430 compiler. Guaranteed faster code or your money back. LOL #include <msp430.h> #include <stdint.h> uint32_t mul824(uint32_t a, uint32_t { // Multiply 8.24 fixed point in a by b // Return is same type as b return (((uint64_t)a * >> 24) & 0xFFFFFFFF; } void main(void) { volatile uint32_t a = 0x01000000; volatile uint32_t b = 0x12345678; volatile uint32_t x = mul824(a, ; volatile uint32_t y = mul824(b, a); } 13 { main: c114: 120A PUSH R10 c116: 1209 PUSH R9 c118: 1208 PUSH R8 c11a: 8031 0010 SUB.W #0x0010,SP 14 volatile uint32_t a = 0x01000000; c11e: 4381 0000 CLR.W 0x0000(SP) c122: 40B1 0100 0002 MOV.W #0x0100,0x0002(SP) 15 volatile uint32_t b = 0x12345678; c128: 40B1 5678 0004 MOV.W #0x5678,0x0004(SP) c12e: 40B1 1234 0006 MOV.W #0x1234,0x0006(SP) 17 volatile uint32_t x = mul824(a, ; c134: 412C MOV.W @SP,R12 c136: 411D 0002 MOV.W 0x0002(SP),R13 c13a: 411E 0004 MOV.W 0x0004(SP),R14 c13e: 411F 0006 MOV.W 0x0006(SP),R15 c142: 12B0 C2B0 CALL #__mspabi_mpyull c146: 4C08 MOV.W R12,R8 c148: 4D09 MOV.W R13,R9 c14a: 4E0A MOV.W R14,R10 c14c: 4F0B MOV.W R15,R11 c14e: 403C 0018 MOV.W #0x0018,R12 c152: 12B0 C1F2 CALL #__mspabi_srlll c156: 4C81 0008 MOV.W R12,0x0008(SP) c15a: 4D81 000A MOV.W R13,0x000a(SP) 18 volatile uint32_t y = mul824(b, a); c15e: 411C 0004 MOV.W 0x0004(SP),R12 c162: 411D 0006 MOV.W 0x0006(SP),R13 c166: 412E MOV.W @SP,R14 c168: 411F 0002 MOV.W 0x0002(SP),R15 c16c: 12B0 C2B0 CALL #__mspabi_mpyull c170: 4C08 MOV.W R12,R8 c172: 4D09 MOV.W R13,R9 c174: 4E0A MOV.W R14,R10 c176: 4F0B MOV.W R15,R11 c178: 403C 0018 MOV.W #0x0018,R12 c17c: 12B0 C1F2 CALL #__mspabi_srlll c180: 4C81 000C MOV.W R12,0x000c(SP) c184: 4D81 000E MOV.W R13,0x000e(SP) 19 } c188: 5031 0010 ADD.W #0x0010,SP c18c: 4138 POP.W R8 c18e: 4139 POP.W R9 c190: 413A POP.W R10 c192: 4130 RET __mspabi_mpyll: c000: 120A PUSH R10 c002: 1209 PUSH R9 c004: 1208 PUSH R8 c006: 1207 PUSH R7 c008: 1206 PUSH R6 c00a: 1205 PUSH R5 c00c: 1204 PUSH R4 c00e: 8031 000C SUB.W #0x000c,SP c012: 4F06 MOV.W R15,R6 c014: 4C81 0008 MOV.W R12,0x0008(SP) c018: 4D81 0002 MOV.W R13,0x0002(SP) c01c: 4E07 MOV.W R14,R7 c01e: 4881 0006 MOV.W R8,0x0006(SP) c022: 4981 0000 MOV.W R9,0x0000(SP) c026: 4A81 0004 MOV.W R10,0x0004(SP) c02a: 4B0E MOV.W R11,R14 c02c: 430D CLR.W R13 c02e: 430F CLR.W R15 c030: 12B0 C268 CALL #__mspabi_mpyl c034: 4C0A MOV.W R12,R10 c036: 430D CLR.W R13 c038: 480E MOV.W R8,R14 c03a: 430F CLR.W R15 c03c: 460C MOV.W R6,R12 c03e: 12B0 C268 CALL #__mspabi_mpyl c042: 4C09 MOV.W R12,R9 c044: 4D04 MOV.W R13,R4 c046: 4305 CLR.W R5 c048: 4306 CLR.W R6 c04a: 470C MOV.W R7,R12 c04c: 412D MOV.W @SP,R13 c04e: 12B0 C28E CALL #__mspabi_mpyul c052: 5C09 ADD.W R12,R9 c054: 6D04 ADDC.W R13,R4 c056: 6305 ADC.W R5 c058: 6306 ADC.W R6 c05a: 411C 0002 MOV.W 0x0002(SP),R12 c05e: 411D 0004 MOV.W 0x0004(SP),R13 c062: 12B0 C28E CALL #__mspabi_mpyul c066: 5C09 ADD.W R12,R9 c068: 6D04 ADDC.W R13,R4 c06a: 6305 ADC.W R5 c06c: 6306 ADC.W R6 c06e: 5A09 ADD.W R10,R9 c070: 411C 0008 MOV.W 0x0008(SP),R12 c074: 480D MOV.W R8,R13 c076: 12B0 C28E CALL #__mspabi_mpyul c07a: 4C0A MOV.W R12,R10 c07c: 530A ADD.W #0,R10 c07e: 4D04 MOV.W R13,R4 c080: 4305 CLR.W R5 c082: 4306 CLR.W R6 c084: 6304 ADC.W R4 c086: 6305 ADC.W R5 c088: 6906 ADDC.W R9,R6 c08a: 411C 0002 MOV.W 0x0002(SP),R12 c08e: 412D MOV.W @SP,R13 c090: 12B0 C28E CALL #__mspabi_mpyul c094: 4C08 MOV.W R12,R8 c096: 4D09 MOV.W R13,R9 c098: 411D 0006 MOV.W 0x0006(SP),R13 c09c: 470C MOV.W R7,R12 c09e: 12B0 C28E CALL #__mspabi_mpyul c0a2: 4D07 MOV.W R13,R7 c0a4: 580C ADD.W R8,R12 c0a6: 4C81 000A MOV.W R12,0x000a(SP) c0aa: 6907 ADDC.W R9,R7 c0ac: 4308 CLR.W R8 c0ae: 6308 ADC.W R8 c0b0: 4309 CLR.W R9 c0b2: 6309 ADC.W R9 c0b4: 411C 0008 MOV.W 0x0008(SP),R12 c0b8: 411D 0004 MOV.W 0x0004(SP),R13 c0bc: 12B0 C28E CALL #__mspabi_mpyul c0c0: 5C81 000A ADD.W R12,0x000a(SP) c0c4: 411F 000A MOV.W 0x000a(SP),R15 c0c8: 6D07 ADDC.W R13,R7 c0ca: 6308 ADC.W R8 c0cc: 6309 ADC.W R9 c0ce: 530A ADD.W #0,R10 c0d0: 6304 ADC.W R4 c0d2: 6F05 ADDC.W R15,R5 c0d4: 6706 ADDC.W R7,R6 c0d6: 411C 0008 MOV.W 0x0008(SP),R12 c0da: 412D MOV.W @SP,R13 c0dc: 12B0 C28E CALL #__mspabi_mpyul c0e0: 4C07 MOV.W R12,R7 c0e2: 4D09 MOV.W R13,R9 c0e4: 411D 0006 MOV.W 0x0006(SP),R13 c0e8: 411C 0002 MOV.W 0x0002(SP),R12 c0ec: 12B0 C28E CALL #__mspabi_mpyul c0f0: 570C ADD.W R7,R12 c0f2: 690D ADDC.W R9,R13 c0f4: 430F CLR.W R15 c0f6: 630F ADC.W R15 c0f8: 430E CLR.W R14 c0fa: 630E ADC.W R14 c0fc: 530A ADD.W #0,R10 c0fe: 6C04 ADDC.W R12,R4 c100: 6D05 ADDC.W R13,R5 c102: 6F06 ADDC.W R15,R6 c104: 4A0C MOV.W R10,R12 c106: 440D MOV.W R4,R13 c108: 450E MOV.W R5,R14 c10a: 460F MOV.W R6,R15 c10c: 5031 000C ADD.W #0x000c,SP c110: 4030 C2E8 BR #__mspabi_func_epilog __mspabi_mpyl: c268: 120A PUSH R10 c26a: 430A CLR.W R10 c26c: 430B CLR.W R11 mpyl_add_loop: c26e: C312 CLRC c270: 100D RRC R13 c272: 100C RRC R12 c274: 2802 JLO (shift_test_mpyl) c276: 5E0A ADD.W R14,R10 c278: 6F0B ADDC.W R15,R11 shift_test_mpyl: c27a: 5E0E RLA.W R14 c27c: 6F0F RLC.W R15 c27e: 930D TST.W R13 c280: 23F6 JNE (mpyl_add_loop) c282: 930C TST.W R12 c284: 23F4 JNE (mpyl_add_loop) c286: 4A0C MOV.W R10,R12 c288: 4B0D MOV.W R11,R13 c28a: 413A POP.W R10 c28c: 4130 RET __mspabi_mpyul: c28e: 4C0B MOV.W R12,R11 c290: 4D0E MOV.W R13,R14 c292: 430F CLR.W R15 c294: 430C CLR.W R12 c296: 430D CLR.W R13 c298: C312 CLRC c29a: 100B RRC R11 c29c: 3C01 JMP (mpyul_add_loop1) mpyul_add_loop: c29e: 110B RRA R11 mpyul_add_loop1: c2a0: 2802 JLO (shift_test_mpyul) c2a2: 5E0C ADD.W R14,R12 c2a4: 6F0D ADDC.W R15,R13 shift_test_mpyul: c2a6: 5E0E RLA.W R14 c2a8: 6F0F RLC.W R15 c2aa: 930B TST.W R11 c2ac: 23F8 JNE (mpyul_add_loop) c2ae: 4130 RET __mspabi_mpyull: c2b0: 120A PUSH R10 c2b2: 1209 PUSH R9 c2b4: 1208 PUSH R8 c2b6: 4C08 MOV.W R12,R8 c2b8: 4D09 MOV.W R13,R9 c2ba: 430A CLR.W R10 c2bc: 430B CLR.W R11 c2be: 4E0C MOV.W R14,R12 c2c0: 4F0D MOV.W R15,R13 c2c2: 430E CLR.W R14 c2c4: 430F CLR.W R15 c2c6: 12B0 C000 CALL #__mspabi_mpyll c2ca: 4030 C2F0 BR #__mspabi_func_epilog_3 __mspabi_srlll: c1f2: 120A PUSH R10 c1f4: 1209 PUSH R9 c1f6: 1208 PUSH R8 c1f8: 1207 PUSH R7 c1fa: 4C07 MOV.W R12,R7 c1fc: 4B0F MOV.W R11,R15 c1fe: 9037 0011 CMP.W #0x0011,R7 c202: 380E JL ($C$L2) c204: 470E MOV.W R7,R14 c206: 831E DEC.W R14 c208: 4E0C MOV.W R14,R12 c20a: 12B0 C25E CALL #__mspabi_srai_4 c20e: F03E FFF0 AND.W #0xfff0,R14 c212: 8E07 SUB.W R14,R7 $C$L1: c214: 4908 MOV.W R9,R8 c216: 4A09 MOV.W R10,R9 c218: 4F0A MOV.W R15,R10 c21a: 430F CLR.W R15 c21c: 831C DEC.W R12 c21e: 23FA JNE ($C$L1) $C$L2: c220: 9317 CMP.W #1,R7 c222: 3807 JL ($C$L4) $C$L3: c224: C312 CLRC c226: 100F RRC R15 c228: 100A RRC R10 c22a: 1009 RRC R9 c22c: 1008 RRC R8 c22e: 8317 DEC.W R7 c230: 23F9 JNE ($C$L3) $C$L4: c232: 480C MOV.W R8,R12 c234: 490D MOV.W R9,R13 c236: 4A0E MOV.W R10,R14 c238: 4030 C2EE BR #__mspabi_func_epilog_4 __mspabi_srai: c23c: F03D 000F AND.W #0x000f,R13 c240: E03D 000F XOR.W #0x000f,R13 c244: 5D0D RLA.W R13 c246: 5D00 ADD.W R13,PC __mspabi_srai_15: c248: 110C RRA R12 __mspabi_srai_14: c24a: 110C RRA R12 __mspabi_srai_13: c24c: 110C RRA R12 __mspabi_srai_12: c24e: 110C RRA R12 __mspabi_srai_11: c250: 110C RRA R12 __mspabi_srai_10: c252: 110C RRA R12 __mspabi_srai_9: c254: 110C RRA R12 __mspabi_srai_8: c256: 110C RRA R12 __mspabi_srai_7: c258: 110C RRA R12 __mspabi_srai_6: c25a: 110C RRA R12 __mspabi_srai_5: c25c: 110C RRA R12 __mspabi_srai_4: c25e: 110C RRA R12 __mspabi_srai_3: c260: 110C RRA R12 __mspabi_srai_2: c262: 110C RRA R12 __mspabi_srai_1: c264: 110C RRA R12 c266: 4130 RET _c_int00_noexit: c2ce: 4031 0400 MOV.W #0x0400,SP c2d2: 12B0 C2F8 CALL #_system_pre_init c2d6: 930C TST.W R12 c2d8: 2402 JEQ ($C$L2) c2da: 12B0 C194 CALL #_auto_init $C$L2: c2de: 430C CLR.W R12 c2e0: 12B0 C114 CALL #main c2e4: 12B0 C2FC CALL #abort __mspabi_func_epilog: c2e8: 4134 POP.W R4 __mspabi_func_epilog_6: c2ea: 4135 POP.W R5 __mspabi_func_epilog_5: c2ec: 4136 POP.W R6 __mspabi_func_epilog_4: c2ee: 4137 POP.W R7 __mspabi_func_epilog_3: c2f0: 4138 POP.W R8 __mspabi_func_epilog_2: c2f2: 4139 POP.W R9 __mspabi_func_epilog_1: c2f4: 413A POP.W R10 c2f6: 4130 RET _system_pre_init: c2f8: 431C MOV.W #1,R12 c2fa: 4130 RET C$$EXIT, abort: c2fc: 4303 NOP $C$L1: c2fe: 3FFF JMP ($C$L1) spirilis and username 2 Quote Link to post Share on other sites
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.