Jump to content
43oh

8.24 fixed point multiplication


Recommended Posts

This code will multiply an unsigned 8.24 fixed point number by a unsigned 32 bit value (any binary point). The returned result is adjusted for the binary point (right shift 24 bits).

 

It could be done like this:

 

uint32_t mul824(uint32_t a, uint32_t 
{
    // Multiply 8.24 fixed point in a by b
    // Return is same type as b
    return ((uint64_t(a) *  >> 24) & 0xFFFFFFFF;
}

 

Unfortunately C uses a product accumulator with the same number of bits as the multiplicands, so this requires casts and is not as efficient as it could be.

 

There are several optimizations done in this assembly code:

- A 64 bit accumulator is used for the 32 bit multiplicands - cast to 64 bits eliminated

- The multiply is done in two stages, first using the LSW of a, and then the MSW. This allows the use of a 48 bit shift register rather than 64 bit.

- The multiply code is a bit more efficient that what CCS uses - a TST Rxx instruction has been eliminated by restructuring the loop. A CLC instruction could also eliminated thanks to the 48 bit shift register (never will shift out a 1)

- There is optional optimization that will skip the second stage of multiplication when a is close to 1.0

- The final right shift of 24 bits is done with byte swap rather than shift

 

 

;
;    Copyright © 2013  Kevin Timmerman
;
;   This program is free software: you can redistribute it and/or modify
;   it under the terms of the GNU General Public License as published by
;   the Free Software Foundation, either version 3 of the License, or
;   (at your option) any later version.
;
;   This program is distributed in the hope that it will be useful,
;   but WITHOUT ANY WARRANTY; without even the implied warranty of
;   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;   GNU General Public License for more details.
;
;   You should have received a copy of the GNU General Public License
;   along with this program.  If not, see <http://www.gnu.org/licenses/>.
;
        .cdecls C, LIST, "msp430.h"             ;
                                                ;
opt1    .equ    1                               ; Optimize for values near 1.0 (faster / larger code)                                              
        .text                                   ;
                                                ;
        .def    mul824                          ; uint32_t mul824(uint32_t a, uint32_t 
                                                ; a is 8.24 fixed point
                                                ; b is any type
                                                ; return is same type as b
                                                ;
                                                ;
mul824:                                         ;                                              
    push    R6                                  ; Accumulator LSW
    push    R7                                  ; Accumulator
    push    R8                                  ; Accumulator
    push    R9                                  ; Accumulator MSW / B shift LSW
    push    R10                                 ; B shift
    push    R11                                 ; B shift MSW
                                                ;
                                                ; R12 - A LSW
                                                ; R13 - A MSW
                                                ; R14 - B LSW
                                                ; R15 - B MSW
                                                ;
    clr     R6                                  ; Clear accumulator
    clr     R7                                  ;
    clr     R8                                  ;
                                                ; - LSW of a * b                                               
    mov     R14, R9                             ; Copy b in to shift register
    mov     R15, R10                            ;
    clr     R11                                 ;                                              
    clrc                                        ;
    jmp     lsw_begin                           ;
lsw_add:                                        ; Add b shift register to accumulator
    add     R9, R6                              ;
    addc    R10, R7                             ;
    addc    R11, R8                             ;
lsw_shift:                                      ;
    rla     R9                                  ; Shift b
    rlc     R10                                 ;
    rlc     R11                                 ;
lsw_begin:                                      ;  
    rrc     R12                                 ; Shift a, test lsb
    jc      lsw_add                             ; lsb is 1, add b...
    jne     lsw_shift                           ; lsb is 0, but more 1 bits remain...
    clr     R9                                  ; Clear MSW of accumulator
                                                ;
    .if opt1                                    ;
    cmp     #0x0100, R13                        ; Multiply by 1.00xxxx ?
    jeq     shift_acc                           ; Yes, skip MSW multiply and just add b to result
    cmp     #0x00FF, R13                        ; Multiply by 0.FFxxxx ?
    jne     do_msw                              ; No, do the MSW multiply
                                                ; - Use Booth's algorithm
    sub     R14, R7                             ; Subtract b << 16 from accumulator (b * 00.01000)
    subc    R15, R8                             ; Add b to result later
    subc    #0, R9                              ; 01.000000 - 00.010000 = 00.FF000
                                                ; This effectively adds b * 00.FF0000 to the accumulator
    jmp     shift_acc                           ; Skip MSW multiply
do_msw:                                         ;  
    clrc                                        ;
    .endif                                      ; - MSW of a * b   
    clr     R11                                 ; Clear MSW of b shift register                                            
    jmp     msw_begin                           ;
msw_add:                                        ; Add b shift register to accumulator
    add     R14, R7                             ;
    addc    R15, R8                             ;
    addc    R11, R9                             ;
msw_shift:                                      ; Shift b
    rla     R14                                 ;
    rlc     R15                                 ;
    rlc     R11                                 ;
msw_begin:                                      ;
    rrc     R13                                 ; Shift a, test lsb
    jc      msw_add                             ; lsb is 1, add b...
    jne     msw_shift                           ; lsb is 0, but more 1 bits remain...
    .if opt1                                    ;
    clr     R14                                 ; Clear b - it will be added to result later
    clr     R15                                 ;
    .endif                                      ;
                                                ;
shift_acc:                                      ;                                              
    mov.b   R8, R12                             ; Shift accumulator right by 24 bits
    swpb    R12                                 ;  in to R12/R13
    swpb    R7                                  ;
    mov.b   R7, R7                              ;
    or      R7, R12                             ; LSW in R12
                                                ;                                                                  
    mov.b   R9, R9                              ;
    swpb    R9                                  ;
    swpb    R8                                  ;
    mov.b   R8, R13                             ;
    or      R9, R13                             ; MSW in R13
                                                ;
    .if opt1                                    ;
    add     R14, R12                            ; Add b to result if MSW multiply was skipped
    addc    R15, R13                            ; (b is cleared by MSW multiply)
    .endif                                      ;
                                                ;
    pop     R11                                 ;
    pop     R10                                 ;
    pop     R9                                  ;
    pop     R8                                  ;
    pop     R7                                  ;
    pop     R6                                  ;
                                                ;
    ret                                         ;
Link to post
Share on other sites

CCSv4 (what I use) doesn't support 64 bit integers.

 

Using CCSv5 with -O 4 -mf 5...

 

Well, at least it inlined the mul824() function.

 

I will take on any MSP430 compiler. Guaranteed faster code or your money back.  LOL

 

#include <msp430.h>
#include <stdint.h>
uint32_t mul824(uint32_t a, uint32_t 
{
    // Multiply 8.24 fixed point in a by b
    // Return is same type as b
    return (((uint64_t)a *  >> 24) & 0xFFFFFFFF;
}
void main(void)
{
volatile uint32_t a = 0x01000000;
volatile uint32_t b = 0x12345678;

volatile uint32_t x = mul824(a, ;
volatile uint32_t y = mul824(b, a);
}

 

 

13    {
      main:
c114:   120A                PUSH    R10
c116:   1209                PUSH    R9
c118:   1208                PUSH    R8
c11a:   8031 0010           SUB.W   #0x0010,SP
14      volatile uint32_t a = 0x01000000;
c11e:   4381 0000           CLR.W   0x0000(SP)
c122:   40B1 0100 0002      MOV.W   #0x0100,0x0002(SP)
15      volatile uint32_t b = 0x12345678;
c128:   40B1 5678 0004      MOV.W   #0x5678,0x0004(SP)
c12e:   40B1 1234 0006      MOV.W   #0x1234,0x0006(SP)
17      volatile uint32_t x = mul824(a, ;
c134:   412C                MOV.W   @SP,R12
c136:   411D 0002           MOV.W   0x0002(SP),R13
c13a:   411E 0004           MOV.W   0x0004(SP),R14
c13e:   411F 0006           MOV.W   0x0006(SP),R15
c142:   12B0 C2B0           CALL    #__mspabi_mpyull
c146:   4C08                MOV.W   R12,R8
c148:   4D09                MOV.W   R13,R9
c14a:   4E0A                MOV.W   R14,R10
c14c:   4F0B                MOV.W   R15,R11
c14e:   403C 0018           MOV.W   #0x0018,R12
c152:   12B0 C1F2           CALL    #__mspabi_srlll
c156:   4C81 0008           MOV.W   R12,0x0008(SP)
c15a:   4D81 000A           MOV.W   R13,0x000a(SP)
18      volatile uint32_t y = mul824(b, a);
c15e:   411C 0004           MOV.W   0x0004(SP),R12
c162:   411D 0006           MOV.W   0x0006(SP),R13
c166:   412E                MOV.W   @SP,R14
c168:   411F 0002           MOV.W   0x0002(SP),R15
c16c:   12B0 C2B0           CALL    #__mspabi_mpyull
c170:   4C08                MOV.W   R12,R8
c172:   4D09                MOV.W   R13,R9
c174:   4E0A                MOV.W   R14,R10
c176:   4F0B                MOV.W   R15,R11
c178:   403C 0018           MOV.W   #0x0018,R12
c17c:   12B0 C1F2           CALL    #__mspabi_srlll
c180:   4C81 000C           MOV.W   R12,0x000c(SP)
c184:   4D81 000E           MOV.W   R13,0x000e(SP)
19    }
c188:   5031 0010           ADD.W   #0x0010,SP
c18c:   4138                POP.W   R8
c18e:   4139                POP.W   R9
c190:   413A                POP.W   R10
c192:   4130                RET    
__mspabi_mpyll:
c000:   120A                PUSH    R10
c002:   1209                PUSH    R9
c004:   1208                PUSH    R8
c006:   1207                PUSH    R7
c008:   1206                PUSH    R6
c00a:   1205                PUSH    R5
c00c:   1204                PUSH    R4
c00e:   8031 000C           SUB.W   #0x000c,SP
c012:   4F06                MOV.W   R15,R6
c014:   4C81 0008           MOV.W   R12,0x0008(SP)
c018:   4D81 0002           MOV.W   R13,0x0002(SP)
c01c:   4E07                MOV.W   R14,R7
c01e:   4881 0006           MOV.W   R8,0x0006(SP)
c022:   4981 0000           MOV.W   R9,0x0000(SP)
c026:   4A81 0004           MOV.W   R10,0x0004(SP)
c02a:   4B0E                MOV.W   R11,R14
c02c:   430D                CLR.W   R13
c02e:   430F                CLR.W   R15
c030:   12B0 C268           CALL    #__mspabi_mpyl
c034:   4C0A                MOV.W   R12,R10
c036:   430D                CLR.W   R13
c038:   480E                MOV.W   R8,R14
c03a:   430F                CLR.W   R15
c03c:   460C                MOV.W   R6,R12
c03e:   12B0 C268           CALL    #__mspabi_mpyl
c042:   4C09                MOV.W   R12,R9
c044:   4D04                MOV.W   R13,R4
c046:   4305                CLR.W   R5
c048:   4306                CLR.W   R6
c04a:   470C                MOV.W   R7,R12
c04c:   412D                MOV.W   @SP,R13
c04e:   12B0 C28E           CALL    #__mspabi_mpyul
c052:   5C09                ADD.W   R12,R9
c054:   6D04                ADDC.W  R13,R4
c056:   6305                ADC.W   R5
c058:   6306                ADC.W   R6
c05a:   411C 0002           MOV.W   0x0002(SP),R12
c05e:   411D 0004           MOV.W   0x0004(SP),R13
c062:   12B0 C28E           CALL    #__mspabi_mpyul
c066:   5C09                ADD.W   R12,R9
c068:   6D04                ADDC.W  R13,R4
c06a:   6305                ADC.W   R5
c06c:   6306                ADC.W   R6
c06e:   5A09                ADD.W   R10,R9
c070:   411C 0008           MOV.W   0x0008(SP),R12
c074:   480D                MOV.W   R8,R13
c076:   12B0 C28E           CALL    #__mspabi_mpyul
c07a:   4C0A                MOV.W   R12,R10
c07c:   530A                ADD.W   #0,R10
c07e:   4D04                MOV.W   R13,R4
c080:   4305                CLR.W   R5
c082:   4306                CLR.W   R6
c084:   6304                ADC.W   R4
c086:   6305                ADC.W   R5
c088:   6906                ADDC.W  R9,R6
c08a:   411C 0002           MOV.W   0x0002(SP),R12
c08e:   412D                MOV.W   @SP,R13
c090:   12B0 C28E           CALL    #__mspabi_mpyul
c094:   4C08                MOV.W   R12,R8
c096:   4D09                MOV.W   R13,R9
c098:   411D 0006           MOV.W   0x0006(SP),R13
c09c:   470C                MOV.W   R7,R12
c09e:   12B0 C28E           CALL    #__mspabi_mpyul
c0a2:   4D07                MOV.W   R13,R7
c0a4:   580C                ADD.W   R8,R12
c0a6:   4C81 000A           MOV.W   R12,0x000a(SP)
c0aa:   6907                ADDC.W  R9,R7
c0ac:   4308                CLR.W   R8
c0ae:   6308                ADC.W   R8
c0b0:   4309                CLR.W   R9
c0b2:   6309                ADC.W   R9
c0b4:   411C 0008           MOV.W   0x0008(SP),R12
c0b8:   411D 0004           MOV.W   0x0004(SP),R13
c0bc:   12B0 C28E           CALL    #__mspabi_mpyul
c0c0:   5C81 000A           ADD.W   R12,0x000a(SP)
c0c4:   411F 000A           MOV.W   0x000a(SP),R15
c0c8:   6D07                ADDC.W  R13,R7
c0ca:   6308                ADC.W   R8
c0cc:   6309                ADC.W   R9
c0ce:   530A                ADD.W   #0,R10
c0d0:   6304                ADC.W   R4
c0d2:   6F05                ADDC.W  R15,R5
c0d4:   6706                ADDC.W  R7,R6
c0d6:   411C 0008           MOV.W   0x0008(SP),R12
c0da:   412D                MOV.W   @SP,R13
c0dc:   12B0 C28E           CALL    #__mspabi_mpyul
c0e0:   4C07                MOV.W   R12,R7
c0e2:   4D09                MOV.W   R13,R9
c0e4:   411D 0006           MOV.W   0x0006(SP),R13
c0e8:   411C 0002           MOV.W   0x0002(SP),R12
c0ec:   12B0 C28E           CALL    #__mspabi_mpyul
c0f0:   570C                ADD.W   R7,R12
c0f2:   690D                ADDC.W  R9,R13
c0f4:   430F                CLR.W   R15
c0f6:   630F                ADC.W   R15
c0f8:   430E                CLR.W   R14
c0fa:   630E                ADC.W   R14
c0fc:   530A                ADD.W   #0,R10
c0fe:   6C04                ADDC.W  R12,R4
c100:   6D05                ADDC.W  R13,R5
c102:   6F06                ADDC.W  R15,R6
c104:   4A0C                MOV.W   R10,R12
c106:   440D                MOV.W   R4,R13
c108:   450E                MOV.W   R5,R14
c10a:   460F                MOV.W   R6,R15
c10c:   5031 000C           ADD.W   #0x000c,SP
c110:   4030 C2E8           BR      #__mspabi_func_epilog

      __mspabi_mpyl:
c268:   120A                PUSH    R10
c26a:   430A                CLR.W   R10
c26c:   430B                CLR.W   R11
      mpyl_add_loop:
c26e:   C312                CLRC   
c270:   100D                RRC     R13
c272:   100C                RRC     R12
c274:   2802                JLO     (shift_test_mpyl)
c276:   5E0A                ADD.W   R14,R10
c278:   6F0B                ADDC.W  R15,R11
      shift_test_mpyl:
c27a:   5E0E                RLA.W   R14
c27c:   6F0F                RLC.W   R15
c27e:   930D                TST.W   R13
c280:   23F6                JNE     (mpyl_add_loop)
c282:   930C                TST.W   R12
c284:   23F4                JNE     (mpyl_add_loop)
c286:   4A0C                MOV.W   R10,R12
c288:   4B0D                MOV.W   R11,R13
c28a:   413A                POP.W   R10
c28c:   4130                RET    
      __mspabi_mpyul:
c28e:   4C0B                MOV.W   R12,R11
c290:   4D0E                MOV.W   R13,R14
c292:   430F                CLR.W   R15
c294:   430C                CLR.W   R12
c296:   430D                CLR.W   R13
c298:   C312                CLRC   
c29a:   100B                RRC     R11
c29c:   3C01                JMP     (mpyul_add_loop1)
      mpyul_add_loop:
c29e:   110B                RRA     R11
      mpyul_add_loop1:
c2a0:   2802                JLO     (shift_test_mpyul)
c2a2:   5E0C                ADD.W   R14,R12
c2a4:   6F0D                ADDC.W  R15,R13
      shift_test_mpyul:
c2a6:   5E0E                RLA.W   R14
c2a8:   6F0F                RLC.W   R15
c2aa:   930B                TST.W   R11
c2ac:   23F8                JNE     (mpyul_add_loop)
c2ae:   4130                RET    
      __mspabi_mpyull:
c2b0:   120A                PUSH    R10
c2b2:   1209                PUSH    R9
c2b4:   1208                PUSH    R8
c2b6:   4C08                MOV.W   R12,R8
c2b8:   4D09                MOV.W   R13,R9
c2ba:   430A                CLR.W   R10
c2bc:   430B                CLR.W   R11
c2be:   4E0C                MOV.W   R14,R12
c2c0:   4F0D                MOV.W   R15,R13
c2c2:   430E                CLR.W   R14
c2c4:   430F                CLR.W   R15
c2c6:   12B0 C000           CALL    #__mspabi_mpyll
c2ca:   4030 C2F0           BR      #__mspabi_func_epilog_3
__mspabi_srlll:
c1f2:   120A                PUSH    R10
c1f4:   1209                PUSH    R9
c1f6:   1208                PUSH    R8
c1f8:   1207                PUSH    R7
c1fa:   4C07                MOV.W   R12,R7
c1fc:   4B0F                MOV.W   R11,R15
c1fe:   9037 0011           CMP.W   #0x0011,R7
c202:   380E                JL      ($C$L2)
c204:   470E                MOV.W   R7,R14
c206:   831E                DEC.W   R14
c208:   4E0C                MOV.W   R14,R12
c20a:   12B0 C25E           CALL    #__mspabi_srai_4
c20e:   F03E FFF0           AND.W   #0xfff0,R14
c212:   8E07                SUB.W   R14,R7
      $C$L1:
c214:   4908                MOV.W   R9,R8
c216:   4A09                MOV.W   R10,R9
c218:   4F0A                MOV.W   R15,R10
c21a:   430F                CLR.W   R15
c21c:   831C                DEC.W   R12
c21e:   23FA                JNE     ($C$L1)
      $C$L2:
c220:   9317                CMP.W   #1,R7
c222:   3807                JL      ($C$L4)
      $C$L3:
c224:   C312                CLRC   
c226:   100F                RRC     R15
c228:   100A                RRC     R10
c22a:   1009                RRC     R9
c22c:   1008                RRC     R8
c22e:   8317                DEC.W   R7
c230:   23F9                JNE     ($C$L3)
      $C$L4:
c232:   480C                MOV.W   R8,R12
c234:   490D                MOV.W   R9,R13
c236:   4A0E                MOV.W   R10,R14
c238:   4030 C2EE           BR      #__mspabi_func_epilog_4
      __mspabi_srai:
c23c:   F03D 000F           AND.W   #0x000f,R13
c240:   E03D 000F           XOR.W   #0x000f,R13
c244:   5D0D                RLA.W   R13
c246:   5D00                ADD.W   R13,PC
      __mspabi_srai_15:
c248:   110C                RRA     R12
      __mspabi_srai_14:
c24a:   110C                RRA     R12
      __mspabi_srai_13:
c24c:   110C                RRA     R12
      __mspabi_srai_12:
c24e:   110C                RRA     R12
      __mspabi_srai_11:
c250:   110C                RRA     R12
      __mspabi_srai_10:
c252:   110C                RRA     R12
      __mspabi_srai_9:
c254:   110C                RRA     R12
      __mspabi_srai_8:
c256:   110C                RRA     R12
      __mspabi_srai_7:
c258:   110C                RRA     R12
      __mspabi_srai_6:
c25a:   110C                RRA     R12
      __mspabi_srai_5:
c25c:   110C                RRA     R12
      __mspabi_srai_4:
c25e:   110C                RRA     R12
      __mspabi_srai_3:
c260:   110C                RRA     R12
      __mspabi_srai_2:
c262:   110C                RRA     R12
      __mspabi_srai_1:
c264:   110C                RRA     R12
c266:   4130                RET    

      _c_int00_noexit:
c2ce:   4031 0400           MOV.W   #0x0400,SP
c2d2:   12B0 C2F8           CALL    #_system_pre_init
c2d6:   930C                TST.W   R12
c2d8:   2402                JEQ     ($C$L2)
c2da:   12B0 C194           CALL    #_auto_init
      $C$L2:
c2de:   430C                CLR.W   R12
c2e0:   12B0 C114           CALL    #main
c2e4:   12B0 C2FC           CALL    #abort
      __mspabi_func_epilog:
c2e8:   4134                POP.W   R4
      __mspabi_func_epilog_6:
c2ea:   4135                POP.W   R5
      __mspabi_func_epilog_5:
c2ec:   4136                POP.W   R6
      __mspabi_func_epilog_4:
c2ee:   4137                POP.W   R7
      __mspabi_func_epilog_3:
c2f0:   4138                POP.W   R8
      __mspabi_func_epilog_2:
c2f2:   4139                POP.W   R9
      __mspabi_func_epilog_1:
c2f4:   413A                POP.W   R10
c2f6:   4130                RET    
      _system_pre_init:
c2f8:   431C                MOV.W   #1,R12
c2fa:   4130                RET    
      C$$EXIT, abort:
c2fc:   4303                NOP    
      $C$L1:
c2fe:   3FFF                JMP     ($C$L1)
Link to post
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...