Quad-precision multiplication

by Richard Russell, December 2014

The program listed below illustrates how to multiply two (unsigned) 64-bit integers together to produce a 128-bit result. It is based on a blog post by Raymond Chen:

        INSTALL @lib$+"ASMLIB2"
 
        DIM crossterms 15+15, result 15+15, gap% 2047, P% 120
        crossterms = (crossterms + 15) AND -16
        result = (result + 15) AND -16 : REM align
 
        ON ERROR LOCAL [OPT FN_asmext : ]
        [
        .mul
        movq xmm0, [^x%%]            ; xmm0 = { 0, 0, A, B } = { *, *, A, B }
        movq xmm1, [^y%%]            ; xmm1 = { 0, 0, C, D } = { *, *, C, D }
        punpckldq xmm0, xmm0         ; xmm0 = { A, A, B, B } = { *, A, *, B }
        punpckldq xmm1, xmm1         ; xmm1 = { C, C, D, D } = { *, C, *, D }
        pshufd xmm2, xmm1, %10001101 ; xmm2 = { D, D, C, C } = { *, D, *, C }
 
        pmuludq xmm1, xmm0           ; xmm1 = { AC, BD } // provisional result
        pmuludq xmm2, xmm0           ; xmm2 = { AD, BC } // cross-terms
 
        movdqa [result], xmm1
        movdqa [crossterms], xmm2
 
        mov    eax, crossterms[0]
        mov    edx, crossterms[4]    ; edx|eax = BC
        add    result[4], eax
        adc    result[8], edx
        adc    dword result[12], 0   ; add the first cross-term
 
        mov    eax, crossterms[8]
        mov    edx, crossterms[12]   ; edx|eax = AD
        add    result[4], eax
        adc    result[8], edx
        adc    dword result[12], 0   ; add the second cross-term
 
        ret
        ]
        RESTORE ERROR
 
        *hex64
        x%% = &1234567812345678
        y%% = &8765432187654321
        *hex32
        CALL mul
        PRINT ~result!12, ~result!8, ~result!4, ~result!0

Note that if you want to incorporate this code in a compiled program it will be necessary to put it in a separate file which you INSTALL or CALL at run-time, and that file should have an extension other than .BBC.