12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
/** Optimised asm arbitrary precision arithmetic ('bignum') 
 * routines for X86 processors.
 *
 * All functions operate on arrays of uints, stored LSB first.
 * If there is a destination array, it will be the first parameter.
 * Currently, all of these functions are subject to change, and are
 * intended for internal use only. 
 * The symbol [#] indicates an array of machine words which is to be
 * interpreted as a multi-byte number.
 *
 * Copyright: Copyright (C) 2008 Don Clugston.  All rights reserved.
 * License:   BSD style: $(LICENSE)
 * Authors:   Don Clugston
 */
/**
 * In simple terms, there are 3 modern x86 microarchitectures:
 * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
 * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
 * (c) the Pentium 4, produced by Marketing.
 *
 * This code has been optimised for the Intel P6 family.
 * Generally the code remains near-optimal for Intel Core2, after translating
 * EAX-> RAX, etc, since all these CPUs use essentially the same pipeline, and
 * are typically limited by memory access.
 * The code uses techniques described in Agner Fog's superb Pentium manuals
 * available at www.agner.org.
 * Not optimised for AMD, which can do two memory loads per cycle (Intel
 * CPUs can only do one). Despite this, performance is superior on AMD.
 * Performance is dreadful on P4. 
 *
 *  Timing results (cycles per int)
 *              --Intel Pentium--  --AMD--
 *              PM     P4   Core2   K7 
 *  +,-         2.25  15.6   2.25   1.5
 *  <<,>>       2.0    6.6   2.0    5.0
 *    (<< MMX)  1.7    5.3   1.5    1.2
 *  *           5.0   15.0   4.0    4.3
 *  mulAdd      5.7   19.0   4.9    4.0
 *  div        30.0   32.0  32.0   22.4
 *  mulAcc(32)  6.5   20.0   5.4    4.9
 *
 * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
 * function call overhead.
 * The timing for Div is quite unpredictable, but it's probably too slow
 * to be useful. On 64-bit processors, these times should
 * halve if run in 64-bit mode, except for the MMX functions.
 */

module tango.math.internal.BignumX86;

/*  
  Naked asm is used throughout, because:
  (a) it frees up the EBP register
  (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
*/

version(GNU) {
    // GDC is a filthy liar. It can't actually do inline asm.
} else version(D_InlineAsm_X86) {
    version = Really_D_InlineAsm_X86;
} else version(LLVM_InlineAsm_X86) { 
        version = Really_D_InlineAsm_X86; 
}

version(Really_D_InlineAsm_X86) {

private:

/* Duplicate string s, with n times, substituting index for '@'.
 *
 * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
 * function for some of the asm routines.
 */
char [] indexedLoopUnroll(int n, char [] s)
{
    char [] u;
    for (int i = 0; i<n; ++i) {
        char [] nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
        
        int last = 0;
        for (int j = 0; j<s.length; ++j) {
            if (s[j]=='@') {
                u ~= s[last..j] ~ nstr;
                last = j+1;
            }
        }
        if (last<s.length) u = u ~ s[last..$];
        
    }
    return u;    
}
unittest
{
assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
}

public:
    
alias uint BigDigit; // A Bignum is an array of BigDigits. Usually the machine word size.
    
// Limits for when to switch between multiplication algorithms.
enum : int { KARATSUBALIMIT = 18 }; // Minimum value for which Karatsuba is worthwhile.
enum : int { KARATSUBASQUARELIMIT=26 }; // Minimum value for which square Karatsuba is worthwhile
    
/** Multi-byte addition or subtraction
 *    dest[#] = src1[#] + src2[#] + carry (0 or 1).
 * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
 * Returns carry or borrow (0 or 1).
 * Set op == '+' for addition, '-' for subtraction.
 */
uint multibyteAddSub(char op)(uint[] dest, uint [] src1, uint [] src2, uint carry)
{
    // Timing:
    // Pentium M: 2.25/int
    // P6 family, Core2 have a partial flags stall when reading the carry flag in
    // an ADC, SBB operation after an operation such as INC or DEC which
    // modifies some, but not all, flags. We avoid this by storing carry into
    // a resister (AL), and restoring it after the branch.
    
    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
    asm {
        naked;
        push EDI;
        push EBX;
        push ESI;
        mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
        mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
        mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
        mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
             // Carry is in EAX
        // Count UP to zero (from -len) to minimize loop overhead.
        lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
        lea ESI, [ESI + 4*ECX]; // EBP = end of src2.        
        lea EDI, [EDI + 4*ECX]; // EDI = end of dest.

        neg ECX;
        add ECX, 8;
        jb L2;  // if length < 8 , bypass the unrolled loop.
L_unrolled:
        shr AL, 1; // get carry from EAX
    }
        mixin(" asm {"
        ~ indexedLoopUnroll( 8, 
        "mov EAX, [@*4-8*4+EDX+ECX*4];"
        ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
        "mov [@*4-8*4+EDI+ECX*4], EAX;")
        ~ "}");
asm {        
        setc AL; // save carry
        add ECX, 8;
        ja L_unrolled;
L2:     // Do the residual 1..7 ints.
   
        sub ECX, 8; 
        jz done;
L_residual: 
        shr AL, 1; // get carry from EAX
    }
        mixin(" asm {"
        ~ indexedLoopUnroll( 1, 
        "mov EAX, [@*4+EDX+ECX*4];"
        ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
        "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
asm {        
        setc AL; // save carry
        add ECX, 1;
        jnz L_residual;
done:
        and EAX, 1; // make it O or 1.
        pop ESI;
        pop EBX;
        pop EDI;
        ret 6*4;
    } 
}

unittest
{
    uint [] a = new uint[40];
    uint [] b = new uint[40];
    uint [] c = new uint[40];
    for (int i=0; i<a.length; ++i)
    {
        if (i&1) a[i]=0x8000_0000 + i;
        else a[i]=i;
        b[i]= 0x8000_0003;
    }
    c[19]=0x3333_3333;
    uint carry = multibyteAddSub!('+')(c[0..18], a[0..18], b[0..18], 0);
    assert(carry==1);
    assert(c[0]==0x8000_0003);
    assert(c[1]==4);
    assert(c[19]==0x3333_3333); // check for overrun
    for (int i=0; i<a.length; ++i)
    {
        a[i]=b[i]=c[i]=0;
    }
    a[8]=0x048D159E;
    b[8]=0x048D159E;
    a[10]=0x1D950C84;
    b[10]=0x1D950C84;
    a[5] =0x44444444;
    carry = multibyteAddSub!('-')(a[0..12], a[0..12], b[0..12], 0);
    assert(a[11]==0);
    for (int i=0; i<10; ++i) if (i!=5) assert(a[i]==0); 
    
    for (int q=3; q<36;++q) {
        for (int i=0; i<a.length; ++i)
        {
            a[i]=b[i]=c[i]=0;
        }    
        a[q-2]=0x040000;
        b[q-2]=0x040000;
       carry = multibyteAddSub!('-')(a[0..q], a[0..q], b[0..q], 0);
       assert(a[q-2]==0);
    }
}

/** dest[#] += carry, or dest[#] -= carry.
 *  op must be '+' or '-'
 *  Returns final carry or borrow (0 or 1)
 */
uint multibyteIncrementAssign(char op)(uint[] dest, uint carry)
{
    enum { LASTPARAM = 1*4 } // 0* pushes + return address.
    asm {
        naked;
        mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
        mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
        // EAX  = carry
L1: ;
    }
    static if (op=='+')
        asm { add [EDX], EAX; }
    else 
        asm { sub [EDX], EAX; }    
    asm {
        mov EAX, 1;
        jnc L2;
        add EDX, 4;        
        dec ECX;
        jnz L1;
        mov EAX, 2;
L2:     dec EAX;
        ret 2*4;
    }
}
    
/** dest[#] = src[#] << numbits
 *  numbits must be in the range 1..31
 *  Returns the overflow
 */
uint multibyteShlNoMMX(uint [] dest, uint [] src, uint numbits)
{
    // Timing: Optimal for P6 family.
    // 2.0 cycles/int on PPro..PM (limited by execution port p0)
    // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
    asm {
        naked;
        push ESI;
        push EDI;
        push EBX;
        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
        mov ECX, EAX; // numbits;

        mov EAX, [-4+ESI + 4*EBX];
        mov EDX, 0;
        shld EDX, EAX, CL;
        push EDX; // Save return value
        cmp EBX, 1;
        jz L_last;
        mov EDX, [-4+ESI + 4*EBX];
        test EBX, 1;
        jz L_odd;
        sub EBX, 1;        
L_even:
        mov EDX, [-4+ ESI + 4*EBX];
        shld EAX, EDX, CL;
        mov [EDI+4*EBX], EAX;
L_odd:
        mov EAX, [-8+ESI + 4*EBX];
        shld EDX, EAX, CL;
        mov [-4+EDI + 4*EBX], EDX;        
        sub EBX, 2;
        jg L_even;
L_last:
        shl EAX, CL;
        mov [EDI], EAX;
        pop EAX; // pop return value
        pop EBX;
        pop EDI;
        pop ESI;
        ret 4*4;
     }
}

/** dest[#] = src[#] >> numbits
 *  numbits must be in the range 1..31
 * This version uses MMX.
 */
uint multibyteShl(uint [] dest, uint [] src, uint numbits)
{
    // Timing:
    // K7 1.2/int. PM 1.7/int P4 5.3/int
    enum { LASTPARAM = 4*4 } // 3* pushes + return address.

    asm {
        naked;
        push ESI;
        push EDI;
        push EBX;
        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;

        movd MM3, EAX; // numbits = bits to shift left
        xor EAX, 63;
        align   16;
        inc EAX;        
        movd MM4, EAX ; // 64-numbits = bits to shift right
        
        // Get the return value into EAX
        and EAX, 31; // EAX = 32-numbits
        movd MM2, EAX; // 32-numbits
        movd MM1, [ESI+4*EBX-4];                
        psrlq MM1, MM2;
        movd EAX, MM1;  // EAX = return value
        test EBX, 1;
        jz L_even;
L_odd:
        cmp EBX, 1;
        jz L_length1;
    
         // deal with odd lengths
        movq MM1, [ESI+4*EBX-8];
        psrlq MM1, MM2;        
        movd    [EDI +4*EBX-4], MM1;
        sub EBX, 1;
L_even: // It's either singly or doubly even
        movq    MM2, [ESI + 4*EBX - 8];
        psllq   MM2, MM3;
        sub EBX, 2;
        jle L_last;
        movq MM1, MM2;
        add EBX, 2;
        test EBX, 2;
        jz L_onceeven;
        sub EBX, 2;
        
        // MAIN LOOP -- 128 bytes per iteration
 L_twiceeven:      // here MM2 is the carry
        movq    MM0, [ESI + 4*EBX-8];
        psrlq   MM0, MM4;
        movq    MM1, [ESI + 4*EBX-8];
        psllq   MM1, MM3;
        por     MM2, MM0;
        movq    [EDI +4*EBX], MM2;
L_onceeven:        // here MM1 is the carry
        movq    MM0, [ESI + 4*EBX-16];
        psrlq   MM0, MM4;
        movq    MM2, [ESI + 4*EBX-16];
        por     MM1, MM0;
        movq    [EDI +4*EBX-8], MM1;        
        psllq   MM2, MM3;
        sub EBX, 4;
        jg L_twiceeven;
L_last:        
        movq    [EDI +4*EBX], MM2;
L_alldone:        
        emms;  // NOTE: costs 6 cycles on Intel CPUs
        pop EBX;
        pop EDI;
        pop ESI;
        ret 4*4;
        
L_length1:
        // length 1 is a special case
        movd MM1, [ESI];
        psllq MM1, MM3;
        movd [EDI], MM1;
        jmp L_alldone;        
   }
}

void multibyteShr(uint [] dest, uint [] src, uint numbits)
{
    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
    asm {
        naked;
        push ESI;
        push EDI;
        push EBX;
        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
align 16;
        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
        lea ESI, [ESI + 4*EBX]; // ESI = end of src
        neg EBX;                // count UP to zero.
        
        movd MM3, EAX; // numbits = bits to shift right
        xor EAX, 63;
        inc EAX;        
        movd MM4, EAX ; // 64-numbits = bits to shift left
        
        test EBX, 1;
        jz L_even;
L_odd:
         // deal with odd lengths
        and EAX, 31; // EAX = 32-numbits
        movd MM2, EAX; // 32-numbits
        cmp EBX, -1;
        jz L_length1;
    
        movq MM0, [ESI+4*EBX];
        psrlq MM0, MM3;        
        movd    [EDI +4*EBX], MM0;
        add EBX, 1;        
L_even: 
        movq    MM2, [ESI + 4*EBX];
        psrlq   MM2, MM3;
        
        movq MM1, MM2;
        add EBX, 4;
        cmp EBX, -2+4;        
        jz L_last;
        // It's either singly or doubly even
        sub EBX, 2;
        test EBX, 2;
        jnz L_onceeven;
        add EBX, 2;
        
        // MAIN LOOP -- 128 bytes per iteration
 L_twiceeven:      // here MM2 is the carry
        movq    MM0, [ESI + 4*EBX-8];
        psllq   MM0, MM4;
        movq    MM1, [ESI + 4*EBX-8];
        psrlq   MM1, MM3;
        por     MM2, MM0;
        movq    [EDI +4*EBX-16], MM2;
L_onceeven:        // here MM1 is the carry
        movq    MM0, [ESI + 4*EBX];
        psllq   MM0, MM4;
        movq    MM2, [ESI + 4*EBX];
        por     MM1, MM0;
        movq    [EDI +4*EBX-8], MM1;        
        psrlq   MM2, MM3;
        add EBX, 4;
        jl L_twiceeven;
L_last:     
        movq    [EDI +4*EBX-16], MM2;
L_alldone:        
        emms;  // NOTE: costs 6 cycles on Intel CPUs
        pop EBX;
        pop EDI;
        pop ESI;
        ret 4*4;
        
L_length1:
        // length 1 is a special case
        movd MM1, [ESI+4*EBX];
        psrlq MM1, MM3;        
        movd    [EDI +4*EBX], MM1;
        jmp L_alldone;        

   }
}

/** dest[#] = src[#] >> numbits
 *  numbits must be in the range 1..31
 */
void multibyteShrNoMMX(uint [] dest, uint [] src, uint numbits)
{
    // Timing: Optimal for P6 family.
    // 2.0 cycles/int on PPro..PM (limited by execution port p0)
    // Terrible performance on AMD64, which has 7 cycles for SHRD!!
    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
    asm {
        naked;
        push ESI;
        push EDI;
        push EBX;
        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
        mov ECX, EAX; // numbits;

        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
        lea ESI, [ESI + 4*EBX]; // ESI = end of src
        neg EBX;                // count UP to zero.
        mov EAX, [ESI + 4*EBX];
        cmp EBX, -1;
        jz L_last;
        mov EDX, [ESI + 4*EBX];
        test EBX, 1;
        jz L_odd;
        add EBX, 1;        
L_even:
        mov EDX, [ ESI + 4*EBX];
        shrd EAX, EDX, CL;
        mov [-4 + EDI+4*EBX], EAX;
L_odd:
        mov EAX, [4 + ESI + 4*EBX];
        shrd EDX, EAX, CL;
        mov [EDI + 4*EBX], EDX;        
        add EBX, 2;
        jl L_even;
L_last:
        shr EAX, CL;
        mov [-4 + EDI], EAX;
        
        pop EBX;
        pop EDI;
        pop ESI;
        ret 4*4;
     }
}

unittest
{

    uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    multibyteShr(aa[0..$-1], aa, 4);
    assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555 
        && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);

    aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    multibyteShr(aa[2..$-1], aa[2..$-1], 4);
    assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
        && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);

    aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    multibyteShr(aa[0..$-2], aa, 4);
    assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
    assert(aa[0]==0x6122_2222);
    assert(aa[3]==0xBCCC_CCCD);


    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    uint r = multibyteShl(aa[2..4], aa[2..4], 4);
    assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
        && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
    assert(r==8);
        
    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    r = multibyteShl(aa[1..4], aa[1..4], 4);    
    assert(aa[0] == 0xF0FF_FFFF
        && aa[2]==0x5555_5561);    
        assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
    assert(r==8);
        assert(aa[1]==0x2222_2230);
        
    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    r = multibyteShl(aa[0..4], aa[1..5], 31);    
}

/** dest[#] = src[#] * multiplier + carry.
 * Returns carry.
 */
uint multibyteMul(uint[] dest, uint[] src, uint multiplier, uint carry)
{
    // Timing: definitely not optimal.
    // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
    // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.

    enum { LASTPARAM = 4*4 } // 4* pushes + return address.
    // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
    // when initializing variables to zero.
    version(D_PIC)
    {
        enum { zero = 0 }
    }
    else
    {
        static int zero = 0;
    }
    asm {
        naked;      
        push ESI;
        push EDI;
        push EBX;
        
        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
        align 16;
        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
        lea ESI, [ESI + 4*EBX]; // ESI = end of src
        mov ECX, EAX; // [carry]; -- last param is in EAX.
        neg EBX;                // count UP to zero.
        test EBX, 1;
        jnz L_odd;
        add EBX, 1;
 L1:
        mov EAX, [-4 + ESI + 4*EBX];
        mul int ptr [ESP+LASTPARAM]; //[multiplier];
        add EAX, ECX;
        mov ECX, zero;
        mov [-4+EDI + 4*EBX], EAX;
        adc ECX, EDX;
L_odd:        
        mov EAX, [ESI + 4*EBX];  // p2
        mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3, 
        add EAX, ECX;
        mov ECX, zero;
        adc ECX, EDX;
        mov [EDI + 4*EBX], EAX;
        add EBX, 2;
        jl L1;
        
        mov EAX, ECX; // get final carry

        pop EBX;
        pop EDI;
        pop ESI;
        ret 5*4;
     }
}

unittest
{
    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    multibyteMul(aa[1..4], aa[1..4], 16, 0);
    assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
}

// The inner multiply-and-add loop, together with the Even entry point.
// Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
// This is the most time-critical code in the BigInt library.
// It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
char [] asmMulAdd_innerloop(char [] OP, char [] M_ADDRESS) {
    // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
    // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load 
    // (unit p2) for initializing registers to zero.
    // There are also dependencies between the instructions, and we run up against the
    // ROB-read limit (can only read 2 registers per cycle).
    // We also need the number of uops in the loop to be a multiple of 3.
    // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
	// if Position-Independent Code is required.
	
	// Register usage
    // ESI = end of src
    // EDI = end of dest
    // EBX = index. Counts up to zero (in steps of 2).
    // EDX:EAX = scratch, used in multiply.
    // ECX = carry1.
    // EBP = carry2.
	// ESP = points to the multiplier.
	
	// The first member of 'dest' which will be modified is [EDI+4*EBX].
	// EAX must already contain the first member of 'src', [ESI+4*EBX].

version(D_PIC) { bool using_PIC = true; } else { bool using_PIC=false; }
return "asm {
        // Entry point for even length
        add EBX, 1;
        mov EBP, ECX; // carry
        
        mul int ptr [" ~ M_ADDRESS ~ "]; // M
        mov ECX, 0;
 
        add EBP, EAX;
        mov EAX, [ESI+4*EBX];
        adc ECX, EDX;

        mul int ptr [" ~ M_ADDRESS ~ "]; // M
        " ~ OP ~ " [-4+EDI+4*EBX], EBP;
        mov EBP, zero;
    
        adc ECX, EAX;
        mov EAX, [4+ESI+4*EBX];
    
        adc EBP, EDX;    
        add EBX, 2;
        jnl L_done;
L1:
        mul int ptr [" ~ M_ADDRESS ~ "];
        " ~ OP ~ " [-8+EDI+4*EBX], ECX; 
        adc EBP, EAX;
        mov ECX, zero;
        mov EAX, [ESI+4*EBX];        
        adc ECX, EDX;
		" ~
    (using_PIC ? "" : "   mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
	~ "
        mul int ptr [" ~ M_ADDRESS ~ "];
        " ~ OP ~ " [-4+EDI+4*EBX], EBP;
        mov EBP, zero;
    
        adc ECX, EAX;
        mov EAX, [4+ESI+4*EBX];
    
        adc EBP, EDX;    
        add EBX, 2;
        jl L1;
L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
        adc EBP, 0;
		}";
		// final carry is now in EBP
}

char [] asmMulAdd_enter_odd(char [] OP, char [] M_ADDRESS) {
return "asm {
        mul int ptr [" ~M_ADDRESS ~"];
        mov EBP, zero;   
        add ECX, EAX;
        mov EAX, [4+ESI+4*EBX];
    
        adc EBP, EDX;    
        add EBX, 2;
        jl L1;
        jmp L_done;
		}";
}
		


/**
 * dest[#] += src[#] * multiplier OP carry(0..FFFF_FFFF).
 * where op == '+' or '-'
 * Returns carry out of MSB (0..FFFF_FFFF).
 */
uint multibyteMulAdd(char op)(uint [] dest, uint[] src, uint multiplier, uint carry)
{
    // Timing: This is the most time-critical bignum function.
    // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
    
    // The main loop is pipelined and unrolled by 2, 
    //   so entry to the loop is also complicated.
    
    // Register usage
    // EDX:EAX = multiply
    // EBX = counter
    // ECX = carry1
    // EBP = carry2
    // EDI = dest
    // ESI = src
    
    const char [] OP = (op=='+')? "add" : "sub";
    version(D_PIC) {
        enum { zero = 0 }
    } else {
        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
        // when initializing registers to zero.
        static int zero = 0;
        // use p3/p4 units 
        static int storagenop; // write-only
    }
    
    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
    asm {
        naked;
        
        push ESI;
        push EDI;
        push EBX;
        push EBP;
        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
        align 16;
        nop;
        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
        lea ESI, [ESI + 4*EBX]; // ESI = end of src
        mov EBP, 0;
        mov ECX, EAX; // ECX = input carry.
        neg EBX;                // count UP to zero.
        mov EAX, [ESI+4*EBX];
        test EBX, 1;
        jnz L_enter_odd;
}
		// Main loop, with entry point for even length
mixin(asmMulAdd_innerloop(OP, "ESP+LASTPARAM")); 
asm {
        mov EAX, EBP; // get final carry
        pop EBP;
        pop EBX;
        pop EDI;
        pop ESI;
        ret 5*4;
}        
L_enter_odd:
    mixin(asmMulAdd_enter_odd(OP, "ESP+LASTPARAM"));
}

unittest {
    
    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
    uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
    multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
    assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
    assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
        && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
}

/** 
   Sets result[#] = result[0..left.length] + left[#] * right[#]
   
   It is defined in this way to allow cache-efficient multiplication.
   This function is equivalent to:
    ----
    for (int i = 0; i< right.length; ++i) {
        dest[left.length + i] = multibyteMulAdd(dest[i..left.length+i],
                left, right[i], 0);
    }
    ----
 */
void multibyteMultiplyAccumulate(uint [] dest, uint[] left, uint [] right)
{
    // Register usage
    // EDX:EAX = used in multiply
    // EBX = index
    // ECX = carry1
    // EBP = carry2
    // EDI = end of dest for this pass through the loop. Index for outer loop.
    // ESI = end of left. never changes
    // [ESP] = M = right[i] = multiplier for this pass through the loop.
    // right.length is changed into dest.ptr+dest.length
    version(D_PIC) {
        enum { zero = 0 }
    } else {
        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
        // when initializing registers to zero.
        static int zero = 0;
        // use p3/p4 units 
        static int storagenop; // write-only
    }
    
    enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
    asm {
        naked;
          
        push ESI;
        push EDI;
        align 16;
        push EBX;
        push EBP;
        push EAX;    // local variable M
        mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
        mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
        mov ESI, [ESP + LASTPARAM + 4*3];  // left.ptr
        lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
        
        mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
        lea EAX, [EDI + 4*EAX];
        mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI       

        lea ESI, [ESI + 4*EBX]; // ESI = end of left
        mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
        mov EAX, [EAX];
        mov [ESP], EAX; // M
outer_loop:        
        mov EBP, 0;
        mov ECX, 0; // ECX = input carry.
        neg EBX;                // count UP to zero.
        mov EAX, [ESI+4*EBX];
        test EBX, 1;
        jnz L_enter_odd;
		}
        // -- Inner loop, with even entry point
        mixin(asmMulAdd_innerloop("add", "ESP"));
asm {
        mov [-4+EDI+4*EBX], EBP;
        add EDI, 4;
        cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
        jz outer_done;
        mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
        mov EAX, [EAX+4];                 // get new M
        mov [ESP], EAX;                   // save new M
        add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
        mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
        jmp outer_loop;
outer_done:        
        pop EAX;
        pop EBP;
        pop EBX;
        pop EDI;
        pop ESI;
        ret 6*4;
}
L_enter_odd:
    mixin(asmMulAdd_enter_odd("add", "ESP"));
}

/**  dest[#] /= divisor.
 * overflow is the initial remainder, and must be in the range 0..divisor-1.
 * divisor must not be a power of 2 (use right shift for that case;
 * A division by zero will occur if divisor is a power of 2).
 * Returns the final remainder
 *
 * Based on public domain code by Eric Bainville. 
 * (http://www.bealto.com/) Used with permission.
 */
uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow)
{
    // Timing: limited by a horrible dependency chain.
    // Pentium M: 18 cycles/op, 8 resource stalls/op.
    // EAX, EDX = scratch, used by MUL
    // EDI = dest
    // CL = shift
    // ESI = quotient
    // EBX = remainderhi
    // EBP = remainderlo
    // [ESP-4] = mask
    // [ESP] = kinv (2^64 /divisor)
    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
    enum { LOCALS = 2*4} // MASK, KINV
    asm {
        naked;
        
        push ESI;
        push EDI;
        push EBX;
        push EBP;
        
        mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
        mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length

        // Loop from msb to lsb
        lea     EDI, [EDI + 4*EBX];        
        mov EBP, EAX; // rem is the input remainder, in 0..divisor-1
        // Build the pseudo-inverse of divisor k: 2^64/k
        // First determine the shift in ecx to get the max number of bits in kinv
        xor     ECX, ECX;
        mov     EAX, [ESP + LASTPARAM]; //divisor;
        mov     EDX, 1;
kinv1:
        inc     ECX;
        ror     EDX, 1;
        shl     EAX, 1;
        jnc     kinv1;
        dec     ECX;
        // Here, ecx is a left shift moving the msb of k to bit 32
        
        mov     EAX, 1;
        shl     EAX, CL;
        dec     EAX;
        ror     EAX, CL ; //ecx bits at msb
        push    EAX; // MASK        
        
        // Then divide 2^(32+cx) by divisor (edx already ok)
        xor     EAX, EAX;
        div     int ptr [ESP + LASTPARAM +  LOCALS-4*1]; //divisor;
        push    EAX; // kinv        
        align   16;
L2:
        // Get 32 bits of quotient approx, multiplying
        // most significant word of (rem*2^32+input)
        mov     EAX, [ESP+4]; //MASK;
        and     EAX, [EDI - 4];
        or      EAX, EBP;
        rol     EAX, CL;
        mov     EBX, EBP;
        mov     EBP, [EDI - 4];
        mul     int ptr [ESP]; //KINV;
                
        shl     EAX, 1;
        rcl     EDX, 1;
        
        // Multiply by k and subtract to get remainder
        // Subtraction must be done on two words
        mov     EAX, EDX;
        mov     ESI, EDX; // quot = high word
        mul     int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
        sub     EBP, EAX;
        sbb     EBX, EDX;   
        jz      Lb;  // high word is 0, goto adjust on single word

        // Adjust quotient and remainder on two words
Ld:     inc     ESI;
        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
        sbb     EBX, 0;
        jnz     Ld;
        
        // Adjust quotient and remainder on single word
Lb:     cmp     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
        jc      Lc; // rem in 0..divisor-1, OK        
        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
        inc     ESI;
        jmp     Lb;
        
        // Store result
Lc:
        mov     [EDI - 4], ESI;
        lea     EDI, [EDI - 4];
        dec     int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
        jnz    L2;
        
        pop EAX; // discard kinv
        pop EAX; // discard mask
        
        mov     EAX, EBP; // return final remainder
        pop     EBP;
        pop     EBX;
        pop     EDI;
        pop     ESI;        
        ret     3*4;
    }
}

unittest {
    uint [] aa = new uint[101];
    for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
    uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
    uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
    for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
    assert(r==0x33FF_7461);
}

// Set dest[2*i..2*i+1]+=src[i]*src[i]
void multibyteAddDiagonalSquares(uint [] dest, uint [] src)
{
    /* Unlike mulAdd, the carry is only 1 bit, 
	   since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
	   Note also that on the last iteration, no carry can occur.
	   As for multibyteAdd, we save & restore carry flag through the loop.
	   
	   The timing is entirely dictated by the dependency chain. We could
	   improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
    */
    enum { LASTPARAM = 4*5 } // 4* pushes + return address.
 asm {
    naked;
    push ESI;
    push EDI;
    push EBX;
	push ECX;
    mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
    mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
    mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
    lea EDI, [EDI + 8*EBX];      // EDI = end of dest
    lea ESI, [ESI + 4*EBX];      // ESI = end of src
    neg EBX;                     // count UP to zero.
    xor ECX, ECX;             // initial carry = 0.
L1:
    mov EAX, [ESI + 4*EBX];
    mul EAX, EAX;
    shr CL, 1;                 // get carry
    adc [EDI + 8*EBX], EAX;
    adc [EDI + 8*EBX + 4], EDX;
    setc CL;                   // save carry
    inc EBX;
    jnz L1;
	
	pop ECX;
	pop EBX;
	pop EDI;
	pop ESI;
	ret 4*4;
 }
}

unittest {
    uint [] aa = new uint[13];
	uint [] bb = new uint[6];
    for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
    for (int i=0; i<bb.length; ++i) bb[i] = i;
	aa[$-1]= 7;
    multibyteAddDiagonalSquares(aa[0..$-1], bb);
	assert(aa[$-1]==7);
	for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
}

void multibyteTriangleAccumulateD(uint[] dest, uint[] x)
{
    for (int i = 0; i < x.length-3; ++i) {
        dest[i+x.length] = multibyteMulAdd!('+')(
             dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
    }
    ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
    dest[$-5] = cast(uint)c;
    c >>= 32;
    c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
    dest[$-4] = cast(uint)c;
    c >>= 32;
length2:	
    c += cast(ulong)(x[$-2]) * x[$-1];
	dest[$-3] = cast(uint)c;
	c >>= 32;
	dest[$-2] = cast(uint)c;
}

//dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
// assert(dest.length = src.length*2);
// assert(src.length >= 3);
void multibyteTriangleAccumulateAsm(uint[] dest, uint[] src)
{
    // Register usage
    // EDX:EAX = used in multiply
    // EBX = index
    // ECX = carry1
    // EBP = carry2
    // EDI = end of dest for this pass through the loop. Index for outer loop.
    // ESI = end of src. never changes
    // [ESP] = M = src[i] = multiplier for this pass through the loop.
    // dest.length is changed into dest.ptr+dest.length
    version(D_PIC) {
        enum { zero = 0 }
    } else {
        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
        // when initializing registers to zero.
        static int zero = 0;
        // use p3/p4 units 
        static int storagenop; // write-only
    }

    enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
    asm {
        naked;

        push ESI;
        push EDI;
        align 16;
        push EBX;
        push EBP;
        push EAX;    // local variable M= src[i]
        mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
        mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
        mov ESI, [ESP + LASTPARAM + 4*1];  // src.ptr

        lea ESI, [ESI + 4*EBX]; // ESI = end of left
        add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M

        // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
        lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass

        lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
        mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI  = &dest[src.length*2 -3]     		

        cmp EBX, 3;
        jz length_is_3;
        
        // We start at src[1], not src[0].		    
        dec EBX;
        mov [ESP + LASTPARAM + 4*0], EBX;

outer_loop:        
        mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
        mov EBP, 0;
        mov ECX, 0; // ECX = input carry.
        dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
        neg EBX;                // count UP to zero.

        mov EAX, [ESI + 4*EBX - 4*1]; // get new M
        mov [ESP], EAX;                   // save new M

        mov EAX, [ESI+4*EBX];
        test EBX, 1;
        jnz L_enter_odd;
    }
    // -- Inner loop, with even entry point
    mixin(asmMulAdd_innerloop("add", "ESP"));
    asm {
        mov [-4+EDI+4*EBX], EBP;
        add EDI, 4;
        cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
        jnz outer_loop;        
length_is_3:
        mov EAX, [ESI - 4*3];
        mul EAX, [ESI - 4*2];
        mov ECX, 0;
        add [EDI-2*4], EAX;  // ECX:dest[$-5] += x[$-3] * x[$-2]        
        adc ECX, EDX;

        mov EAX, [ESI - 4*3];
        mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
        add EAX, ECX;
        mov ECX, 0;
        adc EDX, 0;
        // now EDX: EAX = c + x[$-3] * x[$-1]
        add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
        adc ECX, EDX;  //  ECX holds dest[$-3], it acts as carry for the last row
// do length==2 
        mov EAX, [ESI - 4*2];
        mul EAX, [ESI - 4*1];
        add ECX, EAX;
        adc EDX, 0;        
        mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
        mov [EDI + 1*4], EDX;

        pop EAX;
        pop EBP;
        pop EBX;
        pop EDI;
        pop ESI;
        ret 4*4;
    }
    L_enter_odd:
        mixin(asmMulAdd_enter_odd("add", "ESP"));
}

unittest {
uint [] aa = new uint[200];
   uint [] a  = aa[0..100];
   uint [] b  = new uint [100];
   aa[] = 761;
   a[] = 0;
   b[] = 0;
   a[3] = 6;
   b[0]=1;
   b[1] = 17;
   b[50..100]=78;
   multibyteTriangleAccumulateAsm(a, b[0..50]);
   uint [] c = new uint[100];
   c[] = 0;
   c[1] = 17;
   c[3] = 6;
   assert(a[]==c[]);
   assert(a[0]==0);
   aa[] = 0xFFFF_FFFF;   
   a[] = 0;
   b[] = 0;
   b[0]= 0xbf6a1f01;
   b[1]=  0x6e38ed64;
   b[2]=  0xdaa797ed;
   b[3] = 0;
   
   multibyteTriangleAccumulateAsm(a[0..8], b[0..4]);
   assert(a[1]==0x3a600964);
   assert(a[2]==0x339974f6);
   assert(a[3]==0x46736fce);
   assert(a[4]==0x5e24a2b4);
   
   b[3] = 0xe93ff9f4;
   b[4] = 0x184f03;
   a[]=0;
   multibyteTriangleAccumulateAsm(a[0..14], b[0..7]);
   assert(a[3]==0x79fff5c2);
   assert(a[4]==0xcf384241);
   assert(a[5]== 0x4a17fc8);
   assert(a[6]==0x4d549025);
}


void multibyteSquare(BigDigit[] result, BigDigit [] x)
{
    if (x.length < 4) {
        // Special cases, not worth doing triangular.
        result[x.length] = multibyteMul(result[0..x.length], x, x[0], 0);   
        multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
        return;
    }
    //  Do half a square multiply.
    //  dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
    result[x.length] = multibyteMul!('+')(result[1 .. x.length], x[1..$], x[0], 0);
    multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
    // Multiply by 2 
    result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
    // And add the diagonal elements
    result[0] = 0;
    multibyteAddDiagonalSquares(result, x);
}

version(TangoPerformanceTest) {
import tango.stdc.stdio;
int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }

uint [2200] X1;
uint [2200] Y1;
uint [4000] Z1;

void testPerformance()
{
    // The performance results at the top of this file were obtained using
    // a Windows device driver to access the CPU performance counters.
    // The code below is less accurate but more widely usable.
    // The value for division is quite inconsistent.
    for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
    int t, t0;    
    multibyteShl(Z1[0..2000], X1[0..2000], 7);
    t0 = clock();
    multibyteShl(Z1[0..1000], X1[0..1000], 7);
    t = clock();
    multibyteShl(Z1[0..2000], X1[0..2000], 7);
    auto shltime = (clock() - t) - (t - t0);
    t0 = clock();
    multibyteShr(Z1[2..1002], X1[4..1004], 13);
    t = clock();
    multibyteShr(Z1[2..2002], X1[4..2004], 13);
    auto shrtime = (clock() - t) - (t - t0);
    t0 = clock();
    multibyteAddSub!('+')(Z1[0..1000], X1[0..1000], Y1[0..1000], 0);
    t = clock();
    multibyteAddSub!('+')(Z1[0..2000], X1[0..2000], Y1[0..2000], 0);
    auto addtime = (clock() - t) - (t-t0);
    t0 = clock();
    multibyteMul(Z1[0..1000], X1[0..1000], 7, 0);
    t = clock();
    multibyteMul(Z1[0..2000], X1[0..2000], 7, 0);
    auto multime = (clock() - t) - (t - t0);
    multibyteMulAdd!('+')(Z1[0..2000], X1[0..2000], 217, 0);
    t0 = clock();
    multibyteMulAdd!('+')(Z1[0..1000], X1[0..1000], 217, 0);
    t = clock();
    multibyteMulAdd!('+')(Z1[0..2000], X1[0..2000], 217, 0);
    auto muladdtime = (clock() - t) - (t - t0);        
    multibyteMultiplyAccumulate(Z1[0..64], X1[0..32], Y1[0..32]);
    t = clock();
    multibyteMultiplyAccumulate(Z1[0..64], X1[0..32], Y1[0..32]);
    auto accumtime = clock() - t;
    t0 = clock();
    multibyteDivAssign(Z1[0..2000], 217, 0);
    t = clock();
    multibyteDivAssign(Z1[0..1000], 37, 0);
    auto divtime = (t - t0) - (clock() - t);
	t= clock();
    multibyteSquare(Z1[0..64], X1[0..32]);
    auto squaretime = clock() - t;
    
    printf("-- BigInt asm performance (cycles/int) --\n");    
    printf("Add:        %.2f\n", addtime/1000.0);
    printf("Shl:        %.2f\n", shltime/1000.0);
    printf("Shr:        %.2f\n", shrtime/1000.0);
    printf("Mul:        %.2f\n", multime/1000.0);
    printf("MulAdd:     %.2f\n", muladdtime/1000.0);
    printf("Div:        %.2f\n", divtime/1000.0);
    printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
    printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
}

static this()
{
    testPerformance();
}
}

} // version(D_InlineAsm_X86)