Op Mon, 23 Apr 2012 20:36:48 +0200 schreef David T. Ashley

<(E-Mail Removed)>:

> On Fri, 20 Apr 2012 00:57:28 +0200, "Boudewijn Dijkstra"

> <(E-Mail Removed)> wrote:

>

>> Op Wed, 18 Apr 2012 16:22:10 +0200 schreef David T. Ashley

>> <(E-Mail Removed)>:

>>> I needed to multiply U128 = U96 * U32 on an ARM the other day, and did

>>> a hack job in C. It is logically correct, but it probably takes 3-10

>>> times as long as assembly-language.

>>

>> Unlikely. With Cortex-M3 and the IAR compiler I came to the following

>> results:

>> * C-only: 63 cycles

>> * C-with-asm: 50 cycles

>> * asm-only: 39 cycles

>>

>> In how many cycles does your version run?

>

> Can you post your C code that led to 63-cycle execution?
Note that I did not perform exhaustive testing on the below code.

#include <inttypes.h>

typedef struct uint96_s {

uint32_t w0;

uint32_t w1;

uint32_t w2;

} uint96_t;

typedef struct uint128_s {

uint32_t w0;

uint32_t w1;

uint32_t w2;

uint32_t w3;

} uint128_t;

void umul_96x32c(uint96_t *a, uint32_t b, uint128_t *r)

{

uint64_t p0, p1, p2;

uint32_t r0, r1a, r1b, r2a, r2b, r3;

uint64_t r1, r2;

p0 = (uint64_t)a->w0 * (uint64_t)b;

r0 = p0;

r1a = p0 >> 32;

r->w0 = r0;

p1 = (uint64_t)a->w1 * (uint64_t)b;

r1b = p1;

r2a = p1 >> 32;

r1 = (uint64_t)r1a + (uint64_t)r1b;

r->w1 = r1;

p2 = (uint64_t)a->w2 * (uint64_t)b;

r2b = p2;

r3 = p2 >> 32;

r2 = (r1 >> 32) + (uint64_t)r2a + (uint64_t)r2b;

r->w2 = r2;

r3 += r2 >> 32;

r->w3 = r3;

}

> And your C with ASM?
The obvious thing to do is to add manual ADC instructions.

#pragma inline=never

void umul_96x32i(uint96_t *a, uint32_t b, uint128_t *r)

{

uint64_t p0, p1, p2;

uint32_t r0, r1a, r1b, r2a, r2b, r3;

uint32_t r1, r2;

p0 = (uint64_t)a->w0 * (uint64_t)b;

r0 = p0;

r1a = p0 >> 32;

r->w0 = r0;

p1 = (uint64_t)a->w1 * (uint64_t)b;

r1b = p1;

r2a = p1 >> 32;

r1 = r1a + r1b;

r->w1 = r1;

p2 = (uint64_t)a->w2 * (uint64_t)b;

r2b = p2;

r3 = p2 >> 32;

__asm("adcs %[Rd],%[Rn],%[Rm]"

: [Rd]"=r"(r2)

: [Rn]"r" (r2a), [Rm]"r" (r2b)

: "cc");

r->w2 = r2;

__asm("adc %[Rd],%[Rn],#0"

: [Rd]"=r"(r3)

: [Rn]"r" (r3));

r->w3 = r3;

}

> And your ASM-only?
The only further possible optimization was register allocation.

umul_96x32:

PUSH {R4}

LDR R3,[R0, #+0]

UMULL R3,R12,R1,R3

STR R3,[R2, #+0]

LDR R3,[R0, #+4]

UMULL R3,R4,R1,R3

ADDS R12,R3,R12

STR R12,[R2, #+4]

LDR R0,[R0, #+8]

UMULL R0,R1,R1,R0

ADCS R3,R4,R0

STR R3,[R2, #+8]

ADC R0,R1,#0

STR R0,[R2, #+12]

POP {R4}

BX LR ;; return

--

Gemaakt met Opera's revolutionaire e-mailprogramma:

http://www.opera.com/mail/
(Remove the obvious prefix to reply privately.)