Commit 6bc97fb7 authored by Michael Hamburg's avatar Michael Hamburg

need an include/arch_*/arch_intrinsics.h for other arches

parent 233f8453
......@@ -146,10 +146,14 @@ COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
$$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$<
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
-S -c -o $$@ $$<
$$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$<
$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
-S -c -o $$@ $$<
endef
################################################################
......@@ -166,18 +170,18 @@ $$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1)
$$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS)
$$(CC) $$(CFLAGS) -S -c -o $$@ $$< \
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2))
$$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS)
$$(CC) $$(CFLAGS) \
-I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
-I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \
-S -c -o $$@ $$<
$$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS)
$$(CC) $$(CFLAGS) \
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \
-S -c -o $$@ $$<
......
......@@ -9,6 +9,7 @@
#define _XOPEN_SOURCE 600
#include "arch_config.h"
#include "arch_intrinsics.h"
#include <decaf/common.h>
......@@ -32,7 +33,6 @@
#endif
#if (WORD_BITS == 64)
typedef uint32_t hword_t;
typedef uint64_t word_t, mask_t;
typedef __uint128_t dword_t;
typedef int32_t hsword_t;
......@@ -50,7 +50,6 @@
#define letohWORD letoh64
#define SC_LIMB(x) (x##ull)
#elif (WORD_BITS == 32)
typedef uint16_t hword_t;
typedef uint32_t word_t, mask_t;
typedef uint64_t dword_t;
typedef int16_t hsword_t;
......
......@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) {
return (((__uint128_t)a)-1)>>64;
}
void
gf_25519_mul (
gf_25519_t __restrict__ cs,
const gf_25519_t as,
const gf_25519_t bs
) {
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t bh[4];
......@@ -51,12 +46,7 @@ gf_25519_mul (
c[1] += accum;
}
void
gf_25519_mulw (
gf_25519_t __restrict__ cs,
const gf_25519_t as,
uint64_t b
) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
......@@ -78,18 +68,11 @@ gf_25519_mulw (
c[1] += accum;
}
void
gf_25519_sqr (
gf_25519_t __restrict__ cs,
const gf_25519_t as
) {
gf_25519_mul(cs,as,as); // PERF
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
gf_mul(cs,as,as); // PERF
}
void
gf_25519_strong_reduce (
gf_25519_t a
) {
void gf_strong_reduce (gf a) {
uint64_t mask = (1ull<<51)-1;
/* first, clear high */
......@@ -127,15 +110,11 @@ gf_25519_strong_reduce (
assert(is_zero(carry + scarry));
}
void
gf_25519_serialize (
uint8_t serial[32],
const struct gf_25519_t x
) {
void gf_serialize (uint8_t serial[32], const struct gf x) {
int i,j;
gf_25519_t red;
gf_25519_copy(&red, x);
gf_25519_t trong_reduce(&red);
gf red;
gf_copy(&red, x);
gf_strong_reduce(&red);
uint64_t *r = red.limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
......@@ -146,11 +125,7 @@ gf_25519_serialize (
}
}
mask_t
gf_25519_deserialize (
gf_25519_t x,
const uint8_t serial[32]
) {
mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
int i,j;
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
......
......@@ -3,18 +3,8 @@
*/
#include "f_field.h"
#include "x86-64-arith.h"
static inline uint64_t shr(__uint128_t x, int n) {
return x>>n;
}
void
gf_25519_mul (
gf_25519_s *__restrict__ cs,
const gf_25519_t as,
const gf_25519_t bs
) {
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
......@@ -48,12 +38,12 @@ gf_25519_mul (
mac_rm(&accum2, ai, &b[3]);
uint64_t c0 = accum0 & mask;
accum1 += shr(accum0, 51);
accum1 += shrld(accum0, 51);
uint64_t c1 = accum1 & mask;
accum2 += shr(accum1, 51);
accum2 += shrld(accum1, 51);
c[2] = accum2 & mask;
accum0 = shr(accum2, 51);
accum0 = shrld(accum2, 51);
mac_rm(&accum0, ai, &b[4]);
......@@ -77,7 +67,7 @@ gf_25519_mul (
mac_rm(&accum1, ai, &b[0]);
c[3] = accum0 & mask;
accum1 += shr(accum0, 51);
accum1 += shrld(accum0, 51);
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
......@@ -85,17 +75,13 @@ gf_25519_mul (
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = shr(accum1,51);
uint64_t a1 = shrld(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + shr(accum1,51);
c[1] = c1 + shrld(accum1,51);
}
void
gf_25519_sqr (
gf_25519_s *__restrict__ cs,
const gf_25519_t as
) {
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
......@@ -122,9 +108,9 @@ gf_25519_sqr (
mac_rm(&accum2, ai, &a[4]);
uint64_t c0 = accum0 & mask;
accum1 += shr(accum0, 51);
accum1 += shrld(accum0, 51);
uint64_t c1 = accum1 & mask;
accum2 += shr(accum1, 51);
accum2 += shrld(accum1, 51);
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
......@@ -141,7 +127,7 @@ gf_25519_sqr (
mac_rr(&accum1, a[2], a[2]);
c[3] = accum0 & mask;
accum1 += shr(accum0, 51);
accum1 += shrld(accum0, 51);
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
......@@ -149,51 +135,43 @@ gf_25519_sqr (
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = shr(accum1,51);
uint64_t a1 = shrld(accum1,51);
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + shr(accum1,51);
c[1] = c1 + shrld(accum1,51);
}
void
gf_25519_mulw (
gf_25519_s *__restrict__ cs,
const gf_25519_t as,
uint64_t b
) {
void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
__uint128_t accum = widemul_rm(b, &a[0]);
uint64_t c0 = accum & mask;
accum = shr(accum,51);
accum = shrld(accum,51);
mac_rm(&accum, b, &a[1]);
uint64_t c1 = accum & mask;
accum = shr(accum,51);
accum = shrld(accum,51);
mac_rm(&accum, b, &a[2]);
c[2] = accum & mask;
accum = shr(accum,51);
accum = shrld(accum,51);
mac_rm(&accum, b, &a[3]);
c[3] = accum & mask;
accum = shr(accum,51);
accum = shrld(accum,51);
mac_rm(&accum, b, &a[4]);
c[4] = accum & mask;
accum = shr(accum,51);
accum = shrld(accum,51);
accum = accum * 19 + c0;
c[0] = accum & mask;
c[1] = c1 + shr(accum,51);
c[1] = c1 + shrld(accum,51);
}
void
gf_25519_strong_reduce (
gf_25519_t a
) {
void gf_strong_reduce (gf a) {
uint64_t mask = (1ull<<51)-1;
/* first, clear high */
......@@ -231,15 +209,11 @@ gf_25519_strong_reduce (
assert(is_zero(carry + scarry));
}
void
gf_25519_serialize (
uint8_t serial[32],
const gf_25519_t x
) {
void gf_serialize (uint8_t serial[32], const gf x) {
int i,j;
gf_25519_t red;
gf_25519_copy(red, x);
gf_25519_strong_reduce(red);
gf red;
gf_copy(red, x);
gf_strong_reduce(red);
uint64_t *r = red->limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
......@@ -250,11 +224,7 @@ gf_25519_serialize (
}
}
mask_t
gf_25519_deserialize (
gf_25519_t x,
const uint8_t serial[32]
) {
mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
int i,j;
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
......
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __X86_64_ARITH_H__
#define __X86_64_ARITH_H__
#include <stdint.h>
/* TODO: non x86-64 versions of these.
* FUTURE: autogenerate
*/
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"r"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"r"(b), "a"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"r"(b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx;"
"leaq (,%%rdx,2), %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
return (((__uint128_t)(d))<<64) | c;
#endif
}
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
uint64_t lo2 = *acc2, hi2 = *acc2>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
"addq %[c], %[lo2]; "
"adcq %[d], %[hi2]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
"addq %%rax, %[lo2]; "
"adcq %%rdx, %[hi2]; "
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
*acc2 = (((__uint128_t)(hi2))<<64) | lo2;
}
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"r"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), "a"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t c,d, lo = *acc, hi = *acc>>64;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[lo], %[c]; "
"sbbq %[hi], %[d]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
*acc = (((__uint128_t)(d))<<64) | c;
}
static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
return ((__uint128_t)(a)) * b;
}
static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
return ((__int128_t)(a)) * b;
}
static __inline__ uint64_t opacify(uint64_t x) {
__asm__ volatile("" : "+r"(x));
return x;
}
static __inline__ mask_t is_zero(uint64_t x) {
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
return ~x;
}
#endif /* __X86_64_ARITH_H__ */
......@@ -4,28 +4,20 @@
#include "f_field.h"
static inline mask_t __attribute__((always_inline))
is_zero (
word_t x
) {
static inline mask_t is_zero (word_t x) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}
static uint64_t widemul_32 (
static uint64_t widemul (
const uint32_t a,
const uint32_t b
) {
return ((uint64_t)a)* b;
}
void
gf_448_mul (
gf_448_s *__restrict__ cs,
const gf_448_t as,
const gf_448_t bs
) {
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
......@@ -44,9 +36,9 @@ gf_448_mul (
accum2 = 0;
for (i=0; i<=j; i++) {
accum2 += widemul_32(a[j-i],b[i]);
accum1 += widemul_32(aa[j-i],bb[i]);
accum0 += widemul_32(a[8+j-i], b[8+i]);
accum2 += widemul(a[j-i],b[i]);
accum1 += widemul(aa[j-i],bb[i]);
accum0 += widemul(a[8+j-i], b[8+i]);
}
accum1 -= accum2;
......@@ -54,9 +46,9 @@ gf_448_mul (
accum2 = 0;
for (; i<8; i++) {
accum0 -= widemul_32(a[8+j-i], b[i]);
accum2 += widemul_32(aa[8+j-i], bb[i]);
accum1 += widemul_32(a[16+j-i], b[8+i]);
accum0 -= widemul(a[8+j-i], b[i]);
accum2 += widemul(aa[8+j-i], bb[i]);
accum1 += widemul(a[16+j-i], b[8+i]);
}
accum1 += accum2;
......@@ -81,12 +73,7 @@ gf_448_mul (
c[1] += ((uint32_t)(accum1));
}
void
gf_448_mulw (
gf_448_s *__restrict__ cs,