Commit 42a561d0 authored by Michael Hamburg's avatar Michael Hamburg

some accel in for curve25519

parent 2705bd26
......@@ -19,7 +19,7 @@ ASM ?= $(CC)
DECAF ?= decaf_fast
ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_ref64
ARCH ?= arch_x86_64
else
# no i386 port yet
ARCH ?= arch_ref32
......
......@@ -361,7 +361,7 @@ decaf_bool_t API_NS(scalar_invert) (
}
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#else
decaf_255_scalar_t b, ma;
scalar_t b, ma;
int i;
sc_montmul(b,API_NS(scalar_one),sc_r2);
sc_montmul(ma,a,sc_r2);
......@@ -378,10 +378,10 @@ decaf_bool_t API_NS(scalar_invert) (
}
}
sc_montmul(out,b,decaf_255_scalar_one);
sc_montmul(out,b,API_NS(scalar_one));
API_NS(scalar_destroy)(b);
API_NS(scalar_destroy)(ma);
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#endif
}
......
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#include "p25519.h"
#include "x86-64-arith.h"
void
p255_mul (
p255_t *__restrict__ cs,
const p255_t *as,
const p255_t *bs
) {
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t bh[4];
int i;
for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
accum0 = widemul_rm(ai, &b[0]);
accum1 = widemul_rm(ai, &b[1]);
accum2 = widemul_rm(ai, &b[2]);
ai = a[1];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
mac_rm(&accum2, ai, &b[1]);
ai = a[2];
mac_rm(&accum0, ai, &bh[2]);
mac_rm(&accum1, ai, &bh[3]);
mac_rm(&accum2, ai, &b[0]);
ai = a[3];
mac_rm(&accum0, ai, &bh[1]);
mac_rm(&accum1, ai, &bh[2]);
mac_rm(&accum2, ai, &bh[3]);
ai = a[4];
mac_rm(&accum0, ai, &bh[0]);
mac_rm(&accum1, ai, &bh[1]);
mac_rm(&accum2, ai, &bh[2]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
ai = a[0];
mac_rm(&accum0, ai, &b[3]);
accum1 = widemul_rm(ai, &b[4]);
ai = a[1];
mac_rm(&accum0, ai, &b[2]);
mac_rm(&accum1, ai, &b[3]);
ai = a[2];
mac_rm(&accum0, ai, &b[1]);
mac_rm(&accum1, ai, &b[2]);
ai = a[3];
mac_rm(&accum0, ai, &b[0]);
mac_rm(&accum1, ai, &b[1]);
ai = a[4];
mac_rm(&accum0, ai, &bh[3]);
mac_rm(&accum1, ai, &b[0]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
}
void
p255_mulw (
p255_t *__restrict__ cs,
const p255_t *as,
uint64_t b
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb;
__uint128_t accum = 0;
for (i=0; i<5; i++) {
mac_rm(&accum, b, &a[i]);
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
}
void
p255_sqr (
p255_t *__restrict__ cs,
const p255_t *as
) {
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
uint64_t *c = cs->limb;
uint64_t ah[4];
int i;
for (i=0; i<4; i++) ah[i] = a[i+1] * 19;
__uint128_t accum0, accum1, accum2;
uint64_t ai = a[0];
accum0 = widemul_rr(ai, ai);
ai *= 2;
accum1 = widemul_rm(ai, &a[1]);
accum2 = widemul_rm(ai, &a[2]);
ai = a[1];
mac_rr(&accum2, ai, ai);
ai *= 2;
mac_rm(&accum0, ai, &ah[3]);
ai = a[2] * 2;
mac_rm(&accum0, ai, &ah[2]);
mac_rm(&accum1, ai, &ah[3]);
ai = a[3];
mac_rm(&accum1, ai, &ah[2]);
ai *= 2;
mac_rm(&accum2, ai, &ah[3]);
uint64_t c0 = accum0 & mask;
accum1 += accum0 >> 51;
uint64_t c1 = accum1 & mask;
accum2 += accum1 >> 51;
c[2] = accum2 & mask;
accum0 = accum2 >> 51;
ai = a[0]*2;
mac_rm(&accum0, ai, &a[3]);
accum1 = widemul_rm(ai, &a[4]);
ai = a[1]*2;
mac_rm(&accum0, ai, &a[2]);
mac_rm(&accum1, ai, &a[3]);
mac_rm(&accum0, a[4], &ah[3]);
mac_rr(&accum1, a[2], a[2]);
c[3] = accum0 & mask;
accum1 += accum0 >> 51;
c[4] = accum1 & mask;
/* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
* = 2^(-13 + <13)
* PERF: good enough to fit into uint64_t?
*/
uint64_t a1 = accum1>>51;
accum1 = (__uint128_t)a1 * 19 + c0;
c[0] = accum1 & mask;
c[1] = c1 + (accum1>>51);
}
void
p255_strong_reduce (
p255_t *a
) {
uint64_t mask = (1ull<<51)-1;
/* first, clear high */
a->limb[0] += (a->limb[4]>>51)*19;
a->limb[4] &= mask;
/* now the total is less than 2p */
/* compute total_value - p. No need to reduce mod p. */
__int128_t scarry = 0;
int i;
for (i=0; i<5; i++) {
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
a->limb[i] = scarry & mask;
scarry >>= 51;
}
/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^255
* so let's add back in p. will carry back off the top for 2^255.
*/
assert(is_zero(scarry) | is_zero(scarry+1));
uint64_t scarry_mask = scarry & mask;
__uint128_t carry = 0;
/* add it back */
for (i=0; i<5; i++) {
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 51;
}
assert(is_zero(carry + scarry));
}
void
p255_serialize (
uint8_t serial[32],
const struct p255_t *x
) {
int i,j;
p255_t red;
p255_copy(&red, x);
p255_strong_reduce(&red);
uint64_t *r = red.limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
for (j=0; j<8; j++) {
serial[8*i+j] = ser64[i];
ser64[i] >>= 8;
}
}
}
mask_t
p255_deserialize (
p255_t *x,
const uint8_t serial[32]
) {
int i,j;
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
uint64_t out = 0;
for (j=0; j<8; j++) {
out |= ((uint64_t)serial[8*i+j])<<(8*j);
}
ser64[i] = out;
}
/* Test for >= 2^255-19 */
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
ge &= ser64[1];
ge &= ser64[2];
ge &= (ser64[3]<<1) + 1;
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
x->limb[0] = ser64[0] & mask;
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(~ge);
}
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P255_H__
#define __P255_H__ 1
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include "word.h"
typedef struct p255_t {
uint64_t limb[5];
} p255_t;
#define LBITS 51
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
/*
#define FIELD_LITERAL(a,b,c,d) {{ \
(a##ull) & LMASK, \
((a##ull)>>51 | (b##ull)<<13) & LMASK, \
((b##ull)>>38 | (c##ull)<<26) & LMASK, \
((c##ull)>>25 | (d##ull)<<39) & LMASK, \
(d##ull)>>12 \
}}
*/
#ifdef __cplusplus
extern "C" {
#endif
static __inline__ void
p255_add_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) __attribute__((unused));
static __inline__ void
p255_sub_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) __attribute__((unused));
static __inline__ void
p255_copy (
p255_t *out,
const p255_t *a
) __attribute__((unused));
static __inline__ void
p255_weak_reduce (
p255_t *inout
) __attribute__((unused));
void
p255_strong_reduce (
p255_t *inout
);
static __inline__ void
p255_bias (
p255_t *inout,
int amount
) __attribute__((unused));
void
p255_mul (
p255_t *__restrict__ out,
const p255_t *a,
const p255_t *b
);
void
p255_mulw (
p255_t *__restrict__ out,
const p255_t *a,
uint64_t b
);
void
p255_sqr (
p255_t *__restrict__ out,
const p255_t *a
);
void
p255_serialize (
uint8_t serial[32],
const struct p255_t *x
);
mask_t
p255_deserialize (
p255_t *x,
const uint8_t serial[32]
);
/* -------------- Inline functions begin here -------------- */
void
p255_add_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) {
unsigned int i;
for (i=0; i<5; i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
}
void
p255_sub_RAW (
p255_t *out,
const p255_t *a,
const p255_t *b
) {
unsigned int i;
uint64_t co1 = ((1ull<<51)-1)*2, co2 = co1-36;
for (i=0; i<5; i++) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i==0) ? co2 : co1);
}
}
void
p255_copy (
p255_t *out,
const p255_t *a
) {
memcpy(out,a,sizeof(*a));
}
void
p255_bias (
p255_t *a,
int amt
) {
a->limb[0] += ((uint64_t)(amt)<<52) - 38*amt;
int i;
for (i=1; i<5; i++) {
a->limb[i] += ((uint64_t)(amt)<<52)-2*amt;
}
}
void
p255_weak_reduce (
p255_t *a
) {
uint64_t mask = (1ull<<51) - 1;
uint64_t tmp = a->limb[4] >> 51;
int i;
for (i=4; i>0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>51);
}
a->limb[0] = (a->limb[0] & mask) + tmp*19;
}
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif /* __P255_H__ */
../../p448/arch_x86_64/x86-64-arith.h
\ No newline at end of file
......@@ -53,6 +53,25 @@ static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
#endif
}
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
#ifndef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"r"(b), "a"(a)
: "cc");
return (((__uint128_t)(d))<<64) | c;
#else
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"r"(b), [a]"d"(a));
return (((__uint128_t)(d))<<64) | c;
#endif
}
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
#ifndef __BMI2__
uint64_t c,d;
......@@ -163,6 +182,31 @@ static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
uint64_t lo = *acc, hi = *acc>>64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), "a"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
uint64_t lo = *acc, hi = *acc>>64;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment