Commit 65466601 authored by Mike Hamburg's avatar Mike Hamburg

E-521-related changes. Not quite ready yet...

This is largely a save-your-work checkin.

Created p521/arch_ref64 code to make sure E-521 basically works.
Fixed some of the testing code around E-521.  It doesn't quite pass
everything yet.

Created p521/arch_x86_64 code with optimized multiply.  In this
checkin, the multiply is fast and works, but all the other code in
that directory is the completely unoptimized ref64 build which
reduces after every add and sub.  So the whole thing isn't fast yet.
parent 0dc1b0de
......@@ -34,6 +34,27 @@
#define GOLDI_DIVERSIFY_BYTES 8
#if FIELD_BYTES <= SHA512_OUTPUT_BYTES
#define FIELD_HASH_BYTES SHA512_OUTPUT_BYTES
#define field_hash_final sha512_final
#else
#define FIELD_HASH_BYTES (SHA512_OUTPUT_BYTES * ((FIELD_BYTES-1)/SHA512_OUTPUT_BYTES + 1))
static inline void field_hash_final (
struct sha512_ctx_t *ctx,
unsigned char out[FIELD_HASH_BYTES]
) {
/* SHA PRNG I guess? I really should have used SHAKE */
int i;
for (i=0; i<= (FIELD_BYTES-1) / SHA512_OUTPUT_BYTES; i++) {
if (i)
sha512_update(ctx, &out[(i-1)*SHA512_OUTPUT_BYTES], SHA512_OUTPUT_BYTES);
sha512_final(ctx, &out[i*SHA512_OUTPUT_BYTES]);
}
}
#endif
/* These are just unique identifiers */
static const char *G_INITING = "initializing";
static const char *G_INITED = "initialized";
......@@ -135,7 +156,7 @@ goldilocks_derive_private_key (
memcpy(&privkey->opaque[2*GOLDI_FIELD_BYTES], proto, GOLDI_SYMKEY_BYTES);
unsigned char skb[SHA512_OUTPUT_BYTES];
unsigned char skb[FIELD_HASH_BYTES];
word_t sk[GOLDI_FIELD_WORDS];
assert(sizeof(skb) >= sizeof(sk));
......@@ -146,9 +167,9 @@ goldilocks_derive_private_key (
sha512_init(&ctx);
sha512_update(&ctx, (const unsigned char *)"derivepk", GOLDI_DIVERSIFY_BYTES);
sha512_update(&ctx, proto, GOLDI_SYMKEY_BYTES);
sha512_final(&ctx, (unsigned char *)skb);
field_hash_final(&ctx, (unsigned char *)skb);
barrett_deserialize_and_reduce(sk, skb, SHA512_OUTPUT_BYTES, &curve_prime_order);
barrett_deserialize_and_reduce(sk, skb, sizeof(skb), &curve_prime_order);
barrett_serialize(privkey->opaque, sk, GOLDI_FIELD_BYTES);
scalarmul_fixed_base(&exta, sk, GOLDI_SCALAR_BITS, &goldilocks_global.fixed_base);
......@@ -316,13 +337,13 @@ goldilocks_derive_challenge(
uint64_t message_len
) {
/* challenge = H(pk, [nonceG], message). */
unsigned char sha_out[SHA512_OUTPUT_BYTES];
unsigned char sha_out[FIELD_HASH_BYTES];
struct sha512_ctx_t ctx;
sha512_init(&ctx);
sha512_update(&ctx, pubkey, GOLDI_FIELD_BYTES);
sha512_update(&ctx, gnonce, GOLDI_FIELD_BYTES);
sha512_update(&ctx, message, message_len);
sha512_final(&ctx, sha_out);
field_hash_final(&ctx, sha_out);
barrett_deserialize_and_reduce(challenge, sha_out, sizeof(sha_out), &curve_prime_order);
}
......@@ -346,7 +367,7 @@ goldilocks_sign (
}
/* Derive a nonce. TODO: use HMAC. FUTURE: factor. */
unsigned char sha_out[SHA512_OUTPUT_BYTES];
unsigned char sha_out[FIELD_HASH_BYTES];
word_t tk[GOLDI_FIELD_WORDS];
struct sha512_ctx_t ctx;
sha512_init(&ctx);
......@@ -354,8 +375,8 @@ goldilocks_sign (
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
sha512_update(&ctx, message, message_len);
sha512_update(&ctx, &privkey->opaque[2*GOLDI_FIELD_BYTES], GOLDI_SYMKEY_BYTES);
sha512_final(&ctx, sha_out);
barrett_deserialize_and_reduce(tk, sha_out, SHA512_OUTPUT_BYTES, &curve_prime_order);
field_hash_final(&ctx, sha_out);
barrett_deserialize_and_reduce(tk, sha_out, sizeof(sha_out), &curve_prime_order);
/* 4[nonce]G */
uint8_t signature_tmp[GOLDI_FIELD_BYTES];
......
......@@ -127,7 +127,9 @@ constant_time_cond_swap (
/**
* @brief Constant-time equivalent of memcpy(out, table + elem_bytes*idx, elem_bytes);
*
* The table must be at least as aligned as elem_bytes. The output must be vector aligned.
* The table must be at least as aligned as elem_bytes. The output must be word aligned,
* and if the input size is vector aligned it must also be vector aligned.
*
* The table and output must not alias.
*/
static __inline__ void
......@@ -151,8 +153,9 @@ constant_time_lookup (
big_register_t br_mask = br_is_zero(big_i);
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
if (elem_bytes % sizeof(big_register_t)) {
/* input unaligned, output aligned */
*(big_register_t *)(out+k) |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
/* unaligned */
((unaligned_br_t *)(out+k))->unaligned
|= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
} else {
/* aligned */
*(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]);
......
......@@ -102,6 +102,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
#endif
#if __AVX2__
#define VECTOR_ALIGNED __attribute__((aligned(32)))
typedef uint32x8_t big_register_t;
typedef uint64x4_t uint64xn_t;
typedef uint32x8_t uint32xn_t;
......@@ -113,6 +114,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
return ret;
}
#elif __SSE2__
#define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t;
......@@ -124,6 +126,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
return ret;
}
#elif __ARM_NEON__
#define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t;
......@@ -132,6 +135,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
return vdupq_n_u32(x);
}
#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
#define VECTOR_ALIGNED __attribute__((aligned(8)))
typedef uint64_t big_register_t, uint64xn_t;
typedef uint32_t uint32xn_t;
......@@ -140,6 +144,7 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
return (big_register_t)x;
}
#else
#define VECTOR_ALIGNED __attribute__((aligned(4)))
typedef uint64_t uint64xn_t;
typedef uint32_t uint32xn_t;
typedef uint32_t big_register_t;
......
This diff is collapsed.
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P521_H__
#define __P521_H__ 1
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include "word.h"
typedef struct p521_t {
uint64_t limb[9];
} p521_t;
#ifdef __cplusplus
extern "C" {
#endif
static __inline__ void
p521_set_ui (
p521_t *out,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_add (
p521_t *out,
const p521_t *a,
const p521_t *b
) __attribute__((unused));
static __inline__ void
p521_sub (
p521_t *out,
const p521_t *a,
const p521_t *b
) __attribute__((unused));
static __inline__ void
p521_neg (
p521_t *out,
const p521_t *a
) __attribute__((unused));
static __inline__ void
p521_addw (
p521_t *a,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_subw (
p521_t *a,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_copy (
p521_t *out,
const p521_t *a
) __attribute__((unused));
static __inline__ void
p521_weak_reduce (
p521_t *inout
) __attribute__((unused));
void
p521_strong_reduce (
p521_t *inout
);
mask_t
p521_is_zero (
const p521_t *in
);
static __inline__ void
p521_bias (
p521_t *inout,
int amount
) __attribute__((unused));
static __inline__ void
p521_really_bias (
p521_t *inout,
int amount
) __attribute__((unused));
void
p521_mul (
p521_t *__restrict__ out,
const p521_t *a,
const p521_t *b
);
void
p521_mulw (
p521_t *__restrict__ out,
const p521_t *a,
uint64_t b
);
void
p521_sqr (
p521_t *__restrict__ out,
const p521_t *a
);
void
p521_serialize (
uint8_t *serial,
const struct p521_t *x
);
mask_t
p521_deserialize (
p521_t *x,
const uint8_t serial[66]
);
/* -------------- Inline functions begin here -------------- */
void
p521_set_ui (
p521_t *out,
uint64_t x
) {
int i;
out->limb[0] = x;
for (i=1; i<9; i++) {
out->limb[i] = 0;
}
}
void
p521_add (
p521_t *out,
const p521_t *a,
const p521_t *b
) {
unsigned int i;
for (i=0; i<9; i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
p521_weak_reduce(out);
}
void
p521_sub (
p521_t *out,
const p521_t *a,
const p521_t *b
) {
unsigned int i;
uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
for (i=0; i<9; i++) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
}
p521_weak_reduce(out);
}
void
p521_neg (
struct p521_t *out,
const p521_t *a
) {
unsigned int i;
uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
for (i=0; i<9; i++) {
out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i];
}
p521_weak_reduce(out);
}
void
p521_addw (
p521_t *a,
uint64_t x
) {
a->limb[0] += x;
a->limb[1] += a->limb[0]>>58;
a->limb[0] &= (1ull<<58)-1;
}
void
p521_subw (
p521_t *a,
uint64_t x
) {
a->limb[0] -= x;
p521_really_bias(a, 1);
p521_weak_reduce(a);
}
void
p521_copy (
p521_t *out,
const p521_t *a
) {
memcpy(out,a,sizeof(*a));
}
void
p521_really_bias (
p521_t *a,
int amt
) {
uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt;
int i;
for (i=0; i<9; i++) {
a->limb[i] += (i==8) ? co2 : co1;
}
}
void
p521_bias (
p521_t *a,
int amt
) {
(void) a;
(void) amt;
}
void
p521_weak_reduce (
p521_t *a
) {
uint64_t mask = (1ull<<58) - 1;
uint64_t tmp = a->limb[8] >> 57;
int i;
for (i=8; i>0; i--) {
a->limb[i] = (a->limb[i] & ((i==8) ? mask>>1 : mask)) + (a->limb[i-1]>>58);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif /* __P521_H__ */
This diff is collapsed.
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P521_H__
#define __P521_H__ 1
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include "word.h"
#define LIMBPERM(x) (((x)%3)*3 + (x)/3)
#define USE_P521_3x3_TRANSPOSE
typedef struct p521_t {
uint64_t limb[9];
} p521_t;
#ifdef __cplusplus
extern "C" {
#endif
static __inline__ void
p521_set_ui (
p521_t *out,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_add (
p521_t *out,
const p521_t *a,
const p521_t *b
) __attribute__((unused));
static __inline__ void
p521_sub (
p521_t *out,
const p521_t *a,
const p521_t *b
) __attribute__((unused));
static __inline__ void
p521_neg (
p521_t *out,
const p521_t *a
) __attribute__((unused));
static __inline__ void
p521_addw (
p521_t *a,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_subw (
p521_t *a,
uint64_t x
) __attribute__((unused));
static __inline__ void
p521_copy (
p521_t *out,
const p521_t *a
) __attribute__((unused));
static __inline__ void
p521_weak_reduce (
p521_t *inout
) __attribute__((unused));
void
p521_strong_reduce (
p521_t *inout
);
mask_t
p521_is_zero (
const p521_t *in
);
static __inline__ void
p521_bias (
p521_t *inout,
int amount
) __attribute__((unused));
static __inline__ void
p521_really_bias (
p521_t *inout,
int amount
) __attribute__((unused));
void
p521_mul (
p521_t *__restrict__ out,
const p521_t *a,
const p521_t *b
);
void
p521_mulw (
p521_t *__restrict__ out,
const p521_t *a,
uint64_t b
);
void
p521_sqr (
p521_t *__restrict__ out,
const p521_t *a
);
void
p521_serialize (
uint8_t *serial,
const struct p521_t *x
);
mask_t
p521_deserialize (
p521_t *x,
const uint8_t serial[66]
);
/* -------------- Inline functions begin here -------------- */
void
p521_set_ui (
p521_t *out,
uint64_t x
) {
int i;
out->limb[0] = x;
for (i=1; i<9; i++) {
out->limb[i] = 0;
}
}
void
p521_add (
p521_t *out,
const p521_t *a,
const p521_t *b
) {
unsigned int i;
for (i=0; i<9; i++) {
out->limb[i] = a->limb[i] + b->limb[i];
}
p521_weak_reduce(out);
}
void
p521_sub (
p521_t *out,
const p521_t *a,
const p521_t *b
) {
unsigned int i;
uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
for (i=0; i<9; i++) {
out->limb[i] = a->limb[i] - b->limb[i] + ((i==8) ? co2 : co1);
}
p521_weak_reduce(out);
}
void
p521_neg (
struct p521_t *out,
const p521_t *a
) {
unsigned int i;
uint64_t co1 = ((1ull<<58)-1)*4, co2 = ((1ull<<57)-1)*4;
for (i=0; i<9; i++) {
out->limb[i] = ((i==8) ? co2 : co1) - a->limb[i];
}
p521_weak_reduce(out);
}
void
p521_addw (
p521_t *a,
uint64_t x
) {
a->limb[0] += x;
a->limb[LIMBPERM(1)] += a->limb[0]>>58;
a->limb[0] &= (1ull<<58)-1;
}
void
p521_subw (
p521_t *a,
uint64_t x
) {
a->limb[0] -= x;
p521_really_bias(a, 1);
p521_weak_reduce(a);
}
void
p521_copy (
p521_t *out,
const p521_t *a
) {
memcpy(out,a,sizeof(*a));
}
void
p521_really_bias (
p521_t *a,
int amt
) {
uint64_t co1 = ((1ull<<58)-1)*2*amt, co2 = ((1ull<<57)-1)*2*amt;
int i;
for (i=0; i<9; i++) {
a->limb[i] += (i==8) ? co2 : co1;
}
}
void
p521_bias (
p521_t *a,
int amt
) {
(void) a;
(void) amt;
}
void
p521_weak_reduce (
p521_t *a
) {
uint64_t mask = (1ull<<58) - 1;
uint64_t tmp = a->limb[8] >> 57;
int i;
for (i=8; i>0; i--) {
a->limb[LIMBPERM(i)] = (a->limb[LIMBPERM(i)] & ((i==8) ? mask>>1 : mask)) + (a->limb[LIMBPERM(i-1)]>>58);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif /* __P521_H__ */
......@@ -18,13 +18,13 @@ const uint8_t FIELD_MODULUS[FIELD_BYTES] = {
const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
U64LE(0xbf15dbca0ae7f294),
U60LE(0x04273ba96570e0ba),
U60LE(0xc94750a1813ac0fb),
U60LE(0xea4939b8b9037a08),
U60LE(0x0000000000000002),
U60LE(0x0000000000000000),
U60LE(0x0000000000000000),
U60LE(0x0000000000000000),
U64LE(0x04273ba96570e0ba),
U64LE(0xc94750a1813ac0fb),
U64LE(0xea4939b8b9037a08),
U64LE(0x0000000000000002),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
U64LE(0x0000000000000000),
0x80,
U64LE(0x7e2bb79415cfe529),
......@@ -40,6 +40,17 @@ const word_t SCALARMUL_FIXED_WINDOW_ADJUSTMENT[2*SCALAR_WORDS] = {
const struct affine_t goldilocks_base_point = {
{{
#ifdef USE_P521_3x3_TRANSPOSE
U58LE(0x02a940a2f19ba6c),
U58LE(0x3331c90d2c6ba52),
U58LE(0x2878a3bfd9f42fc),
U58LE(0x03ec4cd920e2a8c),
U58LE(0x0c6203913f6ecc5),
U58LE(0x06277e432c8a5ac),
U58LE(0x1d568fc99c6059d),
U58LE(0x1b2063b22fcf270),
U58LE(0x0752cb45c48648b)
#else
U58LE(0x02a940a2f19ba6c),
U58LE(0x03ec4cd920e2a8c),
U58LE(0x1d568fc99c6059d),
......@@ -49,6 +60,7 @@ const struct affine_t goldilocks_base_point = {
U58LE(0x2878a3bfd9f42fc),
U58LE(0x06277e432c8a5ac),
U58LE(0x0752cb45c48648b)
#endif
}},
{{ 12 }}
};
......@@ -69,6 +81,17 @@ const struct barrett_prime_t curve_prime_order = {
const struct field_t
sqrt_d_minus_1 = {{
#ifdef USE_P521_3x3_TRANSPOSE
U58LE(0x1e2be72c1c81990),
U58LE(0x207dfc238a33e46),
U58LE(0x2264cfb418c4c30),
U58LE(0x1135002ad596c69),
U58LE(0x0e30107cd79d1f6),
U58LE(0x0524b9e715937f5),
U58LE(0x2ab3a257a22666d),
U58LE(0x2d80cc2936a1824),
U58LE(0x0a9ea3ac10d6aed)
#else
U58LE(0x1e2be72c1c81990),
U58LE(0x1135002ad596c69),
U58LE(0x2ab3a257a22666d),
......@@ -78,4 +101,5 @@ sqrt_d_minus_1 = {{
U58LE(0x2264cfb418c4c30),
U58LE(0x0524b9e715937f5),
U58LE(0x0a9ea3ac10d6aed)
#endif
}};
......@@ -163,7 +163,9 @@ scalarmul (
copy_tw_extensible(&tabulator, working);
double_tw_extensible(&tabulator);
struct tw_pniels_t pn, multiples[NTABLE];
struct tw_pniels_t
pn VECTOR_ALIGNED,
multiples[NTABLE] VECTOR_ALIGNED;
convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
convert_tw_extensible_to_tw_pniels(&multiples[0], working);
......@@ -225,7 +227,9 @@ scalarmul_vlook (
copy_tw_extensible(&tabulator, working);
double_tw_extensible(&tabulator);
struct tw_pniels_t pn, multiples[NTABLE];
struct tw_pniels_t
pn VECTOR_ALIGNED,
multiples[NTABLE] VECTOR_ALIGNED;
convert_tw_extensible_to_tw_pniels(&pn, &tabulator);
convert_tw_extensible_to_tw_pniels(&multiples[0], working);
......
......@@ -535,11 +535,11 @@ int main(int argc, char **argv) {
printf("%02x", hsk.opaque[i]);
}
printf("\nss1 = ");
for (i=0; i<FIELD_BYTES; i++) {
for (i=0; i<64; i++) {