Commit 03ecad05 authored by Michael Hamburg's avatar Michael Hamburg

it compiles, but it certainly doesnt work yet

parent 40b1f8b8
......@@ -19,13 +19,13 @@ ASM ?= $(CC)
DECAF ?= decaf_fast
ifneq (,$(findstring x86_64,$(MACHINE)))
ARCH ?= arch_x86_64
ARCH ?= arch_ref64
else
# no i386 port yet
ARCH ?= arch_arm_32
ARCH ?= arch_ref32
endif
FIELD ?= p255
FIELD ?= p25519
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
......@@ -35,7 +35,7 @@ INCFLAGS = -Isrc/include -Iinclude -Isrc/$(FIELD) -Isrc/$(FIELD)/$(ARCH)
LANGFLAGS = -std=c99 -fno-strict-aliasing
LANGXXFLAGS = -fno-strict-aliasing
GENFLAGS = -ffunction-sections -fdata-sections -fvisibility=hidden -fomit-frame-pointer -fPIC
OFLAGS ?= -O3
OFLAGS ?= -O2
TODAY = $(shell date "+%Y-%m-%d")
......
......@@ -61,7 +61,7 @@ typedef uint64_t decaf_dword_t;
/** Galois field element internal structure */
typedef struct gf_s {
decaf_word_t limb[DECAF_255_LIMBS];
} __attribute__((aligned(32))) gf_s, gf[1];
} gf_s, gf[1];
/** @endcond */
/** Number of bytes in a serialized point. */
......
......@@ -18,7 +18,7 @@
#include "shake.h"
/** Number of bytes for a symmetric key (expanded to full key) */
#define DECAF_448_SYMMETRIC_KEY_BYTES 32
#define DECAF_255_SYMMETRIC_KEY_BYTES 32
/** @cond internal */
#define API_VIS __attribute__((visibility("default"))) __attribute__((noinline)) // TODO: synergize with decaf.h
......@@ -31,29 +31,29 @@
/** @endcond */
/** A symmetric key, the compressed point of a private key. */
typedef unsigned char decaf_448_symmetric_key_t[DECAF_448_SYMMETRIC_KEY_BYTES];
typedef unsigned char decaf_255_symmetric_key_t[DECAF_255_SYMMETRIC_KEY_BYTES];
/** An encoded public key. */
typedef unsigned char decaf_448_public_key_t[DECAF_448_SER_BYTES];
typedef unsigned char decaf_255_public_key_t[DECAF_255_SER_BYTES];
/** A signature. */
typedef unsigned char decaf_448_signature_t[DECAF_448_SER_BYTES + DECAF_448_SCALAR_BYTES];
typedef unsigned char decaf_255_signature_t[DECAF_255_SER_BYTES + DECAF_255_SCALAR_BYTES];
typedef struct {
/** @cond intetrnal */
/** The symmetric key from which everything is expanded */
decaf_448_symmetric_key_t sym;
decaf_255_symmetric_key_t sym;
/** The scalar x */
decaf_448_scalar_t secret_scalar;
decaf_255_scalar_t secret_scalar;
/** x*Base */
decaf_448_public_key_t pub;
decaf_255_public_key_t pub;
/** @endcond */
} /** Private key structure for pointers. */
decaf_448_private_key_s,
decaf_255_private_key_s,
/** A private key (gmp array[1] style). */
decaf_448_private_key_t[1];
decaf_255_private_key_t[1];
#ifdef __cplusplus
extern "C" {
......@@ -64,16 +64,16 @@ extern "C" {
* @param [out] priv The derived private key.
* @param [in] proto The compressed or proto-key, which must be 32 random bytes.
*/
void decaf_448_derive_private_key (
decaf_448_private_key_t priv,
const decaf_448_symmetric_key_t proto
void decaf_255_derive_private_key (
decaf_255_private_key_t priv,
const decaf_255_symmetric_key_t proto
) NONNULL2 API_VIS;
/**
* @brief Destroy a private key.
*/
void decaf_448_destroy_private_key (
decaf_448_private_key_t priv
void decaf_255_destroy_private_key (
decaf_255_private_key_t priv
) NONNULL1 API_VIS;
/**
......@@ -81,9 +81,9 @@ void decaf_448_destroy_private_key (
* @param [out] pub The extracted private key.
* @param [in] priv The private key.
*/
void decaf_448_private_to_public (
decaf_448_public_key_t pub,
const decaf_448_private_key_t priv
void decaf_255_private_to_public (
decaf_255_public_key_t pub,
const decaf_255_private_key_t priv
) NONNULL2 API_VIS;
/**
......@@ -104,11 +104,11 @@ void decaf_448_private_to_public (
* and will almost definitely change in the future.
*/
decaf_bool_t
decaf_448_shared_secret (
decaf_255_shared_secret (
uint8_t *shared,
size_t shared_bytes,
const decaf_448_private_key_t my_privkey,
const decaf_448_public_key_t your_pubkey
const decaf_255_private_key_t my_privkey,
const decaf_255_public_key_t your_pubkey
) NONNULL134 WARN_UNUSED API_VIS;
/**
......@@ -119,9 +119,9 @@ decaf_448_shared_secret (
* @param [in] shake A SHAKE256 context with the message.
*/
void
decaf_448_sign_shake (
decaf_448_signature_t sig,
const decaf_448_private_key_t priv,
decaf_255_sign_shake (
decaf_255_signature_t sig,
const decaf_255_private_key_t priv,
const keccak_sponge_t shake
) NONNULL3 API_VIS;
......@@ -134,9 +134,9 @@ decaf_448_sign_shake (
* @param [in] message_len The message's length.
*/
void
decaf_448_sign (
decaf_448_signature_t sig,
const decaf_448_private_key_t priv,
decaf_255_sign (
decaf_255_signature_t sig,
const decaf_255_private_key_t priv,
const unsigned char *message,
size_t message_len
) NONNULL3 API_VIS;
......@@ -149,9 +149,9 @@ decaf_448_sign (
* @param [in] shake A SHAKE256 context with the message.
*/
decaf_bool_t
decaf_448_verify_shake (
const decaf_448_signature_t sig,
const decaf_448_public_key_t pub,
decaf_255_verify_shake (
const decaf_255_signature_t sig,
const decaf_255_public_key_t pub,
const keccak_sponge_t shake
) NONNULL3 API_VIS WARN_UNUSED;
......@@ -164,9 +164,9 @@ decaf_448_verify_shake (
* @param [in] message_len The message's length.
*/
decaf_bool_t
decaf_448_verify (
const decaf_448_signature_t sig,
const decaf_448_public_key_t pub,
decaf_255_verify (
const decaf_255_signature_t sig,
const decaf_255_public_key_t pub,
const unsigned char *message,
size_t message_len
) NONNULL3 API_VIS WARN_UNUSED;
......
......@@ -192,18 +192,18 @@ private:
};
/**@cond internal*/
inline Ed448::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
inline Ed255::Scalar::Scalar(SpongeRng &rng) NOEXCEPT {
*this = rng.read(SER_BYTES);
}
inline Ed448::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
inline Ed255::Point::Point(SpongeRng &rng, bool uniform) NOEXCEPT {
SecureBuffer buffer((uniform ? 2 : 1) * HASH_BYTES);
rng.read(buffer);
set_to_hash(buffer);
}
inline SecureBuffer Ed448::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
inline SecureBuffer Ed255::Point::steg_encode(SpongeRng &rng) const NOEXCEPT {
SecureBuffer out(STEG_BYTES);
bool done;
do {
......
......@@ -45,14 +45,22 @@ typedef int64_t decaf_sdword_t;
#define siv static inline void __attribute__((always_inline))
static const gf ZERO = {{{0}}}, ONE = {{{1}}}, TWO = {{{2}}};
static const int EDWARDS_D = 121665;
static const int EDWARDS_D = -89747;
// Gonna test with PinkBikeShed until the math works...
// Curve25519: 121665;
static const scalar_t sc_p = {{{
// Gonna test with PinkBikeShed until the math works...
SC_LIMB(0xb6b98fd8849faf35),
SC_LIMB(0x16241e6093b2ce59),
SC_LIMB(0),
SC_LIMB(0x2000000000000000)
/* Curve25519:
SC_LIMB(0x5812631a5cf5d3ed),
SC_LIMB(0x14def9dea2f79cd6),
SC_LIMB(0),
SC_LIMB(0),
SC_LIMB(0x1000000000000000)
*/
}}};
const scalar_t API_NS(scalar_one) = {{{1}}}, API_NS(scalar_zero) = {{{0}}};
......@@ -61,7 +69,7 @@ extern const decaf_word_t MONTGOMERY_FACTOR;
/* sqrt(9) = 3 from the curve spec. Not exported, but used by pregen tool. */
const unsigned char base_point_ser_for_pregen[SER_BYTES] = {
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5 /*PinkBikeShed. Curve25519: 3*/, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
extern const point_t API_NS(point_base);
......@@ -82,16 +90,16 @@ const size_t API_NS2(alignof,precomputed_s) = 32;
#ifdef __clang__
#if 100*__clang_major__ + __clang_minor__ > 305
#define VECTORIZE _Pragma("clang loop unroll(disable) vectorize(enable) vectorize_width(8)")
#define UNROLL _Pragma("clang loop unroll(full)") // FIXME: vectorize?
#endif
#endif
#ifndef VECTORIZE
#define VECTORIZE
#ifndef UNROLL
#define UNROLL
#endif
#define FOR_LIMB(i,op) { unsigned int i=0; for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB_V(i,op) { unsigned int i=0; VECTORIZE for (i=0; i<NLIMBS; i++) { op; }}
#define FOR_LIMB_U(i,op) { unsigned int i=0; UNROLL for (i=0; i<NLIMBS; i++) { op; }}
/** Copy x = y */
siv gf_cpy(gf x, const gf y) { x[0] = y[0]; }
......@@ -138,7 +146,7 @@ siv gf_bias ( gf c, int amt) {
/** Subtract mod p. Bias by 2 and don't reduce */
siv gf_sub_nr ( gf_s *__restrict__ c, const gf a, const gf b ) {
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] - b->limb[i] + 2*P->limb[i] );
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_sub_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
gf_bias(c, 2);
......@@ -155,7 +163,7 @@ siv gf_sub_nr_x ( gf c, const gf a, const gf b, int amt ) {
/** Add mod p. Don't reduce. */
siv gf_add_nr ( gf c, const gf a, const gf b ) {
// FOR_LIMB_V(i, c->limb[i] = a->limb[i] + b->limb[i]);
// FOR_LIMB_U(i, c->limb[i] = a->limb[i] + b->limb[i]);
ANALYZE_THIS_ROUTINE_CAREFULLY; //TODO
field_add_nr((field_t *)c, (const field_t *)a, (const field_t *)b);
}
......@@ -183,7 +191,7 @@ sv cond_neg(gf x, decaf_bool_t neg) {
/** Constant time, if (swap) (x,y) = (y,x); */
siv cond_swap(gf x, gf_s *__restrict__ y, decaf_bool_t swap) {
FOR_LIMB_V(i, {
FOR_LIMB_U(i, {
decaf_word_t s = (x->limb[i] ^ y->limb[i]) & swap;
x->limb[i] ^= s;
y->limb[i] ^= s;
......@@ -371,9 +379,27 @@ decaf_bool_t API_NS(scalar_invert) (
}
return ~API_NS(scalar_eq)(out,API_NS(scalar_zero));
#else
(void)out;
(void)a;
return 0;
decaf_255_scalar_t b, ma;
int i;
sc_montmul(b,API_NS(scalar_one),sc_r2);
sc_montmul(ma,a,sc_r2);
for (i=SCALAR_BITS-1; i>=0; i--) {
sc_montsqr(b,b);
decaf_word_t w = sc_p->limb[i/WBITS];
if (i<WBITS) {
assert(w >= 2);
w-=2;
}
if (1 & w>>(i%WBITS)) {
sc_montmul(b,b,ma);
}
}
sc_montmul(out,b,decaf_255_scalar_one);
API_NS(scalar_destroy)(b);
API_NS(scalar_destroy)(ma);
return ~API_NS(scalar_eq)(out,decaf_255_scalar_zero);
#endif
}
......
/**
* @file decaf_config.h
* @author Mike Hamburg
*
* @copyright
* Copyright (c) 2015 Cryptography Research, Inc. \n
* Released under the MIT License. See LICENSE.txt for license information.
*
* @brief Configuration for decaf_fast.c
*/
#ifndef __DECAF_255_CONFIG_H__
#define __DECAF_255_CONFIG_H__ 1
/**
* Use the Montgomery ladder for direct scalarmul.
*
* The Montgomery ladder is faster than Edwards scalarmul, but providing
* the features Decaf supports (cofactor elimination, twist rejection)
* makes it complicated and adds code. Removing the ladder saves a few
* kilobytes at the cost of perhaps 5-10% overhead in direct scalarmul
* time.
*/
#define DECAF_USE_MONTGOMERY_LADDER 1
/** The number of comb tables for fixed base scalarmul. */
#define DECAF_COMBS_N 3
/** The number of teeth per comb for fixed base scalarmul. */
#define DECAF_COMBS_T 5
/** The comb spacing fixed base scalarmul. */
#define DECAF_COMBS_S 17
/** Performance tuning: the width of the fixed window for scalar mul. */
#define DECAF_WINDOW_BITS 4
/**
* The number of bits used for the precomputed table in variable-time
* double scalarmul.
*/
#define DECAF_WNAF_FIXED_TABLE_BITS 5
/**
* Performance tuning: bits used for the variable table in variable-time
* double scalarmul.
*/
#define DECAF_WNAF_VAR_TABLE_BITS 3
#endif /* __DECAF_255_CONFIG_H__ */
......@@ -22,164 +22,33 @@ p255_mul (
const p255_t *as,
const p255_t *bs
) {
const uint64_t *a = as->limb, *b = bs->limb;
const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
uint64_t bh[4];
int i,j;
for (i=0; i<4; i++) bh[i] = b[i+1] * 19;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull<<51) - 1;
uint64_t aa[4], bb[4], bbb[4];
unsigned int i;
for (i=0; i<4; i++) {
aa[i] = a[i] + a[i+4];
bb[i] = b[i] + b[i+4];
bbb[i] = bb[i] + b[i+4];
}
int I_HATE_UNROLLED_LOOPS = 0;
if (I_HATE_UNROLLED_LOOPS) {
/* The compiler probably won't unroll this,
* so it's like 80% slower.
*/
for (i=0; i<4; i++) {
accum2 = 0;
unsigned int j;
for (j=0; j<=i; j++) {
accum2 += widemul(a[j], b[i-j]);
accum1 += widemul(aa[j], bb[i-j]);
accum0 += widemul(a[j+4], b[i-j+4]);
}
for (; j<4; j++) {
accum2 += widemul(a[j], b[i-j+8]);
accum1 += widemul(aa[j], bbb[i-j+4]);
accum0 += widemul(a[j+4], bb[i-j+4]);
}
accum1 -= accum2;
accum0 += accum2;
c[i] = ((uint64_t)(accum0)) & mask;
c[i+4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
__uint128_t accum = 0;
for (i=0; i<5; i++) {
for (j=0; j<=i; j++) {
accum += widemul(b[i-j], a[j]);
}
} else {
accum2 = widemul(a[0], b[0]);
accum1 += widemul(aa[0], bb[0]);
accum0 += widemul(a[4], b[4]);
accum2 += widemul(a[1], b[7]);
accum1 += widemul(aa[1], bbb[3]);
accum0 += widemul(a[5], bb[3]);
accum2 += widemul(a[2], b[6]);
accum1 += widemul(aa[2], bbb[2]);
accum0 += widemul(a[6], bb[2]);
accum2 += widemul(a[3], b[5]);
accum1 += widemul(aa[3], bbb[1]);
accum0 += widemul(a[7], bb[1]);
accum1 -= accum2;
accum0 += accum2;
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[1]);
accum1 += widemul(aa[0], bb[1]);
accum0 += widemul(a[4], b[5]);
accum2 += widemul(a[1], b[0]);
accum1 += widemul(aa[1], bb[0]);
accum0 += widemul(a[5], b[4]);
accum2 += widemul(a[2], b[7]);
accum1 += widemul(aa[2], bbb[3]);
accum0 += widemul(a[6], bb[3]);
accum2 += widemul(a[3], b[6]);
accum1 += widemul(aa[3], bbb[2]);
accum0 += widemul(a[7], bb[2]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[2]);
accum1 += widemul(aa[0], bb[2]);
accum0 += widemul(a[4], b[6]);
accum2 += widemul(a[1], b[1]);
accum1 += widemul(aa[1], bb[1]);
accum0 += widemul(a[5], b[5]);
accum2 += widemul(a[2], b[0]);
accum1 += widemul(aa[2], bb[0]);
accum0 += widemul(a[6], b[4]);
accum2 += widemul(a[3], b[7]);
accum1 += widemul(aa[3], bbb[3]);
accum0 += widemul(a[7], bb[3]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[3]);
accum1 += widemul(aa[0], bb[3]);
accum0 += widemul(a[4], b[7]);
accum2 += widemul(a[1], b[2]);
accum1 += widemul(aa[1], bb[2]);
accum0 += widemul(a[5], b[6]);
accum2 += widemul(a[2], b[1]);
accum1 += widemul(aa[2], bb[1]);
accum0 += widemul(a[6], b[5]);
accum2 += widemul(a[3], b[0]);
accum1 += widemul(aa[3], bb[0]);
accum0 += widemul(a[7], b[4]);
accum1 -= accum2;
accum0 += accum2;
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
} /* !I_HATE_UNROLLED_LOOPS */
accum0 += accum1;
accum0 += c[4];
accum1 += c[0];
c[4] = ((uint64_t)(accum0)) & mask;
c[0] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
c[5] += ((uint64_t)(accum0));
c[1] += ((uint64_t)(accum1));
for (; j<5; j++) {
accum += widemul(bh[i-j+4], a[j]);
}
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
assert(accum < mask);
c[1] += accum;
}
void
......@@ -188,27 +57,25 @@ p255_mulw (
const p255_t *as,
uint64_t b
) {
const uint64_t *a = as->limb;
const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
int i;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum4 = 0;
uint64_t mask = (1ull<<56) - 1;
int i;
for (i=0; i<4; i++) {
accum0 += widemul(b, a[i]);
accum4 += widemul(b, a[i+4]);
c[i] = accum0 & mask; accum0 >>= 56;
c[i+4] = accum4 & mask; accum4 >>= 56;
__uint128_t accum = 0;
for (i=0; i<5; i++) {
accum += widemul(b, a[i]);
c[i] = accum & mask;
accum >>= 51;
}
/* PERF: parallelize? eh well this is reference */
accum *= 19;
accum += c[0];
c[0] = accum & mask;
accum >>= 51;
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 56;
accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 56;
assert(accum < mask);
c[1] += accum;
}
void
......@@ -223,23 +90,21 @@ void
p255_strong_reduce (
p255_t *a
) {
uint64_t mask = (1ull<<56)-1;
uint64_t mask = (1ull<<51)-1;
/* first, clear high */
a->limb[4] += a->limb[7]>>56;
a->limb[0] += a->limb[7]>>56;
a->limb[7] &= mask;
a->limb[0] += (a->limb[4]>>51)*19;
a->limb[4] &= mask;
/* now the total is less than 2^255 - 2^(255-56) + 2^(255-56+8) < 2p */
/* now the total is less than 2p */
/* compute total_value - p. No need to reduce mod p. */
__int128_t scarry = 0;
int i;
for (i=0; i<8; i++) {
scarry = scarry + a->limb[i] - ((i==4)?mask-1:mask);
for (i=0; i<5; i++) {
scarry = scarry + a->limb[i] - ((i==0)?mask-18:mask);
a->limb[i] = scarry & mask;
scarry >>= 56;
scarry >>= 51;
}
/* uncommon case: it was >= p, so now scarry = 0 and this = x
......@@ -253,10 +118,10 @@ p255_strong_reduce (
__uint128_t carry = 0;
/* add it back */
for (i=0; i<8; i++) {
carry = carry + a->limb[i] + ((i==4)?(scarry_mask&~1):scarry_mask);
for (i=0; i<5; i++) {
carry = carry + a->limb[i] + ((i==0)?(scarry_mask&~18):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 56;
carry >>= 51;
}
assert(is_zero(carry + scarry));
......@@ -271,12 +136,13 @@ p255_serialize (
p255_t red;
p255_copy(&red, x);
p255_strong_reduce(&red);
for (i=0; i<8; i++) {
for (j=0; j<7; j++) {
serial[7*i+j] = red.limb[i];
red.limb[i] >>= 8;
uint64_t *r = red.limb;
uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
for (i=0; i<4; i++) {
for (j=0; j<8; j++) {
serial[8*i+j] = ser64[i];
ser64[i] >>= 8;
}
assert(red.limb[i] == 0);
}
}
......@@ -286,33 +152,27 @@ p255_deserialize (
const uint8_t serial[32]
) {
int i,j;
for (i=0; i<8; i++) {
uint64_t ser64[4], mask = ((1ull<<51)-1);
for (i=0; i<4; i++) {
uint64_t out = 0;
for (j=0; j<7; j++) {
out |= ((uint64_t)serial[7*i+j])<<(8*j);
for (j=0; j<8; j++) {
out |= ((uint64_t)serial[8*i+j])<<(8*j);
}
x->limb[i] = out;
ser64[i] = out;
}
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 56 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
uint64_t ge = -1, mask = (1ull<<56)-1;
for (i=0; i<4; i++) {
ge &= x->limb[i];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[4] + 1)) | is_zero(x->limb[4] ^ mask);
/* Test for >= 2^255-19 */
uint64_t ge = -(((__uint128_t)ser64[0]+19)>>64);
ge &= ser64[1];
ge &= ser64[2];
ge &= (ser64[3]<<1) + 1;
ge |= -(((__uint128_t)ser64[3]+0x8000000000000000)>>64);
/* Propagate the rest */
for (i=5; i<8; i++) {
ge &= x->limb[i];
}
x->limb[0] = ser64[0] & mask;
x->limb[1] = (ser64[0]>>51 | ser64[1]<<13) & mask;
x->limb[2] = (ser64[1]>>38 | ser64[2]<<26) & mask;
x->limb[3] = (ser64[2]>>25 | ser64[3]<<39) & mask;
x->limb[4] = ser64[3]>>12;
return ~is_zero(ge ^ mask);
return ~is_zero(~ge);
}
......@@ -15,7 +15,17 @@ typedef struct p255_t {
} p255_t;
#define LBITS 51
#define FIELD_LITERAL(a,b,c,d,e) {{a,b,c,d,e}}
#define FIELD_LITERAL(a,b,c,d,e) {{ a,b,c,d,e }}
/*
#define FIELD_LITERAL(a,b,c,d) {{ \
(a##ull) & LMASK, \
((a##ull)>>51 | (b##ull)<<13) & LMASK, \
((b##ull)>>38 | (c##ull)<<26) & LMASK, \
((c##ull)>>25 | (d##ull)<<39) & LMASK, \
(d##ull)>>12 \
}}