Commit 1eab9a3a authored by Michael Hamburg's avatar Michael Hamburg

New release.

Rework the directory structure into something saner, with src/ test/ include/ and build/

Beginning some tests.  Also, now support scan-build.

Now support 32-bit including vectorless ARM.  NEON is not yet supported, because I don't
have a test machine.

Many internal changes, improvements, and bug fixes.
parent ed6fdbf3
......@@ -508,7 +508,7 @@ HIDE_SCOPE_NAMES = NO
# the files that are included by a file in the documentation of that file.
# The default value is: YES.
SHOW_INCLUDE_FILES = YES
SHOW_INCLUDE_FILES = NO
# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
# grouped member an include statement to the documentation, telling the reader
......@@ -777,7 +777,7 @@ FILE_PATTERNS =
# be searched for input files as well.
# The default value is: NO.
RECURSIVE = NO
RECURSIVE = YES
# The EXCLUDE tag can be used to specify files and/or directories that should be
# excluded from the INPUT source files. This way you can easily exclude a
......
March 29, 2014:
Added a test directory with various tests. Currently testing SHA512 Monte
Carlo, compatibility of the different scalarmul functions, and some
identities on EC point ops. Began moving these tests out of benchmarker.
Added scan-build support.
Improved some internal interfaces. Made a structure for Barrett primes
instead of passing parameters individually. Moved some field operations
to places that make more sense, eg Barrett serialize and deserialize. The
deserialize operation now checks that its argument is in [0,q).
Added more documentation.
Changed the names of a bunch of functions. Still not entirely consistent,
but getting more so.
Some minor speed improvements. For example, multiply is now a couple cycles
faster.
Added a hackish attempt at thread-safety and initialization sanity checking
in the Goldilocks top-level routines.
Fixed some vector alignment bugs. Compiling with -O0 should now work.
Slightly simplified recode_wnaf.
Add a config.h file for future configuration. EXPERIMENT flags moved here.
I've decided against major changes to SHA512 for the moment. They add speed
but also significantly bloat the code, which is going to hurt L1 cache
performance. Perhaps we should link to OpenSSL if a faster SHA512 is desired.
Reorganize the source tree into src, test; factor arch stuff into src/arch_*.
Make most of the code 32-bit clean. There's now a 32-bit generic and 32-bit
vectorless ARM version. No NEON version yet because I don't have a test
machine (could use my phone in a pinch I guess?). The 32-bit version still
isn't heavily optimized, but on ARM it's using a nicely reworked signed/phi-adic
multiplier. The squaring is also based on this, but could really stand some
improvement.
When passed an even exponent (or extra doubles), the Montgomery ladder should
now be accept points if and only if they lie on the curve. This needs
additional testing, but it passes the zero bit exponent test.
On 32-bit, use 8x4x14 instead of 5x5x18 table organization. Probably there's
a better heuristic.
March 5, 2014:
First revision.
......
......@@ -2,61 +2,101 @@
# Released under the MIT License. See LICENSE.txt for license information.
CC = clang
CFLAGS = -O3 -std=c99 -pedantic -Wall -Wextra -Werror \
-mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2 \
-ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC \
-DEXPERIMENT_ECDH_OBLITERATE_CT=1 -DEXPERIMENT_ECDH_STIR_IN_PUBKEYS=1
LD = clang
.PHONY: clean all runbench todo doc
ARCH = arch_x86_64
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wgcc-compat -Wmissing-declarations
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
LANGFLAGS = -std=c99
GENFLAGS = -ffunction-sections -fdata-sections -fomit-frame-pointer -fPIC
OFLAGS = -O3
#XFLAGS = -DN_TESTS_BASE=1000
ARCHFLAGS = -mssse3 -maes -mavx2 -DMUST_HAVE_AVX2 -mbmi2
#ARCHFLAGS = -m32 -mcpu=cortex-a9 -mfpu=vfpv3-d16
CFLAGS = $(LANGFLAGS) $(WARNFLAGS) $(INCFLAGS) $(OFLAGS) $(ARCHFLAGS) $(GENFLAGS) $(XFLAGS)
LDFLAGS = $(ARCHFLAGS)
ASFLAGS = $(ARCHFLAGS)
.PHONY: clean all test bench todo doc lib
.PRECIOUS: build/%.s
HEADERS= Makefile $(shell find . -name "*.h") build/timestamp
LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
build/p448.o build/ec_point.o build/scalarmul.o build/sha512.o
all: bench
TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
build/test_pointops.o
BENCHCOMPONENTS=build/bench.o
all: lib build/test build/bench
scan: clean
scan-build --use-analyzer=`which clang` \
-enable-checker deadcode -enable-checker llvm \
-enable-checker osx -enable-checker security -enable-checker unix \
make build/bench build/test build/goldilocks.so
build/bench: $(LIBCOMPONENTS) $(BENCHCOMPONENTS)
$(LD) $(LDFLAGS) -o $@ $^
build/test: $(LIBCOMPONENTS) $(TESTCOMPONENTS)
$(LD) $(LDFLAGS) -o $@ $^
lib: build/goldilocks.so
build/goldilocks.so: $(LIBCOMPONENTS)
rm -f $@
libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-exported_symbols_list src/exported.sym \
$(LIBCOMPONENTS)
bench: *.h *.c
$(CC) $(CFLAGS) -o $@ *.c
build/timestamp:
mkdir -p build
touch $@
build/%.o: build/%.s
$(CC) -c -o $@ $<
$(CC) $(ASFLAGS) -c -o $@ $<
build/%.s: %.c $(HEADERS)
build/%.s: src/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<
build/goldilocks.so: $(LIBCOMPONENTS)
rm -f $@
libtool -macosx_version_min 10.6 -dynamic -dead_strip -lc -x -o $@ \
-exported_symbols_list exported.sym \
$(LIBCOMPONENTS)
build/%.s: test/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<
build/%.s: src/$(ARCH)/%.c $(HEADERS)
$(CC) $(CFLAGS) -S -c -o $@ $<
doc/timestamp:
mkdir -p doc
touch $@
doc: Doxyfile doc/timestamp *.c *.h
doc: Doxyfile doc/timestamp src/*.c src/include/*.h src/$(ARCH)/*.c src/$(ARCH)/*.h
doxygen
todo::
@egrep --color=auto -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c
@(find * -name '*.h'; find * -name '*.c') | xargs egrep --color=auto -w \
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE'
@echo '============================='
@(for i in FIXME BUG XXX TODO HACK PERF FUTURE REMOVE; do \
egrep -w -i $$i *.h *.c > /dev/null || continue; \
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i > /dev/null || continue; \
/bin/echo -n $$i' ' | head -c 10; \
egrep -w -i $$i *.h *.c | wc -l; \
(find * -name '*.h'; find * -name '*.c') | xargs egrep -w $$i| wc -l; \
done)
@echo '============================='
@echo -n 'Total '
@egrep -w -i 'hack|todo|fixme|bug|xxx|perf|future|remove' *.h *.c | wc -l
@(find * -name '*.h'; find * -name '*.c') | xargs egrep -w \
'HACK|TODO|FIXME|BUG|XXX|PERF|FUTURE|REMOVE' | wc -l
bench: build/bench
./$<
runbench: bench
test: build/test
./$<
clean:
rm -fr build bench *.o *.s
rm -fr build doc
......@@ -23,7 +23,7 @@ Important work items for Ed448-Goldilocks:
* Word_t, mask_t, bigregister_t, etc.
* Generate asm intrinsics with a script?
* Bugfix: make sure that init() and randomization are thread-safe.
* [DONE] Bugfix: make sure that init() and randomization are thread-safe.
* Security: check on deserialization that points are < p.
* Check also that they're nonzero or otherwise non-pathological?
......@@ -80,30 +80,29 @@ Important work items for Ed448-Goldilocks:
* Portability: make the inner layers of the code 32-bit clean.
* Write new versions of the field code.
* 28-bit limbs give less headroom for carries.
* NEON and vectorless ARM.
* Now have a vectorless ARM version; need NEON.
* Improve speed of 32-bit field code.
* Run through the SAGE tool to generate new bias & bound.
* Portability: make the outer layers of the code 32-bit clean.
* There are endian bugs in the signing algorithm.
* NEON and vectorless constant-time comparison.
* [DONE] Portability: make the outer layers of the code 32-bit clean.
* Performance: write and incorporate some extra routines
* Deserialize_and_isogeny
* Unconditional negate (or just plain subtract)
* Performance: fixed parameters?
* Performance/flexibility: decide which parameters should be hard-coded.
* Perhaps useful for comb precomputation.
* Performance: Improve SHA512.
* Improve portability.
* [DONE?] Improve portability.
* Improve speed.
* Except not, because this adds too much code size.
* Link OpenSSL if a fast SHA is desired.
* Protocol:
* Decide what things to stir into hashes for various functions.
* Performance: improve the Barrett field code.
* Support other primes?
* Capture prime shape into a struct instead of passing 3 params.
* Make 32-bit clean. (SAGE?)
* [DONE] Make 32-bit clean.
* Automation:
* Improve the SAGE tool to cover more cases
......@@ -111,6 +110,10 @@ Important work items for Ed448-Goldilocks:
* Constant-time selection
* Intrinsics code
* Field code?
* SAGE tool is impossibly slow on 32-bit
* Currently stuck on Elligator after 19 hours.
* [FIXED] at least for now.
* Vector-mul-chains
* Negation "bubble pushing" optimization
......
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
/**
* @file goldilocks.h
* @author Mike Hamburg
* @brief Goldilocks high-level functions.
*/
#ifndef __GOLDILOCKS_H__
#define __GOLDILOCKS_H__ 1
#include <stdint.h>
/**
* @brief Serialized form of a Goldilocks public key.
*
* @warning This isn't even my final form!
*/
struct goldilocks_public_key_t {
uint8_t opaque[56]; /**< Serialized data. */
};
/**
* @brief Serialized form of a Goldilocks private key.
*
* Contains 56 bytes of actual private key, 56 bytes of
* public key, and 32 bytes of symmetric key for randomization.
*
* @warning This isn't even my final form!
*/
struct goldilocks_private_key_t {
uint8_t opaque[144]; /**< Serialized data. */
};
#ifdef __cplusplus
extern "C" {
#endif
/** @brief No error. */
static const int GOLDI_EOK = 0;
/** @brief Error: your key or other state is corrupt. */
static const int GOLDI_ECORRUPT = 44801;
/** @brief Error: other party's key is corrupt. */
static const int GOLDI_EINVAL = 44802;
/** @brief Error: not enough entropy. */
static const int GOLDI_ENODICE = 44804;
/** @brief Error: you need to initialize the library first. */
static const int GOLDI_EUNINIT = 44805;
/** @brief Error: called init() but we are already initialized. */
static const int GOLDI_EALREADYINIT = 44805;
/**
* @brief Initialize Goldilocks' precomputed tables and
* random number generator. This function must be called before
* any of the other Goldilocks routines (except
* goldilocks_shared_secret in the current version) and should be
* called only once per process.
*
* There is currently no way to tear down this state. It is possible
* that a future version of this library will not require this function.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EALREADYINIT Already initialized.
* @retval GOLDI_ECORRUPT Memory is corrupted, or another thread is already init'ing.
* @retval Nonzero An error occurred.
*/
int
goldilocks_init ()
__attribute__((warn_unused_result));
/**
* @brief Generate a new random keypair.
* @param [out] privkey The generated private key.
* @param [out] pubkey The generated public key.
*
* @warning This isn't even my final form!
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ENODICE Insufficient entropy.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_keygen (
struct goldilocks_private_key_t *privkey,
struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2)));
/**
* @brief Extract the public key from a private key.
*
* This is essentially a memcpy from the public part of the privkey.
*
* @param [out] pubkey The extracted private key.
* @param [in] privkey The private key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT The private key is corrupt.
*/
int
goldilocks_private_to_public (
struct goldilocks_public_key_t *pubkey,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2)));
/**
* @brief Generate a Diffie-Hellman shared secret in constant time.
*
* This function uses some compile-time flags whose merit remains to
* be decided.
*
* If the flag EXPERIMENT_ECDH_OBLITERATE_CT is set, prepend 40 bytes
* of zeros to the secret before hashing. In the case that the other
* party's key is detectably corrupt, instead the symmetric part
* of the secret key is used to produce a pseudorandom value.
*
* If EXPERIMENT_ECDH_STIR_IN_PUBKEYS is set, the sum and product of
* the two parties' public keys is prepended to the hash.
*
* In the current version, this function can safely be run even without
* goldilocks_init(). But this property is not guaranteed for future
* versions, so call it anyway.
*
* @warning This isn't even my final form!
*
* @param [out] shared The shared secret established with the other party.
* @param [in] my_privkey My private key.
* @param [in] your_pubkey The other party's public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EINVAL The other party's key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_shared_secret (
uint8_t shared[64],
const struct goldilocks_private_key_t *my_privkey,
const struct goldilocks_public_key_t *your_pubkey
) __attribute__((warn_unused_result,nonnull(1,2,3)));
/**
* @brief Sign a message.
*
* The signature is deterministic, using the symmetric secret found in the
* secret key to form a nonce.
*
* The technique used in signing is a modified Schnorr system, like EdDSA.
*
* @warning This isn't even my final form!
*
* @param [out] signature_out Space for the output signature.
* @param [in] message The message to be signed.
* @param [in] message_len The length of the message to be signed.
* @param [in] privkey My private key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_ECORRUPT My key is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_sign (
uint8_t signature_out[56*2],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_private_key_t *privkey
) __attribute__((nonnull(1,2,4)));
/**
* @brief Verify a signature.
*
* This function is fairly strict. It will correctly detect when
* the signature has the wrong cofactor component, or when the sig
* values aren't less than p or q.
*
* Currently this function does not detect when the public key is weird,
* eg 0, has cofactor, etc. As a result, a party with a bogus public
* key could create signatures that succeed on some systems and fail on
* others.
*
* @warning This isn't even my final form!
*
* @param [in] signature The signature.
* @param [in] message The message to be verified.
* @param [in] message_len The length of the message to be verified.
* @param [in] pubkey The signer's public key.
*
* @retval GOLDI_EOK Success.
* @retval GOLDI_EINVAL The public key or signature is corrupt.
* @retval GOLDI_EUNINIT You must call goldilocks_init() first.
*/
int
goldilocks_verify (
const uint8_t signature[56*2],
const uint8_t *message,
uint64_t message_len,
const struct goldilocks_public_key_t *pubkey
) __attribute__((warn_unused_result,nonnull(1,2,4)));
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif /* __GOLDILOCKS_H__ */
This diff is collapsed.
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#include "word.h"
#include "p448.h"
//#include "x86-64-arith.h"
static inline mask_t __attribute__((always_inline))
is_zero (
word_t x
) {
dword_t xx = x;
xx--;
return xx >> WORD_BITS;
}
static uint64_t widemul_32 (
const uint32_t a,
const uint32_t b
) {
return ((uint64_t)a)* b;
}
void
p448_mul (
p448_t *__restrict__ cs,
const p448_t *as,
const p448_t *bs
) {
// p448_t ar, br;
// p448_copy(&ar,as);
// p448_copy(&br,bs);
// p448_weak_reduce(&ar);
// p448_weak_reduce(&br);
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
uint32_t mask = (1<<28) - 1;
uint32_t aa[8], bb[8];
/* For some reason clang doesn't vectorize this without prompting? */
// unsigned int i;
// for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
// ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
// ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
// }
int i,j;
for (i=0; i<8; i++) {
aa[i] = a[i] + a[i+8];
bb[i] = b[i] + b[i+8];
}
for (j=0; j<8; j++) {
accum2 = 0;
for (i=0; i<=j; i++) {
accum2 += widemul_32(a[j-i],b[i]);
accum1 += widemul_32(aa[j-i],bb[i]);
accum0 += widemul_32(a[8+j-i], b[8+i]);
}
accum1 -= accum2;
accum0 += accum2;
accum2 = 0;
for (; i<8; i++) {
accum0 -= widemul_32(a[8+j-i], b[i]);
accum2 += widemul_32(aa[8+j-i], bb[i]);
accum1 += widemul_32(a[16+j-i], b[8+i]);
}
accum1 += accum2;
accum0 += accum2;
c[j] = ((uint32_t)(accum0)) & mask;
c[j+8] = ((uint32_t)(accum1)) & mask;
accum0 >>= 28;
accum1 >>= 28;
}
accum0 += accum1;
accum0 += c[8];
accum1 += c[0];
c[8] = ((uint32_t)(accum0)) & mask;
c[0] = ((uint32_t)(accum1)) & mask;
accum0 >>= 28;
accum1 >>= 28;
c[9] += ((uint32_t)(accum0));
c[1] += ((uint32_t)(accum1));
}
void
p448_mulw (
p448_t *__restrict__ cs,
const p448_t *as,
uint64_t b
) {
const uint32_t bhi = b>>28, blo = b & (1<<28)-1;
const uint32_t *a = as->limb;
uint32_t *c = cs->limb;
uint64_t accum0, accum8;
uint32_t mask = (1ull<<28)-1;
int i;
accum0 = widemul_32(blo, a[0]);
accum8 = widemul_32(blo, a[8]);
accum0 += widemul_32(bhi, a[15]);
accum8 += widemul_32(bhi, a[15] + a[7]);
c[0] = accum0 & mask; accum0 >>= 28;
c[8] = accum8 & mask; accum8 >>= 28;
for (i=1; i<8; i++) {
accum0 += widemul_32(blo, a[i]);
accum8 += widemul_32(blo, a[i+8]);
accum0 += widemul_32(bhi, a[i-1]);
accum8 += widemul_32(bhi, a[i+7]);
c[i] = accum0 & mask; accum0 >>= 28;
c[i+8] = accum8 & mask; accum8 >>= 28;
}
accum0 += accum8 + c[8];
c[8] = accum0 & mask;
c[9] += accum0 >> 28;
accum8 += c[0];
c[0] = accum8 & mask;
c[1] += accum8 >> 28;
}
void
p448_sqr (
p448_t *__restrict__ cs,
const p448_t *as
) {
p448_mul(cs,as,as); // PERF
}
void
p448_strong_reduce (
p448_t *a
) {
word_t mask = (1ull<<28)-1;
/* first, clear high */
a->limb[8] += a->limb[15]>>28;
a->limb[0] += a->limb[15]>>28;
a->limb[15] &= mask;
/* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
/* compute total_value - p. No need to reduce mod p. */
dsword_t scarry = 0;
int i;
for (i=0; i<16; i++) {
scarry = scarry + a->limb[i] - ((i==8)?mask-1:mask);
a->limb[i] = scarry & mask;
scarry >>= 28;
}
/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^448
* so let's add back in p. will carry back off the top for 2^448.
*/
assert(is_zero(scarry) | is_zero(scarry+1));
word_t scarry_mask = scarry & mask;
dword_t carry = 0;
/* add it back */
for (i=0; i<16; i++) {
carry = carry + a->limb[i] + ((i==8)?(scarry_mask&~1):scarry_mask);
a->limb[i] = carry & mask;
carry >>= 28;
}
assert(is_zero(carry + scarry));
}
mask_t
p448_is_zero (
const struct p448_t *a
) {
struct p448_t b;
p448_copy(&b,a);
p448_strong_reduce(&b);
uint32_t any = 0;
int i;
for (i=0; i<16; i++) {
any |= b.limb[i];
}
return is_zero(any);
}
void
p448_serialize (
uint8_t *serial,
const struct p448_t *x
) {
int i,j;
p448_t red;
p448_copy(&red, x);
p448_strong_reduce(&red);
for (i=0; i<8; i++) {
uint64_t limb = red.limb[2*i] + (((uint64_t)red.limb[2*i+1])<<28);
for (j=0; j<7; j++) {
serial[7*i+j] = limb;
limb >>= 8;
}
assert(limb == 0);
}
}
mask_t
p448_deserialize (
p448_t *x,
const uint8_t serial[56]
) {
int i,j;
for (i=0; i<8; i++) {
uint64_t out = 0;
for (j=0; j<7; j++) {
out |= ((uint64_t)serial[7*i+j])<<(8*j);
}
x->limb[2*i] = out & (1ull<<28)-1;
x->limb[2*i+1] = out >> 28;
}
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 56 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
uint32_t ge = -1, mask = (1ull<<28)-1;
for (i=0; i<8; i++) {
ge &= x->limb[i];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge = (ge & (x->limb[8] + 1)) | is_zero(x->limb[8] ^ mask);
/* Propagate the rest */
for (i=9; i<16; i++) {
ge &= x->limb[i];
}
return ~is_zero(ge ^ mask);
}
void
simultaneous_invert_p448(
struct p448_t *__restrict__ out,
const struct p448_t *in,
unsigned int n
) {
if (n==0) {
return;
} else if (n==1) {
p448_inverse(out,in);