first commit

2013-01-10 21:14:17 +01:00 · 2013-01-10 21:14:17 +01:00 · 3aad2934a3
commit 3aad2934a3
50 changed files with 5780 additions and 0 deletions
--- a/a.exe
+++ b/a.exe
--- a/crypto_sign.h
+++ b/crypto_sign.h
@ -0,0 +1,12 @@
+#ifndef crypto_sign_edwards25519sha512batch_H
+#define crypto_sign_edwards25519sha512batch_H
+
+#define SECRETKEYBYTES 64
+#define PUBLICKEYBYTES 32
+#define SIGNATUREBYTES 64
+
+extern int crypto_sign(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *);
+extern int crypto_sign_open(unsigned char *,unsigned long long *,const unsigned char *,unsigned long long,const unsigned char *);
+extern int crypto_sign_keypair(unsigned char *,unsigned char *,unsigned char *);
+
+#endif
--- a/fe.h
+++ b/fe.h
@ -0,0 +1,56 @@
+#ifndef FE_H
+#define FE_H
+
+#include "pstdint.h"
+
+typedef int32_t fe[10];
+
+/*
+fe means field element.
+Here the field is \Z/(2^255-19).
+An element t, entries t[0]...t[9], represents the integer
+t[0]+2^26 t[1]+2^51 t[2]+2^77 t[3]+2^102 t[4]+...+2^230 t[9].
+Bounds on each t[i] vary depending on context.
+*/
+
+#define fe_frombytes crypto_sign_ed25519_ref10_fe_frombytes
+#define fe_tobytes crypto_sign_ed25519_ref10_fe_tobytes
+#define fe_copy crypto_sign_ed25519_ref10_fe_copy
+#define fe_isnonzero crypto_sign_ed25519_ref10_fe_isnonzero
+#define fe_isnegative crypto_sign_ed25519_ref10_fe_isnegative
+#define fe_0 crypto_sign_ed25519_ref10_fe_0
+#define fe_1 crypto_sign_ed25519_ref10_fe_1
+#define fe_cswap crypto_sign_ed25519_ref10_fe_cswap
+#define fe_cmov crypto_sign_ed25519_ref10_fe_cmov
+#define fe_add crypto_sign_ed25519_ref10_fe_add
+#define fe_sub crypto_sign_ed25519_ref10_fe_sub
+#define fe_neg crypto_sign_ed25519_ref10_fe_neg
+#define fe_mul crypto_sign_ed25519_ref10_fe_mul
+#define fe_sq crypto_sign_ed25519_ref10_fe_sq
+#define fe_sq2 crypto_sign_ed25519_ref10_fe_sq2
+#define fe_mul121666 crypto_sign_ed25519_ref10_fe_mul121666
+#define fe_invert crypto_sign_ed25519_ref10_fe_invert
+#define fe_pow22523 crypto_sign_ed25519_ref10_fe_pow22523
+
+extern void fe_frombytes(fe,const unsigned char *);
+extern void fe_tobytes(unsigned char *,const fe);
+
+extern void fe_copy(fe,const fe);
+extern int fe_isnonzero(const fe);
+extern int fe_isnegative(const fe);
+extern void fe_0(fe);
+extern void fe_1(fe);
+extern void fe_cswap(fe,fe,unsigned int);
+extern void fe_cmov(fe,const fe,unsigned int);
+
+extern void fe_add(fe,const fe,const fe);
+extern void fe_sub(fe,const fe,const fe);
+extern void fe_neg(fe,const fe);
+extern void fe_mul(fe,const fe,const fe);
+extern void fe_sq(fe,const fe);
+extern void fe_sq2(fe,const fe);
+extern void fe_mul121666(fe,const fe);
+extern void fe_invert(fe,const fe);
+extern void fe_pow22523(fe,const fe);
+
+#endif
--- a/fe_0.c
+++ b/fe_0.c
@ -0,0 +1,19 @@
+#include "fe.h"
+
+/*
+h = 0
+*/
+
+void fe_0(fe h)
+{
+  h[0] = 0;
+  h[1] = 0;
+  h[2] = 0;
+  h[3] = 0;
+  h[4] = 0;
+  h[5] = 0;
+  h[6] = 0;
+  h[7] = 0;
+  h[8] = 0;
+  h[9] = 0;
+}
--- a/fe_1.c
+++ b/fe_1.c
@ -0,0 +1,19 @@
+#include "fe.h"
+
+/*
+h = 1
+*/
+
+void fe_1(fe h)
+{
+  h[0] = 1;
+  h[1] = 0;
+  h[2] = 0;
+  h[3] = 0;
+  h[4] = 0;
+  h[5] = 0;
+  h[6] = 0;
+  h[7] = 0;
+  h[8] = 0;
+  h[9] = 0;
+}
--- a/fe_add.c
+++ b/fe_add.c
@ -0,0 +1,57 @@
+#include "fe.h"
+
+/*
+h = f + g
+Can overlap h with f or g.
+
+Preconditions:
+   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+   |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+
+Postconditions:
+   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+*/
+
+void fe_add(fe h,const fe f,const fe g)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t g0 = g[0];
+  int32_t g1 = g[1];
+  int32_t g2 = g[2];
+  int32_t g3 = g[3];
+  int32_t g4 = g[4];
+  int32_t g5 = g[5];
+  int32_t g6 = g[6];
+  int32_t g7 = g[7];
+  int32_t g8 = g[8];
+  int32_t g9 = g[9];
+  int32_t h0 = f0 + g0;
+  int32_t h1 = f1 + g1;
+  int32_t h2 = f2 + g2;
+  int32_t h3 = f3 + g3;
+  int32_t h4 = f4 + g4;
+  int32_t h5 = f5 + g5;
+  int32_t h6 = f6 + g6;
+  int32_t h7 = f7 + g7;
+  int32_t h8 = f8 + g8;
+  int32_t h9 = f9 + g9;
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_cmov.c
+++ b/fe_cmov.c
@ -0,0 +1,63 @@
+#include "fe.h"
+
+/*
+Replace (f,g) with (g,g) if b == 1;
+replace (f,g) with (f,g) if b == 0.
+
+Preconditions: b in {0,1}.
+*/
+
+void fe_cmov(fe f,const fe g,unsigned int b)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t g0 = g[0];
+  int32_t g1 = g[1];
+  int32_t g2 = g[2];
+  int32_t g3 = g[3];
+  int32_t g4 = g[4];
+  int32_t g5 = g[5];
+  int32_t g6 = g[6];
+  int32_t g7 = g[7];
+  int32_t g8 = g[8];
+  int32_t g9 = g[9];
+  int32_t x0 = f0 ^ g0;
+  int32_t x1 = f1 ^ g1;
+  int32_t x2 = f2 ^ g2;
+  int32_t x3 = f3 ^ g3;
+  int32_t x4 = f4 ^ g4;
+  int32_t x5 = f5 ^ g5;
+  int32_t x6 = f6 ^ g6;
+  int32_t x7 = f7 ^ g7;
+  int32_t x8 = f8 ^ g8;
+  int32_t x9 = f9 ^ g9;
+  b = -b;
+  x0 &= b;
+  x1 &= b;
+  x2 &= b;
+  x3 &= b;
+  x4 &= b;
+  x5 &= b;
+  x6 &= b;
+  x7 &= b;
+  x8 &= b;
+  x9 &= b;
+  f[0] = f0 ^ x0;
+  f[1] = f1 ^ x1;
+  f[2] = f2 ^ x2;
+  f[3] = f3 ^ x3;
+  f[4] = f4 ^ x4;
+  f[5] = f5 ^ x5;
+  f[6] = f6 ^ x6;
+  f[7] = f7 ^ x7;
+  f[8] = f8 ^ x8;
+  f[9] = f9 ^ x9;
+}
--- a/fe_copy.c
+++ b/fe_copy.c
@ -0,0 +1,29 @@
+#include "fe.h"
+
+/*
+h = f
+*/
+
+void fe_copy(fe h,const fe f)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  h[0] = f0;
+  h[1] = f1;
+  h[2] = f2;
+  h[3] = f3;
+  h[4] = f4;
+  h[5] = f5;
+  h[6] = f6;
+  h[7] = f7;
+  h[8] = f8;
+  h[9] = f9;
+}
--- a/fe_frombytes.c
+++ b/fe_frombytes.c
@ -0,0 +1,72 @@
+#include "fe.h"
+#include "pstdint.h"
+
+static uint64_t load_3(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  return result;
+}
+
+static uint64_t load_4(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  result |= ((uint64_t) in[3]) << 24;
+  return result;
+}
+
+/*
+Ignores top bit of h.
+*/
+
+void fe_frombytes(fe h,const unsigned char *s)
+{
+  int64_t h0 = load_4(s);
+  int64_t h1 = load_3(s + 4) << 6;
+  int64_t h2 = load_3(s + 7) << 5;
+  int64_t h3 = load_3(s + 10) << 3;
+  int64_t h4 = load_3(s + 13) << 2;
+  int64_t h5 = load_4(s + 16);
+  int64_t h6 = load_3(s + 20) << 7;
+  int64_t h7 = load_3(s + 23) << 5;
+  int64_t h8 = load_3(s + 26) << 4;
+  int64_t h9 = (load_3(s + 29) & 8388607) << 2;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_invert.c
+++ b/fe_invert.c
@ -0,0 +1,174 @@
+#include "fe.h"
+
+void fe_invert(fe out,const fe z)
+{
+  fe t0;
+  fe t1;
+  fe t2;
+  fe t3;
+  int i;
+
+
+/* qhasm: fe z1 */
+
+/* qhasm: fe z2 */
+
+/* qhasm: fe z8 */
+
+/* qhasm: fe z9 */
+
+/* qhasm: fe z11 */
+
+/* qhasm: fe z22 */
+
+/* qhasm: fe z_5_0 */
+
+/* qhasm: fe z_10_5 */
+
+/* qhasm: fe z_10_0 */
+
+/* qhasm: fe z_20_10 */
+
+/* qhasm: fe z_20_0 */
+
+/* qhasm: fe z_40_20 */
+
+/* qhasm: fe z_40_0 */
+
+/* qhasm: fe z_50_10 */
+
+/* qhasm: fe z_50_0 */
+
+/* qhasm: fe z_100_50 */
+
+/* qhasm: fe z_100_0 */
+
+/* qhasm: fe z_200_100 */
+
+/* qhasm: fe z_200_0 */
+
+/* qhasm: fe z_250_50 */
+
+/* qhasm: fe z_250_0 */
+
+/* qhasm: fe z_255_5 */
+
+/* qhasm: fe z_255_21 */
+
+/* qhasm: enter pow225521 */
+
+/* qhasm: z2 = z1^2^1 */
+/* asm 1: fe_sq(>z2=fe#1,<z1=fe#11); for (i = 1;i < 1;++i) fe_sq(>z2=fe#1,>z2=fe#1); */
+/* asm 2: fe_sq(>z2=t0,<z1=z); for (i = 1;i < 1;++i) fe_sq(>z2=t0,>z2=t0); */
+fe_sq(t0,z); for (i = 1;i < 1;++i) fe_sq(t0,t0);
+
+/* qhasm: z8 = z2^2^2 */
+/* asm 1: fe_sq(>z8=fe#2,<z2=fe#1); for (i = 1;i < 2;++i) fe_sq(>z8=fe#2,>z8=fe#2); */
+/* asm 2: fe_sq(>z8=t1,<z2=t0); for (i = 1;i < 2;++i) fe_sq(>z8=t1,>z8=t1); */
+fe_sq(t1,t0); for (i = 1;i < 2;++i) fe_sq(t1,t1);
+
+/* qhasm: z9 = z1*z8 */
+/* asm 1: fe_mul(>z9=fe#2,<z1=fe#11,<z8=fe#2); */
+/* asm 2: fe_mul(>z9=t1,<z1=z,<z8=t1); */
+fe_mul(t1,z,t1);
+
+/* qhasm: z11 = z2*z9 */
+/* asm 1: fe_mul(>z11=fe#1,<z2=fe#1,<z9=fe#2); */
+/* asm 2: fe_mul(>z11=t0,<z2=t0,<z9=t1); */
+fe_mul(t0,t0,t1);
+
+/* qhasm: z22 = z11^2^1 */
+/* asm 1: fe_sq(>z22=fe#3,<z11=fe#1); for (i = 1;i < 1;++i) fe_sq(>z22=fe#3,>z22=fe#3); */
+/* asm 2: fe_sq(>z22=t2,<z11=t0); for (i = 1;i < 1;++i) fe_sq(>z22=t2,>z22=t2); */
+fe_sq(t2,t0); for (i = 1;i < 1;++i) fe_sq(t2,t2);
+
+/* qhasm: z_5_0 = z9*z22 */
+/* asm 1: fe_mul(>z_5_0=fe#2,<z9=fe#2,<z22=fe#3); */
+/* asm 2: fe_mul(>z_5_0=t1,<z9=t1,<z22=t2); */
+fe_mul(t1,t1,t2);
+
+/* qhasm: z_10_5 = z_5_0^2^5 */
+/* asm 1: fe_sq(>z_10_5=fe#3,<z_5_0=fe#2); for (i = 1;i < 5;++i) fe_sq(>z_10_5=fe#3,>z_10_5=fe#3); */
+/* asm 2: fe_sq(>z_10_5=t2,<z_5_0=t1); for (i = 1;i < 5;++i) fe_sq(>z_10_5=t2,>z_10_5=t2); */
+fe_sq(t2,t1); for (i = 1;i < 5;++i) fe_sq(t2,t2);
+
+/* qhasm: z_10_0 = z_10_5*z_5_0 */
+/* asm 1: fe_mul(>z_10_0=fe#2,<z_10_5=fe#3,<z_5_0=fe#2); */
+/* asm 2: fe_mul(>z_10_0=t1,<z_10_5=t2,<z_5_0=t1); */
+fe_mul(t1,t2,t1);
+
+/* qhasm: z_20_10 = z_10_0^2^10 */
+/* asm 1: fe_sq(>z_20_10=fe#3,<z_10_0=fe#2); for (i = 1;i < 10;++i) fe_sq(>z_20_10=fe#3,>z_20_10=fe#3); */
+/* asm 2: fe_sq(>z_20_10=t2,<z_10_0=t1); for (i = 1;i < 10;++i) fe_sq(>z_20_10=t2,>z_20_10=t2); */
+fe_sq(t2,t1); for (i = 1;i < 10;++i) fe_sq(t2,t2);
+
+/* qhasm: z_20_0 = z_20_10*z_10_0 */
+/* asm 1: fe_mul(>z_20_0=fe#3,<z_20_10=fe#3,<z_10_0=fe#2); */
+/* asm 2: fe_mul(>z_20_0=t2,<z_20_10=t2,<z_10_0=t1); */
+fe_mul(t2,t2,t1);
+
+/* qhasm: z_40_20 = z_20_0^2^20 */
+/* asm 1: fe_sq(>z_40_20=fe#4,<z_20_0=fe#3); for (i = 1;i < 20;++i) fe_sq(>z_40_20=fe#4,>z_40_20=fe#4); */
+/* asm 2: fe_sq(>z_40_20=t3,<z_20_0=t2); for (i = 1;i < 20;++i) fe_sq(>z_40_20=t3,>z_40_20=t3); */
+fe_sq(t3,t2); for (i = 1;i < 20;++i) fe_sq(t3,t3);
+
+/* qhasm: z_40_0 = z_40_20*z_20_0 */
+/* asm 1: fe_mul(>z_40_0=fe#3,<z_40_20=fe#4,<z_20_0=fe#3); */
+/* asm 2: fe_mul(>z_40_0=t2,<z_40_20=t3,<z_20_0=t2); */
+fe_mul(t2,t3,t2);
+
+/* qhasm: z_50_10 = z_40_0^2^10 */
+/* asm 1: fe_sq(>z_50_10=fe#3,<z_40_0=fe#3); for (i = 1;i < 10;++i) fe_sq(>z_50_10=fe#3,>z_50_10=fe#3); */
+/* asm 2: fe_sq(>z_50_10=t2,<z_40_0=t2); for (i = 1;i < 10;++i) fe_sq(>z_50_10=t2,>z_50_10=t2); */
+fe_sq(t2,t2); for (i = 1;i < 10;++i) fe_sq(t2,t2);
+
+/* qhasm: z_50_0 = z_50_10*z_10_0 */
+/* asm 1: fe_mul(>z_50_0=fe#2,<z_50_10=fe#3,<z_10_0=fe#2); */
+/* asm 2: fe_mul(>z_50_0=t1,<z_50_10=t2,<z_10_0=t1); */
+fe_mul(t1,t2,t1);
+
+/* qhasm: z_100_50 = z_50_0^2^50 */
+/* asm 1: fe_sq(>z_100_50=fe#3,<z_50_0=fe#2); for (i = 1;i < 50;++i) fe_sq(>z_100_50=fe#3,>z_100_50=fe#3); */
+/* asm 2: fe_sq(>z_100_50=t2,<z_50_0=t1); for (i = 1;i < 50;++i) fe_sq(>z_100_50=t2,>z_100_50=t2); */
+fe_sq(t2,t1); for (i = 1;i < 50;++i) fe_sq(t2,t2);
+
+/* qhasm: z_100_0 = z_100_50*z_50_0 */
+/* asm 1: fe_mul(>z_100_0=fe#3,<z_100_50=fe#3,<z_50_0=fe#2); */
+/* asm 2: fe_mul(>z_100_0=t2,<z_100_50=t2,<z_50_0=t1); */
+fe_mul(t2,t2,t1);
+
+/* qhasm: z_200_100 = z_100_0^2^100 */
+/* asm 1: fe_sq(>z_200_100=fe#4,<z_100_0=fe#3); for (i = 1;i < 100;++i) fe_sq(>z_200_100=fe#4,>z_200_100=fe#4); */
+/* asm 2: fe_sq(>z_200_100=t3,<z_100_0=t2); for (i = 1;i < 100;++i) fe_sq(>z_200_100=t3,>z_200_100=t3); */
+fe_sq(t3,t2); for (i = 1;i < 100;++i) fe_sq(t3,t3);
+
+/* qhasm: z_200_0 = z_200_100*z_100_0 */
+/* asm 1: fe_mul(>z_200_0=fe#3,<z_200_100=fe#4,<z_100_0=fe#3); */
+/* asm 2: fe_mul(>z_200_0=t2,<z_200_100=t3,<z_100_0=t2); */
+fe_mul(t2,t3,t2);
+
+/* qhasm: z_250_50 = z_200_0^2^50 */
+/* asm 1: fe_sq(>z_250_50=fe#3,<z_200_0=fe#3); for (i = 1;i < 50;++i) fe_sq(>z_250_50=fe#3,>z_250_50=fe#3); */
+/* asm 2: fe_sq(>z_250_50=t2,<z_200_0=t2); for (i = 1;i < 50;++i) fe_sq(>z_250_50=t2,>z_250_50=t2); */
+fe_sq(t2,t2); for (i = 1;i < 50;++i) fe_sq(t2,t2);
+
+/* qhasm: z_250_0 = z_250_50*z_50_0 */
+/* asm 1: fe_mul(>z_250_0=fe#2,<z_250_50=fe#3,<z_50_0=fe#2); */
+/* asm 2: fe_mul(>z_250_0=t1,<z_250_50=t2,<z_50_0=t1); */
+fe_mul(t1,t2,t1);
+
+/* qhasm: z_255_5 = z_250_0^2^5 */
+/* asm 1: fe_sq(>z_255_5=fe#2,<z_250_0=fe#2); for (i = 1;i < 5;++i) fe_sq(>z_255_5=fe#2,>z_255_5=fe#2); */
+/* asm 2: fe_sq(>z_255_5=t1,<z_250_0=t1); for (i = 1;i < 5;++i) fe_sq(>z_255_5=t1,>z_255_5=t1); */
+fe_sq(t1,t1); for (i = 1;i < 5;++i) fe_sq(t1,t1);
+
+/* qhasm: z_255_21 = z_255_5*z11 */
+/* asm 1: fe_mul(>z_255_21=fe#12,<z_255_5=fe#2,<z11=fe#1); */
+/* asm 2: fe_mul(>z_255_21=out,<z_255_5=t1,<z11=t0); */
+fe_mul(out,t1,t0);
+
+/* qhasm: return */
+
+
+  return;
+}
--- a/fe_isnegative.c
+++ b/fe_isnegative.c
@ -0,0 +1,16 @@
+#include "fe.h"
+
+/*
+return 1 if f is in {1,3,5,...,q-2}
+return 0 if f is in {0,2,4,...,q-1}
+
+Preconditions:
+   |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+*/
+
+int fe_isnegative(const fe f)
+{
+  unsigned char s[32];
+  fe_tobytes(s,f);
+  return s[0] & 1;
+}
--- a/fe_isnonzero.c
+++ b/fe_isnonzero.c
@ -0,0 +1,20 @@
+#include "fe.h"
+#include "verify.h"
+
+
+/*
+return 1 if f == 0
+return 0 if f != 0
+
+Preconditions:
+   |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+*/
+
+static const unsigned char zero[32];
+
+int fe_isnonzero(const fe f)
+{
+  unsigned char s[32];
+  fe_tobytes(s,f);
+  return crypto_verify_32(s,zero);
+}
--- a/fe_mul.c
+++ b/fe_mul.c
@ -0,0 +1,253 @@
+#include "fe.h"
+#include "pstdint.h"
+
+/*
+h = f * g
+Can overlap h with f or g.
+
+Preconditions:
+   |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+   |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+
+Postconditions:
+   |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+*/
+
+/*
+Notes on implementation strategy:
+
+Using schoolbook multiplication.
+Karatsuba would save a little in some cost models.
+
+Most multiplications by 2 and 19 are 32-bit precomputations;
+cheaper than 64-bit postcomputations.
+
+There is one remaining multiplication by 19 in the carry chain;
+one *19 precomputation can be merged into this,
+but the resulting data flow is considerably less clean.
+
+There are 12 carries below.
+10 of them are 2-way parallelizable and vectorizable.
+Can get away with 11 carries, but then data flow is much deeper.
+
+With tighter constraints on inputs can squeeze carries into int32.
+*/
+
+void fe_mul(fe h,const fe f,const fe g)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t g0 = g[0];
+  int32_t g1 = g[1];
+  int32_t g2 = g[2];
+  int32_t g3 = g[3];
+  int32_t g4 = g[4];
+  int32_t g5 = g[5];
+  int32_t g6 = g[6];
+  int32_t g7 = g[7];
+  int32_t g8 = g[8];
+  int32_t g9 = g[9];
+  int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
+  int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
+  int32_t g3_19 = 19 * g3;
+  int32_t g4_19 = 19 * g4;
+  int32_t g5_19 = 19 * g5;
+  int32_t g6_19 = 19 * g6;
+  int32_t g7_19 = 19 * g7;
+  int32_t g8_19 = 19 * g8;
+  int32_t g9_19 = 19 * g9;
+  int32_t f1_2 = 2 * f1;
+  int32_t f3_2 = 2 * f3;
+  int32_t f5_2 = 2 * f5;
+  int32_t f7_2 = 2 * f7;
+  int32_t f9_2 = 2 * f9;
+  int64_t f0g0    = f0   * (int64_t) g0;
+  int64_t f0g1    = f0   * (int64_t) g1;
+  int64_t f0g2    = f0   * (int64_t) g2;
+  int64_t f0g3    = f0   * (int64_t) g3;
+  int64_t f0g4    = f0   * (int64_t) g4;
+  int64_t f0g5    = f0   * (int64_t) g5;
+  int64_t f0g6    = f0   * (int64_t) g6;
+  int64_t f0g7    = f0   * (int64_t) g7;
+  int64_t f0g8    = f0   * (int64_t) g8;
+  int64_t f0g9    = f0   * (int64_t) g9;
+  int64_t f1g0    = f1   * (int64_t) g0;
+  int64_t f1g1_2  = f1_2 * (int64_t) g1;
+  int64_t f1g2    = f1   * (int64_t) g2;
+  int64_t f1g3_2  = f1_2 * (int64_t) g3;
+  int64_t f1g4    = f1   * (int64_t) g4;
+  int64_t f1g5_2  = f1_2 * (int64_t) g5;
+  int64_t f1g6    = f1   * (int64_t) g6;
+  int64_t f1g7_2  = f1_2 * (int64_t) g7;
+  int64_t f1g8    = f1   * (int64_t) g8;
+  int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
+  int64_t f2g0    = f2   * (int64_t) g0;
+  int64_t f2g1    = f2   * (int64_t) g1;
+  int64_t f2g2    = f2   * (int64_t) g2;
+  int64_t f2g3    = f2   * (int64_t) g3;
+  int64_t f2g4    = f2   * (int64_t) g4;
+  int64_t f2g5    = f2   * (int64_t) g5;
+  int64_t f2g6    = f2   * (int64_t) g6;
+  int64_t f2g7    = f2   * (int64_t) g7;
+  int64_t f2g8_19 = f2   * (int64_t) g8_19;
+  int64_t f2g9_19 = f2   * (int64_t) g9_19;
+  int64_t f3g0    = f3   * (int64_t) g0;
+  int64_t f3g1_2  = f3_2 * (int64_t) g1;
+  int64_t f3g2    = f3   * (int64_t) g2;
+  int64_t f3g3_2  = f3_2 * (int64_t) g3;
+  int64_t f3g4    = f3   * (int64_t) g4;
+  int64_t f3g5_2  = f3_2 * (int64_t) g5;
+  int64_t f3g6    = f3   * (int64_t) g6;
+  int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
+  int64_t f3g8_19 = f3   * (int64_t) g8_19;
+  int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
+  int64_t f4g0    = f4   * (int64_t) g0;
+  int64_t f4g1    = f4   * (int64_t) g1;
+  int64_t f4g2    = f4   * (int64_t) g2;
+  int64_t f4g3    = f4   * (int64_t) g3;
+  int64_t f4g4    = f4   * (int64_t) g4;
+  int64_t f4g5    = f4   * (int64_t) g5;
+  int64_t f4g6_19 = f4   * (int64_t) g6_19;
+  int64_t f4g7_19 = f4   * (int64_t) g7_19;
+  int64_t f4g8_19 = f4   * (int64_t) g8_19;
+  int64_t f4g9_19 = f4   * (int64_t) g9_19;
+  int64_t f5g0    = f5   * (int64_t) g0;
+  int64_t f5g1_2  = f5_2 * (int64_t) g1;
+  int64_t f5g2    = f5   * (int64_t) g2;
+  int64_t f5g3_2  = f5_2 * (int64_t) g3;
+  int64_t f5g4    = f5   * (int64_t) g4;
+  int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
+  int64_t f5g6_19 = f5   * (int64_t) g6_19;
+  int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
+  int64_t f5g8_19 = f5   * (int64_t) g8_19;
+  int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
+  int64_t f6g0    = f6   * (int64_t) g0;
+  int64_t f6g1    = f6   * (int64_t) g1;
+  int64_t f6g2    = f6   * (int64_t) g2;
+  int64_t f6g3    = f6   * (int64_t) g3;
+  int64_t f6g4_19 = f6   * (int64_t) g4_19;
+  int64_t f6g5_19 = f6   * (int64_t) g5_19;
+  int64_t f6g6_19 = f6   * (int64_t) g6_19;
+  int64_t f6g7_19 = f6   * (int64_t) g7_19;
+  int64_t f6g8_19 = f6   * (int64_t) g8_19;
+  int64_t f6g9_19 = f6   * (int64_t) g9_19;
+  int64_t f7g0    = f7   * (int64_t) g0;
+  int64_t f7g1_2  = f7_2 * (int64_t) g1;
+  int64_t f7g2    = f7   * (int64_t) g2;
+  int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
+  int64_t f7g4_19 = f7   * (int64_t) g4_19;
+  int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
+  int64_t f7g6_19 = f7   * (int64_t) g6_19;
+  int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
+  int64_t f7g8_19 = f7   * (int64_t) g8_19;
+  int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
+  int64_t f8g0    = f8   * (int64_t) g0;
+  int64_t f8g1    = f8   * (int64_t) g1;
+  int64_t f8g2_19 = f8   * (int64_t) g2_19;
+  int64_t f8g3_19 = f8   * (int64_t) g3_19;
+  int64_t f8g4_19 = f8   * (int64_t) g4_19;
+  int64_t f8g5_19 = f8   * (int64_t) g5_19;
+  int64_t f8g6_19 = f8   * (int64_t) g6_19;
+  int64_t f8g7_19 = f8   * (int64_t) g7_19;
+  int64_t f8g8_19 = f8   * (int64_t) g8_19;
+  int64_t f8g9_19 = f8   * (int64_t) g9_19;
+  int64_t f9g0    = f9   * (int64_t) g0;
+  int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
+  int64_t f9g2_19 = f9   * (int64_t) g2_19;
+  int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
+  int64_t f9g4_19 = f9   * (int64_t) g4_19;
+  int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
+  int64_t f9g6_19 = f9   * (int64_t) g6_19;
+  int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
+  int64_t f9g8_19 = f9   * (int64_t) g8_19;
+  int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
+  int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
+  int64_t h1 = f0g1+f1g0   +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
+  int64_t h2 = f0g2+f1g1_2 +f2g0   +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
+  int64_t h3 = f0g3+f1g2   +f2g1   +f3g0   +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
+  int64_t h4 = f0g4+f1g3_2 +f2g2   +f3g1_2 +f4g0   +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
+  int64_t h5 = f0g5+f1g4   +f2g3   +f3g2   +f4g1   +f5g0   +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
+  int64_t h6 = f0g6+f1g5_2 +f2g4   +f3g3_2 +f4g2   +f5g1_2 +f6g0   +f7g9_38+f8g8_19+f9g7_38;
+  int64_t h7 = f0g7+f1g6   +f2g5   +f3g4   +f4g3   +f5g2   +f6g1   +f7g0   +f8g9_19+f9g8_19;
+  int64_t h8 = f0g8+f1g7_2 +f2g6   +f3g5_2 +f4g4   +f5g3_2 +f6g2   +f7g1_2 +f8g0   +f9g9_38;
+  int64_t h9 = f0g9+f1g8   +f2g7   +f3g6   +f4g5   +f5g4   +f6g3   +f7g2   +f8g1   +f9g0   ;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  /*
+  |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
+    i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
+  |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
+    i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
+  */
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  /* |h0| <= 2^25 */
+  /* |h4| <= 2^25 */
+  /* |h1| <= 1.71*2^59 */
+  /* |h5| <= 1.71*2^59 */
+
+  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  /* |h1| <= 2^24; from now on fits into int32 */
+  /* |h5| <= 2^24; from now on fits into int32 */
+  /* |h2| <= 1.41*2^60 */
+  /* |h6| <= 1.41*2^60 */
+
+  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  /* |h2| <= 2^25; from now on fits into int32 unchanged */
+  /* |h6| <= 2^25; from now on fits into int32 unchanged */
+  /* |h3| <= 1.71*2^59 */
+  /* |h7| <= 1.71*2^59 */
+
+  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+  /* |h3| <= 2^24; from now on fits into int32 unchanged */
+  /* |h7| <= 2^24; from now on fits into int32 unchanged */
+  /* |h4| <= 1.72*2^34 */
+  /* |h8| <= 1.41*2^60 */
+
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+  /* |h4| <= 2^25; from now on fits into int32 unchanged */
+  /* |h8| <= 2^25; from now on fits into int32 unchanged */
+  /* |h5| <= 1.01*2^24 */
+  /* |h9| <= 1.71*2^59 */
+
+  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  /* |h9| <= 2^24; from now on fits into int32 unchanged */
+  /* |h0| <= 1.1*2^39 */
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  /* |h0| <= 2^25; from now on fits into int32 unchanged */
+  /* |h1| <= 1.01*2^24 */
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_neg.c
+++ b/fe_neg.c
@ -0,0 +1,45 @@
+#include "fe.h"
+
+/*
+h = -f
+
+Preconditions:
+   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+
+Postconditions:
+   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+*/
+
+void fe_neg(fe h,const fe f)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t h0 = -f0;
+  int32_t h1 = -f1;
+  int32_t h2 = -f2;
+  int32_t h3 = -f3;
+  int32_t h4 = -f4;
+  int32_t h5 = -f5;
+  int32_t h6 = -f6;
+  int32_t h7 = -f7;
+  int32_t h8 = -f8;
+  int32_t h9 = -f9;
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_pow22523.c
+++ b/fe_pow22523.c
@ -0,0 +1,173 @@
+#include "fe.h"
+
+void fe_pow22523(fe out,const fe z)
+{
+  fe t0;
+  fe t1;
+  fe t2;
+  int i;
+
+
+/* qhasm: fe z1 */
+
+/* qhasm: fe z2 */
+
+/* qhasm: fe z8 */
+
+/* qhasm: fe z9 */
+
+/* qhasm: fe z11 */
+
+/* qhasm: fe z22 */
+
+/* qhasm: fe z_5_0 */
+
+/* qhasm: fe z_10_5 */
+
+/* qhasm: fe z_10_0 */
+
+/* qhasm: fe z_20_10 */
+
+/* qhasm: fe z_20_0 */
+
+/* qhasm: fe z_40_20 */
+
+/* qhasm: fe z_40_0 */
+
+/* qhasm: fe z_50_10 */
+
+/* qhasm: fe z_50_0 */
+
+/* qhasm: fe z_100_50 */
+
+/* qhasm: fe z_100_0 */
+
+/* qhasm: fe z_200_100 */
+
+/* qhasm: fe z_200_0 */
+
+/* qhasm: fe z_250_50 */
+
+/* qhasm: fe z_250_0 */
+
+/* qhasm: fe z_252_2 */
+
+/* qhasm: fe z_252_3 */
+
+/* qhasm: enter pow22523 */
+
+/* qhasm: z2 = z1^2^1 */
+/* asm 1: fe_sq(>z2=fe#1,<z1=fe#11); for (i = 1;i < 1;++i) fe_sq(>z2=fe#1,>z2=fe#1); */
+/* asm 2: fe_sq(>z2=t0,<z1=z); for (i = 1;i < 1;++i) fe_sq(>z2=t0,>z2=t0); */
+fe_sq(t0,z); for (i = 1;i < 1;++i) fe_sq(t0,t0);
+
+/* qhasm: z8 = z2^2^2 */
+/* asm 1: fe_sq(>z8=fe#2,<z2=fe#1); for (i = 1;i < 2;++i) fe_sq(>z8=fe#2,>z8=fe#2); */
+/* asm 2: fe_sq(>z8=t1,<z2=t0); for (i = 1;i < 2;++i) fe_sq(>z8=t1,>z8=t1); */
+fe_sq(t1,t0); for (i = 1;i < 2;++i) fe_sq(t1,t1);
+
+/* qhasm: z9 = z1*z8 */
+/* asm 1: fe_mul(>z9=fe#2,<z1=fe#11,<z8=fe#2); */
+/* asm 2: fe_mul(>z9=t1,<z1=z,<z8=t1); */
+fe_mul(t1,z,t1);
+
+/* qhasm: z11 = z2*z9 */
+/* asm 1: fe_mul(>z11=fe#1,<z2=fe#1,<z9=fe#2); */
+/* asm 2: fe_mul(>z11=t0,<z2=t0,<z9=t1); */
+fe_mul(t0,t0,t1);
+
+/* qhasm: z22 = z11^2^1 */
+/* asm 1: fe_sq(>z22=fe#1,<z11=fe#1); for (i = 1;i < 1;++i) fe_sq(>z22=fe#1,>z22=fe#1); */
+/* asm 2: fe_sq(>z22=t0,<z11=t0); for (i = 1;i < 1;++i) fe_sq(>z22=t0,>z22=t0); */
+fe_sq(t0,t0); for (i = 1;i < 1;++i) fe_sq(t0,t0);
+
+/* qhasm: z_5_0 = z9*z22 */
+/* asm 1: fe_mul(>z_5_0=fe#1,<z9=fe#2,<z22=fe#1); */
+/* asm 2: fe_mul(>z_5_0=t0,<z9=t1,<z22=t0); */
+fe_mul(t0,t1,t0);
+
+/* qhasm: z_10_5 = z_5_0^2^5 */
+/* asm 1: fe_sq(>z_10_5=fe#2,<z_5_0=fe#1); for (i = 1;i < 5;++i) fe_sq(>z_10_5=fe#2,>z_10_5=fe#2); */
+/* asm 2: fe_sq(>z_10_5=t1,<z_5_0=t0); for (i = 1;i < 5;++i) fe_sq(>z_10_5=t1,>z_10_5=t1); */
+fe_sq(t1,t0); for (i = 1;i < 5;++i) fe_sq(t1,t1);
+
+/* qhasm: z_10_0 = z_10_5*z_5_0 */
+/* asm 1: fe_mul(>z_10_0=fe#1,<z_10_5=fe#2,<z_5_0=fe#1); */
+/* asm 2: fe_mul(>z_10_0=t0,<z_10_5=t1,<z_5_0=t0); */
+fe_mul(t0,t1,t0);
+
+/* qhasm: z_20_10 = z_10_0^2^10 */
+/* asm 1: fe_sq(>z_20_10=fe#2,<z_10_0=fe#1); for (i = 1;i < 10;++i) fe_sq(>z_20_10=fe#2,>z_20_10=fe#2); */
+/* asm 2: fe_sq(>z_20_10=t1,<z_10_0=t0); for (i = 1;i < 10;++i) fe_sq(>z_20_10=t1,>z_20_10=t1); */
+fe_sq(t1,t0); for (i = 1;i < 10;++i) fe_sq(t1,t1);
+
+/* qhasm: z_20_0 = z_20_10*z_10_0 */
+/* asm 1: fe_mul(>z_20_0=fe#2,<z_20_10=fe#2,<z_10_0=fe#1); */
+/* asm 2: fe_mul(>z_20_0=t1,<z_20_10=t1,<z_10_0=t0); */
+fe_mul(t1,t1,t0);
+
+/* qhasm: z_40_20 = z_20_0^2^20 */
+/* asm 1: fe_sq(>z_40_20=fe#3,<z_20_0=fe#2); for (i = 1;i < 20;++i) fe_sq(>z_40_20=fe#3,>z_40_20=fe#3); */
+/* asm 2: fe_sq(>z_40_20=t2,<z_20_0=t1); for (i = 1;i < 20;++i) fe_sq(>z_40_20=t2,>z_40_20=t2); */
+fe_sq(t2,t1); for (i = 1;i < 20;++i) fe_sq(t2,t2);
+
+/* qhasm: z_40_0 = z_40_20*z_20_0 */
+/* asm 1: fe_mul(>z_40_0=fe#2,<z_40_20=fe#3,<z_20_0=fe#2); */
+/* asm 2: fe_mul(>z_40_0=t1,<z_40_20=t2,<z_20_0=t1); */
+fe_mul(t1,t2,t1);
+
+/* qhasm: z_50_10 = z_40_0^2^10 */
+/* asm 1: fe_sq(>z_50_10=fe#2,<z_40_0=fe#2); for (i = 1;i < 10;++i) fe_sq(>z_50_10=fe#2,>z_50_10=fe#2); */
+/* asm 2: fe_sq(>z_50_10=t1,<z_40_0=t1); for (i = 1;i < 10;++i) fe_sq(>z_50_10=t1,>z_50_10=t1); */
+fe_sq(t1,t1); for (i = 1;i < 10;++i) fe_sq(t1,t1);
+
+/* qhasm: z_50_0 = z_50_10*z_10_0 */
+/* asm 1: fe_mul(>z_50_0=fe#1,<z_50_10=fe#2,<z_10_0=fe#1); */
+/* asm 2: fe_mul(>z_50_0=t0,<z_50_10=t1,<z_10_0=t0); */
+fe_mul(t0,t1,t0);
+
+/* qhasm: z_100_50 = z_50_0^2^50 */
+/* asm 1: fe_sq(>z_100_50=fe#2,<z_50_0=fe#1); for (i = 1;i < 50;++i) fe_sq(>z_100_50=fe#2,>z_100_50=fe#2); */
+/* asm 2: fe_sq(>z_100_50=t1,<z_50_0=t0); for (i = 1;i < 50;++i) fe_sq(>z_100_50=t1,>z_100_50=t1); */
+fe_sq(t1,t0); for (i = 1;i < 50;++i) fe_sq(t1,t1);
+
+/* qhasm: z_100_0 = z_100_50*z_50_0 */
+/* asm 1: fe_mul(>z_100_0=fe#2,<z_100_50=fe#2,<z_50_0=fe#1); */
+/* asm 2: fe_mul(>z_100_0=t1,<z_100_50=t1,<z_50_0=t0); */
+fe_mul(t1,t1,t0);
+
+/* qhasm: z_200_100 = z_100_0^2^100 */
+/* asm 1: fe_sq(>z_200_100=fe#3,<z_100_0=fe#2); for (i = 1;i < 100;++i) fe_sq(>z_200_100=fe#3,>z_200_100=fe#3); */
+/* asm 2: fe_sq(>z_200_100=t2,<z_100_0=t1); for (i = 1;i < 100;++i) fe_sq(>z_200_100=t2,>z_200_100=t2); */
+fe_sq(t2,t1); for (i = 1;i < 100;++i) fe_sq(t2,t2);
+
+/* qhasm: z_200_0 = z_200_100*z_100_0 */
+/* asm 1: fe_mul(>z_200_0=fe#2,<z_200_100=fe#3,<z_100_0=fe#2); */
+/* asm 2: fe_mul(>z_200_0=t1,<z_200_100=t2,<z_100_0=t1); */
+fe_mul(t1,t2,t1);
+
+/* qhasm: z_250_50 = z_200_0^2^50 */
+/* asm 1: fe_sq(>z_250_50=fe#2,<z_200_0=fe#2); for (i = 1;i < 50;++i) fe_sq(>z_250_50=fe#2,>z_250_50=fe#2); */
+/* asm 2: fe_sq(>z_250_50=t1,<z_200_0=t1); for (i = 1;i < 50;++i) fe_sq(>z_250_50=t1,>z_250_50=t1); */
+fe_sq(t1,t1); for (i = 1;i < 50;++i) fe_sq(t1,t1);
+
+/* qhasm: z_250_0 = z_250_50*z_50_0 */
+/* asm 1: fe_mul(>z_250_0=fe#1,<z_250_50=fe#2,<z_50_0=fe#1); */
+/* asm 2: fe_mul(>z_250_0=t0,<z_250_50=t1,<z_50_0=t0); */
+fe_mul(t0,t1,t0);
+
+/* qhasm: z_252_2 = z_250_0^2^2 */
+/* asm 1: fe_sq(>z_252_2=fe#1,<z_250_0=fe#1); for (i = 1;i < 2;++i) fe_sq(>z_252_2=fe#1,>z_252_2=fe#1); */
+/* asm 2: fe_sq(>z_252_2=t0,<z_250_0=t0); for (i = 1;i < 2;++i) fe_sq(>z_252_2=t0,>z_252_2=t0); */
+fe_sq(t0,t0); for (i = 1;i < 2;++i) fe_sq(t0,t0);
+
+/* qhasm: z_252_3 = z_252_2*z1 */
+/* asm 1: fe_mul(>z_252_3=fe#12,<z_252_2=fe#1,<z1=fe#11); */
+/* asm 2: fe_mul(>z_252_3=out,<z_252_2=t0,<z1=z); */
+fe_mul(out,t0,z);
+
+/* qhasm: return */
+
+
+  return;
+}
--- a/fe_sq.c
+++ b/fe_sq.c
@ -0,0 +1,149 @@
+#include "fe.h"
+#include "pstdint.h"
+
+/*
+h = f * f
+Can overlap h with f.
+
+Preconditions:
+   |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+
+Postconditions:
+   |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+*/
+
+/*
+See fe_mul.c for discussion of implementation strategy.
+*/
+
+void fe_sq(fe h,const fe f)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t f0_2 = 2 * f0;
+  int32_t f1_2 = 2 * f1;
+  int32_t f2_2 = 2 * f2;
+  int32_t f3_2 = 2 * f3;
+  int32_t f4_2 = 2 * f4;
+  int32_t f5_2 = 2 * f5;
+  int32_t f6_2 = 2 * f6;
+  int32_t f7_2 = 2 * f7;
+  int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
+  int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
+  int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
+  int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
+  int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
+  int64_t f0f0    = f0   * (int64_t) f0;
+  int64_t f0f1_2  = f0_2 * (int64_t) f1;
+  int64_t f0f2_2  = f0_2 * (int64_t) f2;
+  int64_t f0f3_2  = f0_2 * (int64_t) f3;
+  int64_t f0f4_2  = f0_2 * (int64_t) f4;
+  int64_t f0f5_2  = f0_2 * (int64_t) f5;
+  int64_t f0f6_2  = f0_2 * (int64_t) f6;
+  int64_t f0f7_2  = f0_2 * (int64_t) f7;
+  int64_t f0f8_2  = f0_2 * (int64_t) f8;
+  int64_t f0f9_2  = f0_2 * (int64_t) f9;
+  int64_t f1f1_2  = f1_2 * (int64_t) f1;
+  int64_t f1f2_2  = f1_2 * (int64_t) f2;
+  int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
+  int64_t f1f4_2  = f1_2 * (int64_t) f4;
+  int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
+  int64_t f1f6_2  = f1_2 * (int64_t) f6;
+  int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
+  int64_t f1f8_2  = f1_2 * (int64_t) f8;
+  int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
+  int64_t f2f2    = f2   * (int64_t) f2;
+  int64_t f2f3_2  = f2_2 * (int64_t) f3;
+  int64_t f2f4_2  = f2_2 * (int64_t) f4;
+  int64_t f2f5_2  = f2_2 * (int64_t) f5;
+  int64_t f2f6_2  = f2_2 * (int64_t) f6;
+  int64_t f2f7_2  = f2_2 * (int64_t) f7;
+  int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
+  int64_t f2f9_38 = f2   * (int64_t) f9_38;
+  int64_t f3f3_2  = f3_2 * (int64_t) f3;
+  int64_t f3f4_2  = f3_2 * (int64_t) f4;
+  int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
+  int64_t f3f6_2  = f3_2 * (int64_t) f6;
+  int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
+  int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
+  int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
+  int64_t f4f4    = f4   * (int64_t) f4;
+  int64_t f4f5_2  = f4_2 * (int64_t) f5;
+  int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
+  int64_t f4f7_38 = f4   * (int64_t) f7_38;
+  int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
+  int64_t f4f9_38 = f4   * (int64_t) f9_38;
+  int64_t f5f5_38 = f5   * (int64_t) f5_38;
+  int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
+  int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
+  int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
+  int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
+  int64_t f6f6_19 = f6   * (int64_t) f6_19;
+  int64_t f6f7_38 = f6   * (int64_t) f7_38;
+  int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
+  int64_t f6f9_38 = f6   * (int64_t) f9_38;
+  int64_t f7f7_38 = f7   * (int64_t) f7_38;
+  int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
+  int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
+  int64_t f8f8_19 = f8   * (int64_t) f8_19;
+  int64_t f8f9_38 = f8   * (int64_t) f9_38;
+  int64_t f9f9_38 = f9   * (int64_t) f9_38;
+  int64_t h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
+  int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
+  int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
+  int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
+  int64_t h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
+  int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
+  int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
+  int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
+  int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
+  int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+
+  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+
+  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+
+  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+
+  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_sq2.c
+++ b/fe_sq2.c
@ -0,0 +1,160 @@
+#include "fe.h"
+#include "pstdint.h"
+
+/*
+h = 2 * f * f
+Can overlap h with f.
+
+Preconditions:
+   |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+
+Postconditions:
+   |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+*/
+
+/*
+See fe_mul.c for discussion of implementation strategy.
+*/
+
+void fe_sq2(fe h,const fe f)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t f0_2 = 2 * f0;
+  int32_t f1_2 = 2 * f1;
+  int32_t f2_2 = 2 * f2;
+  int32_t f3_2 = 2 * f3;
+  int32_t f4_2 = 2 * f4;
+  int32_t f5_2 = 2 * f5;
+  int32_t f6_2 = 2 * f6;
+  int32_t f7_2 = 2 * f7;
+  int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
+  int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
+  int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
+  int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
+  int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
+  int64_t f0f0    = f0   * (int64_t) f0;
+  int64_t f0f1_2  = f0_2 * (int64_t) f1;
+  int64_t f0f2_2  = f0_2 * (int64_t) f2;
+  int64_t f0f3_2  = f0_2 * (int64_t) f3;
+  int64_t f0f4_2  = f0_2 * (int64_t) f4;
+  int64_t f0f5_2  = f0_2 * (int64_t) f5;
+  int64_t f0f6_2  = f0_2 * (int64_t) f6;
+  int64_t f0f7_2  = f0_2 * (int64_t) f7;
+  int64_t f0f8_2  = f0_2 * (int64_t) f8;
+  int64_t f0f9_2  = f0_2 * (int64_t) f9;
+  int64_t f1f1_2  = f1_2 * (int64_t) f1;
+  int64_t f1f2_2  = f1_2 * (int64_t) f2;
+  int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
+  int64_t f1f4_2  = f1_2 * (int64_t) f4;
+  int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
+  int64_t f1f6_2  = f1_2 * (int64_t) f6;
+  int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
+  int64_t f1f8_2  = f1_2 * (int64_t) f8;
+  int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
+  int64_t f2f2    = f2   * (int64_t) f2;
+  int64_t f2f3_2  = f2_2 * (int64_t) f3;
+  int64_t f2f4_2  = f2_2 * (int64_t) f4;
+  int64_t f2f5_2  = f2_2 * (int64_t) f5;
+  int64_t f2f6_2  = f2_2 * (int64_t) f6;
+  int64_t f2f7_2  = f2_2 * (int64_t) f7;
+  int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
+  int64_t f2f9_38 = f2   * (int64_t) f9_38;
+  int64_t f3f3_2  = f3_2 * (int64_t) f3;
+  int64_t f3f4_2  = f3_2 * (int64_t) f4;
+  int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
+  int64_t f3f6_2  = f3_2 * (int64_t) f6;
+  int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
+  int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
+  int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
+  int64_t f4f4    = f4   * (int64_t) f4;
+  int64_t f4f5_2  = f4_2 * (int64_t) f5;
+  int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
+  int64_t f4f7_38 = f4   * (int64_t) f7_38;
+  int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
+  int64_t f4f9_38 = f4   * (int64_t) f9_38;
+  int64_t f5f5_38 = f5   * (int64_t) f5_38;
+  int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
+  int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
+  int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
+  int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
+  int64_t f6f6_19 = f6   * (int64_t) f6_19;
+  int64_t f6f7_38 = f6   * (int64_t) f7_38;
+  int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
+  int64_t f6f9_38 = f6   * (int64_t) f9_38;
+  int64_t f7f7_38 = f7   * (int64_t) f7_38;
+  int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
+  int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
+  int64_t f8f8_19 = f8   * (int64_t) f8_19;
+  int64_t f8f9_38 = f8   * (int64_t) f9_38;
+  int64_t f9f9_38 = f9   * (int64_t) f9_38;
+  int64_t h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
+  int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
+  int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
+  int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
+  int64_t h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
+  int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
+  int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
+  int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
+  int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
+  int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  h0 += h0;
+  h1 += h1;
+  h2 += h2;
+  h3 += h3;
+  h4 += h4;
+  h5 += h5;
+  h6 += h6;
+  h7 += h7;
+  h8 += h8;
+  h9 += h9;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+
+  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+
+  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+
+  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+
+  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_sub.c
+++ b/fe_sub.c
@ -0,0 +1,57 @@
+#include "fe.h"
+
+/*
+h = f - g
+Can overlap h with f or g.
+
+Preconditions:
+   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+   |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
+
+Postconditions:
+   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+*/
+
+void fe_sub(fe h,const fe f,const fe g)
+{
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int32_t g0 = g[0];
+  int32_t g1 = g[1];
+  int32_t g2 = g[2];
+  int32_t g3 = g[3];
+  int32_t g4 = g[4];
+  int32_t g5 = g[5];
+  int32_t g6 = g[6];
+  int32_t g7 = g[7];
+  int32_t g8 = g[8];
+  int32_t g9 = g[9];
+  int32_t h0 = f0 - g0;
+  int32_t h1 = f1 - g1;
+  int32_t h2 = f2 - g2;
+  int32_t h3 = f3 - g3;
+  int32_t h4 = f4 - g4;
+  int32_t h5 = f5 - g5;
+  int32_t h6 = f6 - g6;
+  int32_t h7 = f7 - g7;
+  int32_t h8 = f8 - g8;
+  int32_t h9 = f9 - g9;
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
--- a/fe_tobytes.c
+++ b/fe_tobytes.c
@ -0,0 +1,119 @@
+#include "fe.h"
+
+/*
+Preconditions:
+  |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+
+Write p=2^255-19; q=floor(h/p).
+Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
+
+Proof:
+  Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
+  Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
+
+  Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
+  Then 0<y<1.
+
+  Write r=h-pq.
+  Have 0<=r<=p-1=2^255-20.
+  Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
+
+  Write x=r+19(2^-255)r+y.
+  Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
+
+  Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
+  so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
+*/
+
+void fe_tobytes(unsigned char *s,const fe h)
+{
+  int32_t h0 = h[0];
+  int32_t h1 = h[1];
+  int32_t h2 = h[2];
+  int32_t h3 = h[3];
+  int32_t h4 = h[4];
+  int32_t h5 = h[5];
+  int32_t h6 = h[6];
+  int32_t h7 = h[7];
+  int32_t h8 = h[8];
+  int32_t h9 = h[9];
+  int32_t q;
+  int32_t carry0;
+  int32_t carry1;
+  int32_t carry2;
+  int32_t carry3;
+  int32_t carry4;
+  int32_t carry5;
+  int32_t carry6;
+  int32_t carry7;
+  int32_t carry8;
+  int32_t carry9;
+
+  q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
+  q = (h0 + q) >> 26;
+  q = (h1 + q) >> 25;
+  q = (h2 + q) >> 26;
+  q = (h3 + q) >> 25;
+  q = (h4 + q) >> 26;
+  q = (h5 + q) >> 25;
+  q = (h6 + q) >> 26;
+  q = (h7 + q) >> 25;
+  q = (h8 + q) >> 26;
+  q = (h9 + q) >> 25;
+
+  /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
+  h0 += 19 * q;
+  /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
+
+  carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25;
+  carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26;
+  carry9 = h9 >> 25;               h9 -= carry9 << 25;
+                  /* h10 = carry9 */
+
+  /*
+  Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
+  Have h0+...+2^230 h9 between 0 and 2^255-1;
+  evidently 2^255 h10-2^255 q = 0.
+  Goal: Output h0+...+2^230 h9.
+  */
+
+  s[0] = h0 >> 0;
+  s[1] = h0 >> 8;
+  s[2] = h0 >> 16;
+  s[3] = (h0 >> 24) | (h1 << 2);
+  s[4] = h1 >> 6;
+  s[5] = h1 >> 14;
+  s[6] = (h1 >> 22) | (h2 << 3);
+  s[7] = h2 >> 5;
+  s[8] = h2 >> 13;
+  s[9] = (h2 >> 21) | (h3 << 5);
+  s[10] = h3 >> 3;
+  s[11] = h3 >> 11;
+  s[12] = (h3 >> 19) | (h4 << 6);
+  s[13] = h4 >> 2;
+  s[14] = h4 >> 10;
+  s[15] = h4 >> 18;
+  s[16] = h5 >> 0;
+  s[17] = h5 >> 8;
+  s[18] = h5 >> 16;
+  s[19] = (h5 >> 24) | (h6 << 1);
+  s[20] = h6 >> 7;
+  s[21] = h6 >> 15;
+  s[22] = (h6 >> 23) | (h7 << 3);
+  s[23] = h7 >> 5;
+  s[24] = h7 >> 13;
+  s[25] = (h7 >> 21) | (h8 << 4);
+  s[26] = h8 >> 4;
+  s[27] = h8 >> 12;
+  s[28] = (h8 >> 20) | (h9 << 6);
+  s[29] = h9 >> 2;
+  s[30] = h9 >> 10;
+  s[31] = h9 >> 18;
+}
--- a/ge.h
+++ b/ge.h
@ -0,0 +1,95 @@
+#ifndef GE_H
+#define GE_H
+
+/*
+ge means group element.
+
+Here the group is the set of pairs (x,y) of field elements (see fe.h)
+satisfying -x^2 + y^2 = 1 + d x^2y^2
+where d = -121665/121666.
+
+Representations:
+  ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z
+  ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT
+  ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
+  ge_precomp (Duif): (y+x,y-x,2dxy)
+*/
+
+#include "fe.h"
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+} ge_p2;
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+  fe T;
+} ge_p3;
+
+typedef struct {
+  fe X;
+  fe Y;
+  fe Z;
+  fe T;
+} ge_p1p1;
+
+typedef struct {
+  fe yplusx;
+  fe yminusx;
+  fe xy2d;
+} ge_precomp;
+
+typedef struct {
+  fe YplusX;
+  fe YminusX;
+  fe Z;
+  fe T2d;
+} ge_cached;
+
+#define ge_frombytes_negate_vartime crypto_sign_ed25519_ref10_ge_frombytes_negate_vartime
+#define ge_tobytes crypto_sign_ed25519_ref10_ge_tobytes
+#define ge_p3_tobytes crypto_sign_ed25519_ref10_ge_p3_tobytes
+
+#define ge_p2_0 crypto_sign_ed25519_ref10_ge_p2_0
+#define ge_p3_0 crypto_sign_ed25519_ref10_ge_p3_0
+#define ge_precomp_0 crypto_sign_ed25519_ref10_ge_precomp_0
+#define ge_p3_to_p2 crypto_sign_ed25519_ref10_ge_p3_to_p2
+#define ge_p3_to_cached crypto_sign_ed25519_ref10_ge_p3_to_cached
+#define ge_p1p1_to_p2 crypto_sign_ed25519_ref10_ge_p1p1_to_p2
+#define ge_p1p1_to_p3 crypto_sign_ed25519_ref10_ge_p1p1_to_p3
+#define ge_p2_dbl crypto_sign_ed25519_ref10_ge_p2_dbl
+#define ge_p3_dbl crypto_sign_ed25519_ref10_ge_p3_dbl
+
+#define ge_madd crypto_sign_ed25519_ref10_ge_madd
+#define ge_msub crypto_sign_ed25519_ref10_ge_msub
+#define ge_add crypto_sign_ed25519_ref10_ge_add
+#define ge_sub crypto_sign_ed25519_ref10_ge_sub
+#define ge_scalarmult_base crypto_sign_ed25519_ref10_ge_scalarmult_base
+#define ge_double_scalarmult_vartime crypto_sign_ed25519_ref10_ge_double_scalarmult_vartime
+
+extern void ge_tobytes(unsigned char *,const ge_p2 *);
+extern void ge_p3_tobytes(unsigned char *,const ge_p3 *);
+extern int ge_frombytes_negate_vartime(ge_p3 *,const unsigned char *);
+
+extern void ge_p2_0(ge_p2 *);
+extern void ge_p3_0(ge_p3 *);
+extern void ge_precomp_0(ge_precomp *);
+extern void ge_p3_to_p2(ge_p2 *,const ge_p3 *);
+extern void ge_p3_to_cached(ge_cached *,const ge_p3 *);
+extern void ge_p1p1_to_p2(ge_p2 *,const ge_p1p1 *);
+extern void ge_p1p1_to_p3(ge_p3 *,const ge_p1p1 *);
+extern void ge_p2_dbl(ge_p1p1 *,const ge_p2 *);
+extern void ge_p3_dbl(ge_p1p1 *,const ge_p3 *);
+
+extern void ge_madd(ge_p1p1 *,const ge_p3 *,const ge_precomp *);
+extern void ge_msub(ge_p1p1 *,const ge_p3 *,const ge_precomp *);
+extern void ge_add(ge_p1p1 *,const ge_p3 *,const ge_cached *);
+extern void ge_sub(ge_p1p1 *,const ge_p3 *,const ge_cached *);
+extern void ge_scalarmult_base(ge_p3 *,const unsigned char *);
+extern void ge_double_scalarmult_vartime(ge_p2 *,const unsigned char *,const ge_p3 *,const unsigned char *);
+
+#endif
--- a/ge_add.c
+++ b/ge_add.c
@ -0,0 +1,108 @@
+#include "ge.h"
+
+/*
+r = p + q
+*/
+
+void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
+{
+  fe t0;
+
+/* qhasm: enter ge_add */
+
+/* qhasm: fe X1 */
+
+/* qhasm: fe Y1 */
+
+/* qhasm: fe Z1 */
+
+/* qhasm: fe Z2 */
+
+/* qhasm: fe T1 */
+
+/* qhasm: fe ZZ */
+
+/* qhasm: fe YpX2 */
+
+/* qhasm: fe YmX2 */
+
+/* qhasm: fe T2d2 */
+
+/* qhasm: fe X3 */
+
+/* qhasm: fe Y3 */
+
+/* qhasm: fe Z3 */
+
+/* qhasm: fe T3 */
+
+/* qhasm: fe YpX1 */
+
+/* qhasm: fe YmX1 */
+
+/* qhasm: fe A */
+
+/* qhasm: fe B */
+
+/* qhasm: fe C */
+
+/* qhasm: fe D */
+
+/* qhasm: YpX1 = Y1+X1 */
+/* asm 1: fe_add(>YpX1=fe#1,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_add(>YpX1=r->X,<Y1=p->Y,<X1=p->X); */
+fe_add(r->X,p->Y,p->X);
+
+/* qhasm: YmX1 = Y1-X1 */
+/* asm 1: fe_sub(>YmX1=fe#2,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_sub(>YmX1=r->Y,<Y1=p->Y,<X1=p->X); */
+fe_sub(r->Y,p->Y,p->X);
+
+/* qhasm: A = YpX1*YpX2 */
+/* asm 1: fe_mul(>A=fe#3,<YpX1=fe#1,<YpX2=fe#15); */
+/* asm 2: fe_mul(>A=r->Z,<YpX1=r->X,<YpX2=q->YplusX); */
+fe_mul(r->Z,r->X,q->YplusX);
+
+/* qhasm: B = YmX1*YmX2 */
+/* asm 1: fe_mul(>B=fe#2,<YmX1=fe#2,<YmX2=fe#16); */
+/* asm 2: fe_mul(>B=r->Y,<YmX1=r->Y,<YmX2=q->YminusX); */
+fe_mul(r->Y,r->Y,q->YminusX);
+
+/* qhasm: C = T2d2*T1 */
+/* asm 1: fe_mul(>C=fe#4,<T2d2=fe#18,<T1=fe#14); */
+/* asm 2: fe_mul(>C=r->T,<T2d2=q->T2d,<T1=p->T); */
+fe_mul(r->T,q->T2d,p->T);
+
+/* qhasm: ZZ = Z1*Z2 */
+/* asm 1: fe_mul(>ZZ=fe#1,<Z1=fe#13,<Z2=fe#17); */
+/* asm 2: fe_mul(>ZZ=r->X,<Z1=p->Z,<Z2=q->Z); */
+fe_mul(r->X,p->Z,q->Z);
+
+/* qhasm: D = 2*ZZ */
+/* asm 1: fe_add(>D=fe#5,<ZZ=fe#1,<ZZ=fe#1); */
+/* asm 2: fe_add(>D=t0,<ZZ=r->X,<ZZ=r->X); */
+fe_add(t0,r->X,r->X);
+
+/* qhasm: X3 = A-B */
+/* asm 1: fe_sub(>X3=fe#1,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_sub(>X3=r->X,<A=r->Z,<B=r->Y); */
+fe_sub(r->X,r->Z,r->Y);
+
+/* qhasm: Y3 = A+B */
+/* asm 1: fe_add(>Y3=fe#2,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_add(>Y3=r->Y,<A=r->Z,<B=r->Y); */
+fe_add(r->Y,r->Z,r->Y);
+
+/* qhasm: Z3 = D+C */
+/* asm 1: fe_add(>Z3=fe#3,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_add(>Z3=r->Z,<D=t0,<C=r->T); */
+fe_add(r->Z,t0,r->T);
+
+/* qhasm: T3 = D-C */
+/* asm 1: fe_sub(>T3=fe#4,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_sub(>T3=r->T,<D=t0,<C=r->T); */
+fe_sub(r->T,t0,r->T);
+
+/* qhasm: return */
+
+}
--- a/ge_double_scalarmult.c
+++ b/ge_double_scalarmult.c
@ -0,0 +1,135 @@
+#include "ge.h"
+
+static void slide(signed char *r,const unsigned char *a)
+{
+  int i;
+  int b;
+  int k;
+
+  for (i = 0;i < 256;++i)
+    r[i] = 1 & (a[i >> 3] >> (i & 7));
+
+  for (i = 0;i < 256;++i)
+    if (r[i]) {
+      for (b = 1;b <= 6 && i + b < 256;++b) {
+        if (r[i + b]) {
+          if (r[i] + (r[i + b] << b) <= 15) {
+            r[i] += r[i + b] << b; r[i + b] = 0;
+          } else if (r[i] - (r[i + b] << b) >= -15) {
+            r[i] -= r[i + b] << b;
+            for (k = i + b;k < 256;++k) {
+              if (!r[k]) {
+                r[k] = 1;
+                break;
+              }
+              r[k] = 0;
+            }
+          } else
+            break;
+        }
+      }
+    }
+
+}
+
+static ge_precomp Bi[8] = {
+ {
+  { 25967493,-14356035,29566456,3660896,-12694345,4014787,27544626,-11754271,-6079156,2047605 },
+  { -12545711,934262,-2722910,3049990,-727428,9406986,12720692,5043384,19500929,-15469378 },
+  { -8738181,4489570,9688441,-14785194,10184609,-12363380,29287919,11864899,-24514362,-4438546 },
+ },
+ {
+  { 15636291,-9688557,24204773,-7912398,616977,-16685262,27787600,-14772189,28944400,-1550024 },
+  { 16568933,4717097,-11556148,-1102322,15682896,-11807043,16354577,-11775962,7689662,11199574 },
+  { 30464156,-5976125,-11779434,-15670865,23220365,15915852,7512774,10017326,-17749093,-9920357 },
+ },
+ {
+  { 10861363,11473154,27284546,1981175,-30064349,12577861,32867885,14515107,-15438304,10819380 },
+  { 4708026,6336745,20377586,9066809,-11272109,6594696,-25653668,12483688,-12668491,5581306 },
+  { 19563160,16186464,-29386857,4097519,10237984,-4348115,28542350,13850243,-23678021,-15815942 },
+ },
+ {
+  { 5153746,9909285,1723747,-2777874,30523605,5516873,19480852,5230134,-23952439,-15175766 },
+  { -30269007,-3463509,7665486,10083793,28475525,1649722,20654025,16520125,30598449,7715701 },
+  { 28881845,14381568,9657904,3680757,-20181635,7843316,-31400660,1370708,29794553,-1409300 },
+ },
+ {
+  { -22518993,-6692182,14201702,-8745502,-23510406,8844726,18474211,-1361450,-13062696,13821877 },
+  { -6455177,-7839871,3374702,-4740862,-27098617,-10571707,31655028,-7212327,18853322,-14220951 },
+  { 4566830,-12963868,-28974889,-12240689,-7602672,-2830569,-8514358,-10431137,2207753,-3209784 },
+ },
+ {
+  { -25154831,-4185821,29681144,7868801,-6854661,-9423865,-12437364,-663000,-31111463,-16132436 },
+  { 25576264,-2703214,7349804,-11814844,16472782,9300885,3844789,15725684,171356,6466918 },
+  { 23103977,13316479,9739013,-16149481,817875,-15038942,8965339,-14088058,-30714912,16193877 },
+ },
+ {
+  { -33521811,3180713,-2394130,14003687,-16903474,-16270840,17238398,4729455,-18074513,9256800 },
+  { -25182317,-4174131,32336398,5036987,-21236817,11360617,22616405,9761698,-19827198,630305 },
+  { -13720693,2639453,-24237460,-7406481,9494427,-5774029,-6554551,-15960994,-2449256,-14291300 },
+ },
+ {
+  { -3151181,-5046075,9282714,6866145,-31907062,-863023,-18940575,15033784,25105118,-7894876 },
+  { -24326370,15950226,-31801215,-14592823,-11662737,-5090925,1573892,-2625887,2198790,-15804619 },
+  { -3099351,10324967,-2241613,7453183,-5446979,-2735503,-13812022,-16236442,-32461234,-12290683 },
+ },
+} ;
+
+/*
+r = a * A + b * B
+where a = a[0]+256*a[1]+...+256^31 a[31].
+and b = b[0]+256*b[1]+...+256^31 b[31].
+B is the Ed25519 base point (x,4/5) with x positive.
+*/
+
+void ge_double_scalarmult_vartime(ge_p2 *r,const unsigned char *a,const ge_p3 *A,const unsigned char *b)
+{
+  signed char aslide[256];
+  signed char bslide[256];
+  ge_cached Ai[8]; /* A,3A,5A,7A,9A,11A,13A,15A */
+  ge_p1p1 t;
+  ge_p3 u;
+  ge_p3 A2;
+  int i;
+
+  slide(aslide,a);
+  slide(bslide,b);
+
+  ge_p3_to_cached(&Ai[0],A);
+  ge_p3_dbl(&t,A); ge_p1p1_to_p3(&A2,&t);
+  ge_add(&t,&A2,&Ai[0]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[1],&u);
+  ge_add(&t,&A2,&Ai[1]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[2],&u);
+  ge_add(&t,&A2,&Ai[2]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[3],&u);
+  ge_add(&t,&A2,&Ai[3]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[4],&u);
+  ge_add(&t,&A2,&Ai[4]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[5],&u);
+  ge_add(&t,&A2,&Ai[5]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[6],&u);
+  ge_add(&t,&A2,&Ai[6]); ge_p1p1_to_p3(&u,&t); ge_p3_to_cached(&Ai[7],&u);
+
+  ge_p2_0(r);
+
+  for (i = 255;i >= 0;--i) {
+    if (aslide[i] || bslide[i]) break;
+  }
+
+  for (;i >= 0;--i) {
+    ge_p2_dbl(&t,r);
+
+    if (aslide[i] > 0) {
+      ge_p1p1_to_p3(&u,&t);
+      ge_add(&t,&u,&Ai[aslide[i]/2]);
+    } else if (aslide[i] < 0) {
+      ge_p1p1_to_p3(&u,&t);
+      ge_sub(&t,&u,&Ai[(-aslide[i])/2]);
+    }
+
+    if (bslide[i] > 0) {
+      ge_p1p1_to_p3(&u,&t);
+      ge_madd(&t,&u,&Bi[bslide[i]/2]);
+    } else if (bslide[i] < 0) {
+      ge_p1p1_to_p3(&u,&t);
+      ge_msub(&t,&u,&Bi[(-bslide[i])/2]);
+    }
+
+    ge_p1p1_to_p2(r,&t);
+  }
+}
--- a/ge_frombytes.c
+++ b/ge_frombytes.c
@ -0,0 +1,52 @@
+#include "ge.h"
+
+static const fe d = {
+-10913610,13857413,-15372611,6949391,114729,-8787816,-6275908,-3247719,-18696448,-12055116
+
+} ;
+
+static const fe sqrtm1 = {
+-32595792,-7943725,9377950,3500415,12389472,-272473,-25146209,-2005654,326686,11406482
+
+} ;
+
+int ge_frombytes_negate_vartime(ge_p3 *h,const unsigned char *s)
+{
+  fe u;
+  fe v;
+  fe v3;
+  fe vxx;
+  fe check;
+
+  fe_frombytes(h->Y,s);
+  fe_1(h->Z);
+  fe_sq(u,h->Y);
+  fe_mul(v,u,d);
+  fe_sub(u,u,h->Z);       /* u = y^2-1 */
+  fe_add(v,v,h->Z);       /* v = dy^2+1 */
+
+  fe_sq(v3,v);
+  fe_mul(v3,v3,v);        /* v3 = v^3 */
+  fe_sq(h->X,v3);
+  fe_mul(h->X,h->X,v);
+  fe_mul(h->X,h->X,u);    /* x = uv^7 */
+
+  fe_pow22523(h->X,h->X); /* x = (uv^7)^((q-5)/8) */
+  fe_mul(h->X,h->X,v3);
+  fe_mul(h->X,h->X,u);    /* x = uv^3(uv^7)^((q-5)/8) */
+
+  fe_sq(vxx,h->X);
+  fe_mul(vxx,vxx,v);
+  fe_sub(check,vxx,u);    /* vx^2-u */
+  if (fe_isnonzero(check)) {
+    fe_add(check,vxx,u);  /* vx^2+u */
+    if (fe_isnonzero(check)) return -1;
+    fe_mul(h->X,h->X,sqrtm1);
+  }
+
+  if (fe_isnegative(h->X) == (s[31] >> 7))
+    fe_neg(h->X,h->X);
+
+  fe_mul(h->T,h->X,h->Y);
+  return 0;
+}
--- a/ge_madd.c
+++ b/ge_madd.c
@ -0,0 +1,99 @@
+#include "ge.h"
+
+/*
+r = p + q
+*/
+
+void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
+{
+  fe t0;
+
+/* qhasm: enter ge_madd */
+
+/* qhasm: fe X1 */
+
+/* qhasm: fe Y1 */
+
+/* qhasm: fe Z1 */
+
+/* qhasm: fe T1 */
+
+/* qhasm: fe ypx2 */
+
+/* qhasm: fe ymx2 */
+
+/* qhasm: fe xy2d2 */
+
+/* qhasm: fe X3 */
+
+/* qhasm: fe Y3 */
+
+/* qhasm: fe Z3 */
+
+/* qhasm: fe T3 */
+
+/* qhasm: fe YpX1 */
+
+/* qhasm: fe YmX1 */
+
+/* qhasm: fe A */
+
+/* qhasm: fe B */
+
+/* qhasm: fe C */
+
+/* qhasm: fe D */
+
+/* qhasm: YpX1 = Y1+X1 */
+/* asm 1: fe_add(>YpX1=fe#1,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_add(>YpX1=r->X,<Y1=p->Y,<X1=p->X); */
+fe_add(r->X,p->Y,p->X);
+
+/* qhasm: YmX1 = Y1-X1 */
+/* asm 1: fe_sub(>YmX1=fe#2,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_sub(>YmX1=r->Y,<Y1=p->Y,<X1=p->X); */
+fe_sub(r->Y,p->Y,p->X);
+
+/* qhasm: A = YpX1*ypx2 */
+/* asm 1: fe_mul(>A=fe#3,<YpX1=fe#1,<ypx2=fe#15); */
+/* asm 2: fe_mul(>A=r->Z,<YpX1=r->X,<ypx2=q->yplusx); */
+fe_mul(r->Z,r->X,q->yplusx);
+
+/* qhasm: B = YmX1*ymx2 */
+/* asm 1: fe_mul(>B=fe#2,<YmX1=fe#2,<ymx2=fe#16); */
+/* asm 2: fe_mul(>B=r->Y,<YmX1=r->Y,<ymx2=q->yminusx); */
+fe_mul(r->Y,r->Y,q->yminusx);
+
+/* qhasm: C = xy2d2*T1 */
+/* asm 1: fe_mul(>C=fe#4,<xy2d2=fe#17,<T1=fe#14); */
+/* asm 2: fe_mul(>C=r->T,<xy2d2=q->xy2d,<T1=p->T); */
+fe_mul(r->T,q->xy2d,p->T);
+
+/* qhasm: D = 2*Z1 */
+/* asm 1: fe_add(>D=fe#5,<Z1=fe#13,<Z1=fe#13); */
+/* asm 2: fe_add(>D=t0,<Z1=p->Z,<Z1=p->Z); */
+fe_add(t0,p->Z,p->Z);
+
+/* qhasm: X3 = A-B */
+/* asm 1: fe_sub(>X3=fe#1,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_sub(>X3=r->X,<A=r->Z,<B=r->Y); */
+fe_sub(r->X,r->Z,r->Y);
+
+/* qhasm: Y3 = A+B */
+/* asm 1: fe_add(>Y3=fe#2,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_add(>Y3=r->Y,<A=r->Z,<B=r->Y); */
+fe_add(r->Y,r->Z,r->Y);
+
+/* qhasm: Z3 = D+C */
+/* asm 1: fe_add(>Z3=fe#3,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_add(>Z3=r->Z,<D=t0,<C=r->T); */
+fe_add(r->Z,t0,r->T);
+
+/* qhasm: T3 = D-C */
+/* asm 1: fe_sub(>T3=fe#4,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_sub(>T3=r->T,<D=t0,<C=r->T); */
+fe_sub(r->T,t0,r->T);
+
+/* qhasm: return */
+
+}
--- a/ge_msub.c
+++ b/ge_msub.c
@ -0,0 +1,99 @@
+#include "ge.h"
+
+/*
+r = p - q
+*/
+
+void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
+{
+  fe t0;
+
+/* qhasm: enter ge_msub */
+
+/* qhasm: fe X1 */
+
+/* qhasm: fe Y1 */
+
+/* qhasm: fe Z1 */
+
+/* qhasm: fe T1 */
+
+/* qhasm: fe ypx2 */
+
+/* qhasm: fe ymx2 */
+
+/* qhasm: fe xy2d2 */
+
+/* qhasm: fe X3 */
+
+/* qhasm: fe Y3 */
+
+/* qhasm: fe Z3 */
+
+/* qhasm: fe T3 */
+
+/* qhasm: fe YpX1 */
+
+/* qhasm: fe YmX1 */
+
+/* qhasm: fe A */
+
+/* qhasm: fe B */
+
+/* qhasm: fe C */
+
+/* qhasm: fe D */
+
+/* qhasm: YpX1 = Y1+X1 */
+/* asm 1: fe_add(>YpX1=fe#1,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_add(>YpX1=r->X,<Y1=p->Y,<X1=p->X); */
+fe_add(r->X,p->Y,p->X);
+
+/* qhasm: YmX1 = Y1-X1 */
+/* asm 1: fe_sub(>YmX1=fe#2,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_sub(>YmX1=r->Y,<Y1=p->Y,<X1=p->X); */
+fe_sub(r->Y,p->Y,p->X);
+
+/* qhasm: A = YpX1*ymx2 */
+/* asm 1: fe_mul(>A=fe#3,<YpX1=fe#1,<ymx2=fe#16); */
+/* asm 2: fe_mul(>A=r->Z,<YpX1=r->X,<ymx2=q->yminusx); */
+fe_mul(r->Z,r->X,q->yminusx);
+
+/* qhasm: B = YmX1*ypx2 */
+/* asm 1: fe_mul(>B=fe#2,<YmX1=fe#2,<ypx2=fe#15); */
+/* asm 2: fe_mul(>B=r->Y,<YmX1=r->Y,<ypx2=q->yplusx); */
+fe_mul(r->Y,r->Y,q->yplusx);
+
+/* qhasm: C = xy2d2*T1 */
+/* asm 1: fe_mul(>C=fe#4,<xy2d2=fe#17,<T1=fe#14); */
+/* asm 2: fe_mul(>C=r->T,<xy2d2=q->xy2d,<T1=p->T); */
+fe_mul(r->T,q->xy2d,p->T);
+
+/* qhasm: D = 2*Z1 */
+/* asm 1: fe_add(>D=fe#5,<Z1=fe#13,<Z1=fe#13); */
+/* asm 2: fe_add(>D=t0,<Z1=p->Z,<Z1=p->Z); */
+fe_add(t0,p->Z,p->Z);
+
+/* qhasm: X3 = A-B */
+/* asm 1: fe_sub(>X3=fe#1,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_sub(>X3=r->X,<A=r->Z,<B=r->Y); */
+fe_sub(r->X,r->Z,r->Y);
+
+/* qhasm: Y3 = A+B */
+/* asm 1: fe_add(>Y3=fe#2,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_add(>Y3=r->Y,<A=r->Z,<B=r->Y); */
+fe_add(r->Y,r->Z,r->Y);
+
+/* qhasm: Z3 = D-C */
+/* asm 1: fe_sub(>Z3=fe#3,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_sub(>Z3=r->Z,<D=t0,<C=r->T); */
+fe_sub(r->Z,t0,r->T);
+
+/* qhasm: T3 = D+C */
+/* asm 1: fe_add(>T3=fe#4,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_add(>T3=r->T,<D=t0,<C=r->T); */
+fe_add(r->T,t0,r->T);
+
+/* qhasm: return */
+
+}
--- a/ge_p1p1_to_p2.c
+++ b/ge_p1p1_to_p2.c
@ -0,0 +1,12 @@
+#include "ge.h"
+
+/*
+r = p
+*/
+
+extern void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p)
+{
+  fe_mul(r->X,p->X,p->T);
+  fe_mul(r->Y,p->Y,p->Z);
+  fe_mul(r->Z,p->Z,p->T);
+}
--- a/ge_p1p1_to_p3.c
+++ b/ge_p1p1_to_p3.c
@ -0,0 +1,13 @@
+#include "ge.h"
+
+/*
+r = p
+*/
+
+extern void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p)
+{
+  fe_mul(r->X,p->X,p->T);
+  fe_mul(r->Y,p->Y,p->Z);
+  fe_mul(r->Z,p->Z,p->T);
+  fe_mul(r->T,p->X,p->Y);
+}
--- a/ge_p2_0.c
+++ b/ge_p2_0.c
@ -0,0 +1,8 @@
+#include "ge.h"
+
+void ge_p2_0(ge_p2 *h)
+{
+  fe_0(h->X);
+  fe_1(h->Y);
+  fe_1(h->Z);
+}
--- a/ge_p2_dbl.c
+++ b/ge_p2_dbl.c
@ -0,0 +1,84 @@
+#include "ge.h"
+
+/*
+r = 2 * p
+*/
+
+void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p)
+{
+  fe t0;
+
+/* qhasm: enter ge_p2_dbl */
+
+/* qhasm: fe X1 */
+
+/* qhasm: fe Y1 */
+
+/* qhasm: fe Z1 */
+
+/* qhasm: fe A */
+
+/* qhasm: fe AA */
+
+/* qhasm: fe XX */
+
+/* qhasm: fe YY */
+
+/* qhasm: fe B */
+
+/* qhasm: fe X3 */
+
+/* qhasm: fe Y3 */
+
+/* qhasm: fe Z3 */
+
+/* qhasm: fe T3 */
+
+/* qhasm: XX=X1^2 */
+/* asm 1: fe_sq(>XX=fe#1,<X1=fe#11); */
+/* asm 2: fe_sq(>XX=r->X,<X1=p->X); */
+fe_sq(r->X,p->X);
+
+/* qhasm: YY=Y1^2 */
+/* asm 1: fe_sq(>YY=fe#3,<Y1=fe#12); */
+/* asm 2: fe_sq(>YY=r->Z,<Y1=p->Y); */
+fe_sq(r->Z,p->Y);
+
+/* qhasm: B=2*Z1^2 */
+/* asm 1: fe_sq2(>B=fe#4,<Z1=fe#13); */
+/* asm 2: fe_sq2(>B=r->T,<Z1=p->Z); */
+fe_sq2(r->T,p->Z);
+
+/* qhasm: A=X1+Y1 */
+/* asm 1: fe_add(>A=fe#2,<X1=fe#11,<Y1=fe#12); */
+/* asm 2: fe_add(>A=r->Y,<X1=p->X,<Y1=p->Y); */
+fe_add(r->Y,p->X,p->Y);
+
+/* qhasm: AA=A^2 */
+/* asm 1: fe_sq(>AA=fe#5,<A=fe#2); */
+/* asm 2: fe_sq(>AA=t0,<A=r->Y); */
+fe_sq(t0,r->Y);
+
+/* qhasm: Y3=YY+XX */
+/* asm 1: fe_add(>Y3=fe#2,<YY=fe#3,<XX=fe#1); */
+/* asm 2: fe_add(>Y3=r->Y,<YY=r->Z,<XX=r->X); */
+fe_add(r->Y,r->Z,r->X);
+
+/* qhasm: Z3=YY-XX */
+/* asm 1: fe_sub(>Z3=fe#3,<YY=fe#3,<XX=fe#1); */
+/* asm 2: fe_sub(>Z3=r->Z,<YY=r->Z,<XX=r->X); */
+fe_sub(r->Z,r->Z,r->X);
+
+/* qhasm: X3=AA-Y3 */
+/* asm 1: fe_sub(>X3=fe#1,<AA=fe#5,<Y3=fe#2); */
+/* asm 2: fe_sub(>X3=r->X,<AA=t0,<Y3=r->Y); */
+fe_sub(r->X,t0,r->Y);
+
+/* qhasm: T3=B-Z3 */
+/* asm 1: fe_sub(>T3=fe#4,<B=fe#4,<Z3=fe#3); */
+/* asm 2: fe_sub(>T3=r->T,<B=r->T,<Z3=r->Z); */
+fe_sub(r->T,r->T,r->Z);
+
+/* qhasm: return */
+
+}
--- a/ge_p3_0.c
+++ b/ge_p3_0.c
@ -0,0 +1,9 @@
+#include "ge.h"
+
+void ge_p3_0(ge_p3 *h)
+{
+  fe_0(h->X);
+  fe_1(h->Y);
+  fe_1(h->Z);
+  fe_0(h->T);
+}
--- a/ge_p3_dbl.c
+++ b/ge_p3_dbl.c
@ -0,0 +1,12 @@
+#include "ge.h"
+
+/*
+r = 2 * p
+*/
+
+void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p)
+{
+  ge_p2 q;
+  ge_p3_to_p2(&q,p);
+  ge_p2_dbl(r,&q);
+}
--- a/ge_p3_to_cached.c
+++ b/ge_p3_to_cached.c
@ -0,0 +1,17 @@
+#include "ge.h"
+
+/*
+r = p
+*/
+
+static const fe d2 = {
+-21827239,-5839606,-30745221,13898782,229458,15978800,-12551817,-6495438,29715968,9444199
+} ;
+
+extern void ge_p3_to_cached(ge_cached *r,const ge_p3 *p)
+{
+  fe_add(r->YplusX,p->Y,p->X);
+  fe_sub(r->YminusX,p->Y,p->X);
+  fe_copy(r->Z,p->Z);
+  fe_mul(r->T2d,p->T,d2);
+}
--- a/ge_p3_to_p2.c
+++ b/ge_p3_to_p2.c
@ -0,0 +1,12 @@
+#include "ge.h"
+
+/*
+r = p
+*/
+
+extern void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p)
+{
+  fe_copy(r->X,p->X);
+  fe_copy(r->Y,p->Y);
+  fe_copy(r->Z,p->Z);
+}
--- a/ge_p3_tobytes.c
+++ b/ge_p3_tobytes.c
@ -0,0 +1,14 @@
+#include "ge.h"
+
+void ge_p3_tobytes(unsigned char *s,const ge_p3 *h)
+{
+  fe recip;
+  fe x;
+  fe y;
+
+  fe_invert(recip,h->Z);
+  fe_mul(x,h->X,recip);
+  fe_mul(y,h->Y,recip);
+  fe_tobytes(s,y);
+  s[31] ^= fe_isnegative(x) << 7;
+}
--- a/ge_precomp_0.c
+++ b/ge_precomp_0.c
@ -0,0 +1,8 @@
+#include "ge.h"
+
+void ge_precomp_0(ge_precomp *h)
+{
+  fe_1(h->yplusx);
+  fe_1(h->yminusx);
+  fe_0(h->xy2d);
+}
--- a/ge_scalarmult_base.c
+++ b/ge_scalarmult_base.c
--- a/ge_sub.c
+++ b/ge_sub.c
@ -0,0 +1,108 @@
+#include "ge.h"
+
+/*
+r = p - q
+*/
+
+void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
+{
+  fe t0;
+
+/* qhasm: enter ge_sub */
+
+/* qhasm: fe X1 */
+
+/* qhasm: fe Y1 */
+
+/* qhasm: fe Z1 */
+
+/* qhasm: fe Z2 */
+
+/* qhasm: fe T1 */
+
+/* qhasm: fe ZZ */
+
+/* qhasm: fe YpX2 */
+
+/* qhasm: fe YmX2 */
+
+/* qhasm: fe T2d2 */
+
+/* qhasm: fe X3 */
+
+/* qhasm: fe Y3 */
+
+/* qhasm: fe Z3 */
+
+/* qhasm: fe T3 */
+
+/* qhasm: fe YpX1 */
+
+/* qhasm: fe YmX1 */
+
+/* qhasm: fe A */
+
+/* qhasm: fe B */
+
+/* qhasm: fe C */
+
+/* qhasm: fe D */
+
+/* qhasm: YpX1 = Y1+X1 */
+/* asm 1: fe_add(>YpX1=fe#1,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_add(>YpX1=r->X,<Y1=p->Y,<X1=p->X); */
+fe_add(r->X,p->Y,p->X);
+
+/* qhasm: YmX1 = Y1-X1 */
+/* asm 1: fe_sub(>YmX1=fe#2,<Y1=fe#12,<X1=fe#11); */
+/* asm 2: fe_sub(>YmX1=r->Y,<Y1=p->Y,<X1=p->X); */
+fe_sub(r->Y,p->Y,p->X);
+
+/* qhasm: A = YpX1*YmX2 */
+/* asm 1: fe_mul(>A=fe#3,<YpX1=fe#1,<YmX2=fe#16); */
+/* asm 2: fe_mul(>A=r->Z,<YpX1=r->X,<YmX2=q->YminusX); */
+fe_mul(r->Z,r->X,q->YminusX);
+
+/* qhasm: B = YmX1*YpX2 */
+/* asm 1: fe_mul(>B=fe#2,<YmX1=fe#2,<YpX2=fe#15); */
+/* asm 2: fe_mul(>B=r->Y,<YmX1=r->Y,<YpX2=q->YplusX); */
+fe_mul(r->Y,r->Y,q->YplusX);
+
+/* qhasm: C = T2d2*T1 */
+/* asm 1: fe_mul(>C=fe#4,<T2d2=fe#18,<T1=fe#14); */
+/* asm 2: fe_mul(>C=r->T,<T2d2=q->T2d,<T1=p->T); */
+fe_mul(r->T,q->T2d,p->T);
+
+/* qhasm: ZZ = Z1*Z2 */
+/* asm 1: fe_mul(>ZZ=fe#1,<Z1=fe#13,<Z2=fe#17); */
+/* asm 2: fe_mul(>ZZ=r->X,<Z1=p->Z,<Z2=q->Z); */
+fe_mul(r->X,p->Z,q->Z);
+
+/* qhasm: D = 2*ZZ */
+/* asm 1: fe_add(>D=fe#5,<ZZ=fe#1,<ZZ=fe#1); */
+/* asm 2: fe_add(>D=t0,<ZZ=r->X,<ZZ=r->X); */
+fe_add(t0,r->X,r->X);
+
+/* qhasm: X3 = A-B */
+/* asm 1: fe_sub(>X3=fe#1,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_sub(>X3=r->X,<A=r->Z,<B=r->Y); */
+fe_sub(r->X,r->Z,r->Y);
+
+/* qhasm: Y3 = A+B */
+/* asm 1: fe_add(>Y3=fe#2,<A=fe#3,<B=fe#2); */
+/* asm 2: fe_add(>Y3=r->Y,<A=r->Z,<B=r->Y); */
+fe_add(r->Y,r->Z,r->Y);
+
+/* qhasm: Z3 = D-C */
+/* asm 1: fe_sub(>Z3=fe#3,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_sub(>Z3=r->Z,<D=t0,<C=r->T); */
+fe_sub(r->Z,t0,r->T);
+
+/* qhasm: T3 = D+C */
+/* asm 1: fe_add(>T3=fe#4,<D=fe#5,<C=fe#4); */
+/* asm 2: fe_add(>T3=r->T,<D=t0,<C=r->T); */
+fe_add(r->T,t0,r->T);
+
+/* qhasm: return */
+
+}
--- a/ge_tobytes.c
+++ b/ge_tobytes.c
@ -0,0 +1,14 @@
+#include "ge.h"
+
+void ge_tobytes(unsigned char *s,const ge_p2 *h)
+{
+  fe recip;
+  fe x;
+  fe y;
+
+  fe_invert(recip,h->Z);
+  fe_mul(x,h->X,recip);
+  fe_mul(y,h->Y,recip);
+  fe_tobytes(s,y);
+  s[31] ^= fe_isnegative(x) << 7;
+}
--- a/keypair.c
+++ b/keypair.c
@ -0,0 +1,22 @@
+#include "crypto_sign.h"
+#include "sha512.h"
+#include "ge.h"
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk, unsigned char *seed)
+{
+  unsigned char h[64];
+  ge_p3 A;
+  int i;
+
+  sha512(h,seed,32);
+  h[0] &= 248;
+  h[31] &= 63;
+  h[31] |= 64;
+
+  ge_scalarmult_base(&A,h);
+  ge_p3_tobytes(pk,&A);
+
+  for (i = 0;i < 32;++i) sk[i] = seed[i];
+  for (i = 0;i < 32;++i) sk[32 + i] = pk[i];
+  return 0;
+}
--- a/open.c
+++ b/open.c
@ -0,0 +1,39 @@
+#include "crypto_sign.h"
+#include "sha512.h"
+#include "ge.h"
+#include "sc.h"
+
+int crypto_sign_open(
+  unsigned char *m,unsigned long long *mlen,
+  const unsigned char *sm,unsigned long long smlen,
+  const unsigned char *pk
+)
+{
+  unsigned char h[64];
+  unsigned char checkr[32];
+  ge_p3 A;
+  ge_p2 R;
+  unsigned long long i;
+
+  *mlen = -1;
+  if (smlen < 64) return -1;
+  if (sm[63] & 224) return -1;
+  if (ge_frombytes_negate_vartime(&A,pk) != 0) return -1;
+
+  for (i = 0;i < smlen;++i) m[i] = sm[i];
+  for (i = 0;i < 32;++i) m[32 + i] = pk[i];
+  sha512(h,m,smlen);
+  sc_reduce(h);
+
+  ge_double_scalarmult_vartime(&R,h,&A,sm + 32);
+  ge_tobytes(checkr,&R);
+  if (crypto_verify_32(checkr,sm) != 0) {
+    for (i = 0;i < smlen;++i) m[i] = 0;
+    return -1;
+  }
+
+  for (i = 0;i < smlen - 64;++i) m[i] = sm[64 + i];
+  for (i = smlen - 64;i < smlen;++i) m[i] = 0;
+  *mlen = smlen - 64;
+  return 0;
+}
--- a/pstdint.h
+++ b/pstdint.h
@ -0,0 +1,800 @@
+/*  A portable stdint.h
+ ****************************************************************************
+ *  BSD License:
+ ****************************************************************************
+ *
+ *  Copyright (c) 2005-2011 Paul Hsieh
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. The name of the author may not be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ****************************************************************************
+ *
+ *  Version 0.1.12
+ *
+ *  The ANSI C standard committee, for the C99 standard, specified the
+ *  inclusion of a new standard include file called stdint.h.  This is
+ *  a very useful and long desired include file which contains several
+ *  very precise definitions for integer scalar types that is
+ *  critically important for making portable several classes of
+ *  applications including cryptography, hashing, variable length
+ *  integer libraries and so on.  But for most developers its likely
+ *  useful just for programming sanity.
+ *
+ *  The problem is that most compiler vendors have decided not to
+ *  implement the C99 standard, and the next C++ language standard
+ *  (which has a lot more mindshare these days) will be a long time in
+ *  coming and its unknown whether or not it will include stdint.h or
+ *  how much adoption it will have.  Either way, it will be a long time
+ *  before all compilers come with a stdint.h and it also does nothing
+ *  for the extremely large number of compilers available today which
+ *  do not include this file, or anything comparable to it.
+ *
+ *  So that's what this file is all about.  Its an attempt to build a
+ *  single universal include file that works on as many platforms as
+ *  possible to deliver what stdint.h is supposed to.  A few things
+ *  that should be noted about this file:
+ *
+ *    1) It is not guaranteed to be portable and/or present an identical
+ *       interface on all platforms.  The extreme variability of the
+ *       ANSI C standard makes this an impossibility right from the
+ *       very get go. Its really only meant to be useful for the vast
+ *       majority of platforms that possess the capability of
+ *       implementing usefully and precisely defined, standard sized
+ *       integer scalars.  Systems which are not intrinsically 2s
+ *       complement may produce invalid constants.
+ *
+ *    2) There is an unavoidable use of non-reserved symbols.
+ *
+ *    3) Other standard include files are invoked.
+ *
+ *    4) This file may come in conflict with future platforms that do
+ *       include stdint.h.  The hope is that one or the other can be
+ *       used with no real difference.
+ *
+ *    5) In the current verison, if your platform can't represent
+ *       int32_t, int16_t and int8_t, it just dumps out with a compiler
+ *       error.
+ *
+ *    6) 64 bit integers may or may not be defined.  Test for their
+ *       presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX.
+ *       Note that this is different from the C99 specification which
+ *       requires the existence of 64 bit support in the compiler.  If
+ *       this is not defined for your platform, yet it is capable of
+ *       dealing with 64 bits then it is because this file has not yet
+ *       been extended to cover all of your system's capabilities.
+ *
+ *    7) (u)intptr_t may or may not be defined.  Test for its presence
+ *       with the test: #ifdef PTRDIFF_MAX.  If this is not defined
+ *       for your platform, then it is because this file has not yet
+ *       been extended to cover all of your system's capabilities, not
+ *       because its optional.
+ *
+ *    8) The following might not been defined even if your platform is
+ *       capable of defining it:
+ *
+ *       WCHAR_MIN
+ *       WCHAR_MAX
+ *       (u)int64_t
+ *       PTRDIFF_MIN
+ *       PTRDIFF_MAX
+ *       (u)intptr_t
+ *
+ *    9) The following have not been defined:
+ *
+ *       WINT_MIN
+ *       WINT_MAX
+ *
+ *   10) The criteria for defining (u)int_least(*)_t isn't clear,
+ *       except for systems which don't have a type that precisely
+ *       defined 8, 16, or 32 bit types (which this include file does
+ *       not support anyways). Default definitions have been given.
+ *
+ *   11) The criteria for defining (u)int_fast(*)_t isn't something I
+ *       would trust to any particular compiler vendor or the ANSI C
+ *       committee.  It is well known that "compatible systems" are
+ *       commonly created that have very different performance
+ *       characteristics from the systems they are compatible with,
+ *       especially those whose vendors make both the compiler and the
+ *       system.  Default definitions have been given, but its strongly
+ *       recommended that users never use these definitions for any
+ *       reason (they do *NOT* deliver any serious guarantee of
+ *       improved performance -- not in this file, nor any vendor's
+ *       stdint.h).
+ *
+ *   12) The following macros:
+ *
+ *       PRINTF_INTMAX_MODIFIER
+ *       PRINTF_INT64_MODIFIER
+ *       PRINTF_INT32_MODIFIER
+ *       PRINTF_INT16_MODIFIER
+ *       PRINTF_LEAST64_MODIFIER
+ *       PRINTF_LEAST32_MODIFIER
+ *       PRINTF_LEAST16_MODIFIER
+ *       PRINTF_INTPTR_MODIFIER
+ *
+ *       are strings which have been defined as the modifiers required
+ *       for the "d", "u" and "x" printf formats to correctly output
+ *       (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t,
+ *       (u)least32_t, (u)least16_t and (u)intptr_t types respectively.
+ *       PRINTF_INTPTR_MODIFIER is not defined for some systems which
+ *       provide their own stdint.h.  PRINTF_INT64_MODIFIER is not
+ *       defined if INT64_MAX is not defined.  These are an extension
+ *       beyond what C99 specifies must be in stdint.h.
+ *
+ *       In addition, the following macros are defined:
+ *
+ *       PRINTF_INTMAX_HEX_WIDTH
+ *       PRINTF_INT64_HEX_WIDTH
+ *       PRINTF_INT32_HEX_WIDTH
+ *       PRINTF_INT16_HEX_WIDTH
+ *       PRINTF_INT8_HEX_WIDTH
+ *       PRINTF_INTMAX_DEC_WIDTH
+ *       PRINTF_INT64_DEC_WIDTH
+ *       PRINTF_INT32_DEC_WIDTH
+ *       PRINTF_INT16_DEC_WIDTH
+ *       PRINTF_INT8_DEC_WIDTH
+ *
+ *       Which specifies the maximum number of characters required to
+ *       print the number of that type in either hexadecimal or decimal.
+ *       These are an extension beyond what C99 specifies must be in
+ *       stdint.h.
+ *
+ *  Compilers tested (all with 0 warnings at their highest respective
+ *  settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32
+ *  bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio
+ *  .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3
+ *
+ *  This file should be considered a work in progress.  Suggestions for
+ *  improvements, especially those which increase coverage are strongly
+ *  encouraged.
+ *
+ *  Acknowledgements
+ *
+ *  The following people have made significant contributions to the
+ *  development and testing of this file:
+ *
+ *  Chris Howie
+ *  John Steele Scott
+ *  Dave Thorup
+ *  John Dill
+ *
+ */
+
+#include <stddef.h>
+#include <limits.h>
+#include <signal.h>
+
+/*
+ *  For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and
+ *  do nothing else.  On the Mac OS X version of gcc this is _STDINT_H_.
+ */
+
+#if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_) || defined (__UINT_FAST64_TYPE__)) )) && !defined (_PSTDINT_H_INCLUDED)
+#include <stdint.h>
+#define _PSTDINT_H_INCLUDED
+# ifndef PRINTF_INT64_MODIFIER
+#  define PRINTF_INT64_MODIFIER "ll"
+# endif
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER "l"
+# endif
+# ifndef PRINTF_INT16_MODIFIER
+#  define PRINTF_INT16_MODIFIER "h"
+# endif
+# ifndef PRINTF_INTMAX_MODIFIER
+#  define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER
+# endif
+# ifndef PRINTF_INT64_HEX_WIDTH
+#  define PRINTF_INT64_HEX_WIDTH "16"
+# endif
+# ifndef PRINTF_INT32_HEX_WIDTH
+#  define PRINTF_INT32_HEX_WIDTH "8"
+# endif
+# ifndef PRINTF_INT16_HEX_WIDTH
+#  define PRINTF_INT16_HEX_WIDTH "4"
+# endif
+# ifndef PRINTF_INT8_HEX_WIDTH
+#  define PRINTF_INT8_HEX_WIDTH "2"
+# endif
+# ifndef PRINTF_INT64_DEC_WIDTH
+#  define PRINTF_INT64_DEC_WIDTH "20"
+# endif
+# ifndef PRINTF_INT32_DEC_WIDTH
+#  define PRINTF_INT32_DEC_WIDTH "10"
+# endif
+# ifndef PRINTF_INT16_DEC_WIDTH
+#  define PRINTF_INT16_DEC_WIDTH "5"
+# endif
+# ifndef PRINTF_INT8_DEC_WIDTH
+#  define PRINTF_INT8_DEC_WIDTH "3"
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+#  define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+#  define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH
+# endif
+
+/*
+ *  Something really weird is going on with Open Watcom.  Just pull some of
+ *  these duplicated definitions from Open Watcom's stdint.h file for now.
+ */
+
+# if defined (__WATCOMC__) && __WATCOMC__ >= 1250
+#  if !defined (INT64_C)
+#   define INT64_C(x)   (x + (INT64_MAX - INT64_MAX))
+#  endif
+#  if !defined (UINT64_C)
+#   define UINT64_C(x)  (x + (UINT64_MAX - UINT64_MAX))
+#  endif
+#  if !defined (INT32_C)
+#   define INT32_C(x)   (x + (INT32_MAX - INT32_MAX))
+#  endif
+#  if !defined (UINT32_C)
+#   define UINT32_C(x)  (x + (UINT32_MAX - UINT32_MAX))
+#  endif
+#  if !defined (INT16_C)
+#   define INT16_C(x)   (x)
+#  endif
+#  if !defined (UINT16_C)
+#   define UINT16_C(x)  (x)
+#  endif
+#  if !defined (INT8_C)
+#   define INT8_C(x)   (x)
+#  endif
+#  if !defined (UINT8_C)
+#   define UINT8_C(x)  (x)
+#  endif
+#  if !defined (UINT64_MAX)
+#   define UINT64_MAX  18446744073709551615ULL
+#  endif
+#  if !defined (INT64_MAX)
+#   define INT64_MAX  9223372036854775807LL
+#  endif
+#  if !defined (UINT32_MAX)
+#   define UINT32_MAX  4294967295UL
+#  endif
+#  if !defined (INT32_MAX)
+#   define INT32_MAX  2147483647L
+#  endif
+#  if !defined (INTMAX_MAX)
+#   define INTMAX_MAX INT64_MAX
+#  endif
+#  if !defined (INTMAX_MIN)
+#   define INTMAX_MIN INT64_MIN
+#  endif
+# endif
+#endif
+
+#ifndef _PSTDINT_H_INCLUDED
+#define _PSTDINT_H_INCLUDED
+
+#ifndef SIZE_MAX
+# define SIZE_MAX (~(size_t)0)
+#endif
+
+/*
+ *  Deduce the type assignments from limits.h under the assumption that
+ *  integer sizes in bits are powers of 2, and follow the ANSI
+ *  definitions.
+ */
+
+#ifndef UINT8_MAX
+# define UINT8_MAX 0xff
+#endif
+#ifndef uint8_t
+# if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S)
+    typedef unsigned char uint8_t;
+#   define UINT8_C(v) ((uint8_t) v)
+# else
+#   error "Platform not supported"
+# endif
+#endif
+
+#ifndef INT8_MAX
+# define INT8_MAX 0x7f
+#endif
+#ifndef INT8_MIN
+# define INT8_MIN INT8_C(0x80)
+#endif
+#ifndef int8_t
+# if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S)
+    typedef signed char int8_t;
+#   define INT8_C(v) ((int8_t) v)
+# else
+#   error "Platform not supported"
+# endif
+#endif
+
+#ifndef UINT16_MAX
+# define UINT16_MAX 0xffff
+#endif
+#ifndef uint16_t
+#if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S)
+  typedef unsigned int uint16_t;
+# ifndef PRINTF_INT16_MODIFIER
+#  define PRINTF_INT16_MODIFIER ""
+# endif
+# define UINT16_C(v) ((uint16_t) (v))
+#elif (USHRT_MAX == UINT16_MAX)
+  typedef unsigned short uint16_t;
+# define UINT16_C(v) ((uint16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+#  define PRINTF_INT16_MODIFIER "h"
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX 0x7fff
+#endif
+#ifndef INT16_MIN
+# define INT16_MIN INT16_C(0x8000)
+#endif
+#ifndef int16_t
+#if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S)
+  typedef signed int int16_t;
+# define INT16_C(v) ((int16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+#  define PRINTF_INT16_MODIFIER ""
+# endif
+#elif (SHRT_MAX == INT16_MAX)
+  typedef signed short int16_t;
+# define INT16_C(v) ((int16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+#  define PRINTF_INT16_MODIFIER "h"
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX (0xffffffffUL)
+#endif
+#ifndef uint32_t
+#if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S)
+  typedef unsigned long uint32_t;
+# define UINT32_C(v) v ## UL
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER "l"
+# endif
+#elif (UINT_MAX == UINT32_MAX)
+  typedef unsigned int uint32_t;
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER ""
+# endif
+# define UINT32_C(v) v ## U
+#elif (USHRT_MAX == UINT32_MAX)
+  typedef unsigned short uint32_t;
+# define UINT32_C(v) ((unsigned short) (v))
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER ""
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX (0x7fffffffL)
+#endif
+#ifndef INT32_MIN
+# define INT32_MIN INT32_C(0x80000000)
+#endif
+#ifndef int32_t
+#if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S)
+  typedef signed long int32_t;
+# define INT32_C(v) v ## L
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER "l"
+# endif
+#elif (INT_MAX == INT32_MAX)
+  typedef signed int int32_t;
+# define INT32_C(v) v
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER ""
+# endif
+#elif (SHRT_MAX == INT32_MAX)
+  typedef signed short int32_t;
+# define INT32_C(v) ((short) (v))
+# ifndef PRINTF_INT32_MODIFIER
+#  define PRINTF_INT32_MODIFIER ""
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+/*
+ *  The macro stdint_int64_defined is temporarily used to record
+ *  whether or not 64 integer support is available.  It must be
+ *  defined for any 64 integer extensions for new platforms that are
+ *  added.
+ */
+
+#undef stdint_int64_defined
+#if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S)
+# if (__STDC__ && __STDC_VERSION__ >= 199901L) || defined (S_SPLINT_S)
+#  define stdint_int64_defined
+   typedef long long int64_t;
+   typedef unsigned long long uint64_t;
+#  define UINT64_C(v) v ## ULL
+#  define  INT64_C(v) v ## LL
+#  ifndef PRINTF_INT64_MODIFIER
+#   define PRINTF_INT64_MODIFIER "ll"
+#  endif
+# endif
+#endif
+
+#if !defined (stdint_int64_defined)
+# if defined(__GNUC__)
+#  define stdint_int64_defined
+   __extension__ typedef long long int64_t;
+   __extension__ typedef unsigned long long uint64_t;
+#  define UINT64_C(v) v ## ULL
+#  define  INT64_C(v) v ## LL
+#  ifndef PRINTF_INT64_MODIFIER
+#   define PRINTF_INT64_MODIFIER "ll"
+#  endif
+# elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S)
+#  define stdint_int64_defined
+   typedef long long int64_t;
+   typedef unsigned long long uint64_t;
+#  define UINT64_C(v) v ## ULL
+#  define  INT64_C(v) v ## LL
+#  ifndef PRINTF_INT64_MODIFIER
+#   define PRINTF_INT64_MODIFIER "ll"
+#  endif
+# elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC)
+#  define stdint_int64_defined
+   typedef __int64 int64_t;
+   typedef unsigned __int64 uint64_t;
+#  define UINT64_C(v) v ## UI64
+#  define  INT64_C(v) v ## I64
+#  ifndef PRINTF_INT64_MODIFIER
+#   define PRINTF_INT64_MODIFIER "I64"
+#  endif
+# endif
+#endif
+
+#if !defined (LONG_LONG_MAX) && defined (INT64_C)
+# define LONG_LONG_MAX INT64_C (9223372036854775807)
+#endif
+#ifndef ULONG_LONG_MAX
+# define ULONG_LONG_MAX UINT64_C (18446744073709551615)
+#endif
+
+#if !defined (INT64_MAX) && defined (INT64_C)
+# define INT64_MAX INT64_C (9223372036854775807)
+#endif
+#if !defined (INT64_MIN) && defined (INT64_C)
+# define INT64_MIN INT64_C (-9223372036854775808)
+#endif
+#if !defined (UINT64_MAX) && defined (INT64_C)
+# define UINT64_MAX UINT64_C (18446744073709551615)
+#endif
+
+/*
+ *  Width of hexadecimal for number field.
+ */
+
+#ifndef PRINTF_INT64_HEX_WIDTH
+# define PRINTF_INT64_HEX_WIDTH "16"
+#endif
+#ifndef PRINTF_INT32_HEX_WIDTH
+# define PRINTF_INT32_HEX_WIDTH "8"
+#endif
+#ifndef PRINTF_INT16_HEX_WIDTH
+# define PRINTF_INT16_HEX_WIDTH "4"
+#endif
+#ifndef PRINTF_INT8_HEX_WIDTH
+# define PRINTF_INT8_HEX_WIDTH "2"
+#endif
+
+#ifndef PRINTF_INT64_DEC_WIDTH
+# define PRINTF_INT64_DEC_WIDTH "20"
+#endif
+#ifndef PRINTF_INT32_DEC_WIDTH
+# define PRINTF_INT32_DEC_WIDTH "10"
+#endif
+#ifndef PRINTF_INT16_DEC_WIDTH
+# define PRINTF_INT16_DEC_WIDTH "5"
+#endif
+#ifndef PRINTF_INT8_DEC_WIDTH
+# define PRINTF_INT8_DEC_WIDTH "3"
+#endif
+
+/*
+ *  Ok, lets not worry about 128 bit integers for now.  Moore's law says
+ *  we don't need to worry about that until about 2040 at which point
+ *  we'll have bigger things to worry about.
+ */
+
+#ifdef stdint_int64_defined
+  typedef int64_t intmax_t;
+  typedef uint64_t uintmax_t;
+# define  INTMAX_MAX   INT64_MAX
+# define  INTMAX_MIN   INT64_MIN
+# define UINTMAX_MAX  UINT64_MAX
+# define UINTMAX_C(v) UINT64_C(v)
+# define  INTMAX_C(v)  INT64_C(v)
+# ifndef PRINTF_INTMAX_MODIFIER
+#   define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+#  define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+#  define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH
+# endif
+#else
+  typedef int32_t intmax_t;
+  typedef uint32_t uintmax_t;
+# define  INTMAX_MAX   INT32_MAX
+# define UINTMAX_MAX  UINT32_MAX
+# define UINTMAX_C(v) UINT32_C(v)
+# define  INTMAX_C(v)  INT32_C(v)
+# ifndef PRINTF_INTMAX_MODIFIER
+#   define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+#  define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+#  define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH
+# endif
+#endif
+
+/*
+ *  Because this file currently only supports platforms which have
+ *  precise powers of 2 as bit sizes for the default integers, the
+ *  least definitions are all trivial.  Its possible that a future
+ *  version of this file could have different definitions.
+ */
+
+#ifndef stdint_least_defined
+  typedef   int8_t   int_least8_t;
+  typedef  uint8_t  uint_least8_t;
+  typedef  int16_t  int_least16_t;
+  typedef uint16_t uint_least16_t;
+  typedef  int32_t  int_least32_t;
+  typedef uint32_t uint_least32_t;
+# define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER
+# define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER
+# define  UINT_LEAST8_MAX  UINT8_MAX
+# define   INT_LEAST8_MAX   INT8_MAX
+# define UINT_LEAST16_MAX UINT16_MAX
+# define  INT_LEAST16_MAX  INT16_MAX
+# define UINT_LEAST32_MAX UINT32_MAX
+# define  INT_LEAST32_MAX  INT32_MAX
+# define   INT_LEAST8_MIN   INT8_MIN
+# define  INT_LEAST16_MIN  INT16_MIN
+# define  INT_LEAST32_MIN  INT32_MIN
+# ifdef stdint_int64_defined
+    typedef  int64_t  int_least64_t;
+    typedef uint64_t uint_least64_t;
+#   define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER
+#   define UINT_LEAST64_MAX UINT64_MAX
+#   define  INT_LEAST64_MAX  INT64_MAX
+#   define  INT_LEAST64_MIN  INT64_MIN
+# endif
+#endif
+#undef stdint_least_defined
+
+/*
+ *  The ANSI C committee pretending to know or specify anything about
+ *  performance is the epitome of misguided arrogance.  The mandate of
+ *  this file is to *ONLY* ever support that absolute minimum
+ *  definition of the fast integer types, for compatibility purposes.
+ *  No extensions, and no attempt to suggest what may or may not be a
+ *  faster integer type will ever be made in this file.  Developers are
+ *  warned to stay away from these types when using this or any other
+ *  stdint.h.
+ */
+
+typedef   int_least8_t   int_fast8_t;
+typedef  uint_least8_t  uint_fast8_t;
+typedef  int_least16_t  int_fast16_t;
+typedef uint_least16_t uint_fast16_t;
+typedef  int_least32_t  int_fast32_t;
+typedef uint_least32_t uint_fast32_t;
+#define  UINT_FAST8_MAX  UINT_LEAST8_MAX
+#define   INT_FAST8_MAX   INT_LEAST8_MAX
+#define UINT_FAST16_MAX UINT_LEAST16_MAX
+#define  INT_FAST16_MAX  INT_LEAST16_MAX
+#define UINT_FAST32_MAX UINT_LEAST32_MAX
+#define  INT_FAST32_MAX  INT_LEAST32_MAX
+#define   INT_FAST8_MIN   INT_LEAST8_MIN
+#define  INT_FAST16_MIN  INT_LEAST16_MIN
+#define  INT_FAST32_MIN  INT_LEAST32_MIN
+#ifdef stdint_int64_defined
+  typedef  int_least64_t  int_fast64_t;
+  typedef uint_least64_t uint_fast64_t;
+# define UINT_FAST64_MAX UINT_LEAST64_MAX
+# define  INT_FAST64_MAX  INT_LEAST64_MAX
+# define  INT_FAST64_MIN  INT_LEAST64_MIN
+#endif
+
+#undef stdint_int64_defined
+
+/*
+ *  Whatever piecemeal, per compiler thing we can do about the wchar_t
+ *  type limits.
+ */
+
+#if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__)
+# include <wchar.h>
+# ifndef WCHAR_MIN
+#  define WCHAR_MIN 0
+# endif
+# ifndef WCHAR_MAX
+#  define WCHAR_MAX ((wchar_t)-1)
+# endif
+#endif
+
+/*
+ *  Whatever piecemeal, per compiler/platform thing we can do about the
+ *  (u)intptr_t types and limits.
+ */
+
+#if defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED)
+# define STDINT_H_UINTPTR_T_DEFINED
+#endif
+
+#ifndef STDINT_H_UINTPTR_T_DEFINED
+# if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64)
+#  define stdint_intptr_bits 64
+# elif defined (__WATCOMC__) || defined (__TURBOC__)
+#  if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__)
+#    define stdint_intptr_bits 16
+#  else
+#    define stdint_intptr_bits 32
+#  endif
+# elif defined (__i386__) || defined (_WIN32) || defined (WIN32)
+#  define stdint_intptr_bits 32
+# elif defined (__INTEL_COMPILER)
+/* TODO -- what did Intel do about x86-64? */
+# endif
+
+# ifdef stdint_intptr_bits
+#  define stdint_intptr_glue3_i(a,b,c)  a##b##c
+#  define stdint_intptr_glue3(a,b,c)    stdint_intptr_glue3_i(a,b,c)
+#  ifndef PRINTF_INTPTR_MODIFIER
+#    define PRINTF_INTPTR_MODIFIER      stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER)
+#  endif
+#  ifndef PTRDIFF_MAX
+#    define PTRDIFF_MAX                 stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX)
+#  endif
+#  ifndef PTRDIFF_MIN
+#    define PTRDIFF_MIN                 stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN)
+#  endif
+#  ifndef UINTPTR_MAX
+#    define UINTPTR_MAX                 stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX)
+#  endif
+#  ifndef INTPTR_MAX
+#    define INTPTR_MAX                  stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX)
+#  endif
+#  ifndef INTPTR_MIN
+#    define INTPTR_MIN                  stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN)
+#  endif
+#  ifndef INTPTR_C
+#    define INTPTR_C(x)                 stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x)
+#  endif
+#  ifndef UINTPTR_C
+#    define UINTPTR_C(x)                stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x)
+#  endif
+  typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t;
+  typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t)  intptr_t;
+# else
+/* TODO -- This following is likely wrong for some platforms, and does
+   nothing for the definition of uintptr_t. */
+  typedef ptrdiff_t intptr_t;
+# endif
+# define STDINT_H_UINTPTR_T_DEFINED
+#endif
+
+/*
+ *  Assumes sig_atomic_t is signed and we have a 2s complement machine.
+ */
+
+#ifndef SIG_ATOMIC_MAX
+# define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1)
+#endif
+
+#endif
+
+#if defined (__TEST_PSTDINT_FOR_CORRECTNESS)
+
+/* 
+ *  Please compile with the maximum warning settings to make sure macros are not
+ *  defined more than once.
+ */
+ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+ 
+#define glue3_aux(x,y,z) x ## y ## z
+#define glue3(x,y,z) glue3_aux(x,y,z)
+
+#define DECLU(bits) glue3(uint,bits,_t) glue3(u,bits,=) glue3(UINT,bits,_C) (0);
+#define DECLI(bits) glue3(int,bits,_t) glue3(i,bits,=) glue3(INT,bits,_C) (0);
+
+#define DECL(us,bits) glue3(DECL,us,) (bits)
+
+#define TESTUMAX(bits) glue3(u,bits,=) glue3(~,u,bits); if (glue3(UINT,bits,_MAX) glue3(!=,u,bits)) printf ("Something wrong with UINT%d_MAX\n", bits)
+ 
+int main () {
+	DECL(I,8)
+	DECL(U,8)
+	DECL(I,16)
+	DECL(U,16)
+	DECL(I,32)
+	DECL(U,32)
+#ifdef INT64_MAX
+	DECL(I,64)
+	DECL(U,64)
+#endif
+	intmax_t imax = INTMAX_C(0);
+	uintmax_t umax = UINTMAX_C(0);
+	char str0[256], str1[256];
+
+	sprintf (str0, "%d %x\n", 0, ~0);
+	
+	sprintf (str1, "%d %x\n",  i8, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with i8 : %s\n", str1);
+	sprintf (str1, "%u %x\n",  u8, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with u8 : %s\n", str1);
+	sprintf (str1, "%d %x\n",  i16, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with i16 : %s\n", str1);
+	sprintf (str1, "%u %x\n",  u16, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with u16 : %s\n", str1);	
+	sprintf (str1, "%" PRINTF_INT32_MODIFIER "d %x\n",  i32, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with i32 : %s\n", str1);
+	sprintf (str1, "%" PRINTF_INT32_MODIFIER "u %x\n",  u32, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with u32 : %s\n", str1);
+#ifdef INT64_MAX	
+	sprintf (str1, "%" PRINTF_INT64_MODIFIER "d %x\n",  i64, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with i64 : %s\n", str1);
+#endif
+	sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "d %x\n",  imax, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with imax : %s\n", str1);
+	sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "u %x\n",  umax, ~0);
+	if (0 != strcmp (str0, str1)) printf ("Something wrong with umax : %s\n", str1);	
+	
+	TESTUMAX(8);
+	TESTUMAX(16);
+	TESTUMAX(32);
+#ifdef INT64_MAX
+	TESTUMAX(64);
+#endif
+
+	return EXIT_SUCCESS;
+}
+
+#endif
--- a/sc.h
+++ b/sc.h
@ -0,0 +1,15 @@
+#ifndef SC_H
+#define SC_H
+
+/*
+The set of scalars is \Z/l
+where l = 2^252 + 27742317777372353535851937790883648493.
+*/
+
+#define sc_reduce crypto_sign_ed25519_ref10_sc_reduce
+#define sc_muladd crypto_sign_ed25519_ref10_sc_muladd
+
+extern void sc_reduce(unsigned char *);
+extern void sc_muladd(unsigned char *,const unsigned char *,const unsigned char *,const unsigned char *);
+
+#endif
--- a/sc_muladd.c
+++ b/sc_muladd.c
@ -0,0 +1,366 @@
+#include "sc.h"
+#include "pstdint.h"
+
+static uint64_t load_3(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  return result;
+}
+
+static uint64_t load_4(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  result |= ((uint64_t) in[3]) << 24;
+  return result;
+}
+
+/*
+Input:
+  a[0]+256*a[1]+...+256^31*a[31] = a
+  b[0]+256*b[1]+...+256^31*b[31] = b
+  c[0]+256*c[1]+...+256^31*c[31] = c
+
+Output:
+  s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
+  where l = 2^252 + 27742317777372353535851937790883648493.
+*/
+
+void sc_muladd(unsigned char *s,const unsigned char *a,const unsigned char *b,const unsigned char *c)
+{
+  int64_t a0 = 2097151 & load_3(a);
+  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
+  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
+  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
+  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
+  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
+  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
+  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
+  int64_t a8 = 2097151 & load_3(a + 21);
+  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
+  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
+  int64_t a11 = (load_4(a + 28) >> 7);
+  int64_t b0 = 2097151 & load_3(b);
+  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
+  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
+  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
+  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
+  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
+  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
+  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
+  int64_t b8 = 2097151 & load_3(b + 21);
+  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
+  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
+  int64_t b11 = (load_4(b + 28) >> 7);
+  int64_t c0 = 2097151 & load_3(c);
+  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
+  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
+  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
+  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
+  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
+  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
+  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
+  int64_t c8 = 2097151 & load_3(c + 21);
+  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
+  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
+  int64_t c11 = (load_4(c + 28) >> 7);
+  int64_t s0;
+  int64_t s1;
+  int64_t s2;
+  int64_t s3;
+  int64_t s4;
+  int64_t s5;
+  int64_t s6;
+  int64_t s7;
+  int64_t s8;
+  int64_t s9;
+  int64_t s10;
+  int64_t s11;
+  int64_t s12;
+  int64_t s13;
+  int64_t s14;
+  int64_t s15;
+  int64_t s16;
+  int64_t s17;
+  int64_t s18;
+  int64_t s19;
+  int64_t s20;
+  int64_t s21;
+  int64_t s22;
+  int64_t s23;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+  int64_t carry17;
+  int64_t carry18;
+  int64_t carry19;
+  int64_t carry20;
+  int64_t carry21;
+  int64_t carry22;
+
+  s0 = c0 + a0*b0;
+  s1 = c1 + a0*b1 + a1*b0;
+  s2 = c2 + a0*b2 + a1*b1 + a2*b0;
+  s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
+  s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
+  s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
+  s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
+  s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
+  s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1 + a8*b0;
+  s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2 + a8*b1 + a9*b0;
+  s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3 + a8*b2 + a9*b1 + a10*b0;
+  s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4 + a8*b3 + a9*b2 + a10*b1 + a11*b0;
+  s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3 + a10*b2 + a11*b1;
+  s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3 + a11*b2;
+  s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4 + a11*b3;
+  s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
+  s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
+  s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
+  s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
+  s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
+  s20 = a9*b11 + a10*b10 + a11*b9;
+  s21 = a10*b11 + a11*b10;
+  s22 = a11*b11;
+  s23 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+  carry18 = (s18 + (1<<20)) >> 21; s19 += carry18; s18 -= carry18 << 21;
+  carry20 = (s20 + (1<<20)) >> 21; s21 += carry20; s20 -= carry20 << 21;
+  carry22 = (s22 + (1<<20)) >> 21; s23 += carry22; s22 -= carry22 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+  carry17 = (s17 + (1<<20)) >> 21; s18 += carry17; s17 -= carry17 << 21;
+  carry19 = (s19 + (1<<20)) >> 21; s20 += carry19; s19 -= carry19 << 21;
+  carry21 = (s21 + (1<<20)) >> 21; s22 += carry21; s21 -= carry21 << 21;
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
--- a/sc_reduce.c
+++ b/sc_reduce.c
@ -0,0 +1,273 @@
+#include "sc.h"
+#include "pstdint.h"
+
+static uint64_t load_3(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  return result;
+}
+
+static uint64_t load_4(const unsigned char *in)
+{
+  uint64_t result;
+  result = (uint64_t) in[0];
+  result |= ((uint64_t) in[1]) << 8;
+  result |= ((uint64_t) in[2]) << 16;
+  result |= ((uint64_t) in[3]) << 24;
+  return result;
+}
+
+/*
+Input:
+  s[0]+256*s[1]+...+256^63*s[63] = s
+
+Output:
+  s[0]+256*s[1]+...+256^31*s[31] = s mod l
+  where l = 2^252 + 27742317777372353535851937790883648493.
+  Overwrites s in place.
+*/
+
+void sc_reduce(unsigned char *s)
+{
+  int64_t s0 = 2097151 & load_3(s);
+  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
+  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
+  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
+  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
+  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
+  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
+  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
+  int64_t s8 = 2097151 & load_3(s + 21);
+  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
+  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
+  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
+  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
+  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
+  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
+  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
+  int64_t s16 = 2097151 & load_3(s + 42);
+  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
+  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
+  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
+  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
+  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
+  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
+  int64_t s23 = (load_4(s + 60) >> 3);
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
--- a/sha512.c
+++ b/sha512.c
@ -0,0 +1,302 @@
+/*
+20080913
+D. J. Bernstein
+Public domain.
+*/
+
+#include "pstdint.h"
+
+static int64_t load_bigendian(const unsigned char *x)
+{
+  return
+      (int64_t) (x[7]) \
+  | (((int64_t) (x[6])) << 8) \
+  | (((int64_t) (x[5])) << 16) \
+  | (((int64_t) (x[4])) << 24) \
+  | (((int64_t) (x[3])) << 32) \
+  | (((int64_t) (x[2])) << 40) \
+  | (((int64_t) (x[1])) << 48) \
+  | (((int64_t) (x[0])) << 56)
+  ;
+}
+
+static void store_bigendian(unsigned char *x,int64_t u)
+{
+  x[7] = u; u >>= 8;
+  x[6] = u; u >>= 8;
+  x[5] = u; u >>= 8;
+  x[4] = u; u >>= 8;
+  x[3] = u; u >>= 8;
+  x[2] = u; u >>= 8;
+  x[1] = u; u >>= 8;
+  x[0] = u;
+}
+
+#define SHR(x,c) ((x) >> (c))
+#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
+
+#define Ch(x,y,z) ((x & y) ^ (~x & z))
+#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
+#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
+#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
+#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
+#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
+
+#define M(w0,w14,w9,w1) w0 = sigma1(w14) + w9 + sigma0(w1) + w0;
+
+#define EXPAND \
+  M(w0 ,w14,w9 ,w1 ) \
+  M(w1 ,w15,w10,w2 ) \
+  M(w2 ,w0 ,w11,w3 ) \
+  M(w3 ,w1 ,w12,w4 ) \
+  M(w4 ,w2 ,w13,w5 ) \
+  M(w5 ,w3 ,w14,w6 ) \
+  M(w6 ,w4 ,w15,w7 ) \
+  M(w7 ,w5 ,w0 ,w8 ) \
+  M(w8 ,w6 ,w1 ,w9 ) \
+  M(w9 ,w7 ,w2 ,w10) \
+  M(w10,w8 ,w3 ,w11) \
+  M(w11,w9 ,w4 ,w12) \
+  M(w12,w10,w5 ,w13) \
+  M(w13,w11,w6 ,w14) \
+  M(w14,w12,w7 ,w15) \
+  M(w15,w13,w8 ,w0 )
+
+#define F(w,k) \
+  T1 = h + Sigma1(e) + Ch(e,f,g) + k + w; \
+  T2 = Sigma0(a) + Maj(a,b,c); \
+  h = g; \
+  g = f; \
+  f = e; \
+  e = d + T1; \
+  d = c; \
+  c = b; \
+  b = a; \
+  a = T1 + T2;
+
+int sha512_blocks(unsigned char *statebytes,const unsigned char *in,unsigned long long inlen)
+{
+  int64_t state[8];
+  int64_t a;
+  int64_t b;
+  int64_t c;
+  int64_t d;
+  int64_t e;
+  int64_t f;
+  int64_t g;
+  int64_t h;
+  int64_t T1;
+  int64_t T2;
+
+  a = load_bigendian(statebytes +  0); state[0] = a;
+  b = load_bigendian(statebytes +  8); state[1] = b;
+  c = load_bigendian(statebytes + 16); state[2] = c;
+  d = load_bigendian(statebytes + 24); state[3] = d;
+  e = load_bigendian(statebytes + 32); state[4] = e;
+  f = load_bigendian(statebytes + 40); state[5] = f;
+  g = load_bigendian(statebytes + 48); state[6] = g;
+  h = load_bigendian(statebytes + 56); state[7] = h;
+
+  while (inlen >= 128) {
+    int64_t w0  = load_bigendian(in +   0);
+    int64_t w1  = load_bigendian(in +   8);
+    int64_t w2  = load_bigendian(in +  16);
+    int64_t w3  = load_bigendian(in +  24);
+    int64_t w4  = load_bigendian(in +  32);
+    int64_t w5  = load_bigendian(in +  40);
+    int64_t w6  = load_bigendian(in +  48);
+    int64_t w7  = load_bigendian(in +  56);
+    int64_t w8  = load_bigendian(in +  64);
+    int64_t w9  = load_bigendian(in +  72);
+    int64_t w10 = load_bigendian(in +  80);
+    int64_t w11 = load_bigendian(in +  88);
+    int64_t w12 = load_bigendian(in +  96);
+    int64_t w13 = load_bigendian(in + 104);
+    int64_t w14 = load_bigendian(in + 112);
+    int64_t w15 = load_bigendian(in + 120);
+
+    F(w0 ,0x428a2f98d728ae22ULL)
+    F(w1 ,0x7137449123ef65cdULL)
+    F(w2 ,0xb5c0fbcfec4d3b2fULL)
+    F(w3 ,0xe9b5dba58189dbbcULL)
+    F(w4 ,0x3956c25bf348b538ULL)
+    F(w5 ,0x59f111f1b605d019ULL)
+    F(w6 ,0x923f82a4af194f9bULL)
+    F(w7 ,0xab1c5ed5da6d8118ULL)
+    F(w8 ,0xd807aa98a3030242ULL)
+    F(w9 ,0x12835b0145706fbeULL)
+    F(w10,0x243185be4ee4b28cULL)
+    F(w11,0x550c7dc3d5ffb4e2ULL)
+    F(w12,0x72be5d74f27b896fULL)
+    F(w13,0x80deb1fe3b1696b1ULL)
+    F(w14,0x9bdc06a725c71235ULL)
+    F(w15,0xc19bf174cf692694ULL)
+
+    EXPAND
+
+    F(w0 ,0xe49b69c19ef14ad2ULL)
+    F(w1 ,0xefbe4786384f25e3ULL)
+    F(w2 ,0x0fc19dc68b8cd5b5ULL)
+    F(w3 ,0x240ca1cc77ac9c65ULL)
+    F(w4 ,0x2de92c6f592b0275ULL)
+    F(w5 ,0x4a7484aa6ea6e483ULL)
+    F(w6 ,0x5cb0a9dcbd41fbd4ULL)
+    F(w7 ,0x76f988da831153b5ULL)
+    F(w8 ,0x983e5152ee66dfabULL)
+    F(w9 ,0xa831c66d2db43210ULL)
+    F(w10,0xb00327c898fb213fULL)
+    F(w11,0xbf597fc7beef0ee4ULL)
+    F(w12,0xc6e00bf33da88fc2ULL)
+    F(w13,0xd5a79147930aa725ULL)
+    F(w14,0x06ca6351e003826fULL)
+    F(w15,0x142929670a0e6e70ULL)
+
+    EXPAND
+
+    F(w0 ,0x27b70a8546d22ffcULL)
+    F(w1 ,0x2e1b21385c26c926ULL)
+    F(w2 ,0x4d2c6dfc5ac42aedULL)
+    F(w3 ,0x53380d139d95b3dfULL)
+    F(w4 ,0x650a73548baf63deULL)
+    F(w5 ,0x766a0abb3c77b2a8ULL)
+    F(w6 ,0x81c2c92e47edaee6ULL)
+    F(w7 ,0x92722c851482353bULL)
+    F(w8 ,0xa2bfe8a14cf10364ULL)
+    F(w9 ,0xa81a664bbc423001ULL)
+    F(w10,0xc24b8b70d0f89791ULL)
+    F(w11,0xc76c51a30654be30ULL)
+    F(w12,0xd192e819d6ef5218ULL)
+    F(w13,0xd69906245565a910ULL)
+    F(w14,0xf40e35855771202aULL)
+    F(w15,0x106aa07032bbd1b8ULL)
+
+    EXPAND
+
+    F(w0 ,0x19a4c116b8d2d0c8ULL)
+    F(w1 ,0x1e376c085141ab53ULL)
+    F(w2 ,0x2748774cdf8eeb99ULL)
+    F(w3 ,0x34b0bcb5e19b48a8ULL)
+    F(w4 ,0x391c0cb3c5c95a63ULL)
+    F(w5 ,0x4ed8aa4ae3418acbULL)
+    F(w6 ,0x5b9cca4f7763e373ULL)
+    F(w7 ,0x682e6ff3d6b2b8a3ULL)
+    F(w8 ,0x748f82ee5defb2fcULL)
+    F(w9 ,0x78a5636f43172f60ULL)
+    F(w10,0x84c87814a1f0ab72ULL)
+    F(w11,0x8cc702081a6439ecULL)
+    F(w12,0x90befffa23631e28ULL)
+    F(w13,0xa4506cebde82bde9ULL)
+    F(w14,0xbef9a3f7b2c67915ULL)
+    F(w15,0xc67178f2e372532bULL)
+
+    EXPAND
+
+    F(w0 ,0xca273eceea26619cULL)
+    F(w1 ,0xd186b8c721c0c207ULL)
+    F(w2 ,0xeada7dd6cde0eb1eULL)
+    F(w3 ,0xf57d4f7fee6ed178ULL)
+    F(w4 ,0x06f067aa72176fbaULL)
+    F(w5 ,0x0a637dc5a2c898a6ULL)
+    F(w6 ,0x113f9804bef90daeULL)
+    F(w7 ,0x1b710b35131c471bULL)
+    F(w8 ,0x28db77f523047d84ULL)
+    F(w9 ,0x32caab7b40c72493ULL)
+    F(w10,0x3c9ebe0a15c9bebcULL)
+    F(w11,0x431d67c49c100d4cULL)
+    F(w12,0x4cc5d4becb3e42b6ULL)
+    F(w13,0x597f299cfc657e2aULL)
+    F(w14,0x5fcb6fab3ad6faecULL)
+    F(w15,0x6c44198c4a475817ULL)
+
+    a += state[0];
+    b += state[1];
+    c += state[2];
+    d += state[3];
+    e += state[4];
+    f += state[5];
+    g += state[6];
+    h += state[7];
+  
+    state[0] = a;
+    state[1] = b;
+    state[2] = c;
+    state[3] = d;
+    state[4] = e;
+    state[5] = f;
+    state[6] = g;
+    state[7] = h;
+
+    in += 128;
+    inlen -= 128;
+  }
+
+  store_bigendian(statebytes +  0,state[0]);
+  store_bigendian(statebytes +  8,state[1]);
+  store_bigendian(statebytes + 16,state[2]);
+  store_bigendian(statebytes + 24,state[3]);
+  store_bigendian(statebytes + 32,state[4]);
+  store_bigendian(statebytes + 40,state[5]);
+  store_bigendian(statebytes + 48,state[6]);
+  store_bigendian(statebytes + 56,state[7]);
+
+  return inlen;
+}
+
+static const unsigned char iv[64] = {
+  0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08,
+  0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b,
+  0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b,
+  0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1,
+  0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1,
+  0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f,
+  0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b,
+  0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79
+} ;
+
+int sha512(unsigned char *out,const unsigned char *in,unsigned long long inlen)
+{
+  unsigned char h[64];
+  unsigned char padded[256];
+  int i;
+  unsigned long long bytes = inlen;
+
+  for (i = 0;i < 64;++i) h[i] = iv[i];
+
+  sha512_blocks(h,in,inlen);
+  in += inlen;
+  inlen &= 127;
+  in -= inlen;
+
+  for (i = 0;i < inlen;++i) padded[i] = in[i];
+  padded[inlen] = 0x80;
+
+  if (inlen < 112) {
+    for (i = inlen + 1;i < 119;++i) padded[i] = 0;
+    padded[119] = bytes >> 61;
+    padded[120] = bytes >> 53;
+    padded[121] = bytes >> 45;
+    padded[122] = bytes >> 37;
+    padded[123] = bytes >> 29;
+    padded[124] = bytes >> 21;
+    padded[125] = bytes >> 13;
+    padded[126] = bytes >> 5;
+    padded[127] = bytes << 3;
+    sha512_blocks(h,padded,128);
+  } else {
+    for (i = inlen + 1;i < 247;++i) padded[i] = 0;
+    padded[247] = bytes >> 61;
+    padded[248] = bytes >> 53;
+    padded[249] = bytes >> 45;
+    padded[250] = bytes >> 37;
+    padded[251] = bytes >> 29;
+    padded[252] = bytes >> 21;
+    padded[253] = bytes >> 13;
+    padded[254] = bytes >> 5;
+    padded[255] = bytes << 3;
+    sha512_blocks(h,padded,256);
+  }
+
+  for (i = 0;i < 64;++i) out[i] = h[i];
+
+  return 0;
+}
--- a/sha512.h
+++ b/sha512.h
@ -0,0 +1,6 @@
+#ifndef SHA512_H
+#define SHA512_H
+
+int sha512(unsigned char *out,const unsigned char *in,unsigned long long inlen);
+
+#endif
--- a/sign.c
+++ b/sign.c
@ -0,0 +1,38 @@
+#include "crypto_sign.h"
+#include "sha512.h"
+#include "ge.h"
+#include "sc.h"
+
+int crypto_sign(
+  unsigned char *sm,unsigned long long *smlen,
+  const unsigned char *m,unsigned long long mlen,
+  const unsigned char *sk
+)
+{
+  unsigned char az[64];
+  unsigned char r[64];
+  unsigned char hram[64];
+  ge_p3 R;
+  unsigned long long i;
+
+  sha512(az,sk,32);
+  az[0] &= 248;
+  az[31] &= 63;
+  az[31] |= 64;
+
+  *smlen = mlen + 64;
+  for (i = 0;i < mlen;++i) sm[64 + i] = m[i];
+  for (i = 0;i < 32;++i) sm[32 + i] = az[32 + i];
+  sha512(r,sm + 32,mlen + 32);
+  for (i = 0;i < 32;++i) sm[32 + i] = sk[32 + i];
+
+  sc_reduce(r);
+  ge_scalarmult_base(&R,r);
+  ge_p3_tobytes(sm,&R);
+
+  sha512(hram,sm,mlen + 64);
+  sc_reduce(hram);
+  sc_muladd(sm + 32,hram,az,r);
+
+  return 0;
+}
--- a/test.c
+++ b/test.c
@ -0,0 +1,39 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "crypto_sign.h"
+
+char *msg = "Hello World";
+
+int main(int argc, char *argv[]) {
+    unsigned char sk[64], vk[32];
+    unsigned char *sigmsg, *newmsg;
+    unsigned long long sigmsglen, newmsglen;
+    int ret;
+    crypto_sign_keypair(vk, sk, "0123456890123456789012");
+    printf("got keypair\n");
+    sigmsg = malloc(strlen(msg)+1+64);
+    if (!sigmsg)
+        return 1;
+    crypto_sign(sigmsg, &sigmsglen, (unsigned char *)msg, strlen(msg)+1, sk);
+    printf("got signature\n");
+    if (sigmsglen != strlen(msg)+1+64)
+        return 2;
+    newmsg = malloc(sigmsglen);
+    if (!newmsg)
+        return 3;
+    ret = crypto_sign_open(newmsg, &newmsglen, sigmsg, sigmsglen, vk);
+    printf("verified signature\n");
+    if (ret == 0)
+        printf("good!\n");
+    else
+        printf("bad\n");
+    sigmsg[0] ^= 0x01;
+    ret = crypto_sign_open(newmsg, &newmsglen, sigmsg, sigmsglen, vk);
+    if (ret == 0) 
+        printf("bad: failed to detect simple corruption\n");
+    else
+        printf("good: detected simple corruption\n");
+    return 0;
+}
--- a/verify.c
+++ b/verify.c
@ -0,0 +1,38 @@
+int crypto_verify_32(const unsigned char *x,const unsigned char *y)
+{
+  unsigned int differentbits = 0;
+#define F(i) differentbits |= x[i] ^ y[i];
+  F(0)
+  F(1)
+  F(2)
+  F(3)
+  F(4)
+  F(5)
+  F(6)
+  F(7)
+  F(8)
+  F(9)
+  F(10)
+  F(11)
+  F(12)
+  F(13)
+  F(14)
+  F(15)
+  F(16)
+  F(17)
+  F(18)
+  F(19)
+  F(20)
+  F(21)
+  F(22)
+  F(23)
+  F(24)
+  F(25)
+  F(26)
+  F(27)
+  F(28)
+  F(29)
+  F(30)
+  F(31)
+  return (1 & ((differentbits - 1) >> 8)) - 1;
+}
--- a/verify.h
+++ b/verify.h
@ -0,0 +1 @@
+int crypto_verify_32(const unsigned char *x,const unsigned char *y);
				`@ -0,0 +1 @@`
				`int crypto_verify_32(const unsigned char x,const unsigned char y);`