commit 390fa39dc519551eb3c6e93fe3fa52a9ec9d9833 Author: Tom St Denis Date: Fri Feb 28 16:02:06 2003 +0000 added libtommath-0.01 diff --git a/b.bat b/b.bat new file mode 100644 index 0000000..1142d35 --- /dev/null +++ b/b.bat @@ -0,0 +1,3 @@ +nasm -f coff timer.asm +gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER demo.c bn.c timer.o -o demo +gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER demo.c mtest/mpi.c timer.o -o mpidemo diff --git a/bn.c b/bn.c new file mode 100644 index 0000000..82262f5 --- /dev/null +++ b/bn.c @@ -0,0 +1,1983 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca + */ +#include "bn.h" + +/* chars used in radix conversions */ +static const char *s_rmap = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; + +#undef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#undef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) + +/* init a new bigint */ +int mp_init(mp_int *a) +{ + a->dp = calloc(sizeof(mp_digit), 16); + if (a->dp == NULL) { + return MP_MEM; + } + a->used = 0; + a->alloc = 16; + a->sign = MP_ZPOS; + return MP_OKAY; +} + +/* clear one (frees) */ +void mp_clear(mp_int *a) +{ + if (a->dp != NULL) { + memset(a->dp, 0, sizeof(mp_digit) * a->alloc); + free(a->dp); + a->dp = NULL; + } +} + +/* grow as required */ +static int mp_grow(mp_int *a, int size) +{ + int i; + + /* if the alloc size is smaller alloc more ram */ + if (a->alloc < size) { + size += 16 - (size & 15); /* ensure its to the next multiple of 16 words */ + a->dp = realloc(a->dp, sizeof(mp_digit) * size); + if (a->dp == NULL) { + return MP_MEM; + } + i = a->alloc; + a->alloc = size; + + /* zero top words */ + for (; i < size; i++) { + a->dp[i] = 0; + } + } + return MP_OKAY; +} + +/* shrink a bignum */ +int mp_shrink(mp_int *a) +{ + if (a->alloc != a->used) { + if ((a->dp = realloc(a->dp, sizeof(mp_digit) * a->used)) == NULL) { + return MP_MEM; + } + a->alloc = a->used; + } + return MP_OKAY; +} + +/* trim unused digits */ +static void mp_clamp(mp_int *a) +{ + while (a->used > 0 && a->dp[a->used-1] == 0) --(a->used); + if (a->used == 0) { + a->sign = MP_ZPOS; + } +} + +/* set to zero */ +void mp_zero(mp_int *a) +{ + a->sign = MP_ZPOS; + a->used = 0; + memset(a->dp, 0, sizeof(mp_digit) * a->alloc); +} + +/* set to a digit */ +void mp_set(mp_int *a, mp_digit b) +{ + mp_zero(a); + a->dp[0] = b & MP_MASK; + a->used = 1; +} + +/* set a 32-bit const */ +int mp_set_int(mp_int *a, unsigned long b) +{ + int res, x; + if ((res = mp_grow(a, 32/DIGIT_BIT + 1)) != MP_OKAY) { + return res; + } + /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */ + for (x = 0; x < 8; x++) { + mp_mul_2d(a, 4, a); + a->dp[0] |= (b>>28)&15; + b <<= 4; + } + mp_clamp(a); + return MP_OKAY; +} + +/* init a mp_init and grow it to a given size */ +int mp_init_size(mp_int *a, int size) +{ + int res; + + if ((res = mp_init(a)) != MP_OKAY) { + return res; + } + return mp_grow(a, size); +} + +/* copy, b = a */ +int mp_copy(mp_int *a, mp_int *b) +{ + int res, n; + + /* if dst == src do nothing */ + if (a->dp == b->dp) + return MP_OKAY; + + /* grow dest */ + if ((res = mp_grow(b, a->used)) != MP_OKAY) { + return res; + } + + mp_zero(b); + b->used = a->used; + b->sign = a->sign; + for (n = 0; n < a->used; n++) { + b->dp[n] = a->dp[n]; + } + return MP_OKAY; +} + +/* creates "a" then copies b into it */ +int mp_init_copy(mp_int *a, mp_int *b) +{ + int res; + + if ((res = mp_init(a)) != MP_OKAY) { + return res; + } + return mp_copy(b, a); +} + +/* compare maginitude of two ints (unsigned) */ +static int s_mp_cmp(mp_int *a, mp_int *b) +{ + int n; + + /* compare based on # of non-zero digits */ + if (a->used > b->used) { + return MP_GT; + } else if (a->used < b->used) { + return MP_LT; + } + + /* compare based on digits */ + for (n = a->used - 1; n >= 0; n--) { + if (a->dp[n] > b->dp[n]) { + return MP_GT; + } else if (a->dp[n] < b->dp[n]) { + return MP_LT; + } + } + return MP_EQ; +} + +/* compare two ints (signed)*/ +int mp_cmp(mp_int *a, mp_int *b) +{ + /* compare based on sign */ + if (a->sign == MP_NEG && b->sign == MP_ZPOS) { + return MP_LT; + } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) { + return MP_GT; + } + return s_mp_cmp(a, b); +} + +/* compare a digit */ +int mp_cmp_d(mp_int *a, mp_digit b) +{ + if (a->sign == MP_NEG) { + return MP_LT; + } + + if (a->used > 1) { + return MP_GT; + } + + if (a->dp[0] > b) { + return MP_GT; + } else if (a->dp[0] < b) { + return MP_LT; + } else { + return MP_EQ; + } +} + +/* shift right a certain amount of digits */ +void mp_rshd(mp_int *a, int b) +{ + int x; + + /* if b <= 0 then ignore it */ + if (b <= 0) { + return; + } + + /* if b > used then simply zero it and return */ + if (a->used < b) { + mp_zero(a); + return; + } + + /* shift the digits down */ + for (x = 0; x < (a->used - b); x++) { + a->dp[x] = a->dp[x + b]; + } + + /* zero the top digits */ + for (; x < a->used; x++) { + a->dp[x] = 0; + } + mp_clamp(a); +} + +/* shift left a certain amount of digits */ +int mp_lshd(mp_int *a, int b) +{ + int x, res; + + /* if its less than zero return */ + if (b <= 0) + return MP_OKAY; + + /* grow to fit the new digits */ + if ((res = mp_grow(a, a->used + b)) != MP_OKAY) { + return res; + } + + /* increment the used by the shift amount than copy upwards */ + a->used += b; + for (x = a->used-1; x >= b; x--) { + a->dp[x] = a->dp[x - b]; + } + + /* zero the lower digits */ + for (x = 0; x < b; x++) { + a->dp[x] = 0; + } + mp_clamp(a); + return MP_OKAY; +} + +/* calc a value mod 2^b */ +int mp_mod_2d(mp_int *a, int b, mp_int *c) +{ + int x, res; + + /* if b is <= 0 then zero the int */ + if (b <= 0) { + mp_zero(c); + return MP_OKAY; + } + + /* if the modulus is larger than the value than return */ + if (b > (int)(a->used * DIGIT_BIT)) { + return mp_copy(a, c); + } + + /* copy */ + if ((res = mp_copy(a, c)) != MP_OKAY) { + return res; + } + + /* zero digits above */ + for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) { + c->dp[x] = 0; + } + /* clear the digit that is not completely outside/inside the modulus */ + c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(b % DIGIT_BIT)) - ((mp_digit)1)); + mp_clamp(c); + return MP_OKAY; +} + +/* shift right by a certain bit count (store quotient in c, remainder in d) */ +int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d) +{ + mp_digit D, r, rr; + int x, res; + + if (d != NULL) { + if ((res = mp_mod_2d(a, b, d)) != MP_OKAY) { + return res; + } + } + + /* copy */ + if ((res = mp_copy(a, c)) != MP_OKAY) { + return res; + } + + /* shift by as many digits in the bit count */ + mp_rshd(c, b/DIGIT_BIT); + + /* shift any bit count < DIGIT_BIT */ + D = (mp_digit)(b % DIGIT_BIT); + if (D != 0) { + r = 0; + for (x = c->used - 1; x >= 0; x--) { + rr = c->dp[x] & ((mp_digit)((1U<dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D)); + r = rr; + } + } + mp_clamp(c); + return MP_OKAY; +} + +/* shift left by a certain bit count */ +int mp_mul_2d(mp_int *a, int b, mp_int *c) +{ + mp_digit d, r, rr; + int x, res; + + /* copy */ + if ((res = mp_copy(a, c)) != MP_OKAY) { + return res; + } + + if ((res = mp_grow(c, c->used + b/DIGIT_BIT + 1)) != MP_OKAY) { + return res; + } + + /* shift by as many digits in the bit count */ + if ((res = mp_lshd(c, b/DIGIT_BIT)) != MP_OKAY) { + return res; + } + c->used = c->alloc; + + /* shift any bit count < DIGIT_BIT */ + d = (mp_digit)(b % DIGIT_BIT); + if (d != 0) { + r = 0; + for (x = 0; x < a->used; x++) { + rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<dp[x] = ((c->dp[x] << d) | r) & MP_MASK; + r = rr; + } + } + mp_clamp(c); + return MP_OKAY; +} + +/* b = a/2 */ +int mp_div_2(mp_int *a, mp_int *b) +{ + return mp_div_2d(a, 1, b, NULL); +} + +/* b = a*2 */ +int mp_mul_2(mp_int *a, mp_int *b) +{ + return mp_mul_2d(a, 1, b); +} + +/* low level addition */ +static int s_mp_add(mp_int *a, mp_int *b, mp_int *c) +{ + mp_int t, *x; + int res, min, max, i; + mp_digit u; + + /* find sizes */ + if (a->used > b->used) { + min = b->used; + max = a->used; + x = a; + } else if (a->used < b->used) { + min = a->used; + max = b->used; + x = b; + } else { + min = max = a->used; + x = NULL; + } + + /* init result */ + if ((res = mp_init_size(&t, max+1)) != MP_OKAY) { + return res; + } + t.used = max+1; + + /* add digits from lower part */ + u = 0; + for (i = 0; i < min; i++) { + t.dp[i] = a->dp[i] + b->dp[i] + u; + u = (t.dp[i] >> DIGIT_BIT) & 1; + t.dp[i] &= MP_MASK; + } + + /* now copy higher words if any */ + if (min != max) { + for (; i < max; i++) { + t.dp[i] = x->dp[i] + u; + u = (t.dp[i] >> DIGIT_BIT) & 1; + t.dp[i] &= MP_MASK; + } + } + + /* add carry */ + t.dp[i] = u; + + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + return MP_OKAY; +} + +/* low level subtraction (assumes a > b) */ +static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c) +{ + mp_int t; + int res, min, max, i; + mp_digit u; + + /* find sizes */ + min = b->used; + max = a->used; + + /* init result */ + if ((res = mp_init_size(&t, max+1)) != MP_OKAY) { + return res; + } + t.used = max+1; + + /* sub digits from lower part */ + u = 0; + for (i = 0; i < min; i++) { + t.dp[i] = a->dp[i] - (b->dp[i] + u); + u = (t.dp[i] >> DIGIT_BIT) & 1; + t.dp[i] &= MP_MASK; + } + + /* now copy higher words if any */ + if (min != max) { + for (; i < max; i++) { + t.dp[i] = a->dp[i] - u; + u = (t.dp[i] >> DIGIT_BIT) & 1; + t.dp[i] &= MP_MASK; + } + } + + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + + mp_clear(&t); + return MP_OKAY; +} + +/* low level multiplication */ +#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1) + +/* fast multiplier */ +/* multiplies |a| * |b| and only computes upto digs digits of result */ +static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) +{ + mp_int t; + int res, pa, pb, ix, iy; + mp_word W[512]; + mp_digit tmpx, *tmpt, *tmpy; + +// printf("\nHOLA\n"); + + if ((res = mp_init_size(&t, digs)) != MP_OKAY) { + return res; + } + t.used = digs; + + /* clear temp buf */ + memset(W, 0, digs*sizeof(mp_word)); + + pa = a->used; + for (ix = 0; ix < pa; ix++) { + pb = MIN(b->used, digs - ix); + tmpx = a->dp[ix]; + tmpt = &(t.dp[ix]); + tmpy = b->dp; + for (iy = 0; iy < pb; iy++) { + W[iy+ix] += ((mp_word)tmpx) * ((mp_word)*tmpy++); + } + } + + /* now convert the array W downto what we need */ + for (ix = 1; ix < digs; ix++) { + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + } + t.dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK); + + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + + return MP_OKAY; +} + +/* multiplies |a| * |b| and only computes upto digs digits of result */ +static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) +{ + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; + + /* can we use the fast multiplier? */ + if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) { + return fast_s_mp_mul_digs(a,b,c,digs); + } + + if ((res = mp_init_size(&t, digs)) != MP_OKAY) { + return res; + } + t.used = digs; + + pa = a->used; + for (ix = 0; ix < pa; ix++) { + u = 0; + pb = MIN(b->used, digs - ix); + tmpx = a->dp[ix]; + tmpt = &(t.dp[ix]); + tmpy = b->dp; + for (iy = 0; iy < pb; iy++) { + r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u); + *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (mp_digit)(r >> ((mp_word)DIGIT_BIT)); + } + if (ix+iyused + b->used + 1)) != MP_OKAY) { + return res; + } + t.used = a->used + b->used + 1; + + pa = a->used; + pb = b->used; + memset(W, 0, (pa + pb + 1) * sizeof(mp_word)); + for (ix = 0; ix < pa; ix++) { + tmpx = a->dp[ix]; + tmpt = &(t.dp[digs]); + tmpy = b->dp + (digs - ix); + for (iy = digs - ix; iy < pb; iy++) { + W[ix+iy] += ((mp_word)tmpx) * ((mp_word)*tmpy++); + } + } + + /* now convert the array W downto what we need */ + for (ix = 1; ix < (pa+pb+1); ix++) { + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + } + t.dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK); + + + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + + return MP_OKAY; +} + +/* multiplies |a| * |b| and does not compute the lower digs digits + * [meant to get the higher part of the product] + */ +static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) +{ + mp_int t; + int res, pa, pb, ix, iy; + mp_digit u; + mp_word r; + mp_digit tmpx, *tmpt, *tmpy; + + /* can we use the fast multiplier? */ + if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) { + return fast_s_mp_mul_high_digs(a,b,c,digs); + } + + if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) { + return res; + } + t.used = a->used + b->used + 1; + + pa = a->used; + pb = b->used; + for (ix = 0; ix < pa; ix++) { + u = 0; + tmpx = a->dp[ix]; + tmpt = &(t.dp[digs]); + tmpy = b->dp + (digs - ix); + for (iy = digs - ix; iy < pb; iy++) { + r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u); + *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (mp_digit)(r >> ((mp_word)DIGIT_BIT)); + } + *tmpt = u; + } + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + + return MP_OKAY; +} + +/* fast squaring */ +static int fast_s_mp_sqr(mp_int *a, mp_int *b) +{ + mp_int t; + int res, ix, iy, pa; + mp_word r, W[512]; + mp_digit tmpx, *tmpy; + + pa = a->used; + if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { + return res; + } + t.used = pa + pa + 1; + + /* zero temp buffer */ + memset(W, 0, (pa+pa+1)*sizeof(mp_word)); + + for (ix = 0; ix < pa; ix++) { + W[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); + tmpx = a->dp[ix]; + tmpy = &(a->dp[ix+1]); + for (iy = ix + 1; iy < pa; iy++) { + r = ((mp_word)tmpx) * ((mp_word)*tmpy++); + W[ix+iy] += r + r; + } + } + for (ix = 1; ix < (pa+pa+1); ix++) { + W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); + t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK); + } + t.dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK); + + mp_clamp(&t); + if ((res = mp_copy(&t, b)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + return MP_OKAY; +} + +/* low level squaring, b = a*a */ +static int s_mp_sqr(mp_int *a, mp_int *b) +{ + mp_int t; + int res, ix, iy, pa; + mp_word r, u; + mp_digit tmpx, *tmpt; + + /* can we use the fast multiplier? */ + if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) { + return fast_s_mp_sqr(a,b); + } + + pa = a->used; + if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) { + return res; + } + t.used = pa + pa + 1; + + for (ix = 0; ix < pa; ix++) { + r = ((mp_word)t.dp[ix+ix]) + ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); + t.dp[ix+ix] = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (r >> ((mp_word)DIGIT_BIT)); + tmpx = a->dp[ix]; + tmpt = &(t.dp[ix+ix+1]); + for (iy = ix + 1; iy < pa; iy++) { + r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]); + r = ((mp_word)*tmpt) + r + r + ((mp_word)u); + *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (r >> ((mp_word)DIGIT_BIT)); + } + r = ((mp_word)*tmpt) + u; + *tmpt = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (r >> ((mp_word)DIGIT_BIT)); + /* propagate upwards */ + ++tmpt; + while (u != ((mp_word)0)) { + r = ((mp_word)*tmpt) + ((mp_word)1); + *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (r >> ((mp_word)DIGIT_BIT)); + } + } + + mp_clamp(&t); + if ((res = mp_copy(&t, b)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + return MP_OKAY; +} + +/* high level addition (handles signs) */ +int mp_add(mp_int *a, mp_int *b, mp_int *c) +{ + int sa, sb, res; + + sa = a->sign; + sb = b->sign; + + /* handle four cases */ + if (sa == MP_ZPOS && sb == MP_ZPOS) { + /* both positive */ + res = s_mp_add(a, b, c); + c->sign = MP_ZPOS; + } else if (sa == MP_ZPOS && sb == MP_NEG) { + /* a + -b == a - b, but if b>a then we do it as -(b-a) */ + if (s_mp_cmp(a, b) == MP_LT) { + res = s_mp_sub(b, a, c); + c->sign = MP_NEG; + } else { + res = s_mp_sub(a, b, c); + c->sign = MP_ZPOS; + } + } else if (sa == MP_NEG && sb == MP_ZPOS) { + /* -a + b == b - a, but if a>b then we do it as -(a-b) */ + if (s_mp_cmp(a, b) == MP_GT) { + res = s_mp_sub(a, b, c); + c->sign = MP_NEG; + } else { + res = s_mp_sub(b, a, c); + c->sign = MP_ZPOS; + } + } else { + /* -a + -b == -(a + b) */ + res = s_mp_add(a, b, c); + c->sign = MP_NEG; + } + return res; +} + +/* high level subtraction (handles signs) */ +int mp_sub(mp_int *a, mp_int *b, mp_int *c) +{ + int sa, sb, res; + + sa = a->sign; + sb = b->sign; + + /* handle four cases */ + if (sa == MP_ZPOS && sb == MP_ZPOS) { + /* both positive, a - b, but if b>a then we do -(b - a) */ + if (s_mp_cmp(a, b) == MP_LT) { + /* b>a */ + res = s_mp_sub(b, a, c); + c->sign = MP_NEG; + } else { + res = s_mp_sub(a, b, c); + c->sign = MP_ZPOS; + } + } else if (sa == MP_ZPOS && sb == MP_NEG) { + /* a - -b == a + b */ + res = s_mp_add(a, b, c); + c->sign = MP_ZPOS; + } else if (sa == MP_NEG && sb == MP_ZPOS) { + /* -a - b == -(a + b) */ + res = s_mp_add(a, b, c); + c->sign = MP_NEG; + } else { + /* -a - -b == b - a, but if a>b == -(a - b) */ + if (s_mp_cmp(a, b) == MP_GT) { + res = s_mp_sub(a, b, c); + c->sign = MP_NEG; + } else { + res = s_mp_sub(b, a, c); + c->sign = MP_ZPOS; + } + } + return res; +} + +/* c = |a| * |b| using Karatsuba */ +static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c) +{ + mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1; + int B, err, neg, x; + + err = MP_MEM; + + /* min # of digits */ + B = MIN(a->used, b->used); + + /* now divide in two */ + B = B/2; + + /* init copy all the temps */ + if (mp_init_size(&x0, B) != MP_OKAY) goto ERR; + if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0; + if (mp_init_size(&y0, B) != MP_OKAY) goto X1; + if (mp_init_size(&y1, b->used - B) != MP_OKAY) goto Y0; + + /* init temps */ + if (mp_init(&t1) != MP_OKAY) goto Y1; + if (mp_init(&t2) != MP_OKAY) goto T1; + if (mp_init(&x0y0) != MP_OKAY) goto T2; + if (mp_init(&x1y1) != MP_OKAY) goto X0Y0; + + /* now shift the digits */ + x0.sign = x1.sign = a->sign; + y0.sign = y1.sign = b->sign; + + x0.used = y0.used = B; + x1.used = a->used - B; + y1.used = b->used - B; + + for (x = 0; x < B; x++) { + x0.dp[x] = a->dp[x]; + y0.dp[x] = b->dp[x]; + } + for (x = B; x < a->used; x++) { + x1.dp[x-B] = a->dp[x]; + } + for (x = B; x < b->used; x++) { + y1.dp[x-B] = b->dp[x]; + } + + mp_clamp(&x0); + mp_clamp(&x1); + mp_clamp(&y0); + mp_clamp(&y1); + + /* now calc the products x0y0 and x1y1 */ + if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */ + if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */ + + /* now calc x1-x0 and y1-y0 */ + if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */ + if (mp_sub(&y1, &y0, &t2) != MP_OKAY) goto X1Y1; /* t2 = y1 - y0 */ + neg = (t1.sign == t2.sign) ? MP_ZPOS : MP_NEG; + if (mp_mul(&t1, &t2, &t1) != MP_OKAY) goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ + t1.sign = neg; + + /* add x0y0 */ + if (mp_add(&x0y0, &x1y1, &t2) != MP_OKAY) goto X1Y1; /* t2 = x0y0 + x1y1 */ + if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ + + /* shift by B */ + if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<used, b->used) > KARATSUBA_MUL_CUTOFF) { + res = mp_karatsuba_mul(a, b, c); + } else { + res = s_mp_mul(a, b, c); + } + c->sign = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; + return res; +} + +/* Karatsuba squaring, computes b = a*a */ +static int mp_karatsuba_sqr(mp_int *a, mp_int *b) +{ + mp_int x0, x1, t1, t2, x0x0, x1x1; + int B, err; + + err = MP_MEM; + + /* min # of digits */ + B = a->used; + + /* now divide in two */ + B = B/2; + + /* init copy all the temps */ + if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR; + if (mp_init_copy(&x1, a) != MP_OKAY) goto X0; + + /* init temps */ + if (mp_init(&t1) != MP_OKAY) goto X1; + if (mp_init(&t2) != MP_OKAY) goto T1; + if (mp_init(&x0x0) != MP_OKAY) goto T2; + if (mp_init(&x1x1) != MP_OKAY) goto X0X0; + + /* now shift the digits */ + mp_mod_2d(&x0, B*DIGIT_BIT, &x0); + mp_rshd(&x1, B); + + /* now calc the products x0*x0 and x1*x1 */ + if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ + if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ + + /* now calc x1-x0 and y1-y0 */ + if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ + if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ + + /* add x0y0 */ + if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */ + if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ + + /* shift by B */ + if (mp_lshd(&t1, B) != MP_OKAY) goto X1X1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<used > KARATSUBA_SQR_CUTOFF) { + res = mp_karatsuba_sqr(a, b); + } else { + res = s_mp_sqr(a, b); + } + b->sign = MP_ZPOS; + return res; +} + + +/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] */ +int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d) +{ + mp_int q, x, y, t1, t2; + int res, n, t, i, norm, neg; + + /* if a < b then q=0, r = a */ + if (s_mp_cmp(a, b) == MP_LT) { + if (d != NULL) { + res = mp_copy(a, d); + d->sign = a->sign; + } else { + res = MP_OKAY; + } + if (c != NULL) { + mp_zero(c); + } + return res; + } + + + if ((res = mp_init_size(&q, a->used + 2)) != MP_OKAY) { + return res; + } + q.used = a->used + 2; + + if ((res = mp_init(&t1)) != MP_OKAY) { + goto __Q; + } + + if ((res = mp_init(&t2)) != MP_OKAY) { + goto __T1; + } + + if ((res = mp_init_copy(&x, a)) != MP_OKAY) { + goto __T2; + } + + if ((res = mp_init_copy(&y, b)) != MP_OKAY) { + goto __X; + } + + /* fix the sign */ + neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; + x.sign = y.sign = MP_ZPOS; + + /* normalize */ + norm = 0; + while ((y.dp[y.used-1] & (((mp_digit)1)<<(DIGIT_BIT-1))) == ((mp_digit)0)) { + ++norm; + if ((res = mp_mul_2d(&x, 1, &x)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_mul_2d(&y, 1, &y)) != MP_OKAY) { + goto __Y; + } + } + + /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ + n = x.used - 1; + t = y.used - 1; + + /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */ + if ((res = mp_lshd(&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */ + goto __Y; + } + + while (mp_cmp(&x, &y) != MP_LT) { + ++(q.dp[n - t]); + if ((res = mp_sub(&x, &y, &x)) != MP_OKAY) { + goto __Y; + } + } + + /* reset y by shifting it back down */ + mp_rshd(&y, n - t); + + /* step 3. for i from n down to (t + 1) */ + for (i = n; i >= (t + 1); i--) { + /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ + if (x.dp[i] == y.dp[t]) { + q.dp[i - t - 1] = ((1UL< (mp_word)MP_MASK) tmp = MP_MASK; + q.dp[i - t - 1] = (mp_digit)(tmp & (mp_word)(MP_MASK)); + } + + /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */ + q.dp[i-t-1] = (q.dp[i-t-1] + 1) & MP_MASK; + do { + q.dp[i-t-1] = (q.dp[i-t-1] - 1) & MP_MASK; + + /* find left hand */ + t1.dp[0] = (t-1 < 0) ? 0 : y.dp[t-1]; + t1.dp[1] = y.dp[t]; + t1.used = 2; + if ((res = mp_mul_d(&t1, q.dp[i-t-1], &t1)) != MP_OKAY) { + goto __Y; + } + + /* find right hand */ + t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i-2]; + t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i-1]; + t2.dp[2] = x.dp[i]; + t2.used = 3; + } while (mp_cmp(&t1, &t2) == MP_GT); + + /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */ + if ((res = mp_mul_d(&y, q.dp[i-t-1], &t1)) != MP_OKAY) { + goto __Y; + } + + if ((res = mp_lshd(&t1, i - t - 1)) != MP_OKAY) { + goto __Y; + } + + if ((res = mp_sub(&x, &t1, &x)) != MP_OKAY) { + goto __Y; + } + + /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */ + if (x.sign == MP_NEG && x.used != 0) { + if ((res = mp_copy(&y, &t1)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_lshd(&t1, i-t-1)) != MP_OKAY) { + goto __Y; + } + if ((res = mp_add(&x, &t1, &x)) != MP_OKAY) { + goto __Y; + } + + q.dp[i-t-1] = (q.dp[i-t-1] - 1UL) & MP_MASK; + } + } + + /* now q is the quotient and x is the remainder [which we have to normalize] */ + if (c != NULL) { + mp_clamp(&q); + mp_copy(&q, c); + c->sign = neg; + } + + if (d != NULL) { + x.sign = a->sign; + mp_div_2d(&x, norm, &x, NULL); + mp_clamp(&x); + mp_copy(&x, d); + } + + res = MP_OKAY; + +__Y: mp_clear(&y); +__X: mp_clear(&x); +__T2: mp_clear(&t2); +__T1: mp_clear(&t1); +__Q: mp_clear(&q); + return res; +} + +/* single digit addition */ +int mp_add_d(mp_int *a, mp_digit b, mp_int *c) +{ + mp_int t; + int res; + + if ((res = mp_init(&t)) != MP_OKAY) { + return res; + } + mp_set(&t, b); + res = mp_add(a, &t, c); + + mp_clear(&t); + return res; +} + +/* single digit subtraction */ +int mp_sub_d(mp_int *a, mp_digit b, mp_int *c) +{ + mp_int t; + int res; + + if ((res = mp_init(&t)) != MP_OKAY) { + return res; + } + mp_set(&t, b); + res = mp_sub(a, &t, c); + + mp_clear(&t); + return res; +} + +/* multiply by a digit */ +int mp_mul_d(mp_int *a, mp_digit b, mp_int *c) +{ + int res, pa, ix; + mp_word r; + mp_digit u; + mp_int t; + + pa = a->used; + if ((res = mp_init_size(&t, pa + 2)) != MP_OKAY) { + return res; + } + t.used = pa + 2; + + u = 0; + for (ix = 0; ix < pa; ix++) { + r = ((mp_word)u) + ((mp_word)a->dp[ix]) * ((mp_word)b); + t.dp[ix] = (mp_digit)(r & ((mp_word)MP_MASK)); + u = (mp_digit)(r >> ((mp_word)DIGIT_BIT)); + } + t.dp[ix] = u; + + mp_clamp(&t); + if ((res = mp_copy(&t, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + mp_clear(&t); + return MP_OKAY; +} + +/* single digit division */ +int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d) +{ + mp_int t, t2; + int res; + + if ((res = mp_init(&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_init(&t2)) != MP_OKAY) { + mp_clear(&t); + return res; + } + + mp_set(&t, b); + res = mp_div(a, &t, c, &t2); + + if (d != NULL) { + *d = t2.dp[0]; + } + + mp_clear(&t); + mp_clear(&t2); + return res; +} + +/* simple modular functions */ + +/* d = a + b (mod c) */ +int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d) +{ + int res; + + if ((res = mp_add(a, b, d)) != MP_OKAY) { + return res; + } + return mp_mod(d, c, d); +} + +/* d = a - b (mod c) */ +int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d) +{ + int res; + + if ((res = mp_sub(a, b, d)) != MP_OKAY) { + return res; + } + return mp_mod(d, c, d); +} + +/* d = a * b (mod c) */ +int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d) +{ + int res; + + if ((res = mp_mul(a, b, d)) != MP_OKAY) { + return res; + } + return mp_mod(d, c, d); +} + +/* c = a * a (mod b) */ +int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c) +{ + int res; + + if ((res = mp_sqr(a, c)) != MP_OKAY) { + return res; + } + return mp_mod(c, b, c); +} + +/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP] + */ +int mp_gcd(mp_int *a, mp_int *b, mp_int *c) +{ + mp_int u, v, t; + int k, res, neg; + + /* either zero than gcd is the largest */ + if (mp_iszero(a) == 1 && mp_iszero(b) == 0) { + return mp_copy(b, c); + } + if (mp_iszero(a) == 0 && mp_iszero(b) == 1) { + return mp_copy(a, c); + } + if (mp_iszero(a) == 1 && mp_iszero(b) == 1) { + mp_set(c, 1); + return MP_OKAY; + } + + /* if both are negative they share (-1) as a common divisor */ + neg = (a->sign == b->sign) ? a->sign : MP_ZPOS; + + if ((res = mp_init_copy(&u, a)) != MP_OKAY) { + return res; + } + + if ((res = mp_init_copy(&v, b)) != MP_OKAY) { + goto __U; + } + + /* must be positive for the remainder of the algorithm */ + u.sign = v.sign = MP_ZPOS; + + if ((res = mp_init(&t)) != MP_OKAY) { + goto __V; + } + + /* B1. Find power of two */ + k = 0; + while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) { + ++k; + mp_div_2d(&u, 1, &u, NULL); + mp_div_2d(&v, 1, &v, NULL); + } + + /* B2. Initialize */ + if ((u.dp[0] & 1) == 1) { + if ((res = mp_copy(&v, &t)) != MP_OKAY) { + goto __T; + } + t.sign = MP_NEG; + } else { + if ((res = mp_copy(&u, &t)) != MP_OKAY) { + goto __T; + } + } + + do { + /* B3 (and B4). Halve t, if even */ + while (t.used != 0 && (t.dp[0] & 1) == 0) { + mp_div_2d(&t, 1, &t, NULL); + } + + /* B5. if t>0 then u=t otherwise v=-t */ + if (t.used != 0 && t.sign != MP_NEG) { + if ((res = mp_copy(&t, &u)) != MP_OKAY) { + goto __T; + } + } else { + if ((res = mp_copy(&t, &v)) != MP_OKAY) { + goto __T; + } + v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; + } + + /* B6. t = u - v, if t != 0 loop otherwise terminate */ + if ((res = mp_sub(&u, &v, &t)) != MP_OKAY) { + goto __T; + } + } while (t.used != 0); + + if ((res = mp_mul_2d(&u, k, &u)) != MP_OKAY) { + goto __T; + } + + if ((res = mp_copy(&u, c)) != MP_OKAY) { + goto __T; + } + c->sign = neg; +__T: mp_clear(&t); +__V: mp_clear(&u); +__U: mp_clear(&v); + + return res; +} + +/* computes least common multipble as a*b/(a, b) */ +int mp_lcm(mp_int *a, mp_int *b, mp_int *c) +{ + int res; + mp_int t; + + if ((res = mp_init(&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_mul(a, b, &t)) != MP_OKAY) { + mp_clear(&t); + return res; + } + + if ((res = mp_gcd(a, b, c)) != MP_OKAY) { + mp_clear(&t); + return res; + } + + res = mp_div(&t, c, c, NULL); + mp_clear(&t); + return res; +} + +/* computes the modular inverse via extended euclidean algorithm, that is c = 1/a mod b */ +int mp_invmod(mp_int *a, mp_int *b, mp_int *c) +{ + int res; + mp_int t1, t2, t3, u1, u2, u3, v1, v2, v3, q; + + if ((res = mp_init(&t1)) != MP_OKAY) { + return res; + } + + if ((res = mp_init(&t2)) != MP_OKAY) { + goto __T1; + } + + if ((res = mp_init(&t3)) != MP_OKAY) { + goto __T2; + } + + if ((res = mp_init(&u1)) != MP_OKAY) { + goto __T3; + } + + if ((res = mp_init(&u2)) != MP_OKAY) { + goto __U1; + } + + if ((res = mp_init(&u3)) != MP_OKAY) { + goto __U2; + } + + if ((res = mp_init(&v1)) != MP_OKAY) { + goto __U3; + } + + if ((res = mp_init(&v2)) != MP_OKAY) { + goto __V1; + } + + if ((res = mp_init(&v3)) != MP_OKAY) { + goto __V2; + } + + if ((res = mp_init(&q)) != MP_OKAY) { + goto __V3; + } + + /* (u1, u2, u3) = (1, 0, a) */ + mp_set(&u1, 1); + if ((res = mp_copy(a, &u3)) != MP_OKAY) { + goto __Q; + } + + /* (v1, v2, v3) = (0, 1, b) */ + mp_set(&u2, 1); + if ((res = mp_copy(b, &v3)) != MP_OKAY) { + goto __Q; + } + + while (mp_iszero(&v3) == 0) { + if ((res = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY) { + goto __Q; + } + + /* (t1, t2, t3) = (u1, u2, u3) - q*(v1, v2, v3) */ + if ((res = mp_mul(&q, &v1, &t1)) != MP_OKAY) { goto __Q; } + if ((res = mp_sub(&u1, &t1, &t1)) != MP_OKAY) { goto __Q; } + if ((res = mp_mul(&q, &v2, &t2)) != MP_OKAY) { goto __Q; } + if ((res = mp_sub(&u2, &t2, &t2)) != MP_OKAY) { goto __Q; } + if ((res = mp_mul(&q, &v3, &t3)) != MP_OKAY) { goto __Q; } + if ((res = mp_sub(&u3, &t3, &t3)) != MP_OKAY) { goto __Q; } + + /* u = v */ + if ((res = mp_copy(&v1, &u1)) != MP_OKAY) { goto __Q; } + if ((res = mp_copy(&v2, &u2)) != MP_OKAY) { goto __Q; } + if ((res = mp_copy(&v3, &u3)) != MP_OKAY) { goto __Q; } + + /* v = t */ + if ((res = mp_copy(&t1, &v1)) != MP_OKAY) { goto __Q; } + if ((res = mp_copy(&t2, &v2)) != MP_OKAY) { goto __Q; } + if ((res = mp_copy(&t3, &v3)) != MP_OKAY) { goto __Q; } + } + + /* if u3 != 1, then there is no inverse */ + if (mp_cmp_d(&u3, 1) != MP_EQ) { + res = MP_VAL; + goto __Q; + } + + /* u1 is the inverse */ + res = mp_copy(&u1, c); +__Q : mp_clear(&q); +__V3: mp_clear(&v3); +__V2: mp_clear(&v1); +__V1: mp_clear(&v1); +__U3: mp_clear(&u3); +__U2: mp_clear(&u2); +__U1: mp_clear(&u1); +__T3: mp_clear(&t3); +__T2: mp_clear(&t2); +__T1: mp_clear(&t1); + return res; +} + +/* pre-calculate the value required for Barrett reduction + * For a given modulus "b" it calulates the value required in "a" + */ +int mp_reduce_setup(mp_int *a, mp_int *b) +{ + int res; + + mp_set(a, 1); + if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) { + return res; + } + return mp_div(a, b, a, NULL); +} + +/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup */ +int mp_reduce(mp_int *x, mp_int *m, mp_int *mu) +{ + mp_int q; + int res, um = m->used; + + if((res = mp_init_copy(&q, x)) != MP_OKAY) + return res; + + mp_rshd(&q, um - 1); /* q1 = x / b^(k-1) */ + + /* according to HAC this is optimization is ok */ + if (((unsigned long)m->used) > (1UL<<(unsigned long)(DIGIT_BIT-1UL))) { + if ((res = mp_mul(&q, mu, &q)) != MP_OKAY) { + goto CLEANUP; + } + } else { + if ((res = s_mp_mul_high_digs(&q, mu, &q, um-1)) != MP_OKAY) { + goto CLEANUP; + } + } + + mp_rshd(&q, um + 1); /* q3 = q2 / b^(k+1) */ + + /* x = x mod b^(k+1), quick (no division) */ + mp_mod_2d(x, DIGIT_BIT * (um + 1), x); + + /* q = q * m mod b^(k+1), quick (no division) */ + s_mp_mul_digs(&q, m, &q, um + 1); + + /* x = x - q */ + if((res = mp_sub(x, &q, x)) != MP_OKAY) + goto CLEANUP; + + /* If x < 0, add b^(k+1) to it */ + if(mp_cmp_d(x, 0) == MP_LT) { + mp_set(&q, 1); + if((res = mp_lshd(&q, um + 1)) != MP_OKAY) + goto CLEANUP; + if((res = mp_add(x, &q, x)) != MP_OKAY) + goto CLEANUP; + } + + /* Back off if it's too big */ + while(mp_cmp(x, m) != MP_LT) { + if((res = s_mp_sub(x, m, x)) != MP_OKAY) + break; + } + + CLEANUP: + mp_clear(&q); + + return res; +} + +int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) +{ + mp_int M[64], res, mu; + mp_digit buf; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, z, winsize, tab[64]; + + /* find window size */ + x = mp_count_bits(X); + if (x <= 18) { winsize = 2; } + else if (x <= 84) { winsize = 3; } + else if (x <= 300) { winsize = 4; } + else if (x <= 930) { winsize = 5; } + else { winsize = 6; } + + /* init G array */ + for (x = 0; x < (1<used - 1; + bitcpy = bitbuf = 0; + + bitcnt = 1; + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + if (digidx == -1) { + break; + } + buf = X->dp[digidx--]; + bitcnt = DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (buf >> (DIGIT_BIT - 1)) & 1; + buf <<= 1; + + /* if the bit is zero and mode == 0 then we ignore it + * These represent the leading zero bits before the first 1 bit + * in the exponent. Technically this opt is not required but it + * does lower the # of trivial squaring/reductions used + */ + if (y == 0 && mode == 0) continue; + + /* if the bit is zero and mode == 1 then we square */ + if (y == 0 && mode == 1) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + continue; + } + + /* else we add it to the window */ + bitbuf |= (y<<(winsize-++bitcpy)); + mode = 2; + + if (bitcpy == winsize) { + /* ok window is filled so square as required and multiply multiply */ + /* square first */ + for (x = 0; x < winsize; x++) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + } + + /* then multiply */ + if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) { + goto __MU; + } + if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) { + goto __MU; + } + + /* empty window and reset */ + bitcpy = bitbuf = 0; + mode = 1; + } + } + + /* if bits remain then square/multiply */ + if (mode == 2 && bitcpy > 0) { + bitbuf >>= (winsize - bitcpy); + /* square first */ + for (x = 0; x < bitcpy; x++) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) { + goto __RES; + } + } + + /* then multiply */ + if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) { + goto __MU; + } + if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) { + goto __MU; + } + } + + err = mp_copy(&res, Y); +__RES: mp_clear(&res); +__MU : mp_clear(&mu); +__M : + for (x = 0; x < (1< radix conversion <-- */ +/* reverse an array, used for radix code */ +static void reverse(unsigned char *s, int len) +{ + int ix, iy; + unsigned char t; + + ix = 0; + iy = len - 1; + while (ix < iy) { + t = s[ix]; s[ix] = s[iy]; s[iy] = t; + ++ix; + --iy; + } +} + +/* returns the number of bits in an int */ +int mp_count_bits(mp_int *a) +{ + int r; + mp_digit q; + + if (a->used == 0) { + return 0; + } + + r = (a->used - 1) * DIGIT_BIT; + q = a->dp[a->used - 1]; + while (q) { + ++r; + q >>= 1UL; + } + return r; +} + +/* reads a unsigned char array, assumes the msb is stored first [big endian] */ +int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c) +{ + int res; + + mp_zero(a); + a->used = (c/DIGIT_BIT) + ((c % DIGIT_BIT) != 0 ? 1: 0); + if ((res = mp_grow(a, a->used)) != MP_OKAY) { + return res; + } + while (c-- > 0) { + if ((res = mp_mul_2d(a, 8, a)) != MP_OKAY) { + return res; + } + + if (DIGIT_BIT != 7) { + a->dp[0] |= *b++; + a->used += 1; + } else { + a->dp[0] = (*b & MP_MASK); + a->dp[1] |= ((*b++ >> 7U) & 1); + a->used += 2; + } + } + mp_clamp(a); + return MP_OKAY; +} + +/* read signed bin, big endian, first byte is 0==positive or 1==negative */ +int mp_read_signed_bin(mp_int *a, unsigned char *b, int c) +{ + int res; + + if ((res = mp_read_unsigned_bin(a, b + 1, c - 1)) != MP_OKAY) { + return res; + } + a->sign = ((b[0] == (unsigned char)0) ? MP_ZPOS : MP_NEG); + return MP_OKAY; +} + +/* store in unsigned [big endian] format */ +int mp_to_unsigned_bin(mp_int *a, unsigned char *b) +{ + int x, res; + mp_int t; + + if ((res = mp_init_copy(&t, a)) != MP_OKAY) { + return res; + } + + x = 0; + while (mp_iszero(&t) == 0) { + if (DIGIT_BIT != 7) { + b[x++] = (unsigned char)(t.dp[0] & 255); + } else { + b[x++] = (unsigned char)(t.dp[0] | ((t.dp[1] & 0x01) << 7)); + } + mp_div_2d(&t, 8, &t, NULL); + } + reverse(b, x); + return MP_OKAY; +} + +/* store in signed [big endian] format */ +int mp_to_signed_bin(mp_int *a, unsigned char *b) +{ + int res; + + if ((res = mp_to_unsigned_bin(a, b+1)) != MP_OKAY) { + return res; + } + b[0] = (unsigned char)((a->sign == MP_ZPOS) ? 0 : 1); + return MP_OKAY; +} + +/* get the size for an unsigned equivalent */ +int mp_unsigned_bin_size(mp_int *a) +{ + return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0)); +} + +/* get the size for an signed equivalent */ +int mp_signed_bin_size(mp_int *a) +{ + return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0)); +} + +/* read a string [ASCII] in a given radix */ +int mp_read_radix(mp_int *a, unsigned char *str, int radix) +{ + int y, res, neg; + + if (radix < 2 || radix > 64) { + return MP_VAL; + } + + if (*str == (unsigned char)'-') { + ++str; + neg = MP_NEG; + } else { + neg = MP_ZPOS; + } + + mp_zero(a); + while (*str) { + for (y = 0; y < 64; y++) { + if (*str == (unsigned char)s_rmap[y]) { + break; + } + } + + if (y < radix) { + if ((res = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY) { + return res; + } + if ((res = mp_add_d(a, (mp_digit)y, a)) != MP_OKAY) { + return res; + } + } else { + break; + } + ++str; + } + a->sign = neg; + return MP_OKAY; +} + +/* stores a bignum as a ASCII string in a given radix (2..64) */ +int mp_toradix(mp_int *a, unsigned char *str, int radix) +{ + int res, digs; + mp_int t; + mp_digit d; + unsigned char *_s = str; + + if (radix < 2 || radix > 64) { + return MP_VAL; + } + + if ((res = mp_init_copy(&t, a)) != MP_OKAY) { + return res; + } + + if (t.sign == MP_NEG) { + ++_s; + *str++ = '-'; + t.sign = MP_ZPOS; + } + + digs = 0; + while (mp_iszero(&t) == 0) { + if ((res = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) { + return res; + } + *str++ = (unsigned char)s_rmap[d]; + ++digs; + } + reverse(_s, digs); + *str++ = (unsigned char)'\0'; + mp_clear(&t); + return MP_OKAY; +} + diff --git a/bn.h b/bn.h new file mode 100644 index 0000000..2c780e6 --- /dev/null +++ b/bn.h @@ -0,0 +1,225 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca + */ + +#ifndef BN_H_ +#define BN_H_ + +#include +#include +#include +#include +#include + +/* some default configurations. + * + * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits + * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits + * + * At the very least a mp_digit must be able to hold 7 bits + * [any size beyond that is ok provided it overflow the data type] + */ +#ifdef MP_8BIT + typedef unsigned char mp_digit; + typedef unsigned short mp_word; +#elif defined(MP_16BIT) + typedef unsigned short mp_digit; + typedef unsigned long mp_word; +#else + typedef unsigned long mp_digit; + typedef unsigned long long mp_word; + #define DIGIT_BIT 28U +#endif + +#ifndef DIGIT_BIT + #define DIGIT_BIT ((CHAR_BIT * sizeof(mp_digit) - 1)) /* bits per digit */ +#endif + +#define MP_MASK ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1)) + +/* equalities */ +#define MP_LT -1 /* less than */ +#define MP_EQ 0 /* equal to */ +#define MP_GT 1 /* greater than */ + +#define MP_ZPOS 0 /* positive integer */ +#define MP_NEG 1 /* negative */ + +#define MP_OKAY 0 /* ok result */ +#define MP_MEM 1 /* out of mem */ +#define MP_VAL 2 /* invalid input */ + +#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */ +#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */ + +typedef struct { + int used, alloc, sign; + mp_digit *dp; +} mp_int; + +/* ---> init and deinit bignum functions <--- */ + +/* init a bignum */ +int mp_init(mp_int *a); + +/* free a bignum */ +void mp_clear(mp_int *a); + +/* shrink ram required for a bignum */ +int mp_shrink(mp_int *a); + +/* ---> Basic Manipulations <--- */ + +#define mp_iszero(a) (((a)->used == 0) ? 1 : 0) +#define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0) +#define mp_isodd(a) (((a)->used > 0 || (((a)->dp[0] & 1) == 1)) ? 1 : 0) + +/* set to zero */ +void mp_zero(mp_int *a); + +/* set to a digit */ +void mp_set(mp_int *a, mp_digit b); + +/* set a 32-bit const */ +int mp_set_int(mp_int *a, unsigned long b); + +/* init to a given number of digits */ +int mp_init_size(mp_int *a, int size); + +/* copy, b = a */ +int mp_copy(mp_int *a, mp_int *b); + +/* inits and copies, a = b */ +int mp_init_copy(mp_int *a, mp_int *b); + +/* ---> digit manipulation <--- */ + +/* right shift by "b" digits */ +void mp_rshd(mp_int *a, int b); + +/* left shift by "b" digits */ +int mp_lshd(mp_int *a, int b); + +/* c = a / 2^b */ +int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d); + +/* b = a/2 */ +int mp_div_2(mp_int *a, mp_int *b); + +/* c = a * 2^b */ +int mp_mul_2d(mp_int *a, int b, mp_int *c); + +/* b = a*2 */ +int mp_mul_2(mp_int *a, mp_int *b); + +/* c = a mod 2^d */ +int mp_mod_2d(mp_int *a, int b, mp_int *c); + +/* ---> Basic arithmetic <--- */ + +/* compare a to b */ +int mp_cmp(mp_int *a, mp_int *b); + +/* c = a + b */ +int mp_add(mp_int *a, mp_int *b, mp_int *c); + +/* c = a - b */ +int mp_sub(mp_int *a, mp_int *b, mp_int *c); + +/* c = a * b */ +int mp_mul(mp_int *a, mp_int *b, mp_int *c); + +/* b = a^2 */ +int mp_sqr(mp_int *a, mp_int *b); + +/* a/b => cb + d == a */ +int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c == a mod b */ +#define mp_mod(a, b, c) mp_div(a, b, NULL, c) + +/* ---> single digit functions <--- */ + +/* compare against a single digit */ +int mp_cmp_d(mp_int *a, mp_digit b); + +/* c = a + b */ +int mp_add_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a - b */ +int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a * b */ +int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); + +/* a/b => cb + d == a */ +int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); + +/* c = a mod b */ +#define mp_mod_d(a,b,c) mp_div_d(a, b, NULL, c) + +/* ---> number theory <--- */ + +/* d = a + b (mod c) */ +int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a - b (mod c) */ +int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a * b (mod c) */ +int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c = a * a (mod b) */ +int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = 1/a (mod b) */ +int mp_invmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = (a, b) */ +int mp_gcd(mp_int *a, mp_int *b, mp_int *c); + +/* c = [a, b] or (a*b)/(a, b) */ +int mp_lcm(mp_int *a, mp_int *b, mp_int *c); + +/* used to setup the Barrett reduction for a given modulus b */ +int mp_reduce_setup(mp_int *a, mp_int *b); + +/* Barrett Reduction, computes a (mod b) with a precomputed value c */ +int mp_reduce(mp_int *a, mp_int *b, mp_int *c); + +/* d = a^b (mod c) */ +int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* ---> radix conversion <--- */ + +int mp_count_bits(mp_int *a); + +int mp_unsigned_bin_size(mp_int *a); +int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); +int mp_to_unsigned_bin(mp_int *a, unsigned char *b); + +int mp_signed_bin_size(mp_int *a); +int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); +int mp_to_signed_bin(mp_int *a, unsigned char *b); + +int mp_read_radix(mp_int *a, unsigned char *str, int radix); +int mp_toradix(mp_int *a, unsigned char *str, int radix); + +#define mp_tobinary(M, S) mp_toradix((M), (S), 2) +#define mp_tooctal(M, S) mp_toradix((M), (S), 8) +#define mp_todecimal(M, S) mp_toradix((M), (S), 10) +#define mp_tohex(M, S) mp_toradix((M), (S), 16) + +#endif + diff --git a/bn.pdf b/bn.pdf new file mode 100644 index 0000000..0550ee5 Binary files /dev/null and b/bn.pdf differ diff --git a/bn.tex b/bn.tex new file mode 100644 index 0000000..d1a49fe --- /dev/null +++ b/bn.tex @@ -0,0 +1,411 @@ +\documentclass{article} +\begin{document} + +\title{LibTomMath v0.01 \\ A Free Multiple Precision Integer Library} +\author{Tom St Denis \\ tomstdenis@iahu.ca} +\maketitle +\newpage + +\section{Introduction} +``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis +of a public key cryptosystem. LibTomMath is written entire in portable ISO C source code and designed to have an application +interface much like that of MPI from Michael Fromberger. + +LibTomMath was written from scratch by Tom St Denis but designed to be drop in replacement for the MPI package. The +algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's +``The Art of Computer Programming''. The library has been extensively optimized and should provide quite comparable +timings as compared to many free and commercial libraries. + +LibTomMath was designed with the following goals in mind: +\begin{enumerate} +\item Be a drop in replacement for MPI. +\item Be much faster than MPI. +\item Be written entirely in portable C. +\end{enumerate} + +All three goals have been achieved. Particularly the speed increase goal. For example, a 512-bit modular exponentiation is +four times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI. + +Being compatible with MPI means that applications that already use it can be ported fairly quickly. Currently there are +a few differences but there are many similarities. In fact the average MPI based application can be ported in under 15 +minutes. + +Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to +help debug and suggest optimizations. They were both of great help! + +\section{Building Against LibTomMath} + +Building against LibTomMath is very simple because there is only one source file. Simply add ``bn.c'' to your project and +copy both ``bn.c'' and ``bn.h'' into your project directory. There is no configuration nor building required before hand. + +If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them +with references to LibTomMath. For example, substitute + +\begin{verbatim} +#include "mpi.h" +\end{verbatim} + +with + +\begin{verbatim} +#include "bn.h" +\end{verbatim} + +Remove ``mpi.c'' from your project and replace it with ``bn.c''. Note that currently MPI has a few more functions than +LibTomMath has (e.g. no square-root code and a few others). Those are planned for future releases. In the interim work +arounds can be sought. Note that LibTomMath doesn't lack any functions required to build a cryptosystem. + +\section{Programming with LibTomMath} + +\subsection{The mp\_int Structure} +All multiple precision integers are stored in a structure called \textbf{mp\_int}. A multiple precision integer is +essentially an array of \textbf{mp\_digit}. mp\_digit is defined at the top of bn.h. Its type can be changed to suit +a particular platform. + +For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds +seven bits. Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits. +By default a mp\_digit is a unsigned long and holds 28 bits. + +The choice of digit is particular to the platform at hand and what available multipliers are provided. For +MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal. When +MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal. By +default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal. + +This gives the library some flexibility. For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier. The +16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier. In practice this library is not particularly +designed for small devices like an i8051 due to the size. It is possible to strip out functions which are not required +to drop the code size. More realistically the library is well suited to 32 and 64-bit processors that have decent +integer multipliers. The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors. + +Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer. The +used member refers to how many digits are actually used in the representation of the integer. The alloc member refers +to how many digits have been allocated off the heap. There is also the $\beta$ quantity which is equal to $2^W$ where +$W$ is the number of bits in a digit (default is 28). + +\subsection{Basic Functionality} +Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A +function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if +it ran out of memory and \textbf{MP\_VAL} if the input was invalid. + +Before an mp\_int can be used it must be initialized with + +\begin{verbatim} +int mp_init(mp_int *a); +\end{verbatim} + +For example, consider the following. + +\begin{verbatim} +#include "bn.h" +int main(void) +{ + mp_int num; + if (mp_init(&num) != MP_OKAY) { + printf("Error initializing a mp_int.\n"); + } + return 0; +} +\end{verbatim} + +A mp\_int can be freed from memory with + +\begin{verbatim} +void mp_clear(mp_int *a); +\end{verbatim} + +This will zero the memory and free the allocated data. There are a set of trivial functions to manipulate the +value of an mp\_int. + +\begin{verbatim} +/* set to zero */ +void mp_zero(mp_int *a); + +/* set to a digit */ +void mp_set(mp_int *a, mp_digit b); + +/* set a 32-bit const */ +int mp_set_int(mp_int *a, unsigned long b); + +/* init to a given number of digits */ +int mp_init_size(mp_int *a, int size); + +/* copy, b = a */ +int mp_copy(mp_int *a, mp_int *b); + +/* inits and copies, a = b */ +int mp_init_copy(mp_int *a, mp_int *b); +\end{verbatim} + +The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive. The \textbf{mp\_set} function +will zero the integer and set the first digit to a value specified. The \textbf{mp\_set\_int} function will zero the +integer and set the first 32-bits to a given value. It is important to note that using mp\_set can have unintended +side effects when either the MP\_8BIT or MP\_16BIT defines are enabled. By default the library will accept the +ranges of values MPI will (and more). + +The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value. The +allocated digits are zero'ed by default but not marked as used. The \textbf{mp\_copy} function will copy the digits +(and sign) of the first parameter into the integer specified by the second parameter. The \textbf{mp\_init\_copy} will +initialize the first integer specified and copy the second one into it. Note that the order is reversed from that of +mp\_copy. This odd ``bug'' was kept to maintain compatibility with MPI. + +\subsection{Digit Manipulations} + +There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers +of two. + +\begin{verbatim} +/* right shift by "b" digits */ +void mp_rshd(mp_int *a, int b); + +/* left shift by "b" digits */ +int mp_lshd(mp_int *a, int b); + +/* c = a / 2^b */ +int mp_div_2d(mp_int *a, int b, mp_int *c); + +/* b = a/2 */ +int mp_div_2(mp_int *a, mp_int *b); + +/* c = a * 2^b */ +int mp_mul_2d(mp_int *a, int b, mp_int *c); + +/* b = a*2 */ +int mp_mul_2(mp_int *a, mp_int *b); + +/* c = a mod 2^d */ +int mp_mod_2d(mp_int *a, int b, mp_int *c); +\end{verbatim} + +Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits. For example, +mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent +to $x \leftarrow x \cdot \beta^n$. Both functions are extremely fast as they merely copy digits within the array. + +Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to +be specified. For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while +mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$. The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2} +functions are legacy functions that merely shift right or left one bit respectively. The \textbf{mp\_mod\_2d} function +reduces an integer mod a power of two. For example, mp\_mod\_2d($x$, $n$, $y$) is the same as +$y \equiv x \mbox{ (mod }2^n\mbox{)}$. + +\subsection{Basic Arithmetic} + +Next are the class of functions which provide basic arithmetic. + +\begin{verbatim} +/* compare a to b */ +int mp_cmp(mp_int *a, mp_int *b); + +/* c = a + b */ +int mp_add(mp_int *a, mp_int *b, mp_int *c); + +/* c = a - b */ +int mp_sub(mp_int *a, mp_int *b, mp_int *c); + +/* c = a * b */ +int mp_mul(mp_int *a, mp_int *b, mp_int *c); + +/* b = a^2 */ +int mp_sqr(mp_int *a, mp_int *b); + +/* a/b => cb + d == a */ +int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c == a mod b */ +#define mp_mod(a, b, c) mp_div(a, b, NULL, c) +\end{verbatim} + +The \textbf{mp\_cmp} will compare two integers. It will return \textbf{MP\_LT} if the first parameter is less than +the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal. These constants are the same as from +MPI. + +The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all +fairly straight forward to understand. Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be +passed as NULL to ignore it. For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as +mp\_div(\&x, \&y, \&z, NULL) is acceptable. + +There is a related class of ``single digit'' functions that are like the above except they use a digit as the second +operand. + +\begin{verbatim} +/* compare against a single digit */ +int mp_cmp_d(mp_int *a, mp_digit b); + +/* c = a + b */ +int mp_add_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a - b */ +int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a * b */ +int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); + +/* a/b => cb + d == a */ +int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); + +/* c = a mod b */ +#define mp_mod_d(a,b,c) mp_div_d(a, b, NULL, c) +\end{verbatim} + +Note that care should be taken for the value of the digit passed. By default, any 28-bit integer is a valid digit that can +be passed into the function. However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers +can be passed into it. + +\subsection{Modular Arithmetic} + +There are some trivial modular arithmetic functions. + +\begin{verbatim} +/* d = a + b (mod c) */ +int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a - b (mod c) */ +int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a * b (mod c) */ +int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c = a * a (mod b) */ +int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = 1/a (mod b) */ +int mp_invmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = (a, b) */ +int mp_gcd(mp_int *a, mp_int *b, mp_int *c); + +/* c = [a, b] or (a*b)/(a, b) */ +int mp_lcm(mp_int *a, mp_int *b, mp_int *c); + +/* d = a^b (mod c) */ +int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +\end{verbatim} + +These are all fairly simple to understand. The \textbf{mp\_invmod} is a modular multiplicative inverse. That is it +stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists. If +there is no such integer the function returns \textbf{MP\_VAL}. + +\subsection{Radix Conversions} +To read or store integers in other formats there are the following functions. + +\begin{verbatim} +int mp_unsigned_bin_size(mp_int *a); +int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); +int mp_to_unsigned_bin(mp_int *a, unsigned char *b); + +int mp_signed_bin_size(mp_int *a); +int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); +int mp_to_signed_bin(mp_int *a, unsigned char *b); + +int mp_read_radix(mp_int *a, unsigned char *str, int radix); +int mp_toradix(mp_int *a, unsigned char *str, int radix); +\end{verbatim} + +The integers are stored in big endian format as most libraries (and MPI) expect. The \textbf{mp\_read\_radix} and +\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix. Valid values +for the radix are between 2 and 64 (inclusively). + +\section{Timing Analysis} +\subsection{Observed Timings} +A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The +test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build +options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 10,000 times +each while the modular exponentiation (exptmod) were performed 10 times each. The RDTSC (Read Time Stamp Counter) instruction +was used to measure the time the entire iterations took and was divided by the number of iterations to get an +average. The following results were observed. + +\begin{small} +\begin{center} +\begin{tabular}{c|c|c|c} +\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\ +\hline +Multiply & 128 & 1,394 & 915 \\ +Multiply & 256 & 2,559 & 1,893 \\ +Multiply & 512 & 7,919 & 3,770 \\ +Multiply & 1024 & 28,460 & 9,970 \\ +Multiply & 2048 & 109,637 & 32,264 \\ +Multiply & 4096 & 467,226 & 129,645 \\ +\hline +Square & 128 & 1,288 & 1,147 \\ +Square & 256 & 1,705 & 2,129 \\ +Square & 512 & 5,365 & 3,755 \\ +Square & 1024 & 18,836 & 9,267 \\ +Square & 2048 & 72,334 & 28,387 \\ +Square & 4096 & 306,252 & 112,391 \\ +\hline +Exptmod & 512 & 30,497,732 & 7,222,872 \\ +Exptmod & 768 & 98,943,020 & 16,474,567 \\ +Exptmod & 1024 & 221,123,749 & 30,070,883 \\ +Exptmod & 2048 & 1,694,796,907 & 154,697,320 \\ +Exptmod & 2560 & 3,262,360,107 & 318,998,183 \\ +Exptmod & 3072 & 5,647,243,373 & 494,313,122 \\ +Exptmod & 4096 & 13,345,194,048 & 1,036,254,558 + +\end{tabular} +\end{center} +\end{small} + +\subsection{Digit Size} +The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI +defualt of 16. This means in many of the algorithms the savings can be considerable. Consider a baseline multiplier +with a 1024-bit input. With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits. +A savings of $64^2 - 37^2 = 2727$ single precision multiplications. + +\subsection{Multiplication Algorithms} +For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI. There are two variants +of the baseline multiplier. The normal and the fast variants. The normal baseline multiplier is the exact same as the +algorithm from MPI. The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less +than or equal to $2^{w}/\beta^2$. Where $w$ is the number of bits in a \textbf{mp\_word}. By default a mp\_word is +64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits. + +The fast baseline multiplier is optimized by removing the carry operations from the inner loop. This is often referred +to as the ``comba'' method since it computes the products a columns first then figures out the carries. This has the +effect of making a very simple and paralizable inner loop. + +For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is +used. This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large +inputs this method is faster. The Karatsuba implementation is recursive which means for extremely large inputs they +will benefit from the algorithm. + +MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop. As a result even at +smaller numbers (below the Karatsuba cutoff) the LibTomCrypt multipliers are faster. + +\subsection{Squaring Algorithms} + +Similar to the multiplication algorithms there are two baseline squaring algorithms. Both have an asymptotic running +time of $O((t^2 + t)/2)$. The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring +algorithm. The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default +covers numbers upto $3584$ bits. + +There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large +inputs. + +MPI only implements the slower baseline squaring algorithm. As a result LibTomMath is considerably faster at squaring +than MPI is. + +\subsection{Exponentiation Algorithms} + +LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm. For a given exponent size $L$ an +appropriate window size $k$ is chosen. There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular +multiplications. The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base +$b$. Then the multiplications are grouped in windows of $k$ bits. The sliding window technique has the benefit +that it can skip multiplications if there are zero bits following or preceding a window. Consider the exponent +$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication +of $g(3)$, four squarings and and a multiplication by $g(1)$. In total there are 8 squarings and 3 multiplications. + +MPI uses a binary square-multiply method. For the same exponent $e$ it would have had 8 squarings and 5 multiplications. +There is a precomputation phase for the method LibTomCrypt uses but it generally cuts down considerably on the number +of multiplications. Consider a 512-bit exponent. The worst case for the LibTomMath method results in 512 squarings and +124 multiplications. The MPI method would have 512 squarings and 512 multiplications. Randomly every $2k$ bits another +multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides. + +Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given. +However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction +do not to give full precision. As a result the reduction step is much faster and just as accurate. The LibTomMath code +will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used. The +faster multipliers have also been optimized into the two variants (baseline and fast baseline). + +As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI. + + + +\end{document} \ No newline at end of file diff --git a/changes.txt b/changes.txt new file mode 100644 index 0000000..83dd60d --- /dev/null +++ b/changes.txt @@ -0,0 +1,6 @@ +Dec 25th,2002 +v0.01 -- Initial release. Gimme a break. + -- Todo list, + add details to manual [e.g. algorithms] + more comments in code + example programs \ No newline at end of file diff --git a/demo.c b/demo.c new file mode 100644 index 0000000..b6f21db --- /dev/null +++ b/demo.c @@ -0,0 +1,238 @@ +#include + +#ifdef U_MPI +#include +#include +#include +#include +#include + #include "mpi.h" +#else + #include "bn.h" +#endif + +#ifdef TIMER +extern unsigned long long rdtsc(void); +extern void reset(void); +#endif + +static void draw(mp_int *a) +{ + char buf[4096]; + int x; + printf("a->used == %d\na->alloc == %d\na->sign == %d\n", a->used, a->alloc, a->sign); + mp_toradix(a, buf, 10); + printf("num == %s\n", buf); + printf("\n"); +} + +int main(void) +{ + mp_int a, b, c, d, e, f; + unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n; + unsigned char cmd[4096], buf[4096]; + int rr; + +#ifdef TIMER + int n; + unsigned long long tt; +#endif + + mp_init(&a); + mp_init(&b); + mp_init(&c); + mp_init(&d); + mp_init(&e); + mp_init(&f); + + +#ifdef TIMER + + mp_read_radix(&a, "340282366920938463463374607431768211455", 10); + while (a.used * DIGIT_BIT < 8192) { + reset(); + for (rr = 0; rr < 10000; rr++) { + mp_mul(&a, &a, &b); + } + tt = rdtsc(); + printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)10000)); + mp_copy(&b, &a); + } + + + mp_read_radix(&a, "340282366920938463463374607431768211455", 10); + while (a.used * DIGIT_BIT < 8192) { + reset(); + for (rr = 0; rr < 10000; rr++) { + mp_sqr(&a, &b); + } + tt = rdtsc(); + printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)10000)); + mp_copy(&b, &a); + } + + { + char *primes[] = { + "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", + "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487", + "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", + "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887", + "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227", + "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207", + "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979", + NULL + }; + srand(time(NULL)); + for (n = 0; primes[n]; n++) { + mp_read_radix(&a, primes[n], 10); + mp_zero(&b); + for (rr = 0; rr < mp_count_bits(&a); rr++) { + mp_mul_2d(&b, 1, &b); + b.dp[0] |= (rand()&1); + } + mp_sub_d(&a, 1, &c); + mp_mod(&b, &c, &b); + mp_set(&c, 3); + reset(); + for (rr = 0; rr < 20; rr++) { + mp_exptmod(&c, &b, &a, &d); + } + tt = rdtsc(); + mp_sub_d(&a, 1, &e); + mp_sub(&e, &b, &b); + mp_exptmod(&c, &b, &a, &e); /* c^(p-1-b) mod a */ + mp_mulmod(&e, &d, &a, &d); /* c^b * c^(p-1-b) == c^p-1 == 1 */ + if (mp_cmp_d(&d, 1)) { + printf("Different (%d)!!!\n", mp_count_bits(&a)); + draw(&d); + exit(0); + } + printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((unsigned long long)20)); + } + } + +#endif + + expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0; + for (;;) { + printf("add=%7lu sub=%7lu mul=%7lu div=%7lu sqr=%7lu mul2d=%7lu div2d=%7lu gcd=%7lu lcm=%7lu expt=%7lu\r", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n); + fgets(cmd, 4095, stdin); + cmd[strlen(cmd)-1] = 0; + printf("%s ]\r",cmd); + if (!strcmp(cmd, "mul2d")) { ++mul2d_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + + mp_mul_2d(&a, rr, &a); + a.sign = b.sign; + if (mp_cmp(&a, &b) != MP_EQ) { + printf("mul2d failed, rr == %d\n",rr); + draw(&a); + draw(&b); + return 0; + } + } else if (!strcmp(cmd, "div2d")) { ++div2d_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + + mp_div_2d(&a, rr, &a, &e); + a.sign = b.sign; + if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; } + if (mp_cmp(&a, &b) != MP_EQ) { + printf("div2d failed, rr == %d\n",rr); + draw(&a); + draw(&b); + return 0; + } + } else if (!strcmp(cmd, "add")) { ++add_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + mp_add(&a, &b, &d); + if (mp_cmp(&c, &d) != MP_EQ) { + printf("add %lu failure!\n", add_n); +draw(&a);draw(&b);draw(&c);draw(&d); + return 0; + } + } else if (!strcmp(cmd, "sub")) { ++sub_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + mp_sub(&a, &b, &d); + if (mp_cmp(&c, &d) != MP_EQ) { + printf("sub %lu failure!\n", sub_n); +draw(&a);draw(&b);draw(&c);draw(&d); + return 0; + } + } else if (!strcmp(cmd, "mul")) { ++mul_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + mp_mul(&a, &b, &d); + if (mp_cmp(&c, &d) != MP_EQ) { + printf("mul %lu failure!\n", mul_n); +draw(&a);draw(&b);draw(&c);draw(&d); + return 0; + } + } else if (!strcmp(cmd, "div")) { ++div_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10); + + mp_div(&a, &b, &e, &f); + if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) { + printf("div %lu failure!\n", div_n); +draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f); + return 0; + } + + } else if (!strcmp(cmd, "sqr")) { ++sqr_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + mp_sqr(&a, &c); + if (mp_cmp(&b, &c) != MP_EQ) { + printf("sqr %lu failure!\n", sqr_n); +draw(&a);draw(&b);draw(&c); + return 0; + } + } else if (!strcmp(cmd, "gcd")) { ++gcd_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + mp_gcd(&a, &b, &d); + d.sign = c.sign; + if (mp_cmp(&c, &d) != MP_EQ) { + printf("gcd %lu failure!\n", sqr_n); +draw(&a);draw(&b);draw(&c);draw(&d); + return 0; + } + } else if (!strcmp(cmd, "lcm")) { ++lcm_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + mp_lcm(&a, &b, &d); + d.sign = c.sign; + if (mp_cmp(&c, &d) != MP_EQ) { + printf("lcm %lu failure!\n", sqr_n); + draw(&a);draw(&b);draw(&c);draw(&d); + return 0; + } + } else if (!strcmp(cmd, "expt")) { ++expt_n; + fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10); + fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10); + mp_exptmod(&a, &b, &c, &e); + if (mp_cmp(&d, &e) != MP_EQ) { + printf("expt %lu failure!\n", sqr_n); + draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); + return 0; + } + } + } + return 0; +} + diff --git a/makefile b/makefile new file mode 100644 index 0000000..7c85121 --- /dev/null +++ b/makefile @@ -0,0 +1,24 @@ +CC = gcc +CFLAGS += -Wall -W -O3 -funroll-loops + +VERSION=0.01 + +default: test + +test: bn.o demo.o + $(CC) bn.o demo.o -o demo + +docdvi: bn.tex + latex bn + +docs: docdvi + pdflatex bn + rm -f bn.log bn.aux bn.dvi + +clean: + rm -f *.o *.exe mtest/*.exe bn.log bn.aux bn.dvi *.s + +zipup: clean docs + chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ + cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ + bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* \ No newline at end of file diff --git a/timer.asm b/timer.asm new file mode 100644 index 0000000..e8b6383 --- /dev/null +++ b/timer.asm @@ -0,0 +1,28 @@ +; Simple RDTSC reader for NASM +; +; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever] +; +; Most *nix installs use elf so it would be "nasm -f elf timer.asm" +; +; Tom St Denis +[bits 32] +[section .data] +timer dd 0, 0 +[section .text] +[global _rdtsc] +_rdtsc: + rdtsc + sub eax,[timer] + sbb edx,[timer+4] + ret + +[global _reset] +_reset: + push eax + push edx + rdtsc + mov [timer],eax + mov [timer+4],edx + pop edx + pop eax + ret \ No newline at end of file