diff --git a/bn.c b/bn.c index 33d027f..040ff1c 100644 --- a/bn.c +++ b/bn.c @@ -796,7 +796,7 @@ int mp_mul_2(mp_int *a, mp_int *b) DECFUNC(); return res; } - b->used = b->alloc; + ++b->used; /* shift any bit count < DIGIT_BIT */ r = 0; @@ -1017,7 +1017,6 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) */ _W = W + ix; - /* the number of digits is limited by their placement. E.g. we avoid multiplying digits that will end up above the # of digits of precision requested @@ -2840,11 +2839,404 @@ int mp_reduce(mp_int *x, mp_int *m, mp_int *mu) return res; } +/* setups the montgomery reduction stuff */ +int mp_montgomery_setup(mp_int *a, mp_digit *mp) +{ + mp_int t, tt; + int res; + + if ((res = mp_init(&t)) != MP_OKAY) { + return res; + } + + if ((res = mp_init(&tt)) != MP_OKAY) { + goto __T; + } + + /* tt = b */ + tt.dp[0] = 0; + tt.dp[1] = 1; + tt.used = 2; + + /* t = m mod b */ + t.dp[0] = a->dp[0]; + t.used = 1; + + /* t = 1/m mod b */ + if ((res = mp_invmod(&t, &tt, &t)) != MP_OKAY) { + goto __TT; + } + + /* t = -1/m mod b */ + *mp = ((mp_digit)1 << ((mp_digit)DIGIT_BIT)) - t.dp[0]; + + res = MP_OKAY; +__TT: mp_clear(&tt); +__T: mp_clear(&t); + return res; +} + + +/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ +static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) +{ + int ix, res, olduse; + mp_digit ui; + mp_word W[512]; + + /* get old used count */ + olduse = a->used; + + /* grow a as required */ + if (a->alloc < m->used*2+1) { + if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) { + return res; + } + } + + /* copy and clear */ + for (ix = 0; ix < a->used; ix++) { + W[ix] = a->dp[ix]; + } + for (; ix < m->used * 2 + 1; ix++) { + W[ix] = 0; + } + + for (ix = 0; ix < m->used; ix++) { + /* ui = ai * m' mod b */ + ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK; + + /* a = a + ui * m * b^i */ + { + register int iy; + register mp_digit *tmpx; + register mp_word *_W; + + /* aliases */ + tmpx = m->dp; + _W = W + ix; + + for (iy = 0; iy < m->used; iy++) { + *_W++ += ((mp_word)ui) * ((mp_word)*tmpx++); + } + + /* now fix carry for W[ix+1] */ + W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT); + W[ix] &= ((mp_word)MP_MASK); + } + } + + /* nox fix rest of carries */ + for (; ix <= m->used * 2 + 1; ix++) { + W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT)); + W[ix-1] &= ((mp_word)MP_MASK); + } + + /* copy out */ + + /* A = A/b^n */ + for (ix = 0; ix < m->used + 1; ix++) { + a->dp[ix] = W[ix+m->used]; + } + + a->used = m->used + 1; + + /* zero oldused digits */ + for (; ix < olduse; ix++) { + a->dp[ix] = 0; + } + + mp_clamp(a); + + /* if A >= m then A = A - m */ + if (mp_cmp_mag(a, m) != MP_LT) { + if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { + return res; + } + } + + return MP_OKAY; +} + +/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) +{ + int ix, res, digs; + mp_digit ui; + + REGFUNC("mp_montgomery_reduce"); + VERIFY(a); + VERIFY(m); + + digs = m->used * 2 + 1; + if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) { + res = fast_mp_montgomery_reduce(a, m, mp); + DECFUNC(); + return res; + } + + if (a->alloc < m->used*2+1) { + if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) { + DECFUNC(); + return res; + } + } + a->used = m->used * 2 + 1; + + for (ix = 0; ix < m->used; ix++) { + /* ui = ai * m' mod b */ + ui = (a->dp[ix] * mp) & MP_MASK; + + /* a = a + ui * m * b^i */ + { + register int iy; + register mp_digit *tmpx, *tmpy, mu; + register mp_word r; + + /* aliases */ + tmpx = m->dp; + tmpy = a->dp + ix; + + mu = 0; + for (iy = 0; iy < m->used; iy++) { + r = ((mp_word)ui) * ((mp_word)*tmpx++) + ((mp_word)mu) + ((mp_word)*tmpy); + mu = (r >> ((mp_word)DIGIT_BIT)); + *tmpy++ = (r & ((mp_word)MP_MASK)); + } + /* propagate carries */ + while (mu) { + *tmpy += mu; + mu = (*tmpy>>DIGIT_BIT)&1; + *tmpy++ &= MP_MASK; + } + } + } + + /* A = A/b^n */ + mp_rshd(a, m->used); + + /* if A >= m then A = A - m */ + if (mp_cmp_mag(a, m) != MP_LT) { + if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { + DECFUNC(); + return res; + } + } + + DECFUNC(); + return MP_OKAY; +} + /* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 * * Uses a left-to-right k-ary sliding window to compute the modular exponentiation. * The value of k changes based on the size of the exponent. + * + * Uses Montgomery reduction */ +static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) +{ + mp_int M[64], res; + mp_digit buf, mp; + int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; + + REGFUNC("mp_exptmod_fast"); + VERIFY(G); + VERIFY(X); + VERIFY(P); + VERIFY(Y); + + /* find window size */ + x = mp_count_bits(X); + if (x <= 18) { winsize = 2; } + else if (x <= 84) { winsize = 3; } + else if (x <= 300) { winsize = 4; } + else if (x <= 930) { winsize = 5; } + else { winsize = 6; } + + /* init G array */ + for (x = 0; x < (1<used)) != MP_OKAY) { + goto __RES; + } + + /* res = R mod m */ + if ((err = mp_mod(&res, P, &res)) != MP_OKAY) { + goto __RES; + } + + /* create M table + * + * The M table contains powers of the input base, e.g. M[x] = G^x mod P + * + * The first half of the table is not computed though accept for M[0] and M[1] + */ + mp_set(&M[0], 1); + if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { + goto __RES; + } + + /* now set M[1] to G * R mod m */ + if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) { + goto __RES; + } + + /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ + if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) { + goto __RES; + } + + for (x = 0; x < (winsize-1); x++) { + if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&M[1<<(winsize-1)], P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* create upper table */ + for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) { + if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&M[x], P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* set initial mode and bit cnt */ + mode = 0; + bitcnt = 0; + buf = 0; + digidx = X->used - 1; + bitcpy = bitbuf = 0; + + bitcnt = 1; + for (;;) { + /* grab next digit as required */ + if (--bitcnt == 0) { + if (digidx == -1) { + break; + } + buf = X->dp[digidx--]; + bitcnt = (int)DIGIT_BIT; + } + + /* grab the next msb from the exponent */ + y = (buf >> (DIGIT_BIT - 1)) & 1; + buf <<= 1; + + /* if the bit is zero and mode == 0 then we ignore it + * These represent the leading zero bits before the first 1 bit + * in the exponent. Technically this opt is not required but it + * does lower the # of trivial squaring/reductions used + */ + if (mode == 0 && y == 0) continue; + + /* if the bit is zero and mode == 1 then we square */ + if (y == 0 && mode == 1) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) { + goto __RES; + } + continue; + } + + /* else we add it to the window */ + bitbuf |= (y<<(winsize-++bitcpy)); + mode = 2; + + if (bitcpy == winsize) { + /* ok window is filled so square as required and multiply multiply */ + /* square first */ + for (x = 0; x < winsize; x++) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) { + goto __RES; + } + } + + /* then multiply */ + if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) { + goto __RES; + } + + /* empty window and reset */ + bitcpy = bitbuf = 0; + mode = 1; + } + } + + /* if bits remain then square/multiply */ + if (mode == 2 && bitcpy > 0) { + /* square then multiply if the bit is set */ + for (x = 0; x < bitcpy; x++) { + if ((err = mp_sqr(&res, &res)) != MP_OKAY) { + goto __RES; + } + if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) { + goto __RES; + } + + bitbuf <<= 1; + if ((bitbuf & (1<used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) { + err = mp_exptmod_fast(G, X, P, Y); + DECFUNC(); + return err; + } + /* find window size */ x = mp_count_bits(X); if (x <= 18) { winsize = 2; } @@ -2918,7 +3317,8 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) goto __MU; } } - /* init result */ + + /* setup result */ if ((err = mp_init(&res)) != MP_OKAY) { goto __MU; } @@ -3009,10 +3409,10 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) if ((bitbuf & (1<sign; a->sign = MP_ZPOS; - /* t2 = a */ - if ((res = mp_copy(a, &t2)) != MP_OKAY) { - goto __T3; - } + /* t2 = 2 */ + mp_set(&t2, 2); do { /* t1 = t2 */ @@ -3098,16 +3496,20 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c) } } while (mp_cmp(&t1, &t2) != MP_EQ); - /* result can be at most off by one so check */ - if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) { - goto __T3; - } - - if (mp_cmp(&t2, a) == MP_GT) { - if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) { + /* result can be off by a few so check */ + for (;;) { + if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) { goto __T3; } - } + + if (mp_cmp(&t2, a) == MP_GT) { + if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) { + goto __T3; + } + } else { + break; + } + } /* reset the sign of a first */ a->sign = neg; @@ -3336,13 +3738,14 @@ int mp_to_signed_bin(mp_int *a, unsigned char *b) /* get the size for an unsigned equivalent */ int mp_unsigned_bin_size(mp_int *a) { - return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0)); + int size = mp_count_bits(a); + return (size/8 + ((size&7) != 0 ? 1 : 0)); } /* get the size for an signed equivalent */ int mp_signed_bin_size(mp_int *a) { - return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0)); + return 1 + mp_unsigned_bin_size(a); } /* read a string [ASCII] in a given radix */ @@ -3431,8 +3834,11 @@ int mp_radix_size(mp_int *a, int radix) mp_int t; mp_digit d; - digs = 0; - + /* special case for binary */ + if (radix == 2) { + return mp_count_bits(a) + (a->sign == MP_NEG ? 1 : 0) + 1; + } + if (radix < 2 || radix > 64) { return 0; } @@ -3441,6 +3847,7 @@ int mp_radix_size(mp_int *a, int radix) return 0; } + digs = 0; if (t.sign == MP_NEG) { ++digs; t.sign = MP_ZPOS; diff --git a/bn.h b/bn.h index 903e7d6..6e7bc85 100644 --- a/bn.h +++ b/bn.h @@ -84,6 +84,7 @@ typedef int mp_err; /* you'll have to tune these... */ #define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */ #define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */ +#define MONTGOMERY_EXPT_CUTOFF 40 /* max. number of digits that montgomery reductions will help for */ #define MP_PREC 64 /* default digits of precision */ @@ -114,7 +115,7 @@ int mp_shrink(mp_int *a); #define mp_iszero(a) (((a)->used == 0) ? 1 : 0) #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0) -#define mp_isodd(a) (((a)->used > 0 || (((a)->dp[0] & 1) == 1)) ? 1 : 0) +#define mp_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0) /* set to zero */ void mp_zero(mp_int *a); @@ -256,6 +257,12 @@ int mp_reduce_setup(mp_int *a, mp_int *b); * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. */ int mp_reduce(mp_int *a, mp_int *b, mp_int *c); + +/* setups the montgomery reduction */ +int mp_montgomery_setup(mp_int *a, mp_digit *mp); + +/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); /* d = a^b (mod c) */ int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); diff --git a/bn.pdf b/bn.pdf index 3bea8f2..f9c86f2 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index ed2a46d..d2aab27 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ \documentclass{article} \begin{document} -\title{LibTomMath v0.09 \\ A Free Multiple Precision Integer Library} +\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage @@ -279,6 +279,22 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c); /* computes the jacobi c = (a | n) (or Legendre if b is prime) */ int mp_jacobi(mp_int *a, mp_int *n, int *c); +/* used to setup the Barrett reduction for a given modulus b */ +int mp_reduce_setup(mp_int *a, mp_int *b); + +/* Barrett Reduction, computes a (mod b) with a precomputed value c + * + * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely + * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. + */ +int mp_reduce(mp_int *a, mp_int *b, mp_int *c); + +/* setups the montgomery reduction */ +int mp_montgomery_setup(mp_int *a, mp_digit *mp); + +/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); + /* d = a^b (mod c) */ int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); \end{verbatim} @@ -451,21 +467,38 @@ by $c$. Since the result of the Jacobi function $\left ( {a \over n} \right ) \ natural to store the result in a simple C style \textbf{int}. If $n$ is prime then the Jacobi function produces the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}. This means if $n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if -it is not. +it is not. \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is -chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett reductions are used -to reduce the squared or multiplied temporary results modulo $c$. A Barrett reduction requires one division that is -performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work. +chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery +reductions are used to reduce the squared or multiplied temporary results modulo $c$. -Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications. The total work of a exponentiation is -therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which -simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$. The bulk of the time is spent in the Barrett -reductions and the squaring algorithms. Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and -squaring routines first. Significant improvements in the future will most likely stem from optimizing these instead -of optimizing the multipliers. +\subsection{Fast Modular Reductions} + +\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes +$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the +function mp\_reduce\_setup(). + +The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs +have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial +multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants. Barrett reduction +can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications. + +\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} +Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then +$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. +The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). + +The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine +is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can +reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. + +Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply +by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the +exptmod function can be made to take advantage of this such that only one normalization at the end is required. \section{Timing Analysis} \subsection{Observed Timings} @@ -503,9 +536,9 @@ Square & 1024 & 18,991 & 5,733 \\ Square & 2048 & 72,126 & 17,621 \\ Square & 4096 & 306,269 & 67,576 \\ \hline -Exptmod & 512 & 32,021,586 & 4,138,354 \\ -Exptmod & 768 & 97,595,492 & 9,840,233 \\ -Exptmod & 1024 & 223,302,532 & 20,624,553 \\ +Exptmod & 512 & 32,021,586 & 3,118,435 \\ +Exptmod & 768 & 97,595,492 & 8,493,633 \\ +Exptmod & 1024 & 223,302,532 & 17,715,899 \\ Exptmod & 2048 & 1,682,223,369 & 114,936,361 \\ Exptmod & 2560 & 3,268,615,571 & 229,402,426 \\ Exptmod & 3072 & 5,597,240,141 & 367,403,840 \\ diff --git a/changes.txt b/changes.txt index 2d84db9..d302ee6 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,8 @@ +Jan 9th, 2003 +v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code. + -- Added baseline montgomery and comba montgomery reductions, sped up exptmods + [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF] + Jan 6th, 2003 v0.09 -- Updated the manual to reflect recent changes. :-) -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib diff --git a/demo.c b/demo.c index f671758..ab92707 100644 --- a/demo.c +++ b/demo.c @@ -21,12 +21,15 @@ #define TIMER extern ulong64 rdtsc(void); extern void reset(void); -#else +#endif +#ifdef TIMER +#ifndef TIMER_X86 ulong64 _tt; void reset(void) { _tt = clock(); } ulong64 rdtsc(void) { return clock() - _tt; } #endif +#endif #ifndef DEBUG int _ifuncs; @@ -82,6 +85,7 @@ int main(void) mp_int a, b, c, d, e, f; unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; int rr; + mp_digit tom; #ifdef TIMER int n; @@ -94,29 +98,38 @@ int main(void) mp_init(&d); mp_init(&e); mp_init(&f); - + + mp_read_radix(&a, "59994534535345535344389423", 10); + mp_read_radix(&b, "49993453555234234565675534", 10); + mp_read_radix(&c, "62398923474472948723847281", 10); + + mp_mulmod(&a, &b, &c, &f); + + /* setup mont */ + mp_montgomery_setup(&c, &tom); + mp_mul(&a, &b, &a); + mp_montgomery_reduce(&a, &c, tom); + mp_montgomery_reduce(&a, &c, tom); + mp_lshd(&a, c.used*2); + mp_mod(&a, &c, &a); + + mp_toradix(&a, cmd, 10); + printf("%s\n\n", cmd); + mp_toradix(&f, cmd, 10); + printf("%s\n", cmd); + +/* return 0; */ + + mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64); mp_reduce_setup(&b, &a); printf("\n\n----\n\n"); mp_toradix(&b, buf, 10); printf("b == %s\n\n\n", buf); - + mp_read_radix(&b, "4982748972349724892742", 10); mp_sub_d(&a, 1, &c); - -#ifdef DEBUG - mp_sqr(&a, &a);mp_sqr(&c, &c); - mp_sqr(&a, &a);mp_sqr(&c, &c); - mp_sqr(&a, &a);mp_sqr(&c, &c); - reset_timings(); -#endif mp_exptmod(&b, &c, &a, &d); -#ifdef DEBUG - dump_timings(); - return 0; - -#endif - mp_toradix(&d, buf, 10); printf("b^p-1 == %s\n", buf); @@ -169,7 +182,7 @@ int main(void) printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); mp_copy(&b, &a); } - + { char *primes[] = { "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", diff --git a/etc/pprime.c b/etc/pprime.c index fb987e3..4943088 100644 --- a/etc/pprime.c +++ b/etc/pprime.c @@ -85,10 +85,11 @@ static mp_digit prime_digit() } /* makes a prime of at least k bits */ -int pprime(int k, mp_int *p, mp_int *q) +int pprime(int k, int li, mp_int *p, mp_int *q) { mp_int a, b, c, n, x, y, z, v; - int res; + int res, ii; + static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 }; /* single digit ? */ if (k <= (int)DIGIT_BIT) { @@ -167,55 +168,60 @@ int pprime(int k, mp_int *p, mp_int *q) if (mp_cmp_d(&y, 1) != MP_EQ) goto top; - /* now try base x=2 */ - mp_set(&x, 2); + /* now try base x=bases[ii] */ + for (ii = 0; ii < li; ii++) { + mp_set(&x, bases[ii]); - /* compute x^a mod n */ - if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) { /* y = x^a mod n */ - goto __Z; - } + /* compute x^a mod n */ + if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) { /* y = x^a mod n */ + goto __Z; + } - /* if y == 1 loop */ - if (mp_cmp_d(&y, 1) == MP_EQ) goto top; + /* if y == 1 loop */ + if (mp_cmp_d(&y, 1) == MP_EQ) continue; - /* now x^2a mod n */ - if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2a mod n */ - goto __Z; - } + /* now x^2a mod n */ + if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2a mod n */ + goto __Z; + } - if (mp_cmp_d(&y, 1) == MP_EQ) goto top; + if (mp_cmp_d(&y, 1) == MP_EQ) continue; - /* compute x^b mod n */ - if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) { /* y = x^b mod n */ - goto __Z; - } + /* compute x^b mod n */ + if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) { /* y = x^b mod n */ + goto __Z; + } - /* if y == 1 loop */ - if (mp_cmp_d(&y, 1) == MP_EQ) goto top; + /* if y == 1 loop */ + if (mp_cmp_d(&y, 1) == MP_EQ) continue; - /* now x^2b mod n */ - if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2b mod n */ - goto __Z; - } + /* now x^2b mod n */ + if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2b mod n */ + goto __Z; + } - if (mp_cmp_d(&y, 1) == MP_EQ) goto top; + if (mp_cmp_d(&y, 1) == MP_EQ) continue; + /* compute x^c mod n == x^ab mod n */ + if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) { /* y = x^ab mod n */ + goto __Z; + } - /* compute x^c mod n == x^ab mod n */ - if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) { /* y = x^ab mod n */ - goto __Z; + /* if y == 1 loop */ + if (mp_cmp_d(&y, 1) == MP_EQ) continue; + + /* now compute (x^c mod n)^2 */ + if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2ab mod n */ + goto __Z; + } + + /* y should be 1 */ + if (mp_cmp_d(&y, 1) != MP_EQ) continue; + break; } - /* if y == 1 loop */ - if (mp_cmp_d(&y, 1) == MP_EQ) goto top; - - /* now compute (x^c mod n)^2 */ - if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) { /* y = x^2ab mod n */ - goto __Z; - } - - /* y should be 1 */ - if (mp_cmp_d(&y, 1) != MP_EQ) goto top; + /* no bases worked? */ + if (ii == li) goto top; /* { @@ -232,10 +238,14 @@ int pprime(int k, mp_int *p, mp_int *q) */ /* a = n */ mp_copy(&n, &a); - } + } + + /* get q to be the order of the large prime subgroup */ + mp_sub_d(&n, 1, q); + mp_div_2(q, q); + mp_div(q, &b, q, NULL); mp_exch(&n, p); - mp_exch(&b, q); res = MP_OKAY; __Z: mp_clear(&z); @@ -254,19 +264,25 @@ int main(void) { mp_int p, q; char buf[4096]; - int k; + int k, li; clock_t t1; srand(time(NULL)); printf("Enter # of bits: \n"); - scanf("%d", &k); + fgets(buf, sizeof(buf), stdin); + sscanf(buf, "%d", &k); + + printf("Enter number of bases to try (1 to 8):\n"); + fgets(buf, sizeof(buf), stdin); + sscanf(buf, "%d", &li); + mp_init(&p); mp_init(&q); t1 = clock(); - pprime(k, &p, &q); + pprime(k, li, &p, &q); t1 = clock() - t1; printf("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits(&p)); diff --git a/makefile b/makefile index d448933..edaf773 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = gcc CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.09 +VERSION=0.10 default: test diff --git a/poly.c b/poly.c new file mode 100644 index 0000000..0c85897 --- /dev/null +++ b/poly.c @@ -0,0 +1,302 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * This file "poly.c" provides GF(p^k) functionality on top of the + * libtommath library. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca + */ +#include "poly.h" + +#undef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#undef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) + +static void s_free(mp_poly *a) +{ + int k; + for (k = 0; k < a->alloc; k++) { + mp_clear(&(a->co[k])); + } +} + +static int s_setup(mp_poly *a, int low, int high) +{ + int res, k, j; + for (k = low; k < high; k++) { + if ((res = mp_init(&(a->co[k]))) != MP_OKAY) { + for (j = low; j < k; j++) { + mp_clear(&(a->co[j])); + } + return MP_MEM; + } + } + return MP_OKAY; +} + +int mp_poly_init(mp_poly *a, mp_int *cha) +{ + return mp_poly_init_size(a, cha, MP_POLY_PREC); +} + +/* init a poly of a given (size) degree */ +int mp_poly_init_size(mp_poly *a, mp_int *cha, int size) +{ + int res; + + /* allocate array of mp_ints for coefficients */ + a->co = malloc(size * sizeof(mp_int)); + if (a->co == NULL) { + return MP_MEM; + } + a->used = 0; + a->alloc = size; + + /* now init the range */ + if ((res = s_setup(a, 0, size)) != MP_OKAY) { + free(a->co); + return res; + } + + /* copy characteristic */ + if ((res = mp_init_copy(&(a->cha), cha)) != MP_OKAY) { + s_free(a); + free(a->co); + return res; + } + + /* return ok at this point */ + return MP_OKAY; +} + +/* grow the size of a poly */ +static int mp_poly_grow(mp_poly *a, int size) +{ + int res; + + if (size > a->alloc) { + /* resize the array of coefficients */ + a->co = realloc(a->co, sizeof(mp_int) * size); + if (a->co == NULL) { + return MP_MEM; + } + + /* now setup the coefficients */ + if ((res = s_setup(a, a->alloc, a->alloc + size)) != MP_OKAY) { + return res; + } + + a->alloc += size; + } + return MP_OKAY; +} + +/* copy, b = a */ +int mp_poly_copy(mp_poly *a, mp_poly *b) +{ + int res, k; + + /* resize b */ + if ((res = mp_poly_grow(b, a->used)) != MP_OKAY) { + return res; + } + + /* now copy the used part */ + b->used = a->used; + + /* now the cha */ + if ((res = mp_copy(&(a->cha), &(b->cha))) != MP_OKAY) { + return res; + } + + /* now all the coefficients */ + for (k = 0; k < b->used; k++) { + if ((res = mp_copy(&(a->co[k]), &(b->co[k]))) != MP_OKAY) { + return res; + } + } + + /* now zero the top */ + for (k = b->used; k < b->alloc; k++) { + mp_zero(&(b->co[k])); + } + + return MP_OKAY; +} + +/* init from a copy, a = b */ +int mp_poly_init_copy(mp_poly *a, mp_poly *b) +{ + int res; + + if ((res = mp_poly_init(a, &(b->cha))) != MP_OKAY) { + return res; + } + return mp_poly_copy(b, a); +} + +/* free a poly from ram */ +void mp_poly_clear(mp_poly *a) +{ + s_free(a); + mp_clear(&(a->cha)); + free(a->co); + + a->co = NULL; + a->used = a->alloc = 0; +} + +/* exchange two polys */ +void mp_poly_exch(mp_poly *a, mp_poly *b) +{ + mp_poly t; + t = *a; *a = *b; *b = t; +} + +/* clamp the # of used digits */ +static void mp_poly_clamp(mp_poly *a) +{ + while (a->used > 0 && mp_cmp_d(&(a->co[a->used]), 0) == MP_EQ) --(a->used); +} + +/* add two polynomials, c(x) = a(x) + b(x) */ +int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c) +{ + mp_poly t, *x, *y; + int res, k; + + /* ensure char's are the same */ + if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) { + return MP_VAL; + } + + /* now figure out the sizes such that x is the + largest degree poly and y is less or equal in degree + */ + if (a->used > b->used) { + x = a; + y = b; + } else { + x = b; + y = a; + } + + /* now init the result to be a copy of the largest */ + if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) { + return res; + } + + /* now add the coeffcients of the smaller one */ + for (k = 0; k < y->used; k++) { + if ((res = mp_addmod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) { + goto __T; + } + } + + mp_poly_clamp(&t); + mp_poly_exch(&t, c); + res = MP_OKAY; + +__T: mp_poly_clear(&t); + return res; +} + +/* subtracts two polynomials, c(x) = a(x) - b(x) */ +int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c) +{ + mp_poly t, *x, *y; + int res, k; + + /* ensure char's are the same */ + if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) { + return MP_VAL; + } + + /* now figure out the sizes such that x is the + largest degree poly and y is less or equal in degree + */ + if (a->used > b->used) { + x = a; + y = b; + } else { + x = b; + y = a; + } + + /* now init the result to be a copy of the largest */ + if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) { + return res; + } + + /* now add the coeffcients of the smaller one */ + for (k = 0; k < y->used; k++) { + if ((res = mp_submod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) { + goto __T; + } + } + + mp_poly_clamp(&t); + mp_poly_exch(&t, c); + res = MP_OKAY; + +__T: mp_poly_clear(&t); + return res; +} + +/* multiplies two polynomials, c(x) = a(x) * b(x) */ +int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c) +{ + mp_poly t; + mp_int tt; + int res, pa, pb, ix, iy; + + /* ensure char's are the same */ + if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) { + return MP_VAL; + } + + /* degrees of a and b */ + pa = a->used; + pb = b->used; + + /* now init the temp polynomial to be of degree pa+pb */ + if ((res = mp_poly_init_size(&t, &(a->cha), pa+pb)) != MP_OKAY) { + return res; + } + + /* now init our temp int */ + if ((res = mp_init(&tt)) != MP_OKAY) { + goto __T; + } + + /* now loop through all the digits */ + for (ix = 0; ix < pa; ix++) { + for (iy = 0; iy < pb; iy++) { + if ((res = mp_mul(&(a->co[ix]), &(b->co[iy]), &tt)) != MP_OKAY) { + goto __TT; + } + if ((res = mp_addmod(&tt, &(t.co[ix+iy]), &(a->cha), &(t.co[ix+iy]))) != MP_OKAY) { + goto __TT; + } + } + } + + mp_poly_clamp(&t); + mp_poly_exch(&t, c); + res = MP_OKAY; + +__TT: mp_clear(&tt); +__T: mp_poly_clear(&t); + return res; +} + diff --git a/poly.h b/poly.h new file mode 100644 index 0000000..f2e3212 --- /dev/null +++ b/poly.h @@ -0,0 +1,73 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is library that provides for multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * This file "poly.h" provides GF(p^k) functionality on top of the + * libtommath library. + * + * The library is designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca + */ + +#ifndef POLY_H_ +#define POLY_H_ + +#include "bn.h" + +/* a mp_poly is basically a derived "class" of a mp_int + * it uses the same technique of growing arrays via + * used/alloc parameters except the base unit or "digit" + * is in fact a mp_int. These hold the coefficients + * of the polynomial + */ +typedef struct { + int used, /* coefficients used */ + alloc; /* coefficients allocated (and initialized) */ + mp_int *co, /* coefficients */ + cha; /* characteristic */ + +} mp_poly; + + +#define MP_POLY_PREC 16 /* default coefficients allocated */ + +/* init a poly */ +int mp_poly_init(mp_poly *a, mp_int *cha); + +/* init a poly of a given (size) degree */ +int mp_poly_init_size(mp_poly *a, mp_int *cha, int size); + +/* copy, b = a */ +int mp_poly_copy(mp_poly *a, mp_poly *b); + +/* init from a copy, a = b */ +int mp_poly_init_copy(mp_poly *a, mp_poly *b); + +/* free a poly from ram */ +void mp_poly_clear(mp_poly *a); + +/* exchange two polys */ +void mp_poly_exch(mp_poly *a, mp_poly *b); + +/* ---> Basic Arithmetic <--- */ + +/* add two polynomials, c(x) = a(x) + b(x) */ +int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c); + +/* subtracts two polynomials, c(x) = a(x) - b(x) */ +int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c); + +/* multiplies two polynomials, c(x) = a(x) * b(x) */ +int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c); + + + +#endif +