diff --git a/b.bat b/b.bat index 38c1120..1d0b900 100644 --- a/b.bat +++ b/b.bat @@ -1,3 +1,3 @@ nasm -f coff timer.asm -gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o demo +gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo diff --git a/bn.c b/bn.c index 4a0f4f9..cf8a391 100644 --- a/bn.c +++ b/bn.c @@ -700,8 +700,8 @@ int mp_mul_2(mp_int *a, mp_int *b) /* low level addition */ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c) { - mp_int t, *x; - int res, min, max, i; + mp_int *x; + int olduse, res, min, max, i; mp_digit u; REGFUNC("s_mp_add"); @@ -724,45 +724,52 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c) } /* init result */ - if ((res = mp_init_size(&t, max+1)) != MP_OKAY) { - DECFUNC(); - return res; + if (c->alloc < max+1) { + if ((res = mp_grow(c, max+1)) != MP_OKAY) { + DECFUNC(); + return res; + } } - t.used = max+1; + + olduse = c->used; + c->used = max + 1; /* add digits from lower part */ u = 0; for (i = 0; i < min; i++) { /* T[i] = A[i] + B[i] + U */ - t.dp[i] = a->dp[i] + b->dp[i] + u; + c->dp[i] = a->dp[i] + b->dp[i] + u; /* U = carry bit of T[i] */ - u = (t.dp[i] >> DIGIT_BIT) & 1; + u = (c->dp[i] >> DIGIT_BIT) & 1; /* take away carry bit from T[i] */ - t.dp[i] &= MP_MASK; + c->dp[i] &= MP_MASK; } /* now copy higher words if any, that is in A+B if A or B has more digits add those in */ if (min != max) { for (; i < max; i++) { /* T[i] = X[i] + U */ - t.dp[i] = x->dp[i] + u; + c->dp[i] = x->dp[i] + u; /* U = carry bit of T[i] */ - u = (t.dp[i] >> DIGIT_BIT) & 1; + u = (c->dp[i] >> DIGIT_BIT) & 1; /* take away carry bit from T[i] */ - t.dp[i] &= MP_MASK; + c->dp[i] &= MP_MASK; } } /* add carry */ - t.dp[i] = u; - - mp_clamp(&t); - mp_exch(&t, c); - mp_clear(&t); + c->dp[i] = u; + + /* clear digits above used (since we may not have grown result above) */ + for (i = c->used; i < olduse; i++) { + c->dp[i] = 0; + } + + mp_clamp(c); DECFUNC(); return MP_OKAY; } @@ -770,8 +777,7 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c) /* low level subtraction (assumes a > b) */ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c) { - mp_int t; - int res, min, max, i; + int olduse, res, min, max, i; mp_digit u; REGFUNC("s_mp_sub"); @@ -784,42 +790,48 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c) max = a->used; /* init result */ - if ((res = mp_init_size(&t, max)) != MP_OKAY) { - DECFUNC(); - return res; + if (c->alloc < max) { + if ((res = mp_grow(c, max)) != MP_OKAY) { + DECFUNC(); + return res; + } } - t.used = max; + olduse = c->used; + c->used = max; /* sub digits from lower part */ u = 0; for (i = 0; i < min; i++) { /* T[i] = A[i] - B[i] - U */ - t.dp[i] = a->dp[i] - (b->dp[i] + u); + c->dp[i] = a->dp[i] - (b->dp[i] + u); /* U = carry bit of T[i] */ - u = (t.dp[i] >> DIGIT_BIT) & 1; + u = (c->dp[i] >> DIGIT_BIT) & 1; /* Clear carry from T[i] */ - t.dp[i] &= MP_MASK; + c->dp[i] &= MP_MASK; } /* now copy higher words if any, e.g. if A has more digits than B */ if (min != max) { for (; i < max; i++) { /* T[i] = A[i] - U */ - t.dp[i] = a->dp[i] - u; + c->dp[i] = a->dp[i] - u; /* U = carry bit of T[i] */ - u = (t.dp[i] >> DIGIT_BIT) & 1; + u = (c->dp[i] >> DIGIT_BIT) & 1; /* Clear carry from T[i] */ - t.dp[i] &= MP_MASK; + c->dp[i] &= MP_MASK; } } - mp_clamp(&t); - mp_exch(&t, c); - mp_clear(&t); + /* clear digits above used (since we may not have grown result above) */ + for (i = c->used; i < olduse; i++) { + c->dp[i] = 0; + } + + mp_clamp(c); DECFUNC(); return MP_OKAY; } @@ -2941,7 +2953,7 @@ int mp_toradix(mp_int *a, char *str, int radix) int res, digs; mp_int t; mp_digit d; - unsigned char *_s = str; + char *_s = str; if (radix < 2 || radix > 64) { return MP_VAL; @@ -2966,7 +2978,7 @@ int mp_toradix(mp_int *a, char *str, int radix) *str++ = s_rmap[d]; ++digs; } - reverse(_s, digs); + reverse((unsigned char *)_s, digs); *str++ = '\0'; mp_clear(&t); return MP_OKAY; diff --git a/bn.h b/bn.h index a87de8a..624c50d 100644 --- a/bn.h +++ b/bn.h @@ -12,7 +12,6 @@ * * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca */ - #ifndef BN_H_ #define BN_H_ @@ -39,13 +38,18 @@ #else #ifndef CRYPT #ifdef _MSC_VER - typedef __int64 ulong64; + typedef unsigned __int64 ulong64; + typedef signed __int64 long64; #else typedef unsigned long long ulong64; + typedef signed long long long64; #endif #endif + + /* default case */ typedef unsigned long mp_digit; typedef ulong64 mp_word; + #define DIGIT_BIT 28U #endif @@ -72,8 +76,14 @@ typedef int mp_err; -#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */ -#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */ +/* you'll have to tune these... */ +#ifdef FAST_FPU + #define KARATSUBA_MUL_CUTOFF 100 /* Min. number of digits before Karatsuba multiplication is used. */ + #define KARATSUBA_SQR_CUTOFF 100 /* Min. number of digits before Karatsuba squaring is used. */ +#else + #define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */ + #define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */ +#endif typedef struct { int used, alloc, sign; diff --git a/bn.pdf b/bn.pdf index 34156bc..d517dc4 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index 9808a73..069f76b 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ \documentclass{article} \begin{document} -\title{LibTomMath v0.05 \\ A Free Multiple Precision Integer Library} +\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage @@ -460,34 +460,34 @@ were observed. \begin{tabular}{c|c|c|c} \hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\ \hline -Inversion & 128 & 264,083 & 159,194 \\ -Inversion & 256 & 549,370 & 355,914 \\ -Inversion & 512 & 1,675,975 & 842,538 \\ -Inversion & 1024 & 5,237,957 & 2,170,804 \\ -Inversion & 2048 & 17,871,944 & 6,250,876 \\ -Inversion & 4096 & 66,610,468 & 18,161,612 \\ +Inversion & 128 & 264,083 & 59,782 \\ +Inversion & 256 & 549,370 & 146,915 \\ +Inversion & 512 & 1,675,975 & 367,172 \\ +Inversion & 1024 & 5,237,957 & 1,054,158 \\ +Inversion & 2048 & 17,871,944 & 3,459,683 \\ +Inversion & 4096 & 66,610,468 & 11,834,556 \\ \hline Multiply & 128 & 1,426 & 828 \\ Multiply & 256 & 2,551 & 1,393 \\ Multiply & 512 & 7,913 & 2,926 \\ Multiply & 1024 & 28,496 & 8,620 \\ Multiply & 2048 & 109,897 & 28,967 \\ -Multiply & 4096 & 469,970 & 106,855 \\ +Multiply & 4096 & 469,970 & 105,387 \\ \hline Square & 128 & 1,319 & 869 \\ Square & 256 & 1,776 & 1,362 \\ Square & 512 & 5,399 & 2,571 \\ Square & 1024 & 18,991 & 6,332 \\ Square & 2048 & 72,126 & 18,426 \\ -Square & 4096 & 306,269 & 76,305 \\ +Square & 4096 & 306,269 & 74,908 \\ \hline -Exptmod & 512 & 32,021,586 & 5,709,468 \\ -Exptmod & 768 & 97,595,492 & 12,473,526 \\ -Exptmod & 1024 & 223,302,532 & 23,593,075 \\ -Exptmod & 2048 & 1,682,223,369 & 121,992,481 \\ -Exptmod & 2560 & 3,268,615,571 & 258,155,605 \\ -Exptmod & 3072 & 5,597,240,141 & 399,800,504 \\ -Exptmod & 4096 & 13,347,270,891 & 826,013,375 +Exptmod & 512 & 32,021,586 & 5,696,459 \\ +Exptmod & 768 & 97,595,492 & 12,428,274 \\ +Exptmod & 1024 & 223,302,532 & 22,834,316 \\ +Exptmod & 2048 & 1,682,223,369 & 119,888,049 \\ +Exptmod & 2560 & 3,268,615,571 & 250,901,263 \\ +Exptmod & 3072 & 5,597,240,141 & 391,716,431 \\ +Exptmod & 4096 & 13,347,270,891 & 814,429,647 \end{tabular} \end{center} diff --git a/changes.txt b/changes.txt index 7900121..e5b6294 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,7 @@ +Dec 31st, 2002 +v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc... + -- Cleaned up the header a bit more + Dec 30th, 2002 v0.05 -- Builds with MSVC out of the box -- Fixed a bug in mp_invmod w.r.t. even moduli diff --git a/demo.c b/demo.c index 5e29d1f..0a1943f 100644 --- a/demo.c +++ b/demo.c @@ -1,5 +1,6 @@ #include + #ifdef U_MPI #include #include @@ -12,7 +13,6 @@ #else typedef unsigned long long ulong64; #endif - #else #include "bn.h" #endif @@ -22,6 +22,8 @@ extern ulong64 rdtsc(void); extern void reset(void); #else + + ulong64 _tt; void reset(void) { _tt = clock(); } ulong64 rdtsc(void) { return clock() - _tt; } @@ -73,11 +75,11 @@ int mp_reduce_setup(mp_int *a, mp_int *b) } #endif + char cmd[4096], buf[4096]; int main(void) { mp_int a, b, c, d, e, f; unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; - unsigned char cmd[4096], buf[4096]; int rr; #ifdef TIMER @@ -92,6 +94,7 @@ int main(void) mp_init(&e); mp_init(&f); + mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64); mp_reduce_setup(&b, &a); printf("\n\n----\n\n"); @@ -102,12 +105,11 @@ int main(void) mp_sub_d(&a, 1, &c); mp_exptmod(&b, &c, &a, &d); mp_toradix(&d, buf, 10); - printf("b^p-1 == %s\n", buf); - + printf("b^p-1 == %s\n", buf); #ifdef TIMER - mp_read_radix(&a, "340282366920938463463374607431768211455", 10); + mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); @@ -132,30 +134,31 @@ int main(void) mp_sqr(&a, &a); mp_sqr(&b, &b); } - mp_read_radix(&a, "340282366920938463463374607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 10000; rr++) { + for (rr = 0; rr < 100000; rr++) { mp_sqr(&a, &b); } tt = rdtsc(); - printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000)); + printf("Squaring %d-bit took %lu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); mp_copy(&b, &a); } mp_read_radix(&a, "340282366920938463463374607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { reset(); - for (rr = 0; rr < 10000; rr++) { + for (rr = 0; rr < 100000; rr++) { mp_mul(&a, &a, &b); } tt = rdtsc(); - printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000)); + printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); mp_copy(&b, &a); } - + + + { char *primes[] = { @@ -213,6 +216,8 @@ int main(void) mp_sqr(&a, &a); mp_sqr(&b, &b); } + + return 0; #endif @@ -264,9 +269,9 @@ draw(&a);draw(&b);draw(&c);draw(&d); /* test the sign/unsigned storage functions */ rr = mp_signed_bin_size(&c); - mp_to_signed_bin(&c, cmd); + mp_to_signed_bin(&c, (unsigned char *)cmd); memset(cmd+rr, rand()&255, sizeof(cmd)-rr); - mp_read_signed_bin(&d, cmd, rr); + mp_read_signed_bin(&d, (unsigned char *)cmd, rr); if (mp_cmp(&c, &d) != MP_EQ) { printf("mp_signed_bin failure!\n"); draw(&c); @@ -276,9 +281,9 @@ draw(&a);draw(&b);draw(&c);draw(&d); rr = mp_unsigned_bin_size(&c); - mp_to_unsigned_bin(&c, cmd); + mp_to_unsigned_bin(&c, (unsigned char *)cmd); memset(cmd+rr, rand()&255, sizeof(cmd)-rr); - mp_read_unsigned_bin(&d, cmd, rr); + mp_read_unsigned_bin(&d, (unsigned char *)cmd, rr); if (mp_cmp_mag(&c, &d) != MP_EQ) { printf("mp_unsigned_bin failure!\n"); draw(&c); diff --git a/makefile b/makefile index 58b57ad..ec94b70 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = gcc CFLAGS += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.05 +VERSION=0.06 default: test