diff --git a/b.bat b/b.bat
index 38c1120..1d0b900 100644
--- a/b.bat
+++ b/b.bat
@@ -1,3 +1,3 @@
 nasm -f coff timer.asm
-gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o demo
+gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
 gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
diff --git a/bn.c b/bn.c
index 4a0f4f9..cf8a391 100644
--- a/bn.c
+++ b/bn.c
@@ -700,8 +700,8 @@ int mp_mul_2(mp_int *a, mp_int *b)
 /* low level addition */
 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
 {
-   mp_int t, *x;
-   int res, min, max, i;
+   mp_int *x;
+   int olduse, res, min, max, i;
    mp_digit u;
    
    REGFUNC("s_mp_add");
@@ -724,45 +724,52 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
    }
    
    /* init result */
-   if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   if (c->alloc < max+1) {
+      if ((res = mp_grow(c, max+1)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = max+1;
+ 
+   olduse  = c->used;
+   c->used = max + 1;
 
    /* add digits from lower part */
    u = 0;
    for (i = 0; i < min; i++) {
        /* T[i] = A[i] + B[i] + U */   
-       t.dp[i] = a->dp[i] + b->dp[i] + u;
+       c->dp[i] = a->dp[i] + b->dp[i] + u;
        
        /* U = carry bit of T[i] */       
-       u = (t.dp[i] >> DIGIT_BIT) & 1;
+       u = (c->dp[i] >> DIGIT_BIT) & 1;
        
        /* take away carry bit from T[i] */
-       t.dp[i] &= MP_MASK;
+       c->dp[i] &= MP_MASK;
    }
    
    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
    if (min != max) {
       for (; i < max; i++) { 
          /* T[i] = X[i] + U */
-         t.dp[i] = x->dp[i] + u;
+         c->dp[i] = x->dp[i] + u;
          
          /* U = carry bit of T[i] */
-         u = (t.dp[i] >> DIGIT_BIT) & 1;
+         u = (c->dp[i] >> DIGIT_BIT) & 1;
          
          /* take away carry bit from T[i] */
-         t.dp[i] &= MP_MASK;
+         c->dp[i] &= MP_MASK;
       }
    }
    
    /* add carry */
-   t.dp[i] = u;
-
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   c->dp[i] = u;
+   
+   /* clear digits above used (since we may not have grown result above) */
+   for (i = c->used; i < olduse; i++) {
+      c->dp[i] = 0;
+   }
+   
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;       
 }
@@ -770,8 +777,7 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
 /* low level subtraction (assumes a > b) */
 static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
 {
-   mp_int t;
-   int res, min, max, i;
+   int olduse, res, min, max, i;
    mp_digit u;
    
    REGFUNC("s_mp_sub");
@@ -784,42 +790,48 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
    max = a->used;
    
    /* init result */
-   if ((res = mp_init_size(&t, max)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   if (c->alloc < max) {
+      if ((res = mp_grow(c, max)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = max;
+   olduse  = c->used;
+   c->used = max;
    
    /* sub digits from lower part */
    u = 0;
    for (i = 0; i < min; i++) {
        /* T[i] = A[i] - B[i] - U */
-       t.dp[i] = a->dp[i] - (b->dp[i] + u);
+       c->dp[i] = a->dp[i] - (b->dp[i] + u);
        
        /* U = carry bit of T[i] */
-       u = (t.dp[i] >> DIGIT_BIT) & 1;
+       u = (c->dp[i] >> DIGIT_BIT) & 1;
        
        /* Clear carry from T[i] */
-       t.dp[i] &= MP_MASK;
+       c->dp[i] &= MP_MASK;
    }
    
    /* now copy higher words if any, e.g. if A has more digits than B  */
    if (min != max) {
       for (; i < max; i++) { 
          /* T[i] = A[i] - U */
-         t.dp[i] = a->dp[i] - u;
+         c->dp[i] = a->dp[i] - u;
 
          /* U = carry bit of T[i] */
-         u = (t.dp[i] >> DIGIT_BIT) & 1;
+         u = (c->dp[i] >> DIGIT_BIT) & 1;
 
          /* Clear carry from T[i] */
-         t.dp[i] &= MP_MASK;
+         c->dp[i] &= MP_MASK;
       }
    }
    
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   /* clear digits above used (since we may not have grown result above) */
+   for (i = c->used; i < olduse; i++) {
+      c->dp[i] = 0;
+   }
+
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;       
 }
@@ -2941,7 +2953,7 @@ int mp_toradix(mp_int *a, char *str, int radix)
    int res, digs;
    mp_int t;
    mp_digit d;
-   unsigned char *_s = str;
+   char *_s = str;
    
    if (radix < 2 || radix > 64) {
       return MP_VAL;
@@ -2966,7 +2978,7 @@ int mp_toradix(mp_int *a, char *str, int radix)
        *str++ = s_rmap[d];
        ++digs;
    }
-   reverse(_s, digs);
+   reverse((unsigned char *)_s, digs);
    *str++ = '\0';
    mp_clear(&t);
    return MP_OKAY;
diff --git a/bn.h b/bn.h
index a87de8a..624c50d 100644
--- a/bn.h
+++ b/bn.h
@@ -12,7 +12,6 @@
  *
  * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
  */
-
 #ifndef BN_H_
 #define BN_H_
 
@@ -39,13 +38,18 @@
 #else
 #ifndef CRYPT
    #ifdef _MSC_VER
-      typedef __int64            ulong64;
+      typedef unsigned __int64   ulong64;
+      typedef signed __int64     long64;
    #else
       typedef unsigned long long ulong64;
+      typedef signed long long   long64;
    #endif   
 #endif   
+
+   /* default case */
    typedef unsigned long      mp_digit;
    typedef ulong64            mp_word;
+  
    #define DIGIT_BIT          28U
 #endif  
 
@@ -72,8 +76,14 @@
 
 typedef int           mp_err;
 
-#define KARATSUBA_MUL_CUTOFF    80                /* Min. number of digits before Karatsuba multiplication is used. */
-#define KARATSUBA_SQR_CUTOFF    80                /* Min. number of digits before Karatsuba squaring is used. */
+/* you'll have to tune these... */
+#ifdef FAST_FPU
+   #define KARATSUBA_MUL_CUTOFF    100     /* Min. number of digits before Karatsuba multiplication is used. */
+   #define KARATSUBA_SQR_CUTOFF    100     /* Min. number of digits before Karatsuba squaring is used. */
+#else
+   #define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
+   #define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
+#endif
 
 typedef struct  {
     int used, alloc, sign;
diff --git a/bn.pdf b/bn.pdf
index 34156bc..d517dc4 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 9808a73..069f76b 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.05 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -460,34 +460,34 @@ were observed.
 \begin{tabular}{c|c|c|c}
 \hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
 \hline
-Inversion & 128 & 264,083  & 159,194  \\
-Inversion & 256 & 549,370  & 355,914  \\
-Inversion & 512 & 1,675,975  & 842,538  \\
-Inversion & 1024 & 5,237,957  & 2,170,804  \\
-Inversion & 2048 & 17,871,944  & 6,250,876  \\
-Inversion & 4096 & 66,610,468  & 18,161,612  \\
+Inversion & 128 & 264,083  & 59,782   \\
+Inversion & 256 & 549,370  & 146,915   \\
+Inversion & 512 & 1,675,975  & 367,172   \\
+Inversion & 1024 & 5,237,957  & 1,054,158   \\
+Inversion & 2048 & 17,871,944  & 3,459,683   \\
+Inversion & 4096 & 66,610,468  & 11,834,556   \\
 \hline
 Multiply & 128 & 1,426   & 828    \\
 Multiply & 256 & 2,551   & 1,393    \\
 Multiply & 512 & 7,913   & 2,926    \\
 Multiply & 1024 & 28,496   & 8,620  \\
 Multiply & 2048 & 109,897   & 28,967    \\
-Multiply & 4096 & 469,970   & 106,855    \\
+Multiply & 4096 & 469,970   & 105,387    \\
 \hline 
 Square & 128 & 1,319   & 869    \\
 Square & 256 & 1,776   & 1,362    \\
 Square & 512 & 5,399  & 2,571   \\
 Square & 1024 & 18,991  & 6,332    \\
 Square & 2048 & 72,126  & 18,426   \\
-Square & 4096 & 306,269  & 76,305  \\
+Square & 4096 & 306,269  & 74,908 \\
 \hline 
-Exptmod & 512 & 32,021,586  & 5,709,468 \\
-Exptmod & 768 & 97,595,492  & 12,473,526   \\
-Exptmod & 1024 & 223,302,532  & 23,593,075    \\
-Exptmod & 2048 & 1,682,223,369   & 121,992,481    \\
-Exptmod & 2560 & 3,268,615,571   & 258,155,605    \\
-Exptmod & 3072 & 5,597,240,141   & 399,800,504   \\
-Exptmod & 4096 & 13,347,270,891   & 826,013,375   
+Exptmod & 512 & 32,021,586  & 5,696,459  \\
+Exptmod & 768 & 97,595,492  & 12,428,274   \\
+Exptmod & 1024 & 223,302,532  & 22,834,316   \\
+Exptmod & 2048 & 1,682,223,369   & 119,888,049    \\
+Exptmod & 2560 & 3,268,615,571   & 250,901,263     \\
+Exptmod & 3072 & 5,597,240,141   & 391,716,431    \\
+Exptmod & 4096 & 13,347,270,891   & 814,429,647    
 
 \end{tabular}
 \end{center}
diff --git a/changes.txt b/changes.txt
index 7900121..e5b6294 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,7 @@
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+       
 Dec 30th, 2002
 v0.05  -- Builds with MSVC out of the box
        -- Fixed a bug in mp_invmod w.r.t. even moduli
diff --git a/demo.c b/demo.c
index 5e29d1f..0a1943f 100644
--- a/demo.c
+++ b/demo.c
@@ -1,5 +1,6 @@
 #include <time.h>
 
+
 #ifdef U_MPI
 #include <stdio.h>
 #include <string.h>
@@ -12,7 +13,6 @@
    #else
       typedef unsigned long long ulong64;
    #endif   
-   
 #else   
    #include "bn.h"
 #endif
@@ -22,6 +22,8 @@
 extern ulong64 rdtsc(void);
 extern void reset(void);
 #else 
+
+
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
@@ -73,11 +75,11 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
 }
 #endif
 
+   char cmd[4096], buf[4096];
 int main(void)
 {
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
-   unsigned char cmd[4096], buf[4096];
    int rr;
    
 #ifdef TIMER
@@ -92,6 +94,7 @@ int main(void)
    mp_init(&e);
    mp_init(&f);
    
+ 
    mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
    mp_reduce_setup(&b, &a);
    printf("\n\n----\n\n");
@@ -102,12 +105,11 @@ int main(void)
    mp_sub_d(&a, 1, &c);
    mp_exptmod(&b, &c, &a, &d);
    mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf);
-   
+   printf("b^p-1 == %s\n", buf); 
 
 #ifdef TIMER   
 
-   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
+      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
          reset();
@@ -132,30 +134,31 @@ int main(void)
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
-   
 
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 10000; rr++) {
+      for (rr = 0; rr < 100000; rr++) {
           mp_sqr(&a, &b);
       }
       tt = rdtsc();
-      printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
+      printf("Squaring %d-bit took %lu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
    
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 10000; rr++) {
+      for (rr = 0; rr < 100000; rr++) {
           mp_mul(&a, &a, &b);
       }
       tt = rdtsc();
-      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
+      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
-   
+
+  
+ 
    
    {
       char *primes[] = {
@@ -213,6 +216,8 @@ int main(void)
       mp_sqr(&a, &a);
       mp_sqr(&b, &b);
    }
+   
+   return 0;
   
 #endif   
 
@@ -264,9 +269,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
           /* test the sign/unsigned storage functions */
           
           rr = mp_signed_bin_size(&c);
-          mp_to_signed_bin(&c, cmd);
+          mp_to_signed_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_signed_bin(&d, cmd, rr);
+          mp_read_signed_bin(&d, (unsigned char *)cmd, rr);
           if (mp_cmp(&c, &d) != MP_EQ) {
              printf("mp_signed_bin failure!\n");
              draw(&c);
@@ -276,9 +281,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                     
           
           rr = mp_unsigned_bin_size(&c);
-          mp_to_unsigned_bin(&c, cmd);
+          mp_to_unsigned_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_unsigned_bin(&d, cmd, rr);
+          mp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
           if (mp_cmp_mag(&c, &d) != MP_EQ) {
              printf("mp_unsigned_bin failure!\n");
              draw(&c);
diff --git a/makefile b/makefile
index 58b57ad..ec94b70 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
 
-VERSION=0.05
+VERSION=0.06
 
 default: test