diff --git a/bn.pdf b/bn.pdf
index 1a361ef..a002099 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 5341a4a..b5b17ec 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.21 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.22 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index eb71601..e32c1ae 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -66,8 +66,8 @@ top:
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&B) == 0) {
+    /* 4.2 if B is odd then */
+    if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
         goto __ERR;
       }
@@ -84,8 +84,8 @@ top:
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&D) == 0) {
+    /* 5.2 if D is odd then */
+    if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
         goto __ERR;
diff --git a/bn_mp_cnt_lsb.c b/bn_mp_cnt_lsb.c
new file mode 100644
index 0000000..df0846f
--- /dev/null
+++ b/bn_mp_cnt_lsb.c
@@ -0,0 +1,40 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a)
+{
+   int x;
+   mp_digit q;
+
+   if (mp_iszero(a) == 1) {
+      return 0;
+   }
+
+   /* scan lower digits until non-zero */
+   for (x = 0; x < a->used && a->dp[x] == 0; x++);
+   q = a->dp[x];
+   x *= DIGIT_BIT;
+
+   /* now scan this digit until a 1 is found */
+   while ((q & 1) == 0) {
+      q >>= 1;
+      x  += 1;
+   }
+
+   return x;
+}
+
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
index 18bf904..9bc1fd6 100644
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@@ -58,11 +58,14 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
-    register mp_digit *tmpc, mask;
+    register mp_digit *tmpc, mask, shift;
 
     /* mask */
     mask = (((mp_digit)1) << D) - 1;
 
+    /* shift for lsb */
+    shift = DIGIT_BIT - D;
+
     /* alias */
     tmpc = c->dp + (c->used - 1);
 
@@ -73,7 +76,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
       rr = *tmpc & mask;
 
       /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      *tmpc = (*tmpc >> D) | (r << shift);
       --tmpc;
 
       /* set the carry to the carry bits of the current word found above */
diff --git a/bn_mp_div_3.c b/bn_mp_div_3.c
index 524531e..d82f29f 100644
--- a/bn_mp_div_3.c
+++ b/bn_mp_div_3.c
@@ -1,64 +1,64 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* divide by three (based on routine from MPI and the GMP manual) */
-int
-mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
-{
-  mp_int   q;
-  mp_word  w, t;
-  mp_digit b;
-  int      res, ix;
-  
-  /* b = 2**DIGIT_BIT / 3 */
-  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
-
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-  
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-     
-     if (w >= 3) {
-        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
-        w -= (t << ((mp_word)1)) + t;
-        while (w >= 3) {
-           t += 1;
-           w -= 3;
-        }
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = (mp_digit)t;
-  }
-  
-  if (d != NULL) {
-     *d = (mp_digit)w;
-  }
-  
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-  
-  return res;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= 3) {
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+        w -= (t << ((mp_word)1)) + t;
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+  
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index 54de53d..567d614 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -21,10 +21,17 @@
  *
  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
  */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
-  mp_int  M[256], res;
+  mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@@ -58,17 +65,24 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   }
 #endif
 
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
 
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init (&M[x])) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
         mp_clear (&M[y]);
       }
+      mp_clear(&M[1]);
       return err;
     }
   }
 
+
   /* determine and setup reduction code */
   if (redmode == 0) {
      /* now setup montgomery  */
@@ -257,7 +271,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   err = MP_OKAY;
 __RES:mp_clear (&res);
 __M:
-  for (x = 0; x < (1 << winsize); x++) {
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
   }
   return err;
diff --git a/bn_mp_fread.c b/bn_mp_fread.c
new file mode 100644
index 0000000..004b0f1
--- /dev/null
+++ b/bn_mp_fread.c
@@ -0,0 +1,61 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (mp_s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
diff --git a/bn_mp_fwrite.c b/bn_mp_fwrite.c
new file mode 100644
index 0000000..93d51fd
--- /dev/null
+++ b/bn_mp_fwrite.c
@@ -0,0 +1,47 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index 1c930c7..ff6ba70 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -14,13 +14,12 @@
  */
 #include <tommath.h>
 
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
- */
+/* Greatest Common Divisor using the binary method */
 int
 mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  u, v, t;
-  int     k, res, neg;
+  mp_int  u, v;
+  int     k, u_lsb, v_lsb, res;
 
   /* either zero than gcd is the largest */
   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
@@ -34,9 +33,6 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     return MP_OKAY;
   }
 
-  /* if both are negative they share (-1) as a common divisor */
-  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-
   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
     return res;
   }
@@ -48,71 +44,55 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   /* must be positive for the remainder of the algorithm */
   u.sign = v.sign = MP_ZPOS;
 
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  /* B1.  Find the common power of two for u and v */
+  u_lsb = mp_cnt_lsb(&u);
+  v_lsb = mp_cnt_lsb(&v);
+  k     = MIN(u_lsb, v_lsb);
+
+  if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  /* divide any remaining factors of two out */
+  if (u_lsb != k) {
+     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+
+  if (v_lsb != k) {
+     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+  
+  while (mp_iszero(&v) == 0) {
+     /* make sure v is the largest */
+     if (mp_cmp_mag(&u, &v) == MP_GT) {
+        mp_exch(&u, &v);
+     }
+     
+     /* subtract smallest from largest */
+     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
+        goto __V;
+     }
+     
+     /* Divide out all factors of two */
+     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
+        goto __V;
+     } 
+  } 
+  
+  /* multiply by 2**k which we divided out at the beginning */ 
+  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
     goto __V;
   }
-
-  /* B1.  Find power of two */
-  k = 0;
-  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
-    ++k;
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __T;
-    }
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  /* B2.  Initialize */
-  if (mp_isodd(&u) == 1) {
-    /* t = -v */
-    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-    t.sign = MP_NEG;
-  } else {
-    /* t = u */
-    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  do {
-    /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && mp_iseven(&t) == 1) {
-      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-        goto __T;
-      }
-    }
-
-    /* B5.  if t>0 then u=t otherwise v=-t */
-    if (t.used != 0 && t.sign != MP_NEG) {
-      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-        goto __T;
-      }
-    } else {
-      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-        goto __T;
-      }
-      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-    }
-
-    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  } while (mp_iszero(&t) == 0);
-
-  /* multiply by 2^k which we divided out at the beginning */ 
-  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
-    goto __T;
-  }
-
-  mp_exch (&u, c);
-  c->sign = neg;
+  c->sign = MP_ZPOS;
   res = MP_OKAY;
-__T:mp_clear (&t);
 __V:mp_clear (&u);
 __U:mp_clear (&v);
   return res;
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
index 36ce092..2dd7a0c 100644
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@@ -14,6 +14,7 @@
  */
 #include <tommath.h>
 
+/* hac 14.61, pp608 */
 int
 mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
@@ -21,17 +22,18 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   int     res;
 
   /* b cannot be negative */
-  if (b->sign == MP_NEG) {
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
     return MP_VAL;
   }
 
   /* if the modulus is odd we can use a faster routine instead */
-  if (mp_iseven (b) == 0) {
+  if (mp_isodd (b) == 1) {
     return fast_mp_invmod (a, b, c);
   }
   
   /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
      return res;
   }
 
@@ -43,10 +45,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __ERR;
   }
 
-  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
@@ -63,7 +61,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   mp_set (&A, 1);
   mp_set (&D, 1);
 
-
 top:
   /* 4.  while u is even do */
   while (mp_iseven (&u) == 1) {
@@ -72,13 +69,13 @@ top:
       goto __ERR;
     }
     /* 4.2 if A or B is odd then */
-    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
     }
     /* A = A/2, B = B/2 */
@@ -90,21 +87,20 @@ top:
     }
   }
 
-
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
     }
     /* C = C/2, D = D/2 */
@@ -157,10 +153,23 @@ top:
     goto __ERR;
   }
 
-  /* a is now the inverse */
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-
 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c
index 1a7573d..eb0c47b 100644
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@@ -18,10 +18,10 @@
  * HAC pp. 73 Algorithm 2.149
  */
 int
-mp_jacobi (mp_int * a, mp_int * n, int *c)
+mp_jacobi (mp_int * a, mp_int * p, int *c)
 {
-  mp_int  a1, n1, e;
-  int     s, r, res;
+  mp_int  a1, p1;
+  int     k, s, r, res;
   mp_digit residue;
 
   /* step 1.  if a == 0, return 0 */
@@ -37,39 +37,30 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
   }
 
   /* default */
-  s = 0;
+  k = s = 0;
 
-  /* step 3.  write a = a1 * 2^e  */
+  /* step 3.  write a = a1 * 2**k  */
   if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
     return res;
   }
 
-  if ((res = mp_init (&n1)) != MP_OKAY) {
+  if ((res = mp_init (&p1)) != MP_OKAY) {
     goto __A1;
   }
 
-  if ((res = mp_init (&e)) != MP_OKAY) {
-    goto __N1;
-  }
-
   while (mp_iseven (&a1) == 1) {
-    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
-      goto __E;
-    }
-
+    k = k + 1;
     if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
-      goto __E;
+      goto __P1;
     }
   }
 
   /* step 4.  if e is even set s=1 */
-  if (mp_iseven (&e) == 1) {
+  if ((k & 1) == 0) {
     s = 1;
   } else {
-    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
-      goto __E;
-    }
+    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+    residue = p->dp[0] & 7;
 
     if (residue == 1 || residue == 7) {
       s = 1;
@@ -78,17 +69,9 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     }
   }
 
-  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
-    goto __E;
-  }
-  if (residue == 3) {
-    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-    if (residue == 3) {
-      s = -s;
-    }
+  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
+    s = -s;
   }
 
   /* if a1 == 1 we're done */
@@ -96,19 +79,18 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     *c = s;
   } else {
     /* n1 = n mod a1 */
-    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
+      goto __P1;
     }
-    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
+      goto __P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__E:mp_clear (&e);
-__N1:mp_clear (&n1);
+__P1:mp_clear (&p1);
 __A1:mp_clear (&a1);
   return res;
 }
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
index ded3a3c..450f619 100644
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@@ -14,12 +14,6 @@
  */
 #include <tommath.h>
 
-/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
-   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
- 
-   To be fixed for LTM 0.18
- */
-
 /* shift left by a certain bit count */
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
@@ -34,8 +28,8 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
      }
   }
 
-  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
-     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
        return res;
      }
   }
@@ -46,17 +40,19 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       return res;
     }
   }
-  c->used = c->alloc;
 
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    register mp_digit *tmpc, mask, r, rr;
+    register mp_digit *tmpc, shift, mask, r, rr;
     register int x;
 
     /* bitmask for carries */
     mask = (((mp_digit)1) << d) - 1;
 
+    /* shift for msbs */
+    shift = DIGIT_BIT - d;
+
     /* alias */
     tmpc = c->dp;
 
@@ -64,7 +60,7 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
     r    = 0;
     for (x = 0; x < c->used; x++) {
       /* get the higher bits of the current word */
-      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+      rr = (*tmpc >> shift) & mask;
 
       /* shift the current word and OR in the carry */
       *tmpc = ((*tmpc << d) | r) & MP_MASK;
@@ -73,6 +69,11 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       /* set the carry to the carry bits of the current word */
       r = rr;
     }
+    
+    /* set final carry */
+    if (r != 0) {
+       c->dp[c->used++] = r;
+    }
   }
   mp_clamp (c);
   return MP_OKAY;
diff --git a/bn_mp_prime_fermat.c b/bn_mp_prime_fermat.c
index b218077..202f45a 100644
--- a/bn_mp_prime_fermat.c
+++ b/bn_mp_prime_fermat.c
@@ -16,9 +16,9 @@
 
 /* performs one Fermat test.
  * 
- * If "a" were prime then b^a == b (mod a) since the order of
+ * If "a" were prime then b**a == b (mod a) since the order of
  * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
+ * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
  *
  * Sets result to 1 if the congruence holds, or zero otherwise.
  */
@@ -36,7 +36,7 @@ mp_prime_fermat (mp_int * a, mp_int * b, int *result)
     return err;
   }
 
-  /* compute t = b^a mod a */
+  /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
     goto __T;
   }
diff --git a/bn_mp_prime_is_divisible.c b/bn_mp_prime_is_divisible.c
index 5b81104..fcad869 100644
--- a/bn_mp_prime_is_divisible.c
+++ b/bn_mp_prime_is_divisible.c
@@ -14,7 +14,8 @@
  */
 #include <tommath.h>
 
-/* determines if an integers is divisible by one of the first 256 primes or not
+/* determines if an integers is divisible by one 
+ * of the first PRIME_SIZE primes or not
  *
  * sets result to 0 if not, 1 if yes
  */
@@ -28,12 +29,6 @@ mp_prime_is_divisible (mp_int * a, int *result)
   *result = 0;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* is it equal to the prime? */
-    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
-      *result = 1;
-      return MP_OKAY;
-    }
-
     /* what is a mod __prime_tab[ix] */
     if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
       return err;
diff --git a/bn_mp_prime_miller_rabin.c b/bn_mp_prime_miller_rabin.c
index 422a5eb..4a96674 100644
--- a/bn_mp_prime_miller_rabin.c
+++ b/bn_mp_prime_miller_rabin.c
@@ -38,19 +38,17 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     goto __N1;
   }
 
-  /* set 2^s * r = n1 */
+  /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
     goto __N1;
   }
-  s = 0;
-  while (mp_iseven (&r) == 1) {
-    ++s;
-    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
-      goto __R;
-    }
+ 
+  s = mp_cnt_lsb(&r);
+  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
+    goto __R;
   }
 
-  /* compute y = b^r mod a */
+  /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
     goto __R;
   }
@@ -64,12 +62,12 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-	goto __Y;
+         goto __Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-	goto __Y;
+         goto __Y;
       }
 
       ++j;
diff --git a/bn_mp_radix_size.c b/bn_mp_radix_size.c
new file mode 100644
index 0000000..992743c
--- /dev/null
+++ b/bn_mp_radix_size.c
@@ -0,0 +1,54 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
+
diff --git a/bn_mp_radix_smap.c b/bn_mp_radix_smap.c
new file mode 100644
index 0000000..45d02aa
--- /dev/null
+++ b/bn_mp_radix_smap.c
@@ -0,0 +1,18 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
diff --git a/bn_mp_read_radix.c b/bn_mp_read_radix.c
new file mode 100644
index 0000000..e32823f
--- /dev/null
+++ b/bn_mp_read_radix.c
@@ -0,0 +1,77 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  /* make sure the radix is ok */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* if the leading digit is a 
+   * minus set the sign to negative. 
+   */
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  /* set the integer to the default of zero */
+  mp_zero (a);
+  
+  /* process each digit of the string */
+  while (*str) {
+    /* if the radix < 36 the conversion is case insensitive
+     * this allows numbers like 1AB and 1ab to represent the same  value
+     * [e.g. in hex]
+     */
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == mp_s_rmap[y]) {
+         break;
+      }
+    }
+
+    /* if the char was found in the map 
+     * and is less than the given radix add it
+     * to the number, otherwise exit the loop. 
+     */
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+         return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+         return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  
+  /* set the sign only if a != 0 */
+  if (mp_iszero(a) != 1) {
+     a->sign = neg;
+  }
+  return MP_OKAY;
+}
diff --git a/bn_mp_toradix.c b/bn_mp_toradix.c
new file mode 100644
index 0000000..52097b1
--- /dev/null
+++ b/bn_mp_toradix.c
@@ -0,0 +1,70 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+  
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+  
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+  
+  /* append a NULL so the string is properly terminated */
+  *str++ = '\0';
+  
+  
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
diff --git a/bn_radix.c b/bn_radix.c
deleted file mode 100644
index 42419cd..0000000
--- a/bn_radix.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* chars used in radix conversions */
-static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-/* read a string [ASCII] in a given radix */
-int
-mp_read_radix (mp_int * a, char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  mp_zero (a);
-  while (*str) {
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == s_rmap[y]) {
-    break;
-      }
-    }
-
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-    return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-    return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-  if (mp_iszero(a) != 1) {
-     a->sign = neg;
-  }
-  return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int
-mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-  
-  /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
-     *str++ = '0';
-     *str = '\0';
-     return MP_OKAY;
-  }
-  
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = s_rmap[d];
-    ++digs;
-  }
-  bn_reverse ((unsigned char *)_s, digs);
-  *str++ = '\0';
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int
-mp_radix_size (mp_int * a, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  /* special case for binary */
-  if (radix == 2) {
-    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-  }
-
-  if (radix < 2 || radix > 64) {
-    return 0;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return 0;
-  }
-
-  digs = 0;
-  if (t.sign == MP_NEG) {
-    ++digs;
-    t.sign = MP_ZPOS;
-  }
-
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return 0;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-  return digs + 1;
-}
-
-/* read a bigint from a file stream in ASCII */
-int mp_fread(mp_int *a, int radix, FILE *stream)
-{
-   int err, ch, neg, y;
-   
-   /* clear a */
-   mp_zero(a);
-   
-   /* if first digit is - then set negative */
-   ch = fgetc(stream);
-   if (ch == '-') {
-      neg = MP_NEG;
-      ch = fgetc(stream);
-   } else {
-      neg = MP_ZPOS;
-   }
-   
-   for (;;) {
-      /* find y in the radix map */
-      for (y = 0; y < radix; y++) {
-          if (s_rmap[y] == ch) {
-             break;
-          }
-      }
-      if (y == radix) {
-         break;
-      }
-      
-      /* shift up and add */
-      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-      
-      ch = fgetc(stream);
-   }
-   if (mp_cmp_d(a, 0) != MP_EQ) {
-      a->sign = neg;
-   }
-   
-   return MP_OKAY;
-}
-
-int mp_fwrite(mp_int *a, int radix, FILE *stream)
-{
-   char *buf;
-   int err, len, x;
-   
-   len = mp_radix_size(a, radix);
-   if (len == 0) {
-      return MP_VAL;
-   }
-   
-   buf = malloc(len);
-   if (buf == NULL) {
-      return MP_MEM;
-   }
-   
-   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
-      free(buf);
-      return err;
-   }
-   
-   for (x = 0; x < len; x++) {
-       if (fputc(buf[x], stream) == EOF) {
-          free(buf);
-          return MP_VAL;
-       }
-   }
-   
-   free(buf);
-   return MP_OKAY;
-}
-
diff --git a/bn_s_mp_exptmod.c b/bn_s_mp_exptmod.c
index fcd0e01..931d49f 100644
--- a/bn_s_mp_exptmod.c
+++ b/bn_s_mp_exptmod.c
@@ -1,216 +1,230 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  mp_int  M[256], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-    if (winsize > 5) {
-       winsize = 5;
-    }
-#endif
-
-  /* init M array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the base, 
-   * e.g. M[x] = G**x mod P
-   *
-   * The first half of the table is not 
-   * computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring 
-   * M[1] (winsize-1) times 
-   */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-                       &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-        break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __MU;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __MU;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int
+s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[TAB_SIZE], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
diff --git a/booker.pl b/booker.pl
index 5bc6645..4f4231f 100644
--- a/booker.pl
+++ b/booker.pl
@@ -75,7 +75,7 @@ while (<IN>) {
       $line = 0;
       $tmp = $m[1];
       $tmp =~ s/_/"\\_"/ge;
-      print OUT "\\index{$tmp}\n\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
+      print OUT "\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
       $wroteline += 5;
       
       if ($skipheader == 1) {
@@ -248,7 +248,7 @@ while (<IN>) {
       chomp($_);
       @m = split(",", $_);
       print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n";
-      print OUT "\\caption{$m[2]}\n\\end{figure}\n\\end{center}\n";
+      print OUT "\\caption{$m[2]}\n\\label{pic:$m[1]}\n\\end{figure}\n\\end{center}\n";
       $wroteline += 4;
    } else {
       print OUT $_;
diff --git a/changes.txt b/changes.txt
index fefe6fe..2dc77c8 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,14 @@
+July 2nd, 2003
+v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always congruent to the inverse...]
+       -- Fixed up s_mp_exptmod and mp_exptmod_fast so the lower half of the pre-computed table isn't allocated
+          which makes the algorithm use half as much ram.
+       -- Fixed the install script not to make the book :-) [which isn't included anyways]
+       -- added mp_cnt_lsb() which counts how many of the lsbs are zero
+       -- optimized mp_gcd() to use the new mp_cnt_lsb() to replace multiple divisions by two by a single division.
+       -- applied similar optimization to mp_prime_miller_rabin().
+       -- Fixed a bug in both mp_invmod() and fast_mp_invmod() which tested for odd
+          via "mp_iseven() == 0" which is not valid [since zero is not even either].
+
 June 19th, 2003
 v0.21  -- Fixed bug in mp_mul_d which would not handle sign correctly [would not always forward it]
        -- Removed the #line lines from gen.pl [was in violation of ISO C]       
diff --git a/demo/demo.c b/demo/demo.c
index 5f4736b..a60d112 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -64,6 +64,19 @@ int main(void)
    mp_init(&f);   
    
    srand(time(NULL));
+
+#if 0
+   /* test mp_cnt_lsb */
+   mp_set(&a, 1);
+   for (ix = 0; ix < 128; ix++) {
+       if (mp_cnt_lsb(&a) != ix) {
+          printf("Failed at %d\n", ix);
+          return 0;
+       }
+       mp_mul_2(&a, &a);
+   }
+#endif
+
 /* test mp_reduce_2k */
 #if 0
    for (cnt = 3; cnt <= 4096; ++cnt) {
@@ -325,8 +338,6 @@ int main(void)
    /* force KARA and TOOM to enable despite cutoffs */
    KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
    TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
-   
-   
 
    for (;;) {
        /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
diff --git a/etc/2kprime.1 b/etc/2kprime.1
index eb12565..c41ded1 100644
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@@ -1,2 +1,2 @@
-256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
-512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
+256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
+512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
diff --git a/makefile b/makefile
index 8fce574..ddfb64c 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.21
+VERSION=0.22
 
 default: libtommath.a
 
@@ -29,25 +29,25 @@ bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_
 bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
 bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
 bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
-bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
 bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
-bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
 	ranlib libtommath.a
 
-install: libtommath.a docs
+install: libtommath.a
 	install -d -g root -o root $(DESTDIR)$(LIBPATH)
 	install -d -g root -o root $(DESTDIR)$(INCPATH)
-	install -d -g root -o root $(DESTDIR)$(DATAPATH)
 	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
 	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
-	install -g root -o root bn.pdf $(DESTDIR)$(DATAPATH)
 
 test: libtommath.a demo/demo.o
 	$(CC) demo/demo.o libtommath.a -o test
@@ -103,6 +103,6 @@ clean:
 zipup: clean manual poster
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
-	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; cp tdcal.pdf ./libtommath-$(VERSION)/ ; cd ./libtommath-$(VERSION) ; rm -f tommath.src tommath.tex tommath.out ; cd pics ; rm -f * ; cd .. ; cd .. ; ls ; \
+	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; cp tdcal.pdf ./libtommath-$(VERSION)/ ; cd ./libtommath-$(VERSION) ; rm -f tommath.src tommath.tex tommath.out ; cd pics ; rm -f *.tif *.ps *.pdf ; cd .. ; cd .. ; ls ; \
 	tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
 	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
diff --git a/makefile.bcc b/makefile.bcc
index bc029fc..b227e8d 100644
--- a/makefile.bcc
+++ b/makefile.bcc
@@ -1,37 +1,39 @@
-#
-# Borland C++Builder Makefile (makefile.bcc)
-#
-
-
-LIB = tlib
-CC = bcc32
-CFLAGS = -c -O2 -I.
-
-OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
-bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
-bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
-bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
-bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
-bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
-bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
-bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
-bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
-bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
-bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
-bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
-bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
-bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
-bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
-bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
-bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
-bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
-bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
-bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
-
-TARGET = libtommath.lib
-
-$(TARGET): $(OBJECTS)
-
-.c.obj:
-	$(CC) $(CFLAGS) $<
+#
+# Borland C++Builder Makefile (makefile.bcc)
+#
+
+
+LIB = tlib
+CC = bcc32
+CFLAGS = -c -O2 -I.
+
+OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
+bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
+bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
+bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
+bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
+bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
+bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
+bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
+bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
+bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
+bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
+bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
+bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
+bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj  \
+bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
+bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
+bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
+bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj
+
+TARGET = libtommath.lib
+
+$(TARGET): $(OBJECTS)
+
+.c.obj:
+	$(CC) $(CFLAGS) $<
 	$(LIB) $(TARGET) -+$@
\ No newline at end of file
diff --git a/makefile.msvc b/makefile.msvc
index 652c135..619e2f0 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -19,13 +19,15 @@ bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.
 bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
 bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
 bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
-bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
+bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj  \
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
 bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
-bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
+bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj
 
 
 
diff --git a/mtest/mtest.c b/mtest/mtest.c
index 5abc1a4..2c111c5 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -107,7 +107,7 @@ int main(void)
          sleep(1);
          t1 = clock();
       }
-   
+
        n = fgetc(rng) % 13;
 
    if (n == 0) {
diff --git a/pics/expt_state.sxd b/pics/expt_state.sxd
new file mode 100644
index 0000000..6518404
Binary files /dev/null and b/pics/expt_state.sxd differ
diff --git a/pics/makefile b/pics/makefile
new file mode 100644
index 0000000..07bbca9
--- /dev/null
+++ b/pics/makefile
@@ -0,0 +1,30 @@
+# makes the images... yeah
+
+default:  pses
+
+
+sliding_window.ps: sliding_window.tif
+	tiff2ps -c -e sliding_window.tif > sliding_window.ps
+	
+expt_state.ps: expt_state.tif
+	tiff2ps -c -e expt_state.tif > expt_state.ps
+
+primality.ps: primality.tif
+	tiff2ps -c -e primality.tif > primality.ps
+
+sliding_window.pdf: sliding_window.ps
+	epstopdf sliding_window.ps
+	
+expt_state.pdf: expt_state.ps
+	epstopdf expt_state.ps
+
+primality.pdf: primality.ps
+	epstopdf primality.ps
+
+
+pses: sliding_window.ps expt_state.ps primality.ps
+pdfes: sliding_window.pdf expt_state.pdf primality.pdf
+
+clean:
+	rm -rf *.ps *.pdf .xvpics
+   
\ No newline at end of file
diff --git a/pics/sliding_window.sxd b/pics/sliding_window.sxd
new file mode 100644
index 0000000..91e7c0d
Binary files /dev/null and b/pics/sliding_window.sxd differ
diff --git a/poster.pdf b/poster.pdf
index 0fd33d4..a37d2db 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 1115a5d..9818cbe 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -67,8 +67,8 @@ top:
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&B) == 0) {
+    /* 4.2 if B is odd then */
+    if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
         goto __ERR;
       }
@@ -85,8 +85,8 @@ top:
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&D) == 0) {
+    /* 5.2 if D is odd then */
+    if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
         goto __ERR;
@@ -1116,6 +1116,50 @@ mp_cmp_mag (mp_int * a, mp_int * b)
 
 /* End: bn_mp_cmp_mag.c */
 
+/* Start: bn_mp_cnt_lsb.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a)
+{
+   int x;
+   mp_digit q;
+
+   if (mp_iszero(a) == 1) {
+      return 0;
+   }
+
+   /* scan lower digits until non-zero */
+   for (x = 0; x < a->used && a->dp[x] == 0; x++);
+   q = a->dp[x];
+   x *= DIGIT_BIT;
+
+   /* now scan this digit until a 1 is found */
+   while ((q & 1) == 0) {
+      q >>= 1;
+      x  += 1;
+   }
+
+   return x;
+}
+
+
+/* End: bn_mp_cnt_lsb.c */
+
 /* Start: bn_mp_copy.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -1560,11 +1604,14 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
-    register mp_digit *tmpc, mask;
+    register mp_digit *tmpc, mask, shift;
 
     /* mask */
     mask = (((mp_digit)1) << D) - 1;
 
+    /* shift for lsb */
+    shift = DIGIT_BIT - D;
+
     /* alias */
     tmpc = c->dp + (c->used - 1);
 
@@ -1575,7 +1622,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
       rr = *tmpc & mask;
 
       /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      *tmpc = (*tmpc >> D) | (r << shift);
       --tmpc;
 
       /* set the carry to the carry bits of the current word found above */
@@ -1593,70 +1640,70 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 /* End: bn_mp_div_2d.c */
 
 /* Start: bn_mp_div_3.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* divide by three (based on routine from MPI and the GMP manual) */
-int
-mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
-{
-  mp_int   q;
-  mp_word  w, t;
-  mp_digit b;
-  int      res, ix;
-  
-  /* b = 2**DIGIT_BIT / 3 */
-  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
-
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-  
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-     
-     if (w >= 3) {
-        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
-        w -= (t << ((mp_word)1)) + t;
-        while (w >= 3) {
-           t += 1;
-           w -= 3;
-        }
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = (mp_digit)t;
-  }
-  
-  if (d != NULL) {
-     *d = (mp_digit)w;
-  }
-  
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-  
-  return res;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= 3) {
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+        w -= (t << ((mp_word)1)) + t;
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+  
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
 
 /* End: bn_mp_div_3.c */
 
@@ -2078,10 +2125,17 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
  *
  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
  */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
-  mp_int  M[256], res;
+  mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@@ -2115,17 +2169,24 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   }
 #endif
 
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
 
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init (&M[x])) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
         mp_clear (&M[y]);
       }
+      mp_clear(&M[1]);
       return err;
     }
   }
 
+
   /* determine and setup reduction code */
   if (redmode == 0) {
      /* now setup montgomery  */
@@ -2314,7 +2375,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   err = MP_OKAY;
 __RES:mp_clear (&res);
 __M:
-  for (x = 0; x < (1 << winsize); x++) {
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
   }
   return err;
@@ -2322,6 +2384,122 @@ __M:
 
 /* End: bn_mp_exptmod_fast.c */
 
+/* Start: bn_mp_fread.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (mp_s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_fread.c */
+
+/* Start: bn_mp_fwrite.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_fwrite.c */
+
 /* Start: bn_mp_gcd.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -2339,13 +2517,12 @@ __M:
  */
 #include <tommath.h>
 
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
- */
+/* Greatest Common Divisor using the binary method */
 int
 mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  u, v, t;
-  int     k, res, neg;
+  mp_int  u, v;
+  int     k, u_lsb, v_lsb, res;
 
   /* either zero than gcd is the largest */
   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
@@ -2359,9 +2536,6 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     return MP_OKAY;
   }
 
-  /* if both are negative they share (-1) as a common divisor */
-  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-
   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
     return res;
   }
@@ -2373,71 +2547,55 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   /* must be positive for the remainder of the algorithm */
   u.sign = v.sign = MP_ZPOS;
 
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  /* B1.  Find the common power of two for u and v */
+  u_lsb = mp_cnt_lsb(&u);
+  v_lsb = mp_cnt_lsb(&v);
+  k     = MIN(u_lsb, v_lsb);
+
+  if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  /* divide any remaining factors of two out */
+  if (u_lsb != k) {
+     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+
+  if (v_lsb != k) {
+     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+  
+  while (mp_iszero(&v) == 0) {
+     /* make sure v is the largest */
+     if (mp_cmp_mag(&u, &v) == MP_GT) {
+        mp_exch(&u, &v);
+     }
+     
+     /* subtract smallest from largest */
+     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
+        goto __V;
+     }
+     
+     /* Divide out all factors of two */
+     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
+        goto __V;
+     } 
+  } 
+  
+  /* multiply by 2**k which we divided out at the beginning */ 
+  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
     goto __V;
   }
-
-  /* B1.  Find power of two */
-  k = 0;
-  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
-    ++k;
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __T;
-    }
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  /* B2.  Initialize */
-  if (mp_isodd(&u) == 1) {
-    /* t = -v */
-    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-    t.sign = MP_NEG;
-  } else {
-    /* t = u */
-    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  do {
-    /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && mp_iseven(&t) == 1) {
-      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-        goto __T;
-      }
-    }
-
-    /* B5.  if t>0 then u=t otherwise v=-t */
-    if (t.used != 0 && t.sign != MP_NEG) {
-      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-        goto __T;
-      }
-    } else {
-      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-        goto __T;
-      }
-      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-    }
-
-    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  } while (mp_iszero(&t) == 0);
-
-  /* multiply by 2^k which we divided out at the beginning */ 
-  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
-    goto __T;
-  }
-
-  mp_exch (&u, c);
-  c->sign = neg;
+  c->sign = MP_ZPOS;
   res = MP_OKAY;
-__T:mp_clear (&t);
 __V:mp_clear (&u);
 __U:mp_clear (&v);
   return res;
@@ -2615,6 +2773,7 @@ mp_init_size (mp_int * a, int size)
  */
 #include <tommath.h>
 
+/* hac 14.61, pp608 */
 int
 mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
@@ -2622,17 +2781,18 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   int     res;
 
   /* b cannot be negative */
-  if (b->sign == MP_NEG) {
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
     return MP_VAL;
   }
 
   /* if the modulus is odd we can use a faster routine instead */
-  if (mp_iseven (b) == 0) {
+  if (mp_isodd (b) == 1) {
     return fast_mp_invmod (a, b, c);
   }
   
   /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
      return res;
   }
 
@@ -2644,10 +2804,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __ERR;
   }
 
-  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
@@ -2664,7 +2820,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   mp_set (&A, 1);
   mp_set (&D, 1);
 
-
 top:
   /* 4.  while u is even do */
   while (mp_iseven (&u) == 1) {
@@ -2673,13 +2828,13 @@ top:
       goto __ERR;
     }
     /* 4.2 if A or B is odd then */
-    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
     }
     /* A = A/2, B = B/2 */
@@ -2691,21 +2846,20 @@ top:
     }
   }
 
-
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __ERR;
+         goto __ERR;
       }
     }
     /* C = C/2, D = D/2 */
@@ -2758,10 +2912,23 @@ top:
     goto __ERR;
   }
 
-  /* a is now the inverse */
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-
 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
@@ -2789,10 +2956,10 @@ __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
  * HAC pp. 73 Algorithm 2.149
  */
 int
-mp_jacobi (mp_int * a, mp_int * n, int *c)
+mp_jacobi (mp_int * a, mp_int * p, int *c)
 {
-  mp_int  a1, n1, e;
-  int     s, r, res;
+  mp_int  a1, p1;
+  int     k, s, r, res;
   mp_digit residue;
 
   /* step 1.  if a == 0, return 0 */
@@ -2808,39 +2975,30 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
   }
 
   /* default */
-  s = 0;
+  k = s = 0;
 
-  /* step 3.  write a = a1 * 2^e  */
+  /* step 3.  write a = a1 * 2**k  */
   if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
     return res;
   }
 
-  if ((res = mp_init (&n1)) != MP_OKAY) {
+  if ((res = mp_init (&p1)) != MP_OKAY) {
     goto __A1;
   }
 
-  if ((res = mp_init (&e)) != MP_OKAY) {
-    goto __N1;
-  }
-
   while (mp_iseven (&a1) == 1) {
-    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
-      goto __E;
-    }
-
+    k = k + 1;
     if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
-      goto __E;
+      goto __P1;
     }
   }
 
   /* step 4.  if e is even set s=1 */
-  if (mp_iseven (&e) == 1) {
+  if ((k & 1) == 0) {
     s = 1;
   } else {
-    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
-      goto __E;
-    }
+    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+    residue = p->dp[0] & 7;
 
     if (residue == 1 || residue == 7) {
       s = 1;
@@ -2849,17 +3007,9 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     }
   }
 
-  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
-    goto __E;
-  }
-  if (residue == 3) {
-    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-    if (residue == 3) {
-      s = -s;
-    }
+  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
+    s = -s;
   }
 
   /* if a1 == 1 we're done */
@@ -2867,19 +3017,18 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     *c = s;
   } else {
     /* n1 = n mod a1 */
-    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
+      goto __P1;
     }
-    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
+      goto __P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__E:mp_clear (&e);
-__N1:mp_clear (&n1);
+__P1:mp_clear (&p1);
 __A1:mp_clear (&a1);
   return res;
 }
@@ -3770,12 +3919,6 @@ mp_mul_2 (mp_int * a, mp_int * b)
  */
 #include <tommath.h>
 
-/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
-   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
- 
-   To be fixed for LTM 0.18
- */
-
 /* shift left by a certain bit count */
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
@@ -3790,8 +3933,8 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
      }
   }
 
-  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
-     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
        return res;
      }
   }
@@ -3802,17 +3945,19 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       return res;
     }
   }
-  c->used = c->alloc;
 
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    register mp_digit *tmpc, mask, r, rr;
+    register mp_digit *tmpc, shift, mask, r, rr;
     register int x;
 
     /* bitmask for carries */
     mask = (((mp_digit)1) << d) - 1;
 
+    /* shift for msbs */
+    shift = DIGIT_BIT - d;
+
     /* alias */
     tmpc = c->dp;
 
@@ -3820,7 +3965,7 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
     r    = 0;
     for (x = 0; x < c->used; x++) {
       /* get the higher bits of the current word */
-      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+      rr = (*tmpc >> shift) & mask;
 
       /* shift the current word and OR in the carry */
       *tmpc = ((*tmpc << d) | r) & MP_MASK;
@@ -3829,6 +3974,11 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       /* set the carry to the carry bits of the current word */
       r = rr;
     }
+    
+    /* set final carry */
+    if (r != 0) {
+       c->dp[c->used++] = r;
+    }
   }
   mp_clamp (c);
   return MP_OKAY;
@@ -4250,9 +4400,9 @@ mp_or (mp_int * a, mp_int * b, mp_int * c)
 
 /* performs one Fermat test.
  * 
- * If "a" were prime then b^a == b (mod a) since the order of
+ * If "a" were prime then b**a == b (mod a) since the order of
  * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
+ * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
  *
  * Sets result to 1 if the congruence holds, or zero otherwise.
  */
@@ -4270,7 +4420,7 @@ mp_prime_fermat (mp_int * a, mp_int * b, int *result)
     return err;
   }
 
-  /* compute t = b^a mod a */
+  /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
     goto __T;
   }
@@ -4304,7 +4454,8 @@ __T:mp_clear (&t);
  */
 #include <tommath.h>
 
-/* determines if an integers is divisible by one of the first 256 primes or not
+/* determines if an integers is divisible by one 
+ * of the first PRIME_SIZE primes or not
  *
  * sets result to 0 if not, 1 if yes
  */
@@ -4318,12 +4469,6 @@ mp_prime_is_divisible (mp_int * a, int *result)
   *result = 0;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* is it equal to the prime? */
-    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
-      *result = 1;
-      return MP_OKAY;
-    }
-
     /* what is a mod __prime_tab[ix] */
     if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
       return err;
@@ -4462,19 +4607,17 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     goto __N1;
   }
 
-  /* set 2^s * r = n1 */
+  /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
     goto __N1;
   }
-  s = 0;
-  while (mp_iseven (&r) == 1) {
-    ++s;
-    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
-      goto __R;
-    }
+ 
+  s = mp_cnt_lsb(&r);
+  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
+    goto __R;
   }
 
-  /* compute y = b^r mod a */
+  /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
     goto __R;
   }
@@ -4488,12 +4631,12 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-	goto __Y;
+         goto __Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-	goto __Y;
+         goto __Y;
       }
 
       ++j;
@@ -4573,6 +4716,86 @@ int mp_prime_next_prime(mp_int *a, int t)
 
 /* End: bn_mp_prime_next_prime.c */
 
+/* Start: bn_mp_radix_size.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
+
+
+/* End: bn_mp_radix_size.c */
+
+/* Start: bn_mp_radix_smap.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+/* End: bn_mp_radix_smap.c */
+
 /* Start: bn_mp_rand.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -4626,6 +4849,87 @@ mp_rand (mp_int * a, int digits)
 
 /* End: bn_mp_rand.c */
 
+/* Start: bn_mp_read_radix.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  /* make sure the radix is ok */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* if the leading digit is a 
+   * minus set the sign to negative. 
+   */
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  /* set the integer to the default of zero */
+  mp_zero (a);
+  
+  /* process each digit of the string */
+  while (*str) {
+    /* if the radix < 36 the conversion is case insensitive
+     * this allows numbers like 1AB and 1ab to represent the same  value
+     * [e.g. in hex]
+     */
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == mp_s_rmap[y]) {
+         break;
+      }
+    }
+
+    /* if the char was found in the map 
+     * and is less than the given radix add it
+     * to the number, otherwise exit the loop. 
+     */
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+         return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+         return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  
+  /* set the sign only if a != 0 */
+  if (mp_iszero(a) != 1) {
+     a->sign = neg;
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_radix.c */
+
 /* Start: bn_mp_read_signed_bin.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -5970,6 +6274,80 @@ ERR:
 
 /* End: bn_mp_toom_sqr.c */
 
+/* Start: bn_mp_toradix.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+  
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+  
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+  
+  /* append a NULL so the string is properly terminated */
+  *str++ = '\0';
+  
+  
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+
+/* End: bn_mp_toradix.c */
+
 /* Start: bn_mp_unsigned_bin_size.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6133,234 +6511,6 @@ const mp_digit __prime_tab[] = {
 
 /* End: bn_prime_tab.c */
 
-/* Start: bn_radix.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* chars used in radix conversions */
-static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-/* read a string [ASCII] in a given radix */
-int
-mp_read_radix (mp_int * a, char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  mp_zero (a);
-  while (*str) {
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == s_rmap[y]) {
-    break;
-      }
-    }
-
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-    return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-    return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-  if (mp_iszero(a) != 1) {
-     a->sign = neg;
-  }
-  return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int
-mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-  
-  /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
-     *str++ = '0';
-     *str = '\0';
-     return MP_OKAY;
-  }
-  
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = s_rmap[d];
-    ++digs;
-  }
-  bn_reverse ((unsigned char *)_s, digs);
-  *str++ = '\0';
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int
-mp_radix_size (mp_int * a, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  /* special case for binary */
-  if (radix == 2) {
-    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-  }
-
-  if (radix < 2 || radix > 64) {
-    return 0;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return 0;
-  }
-
-  digs = 0;
-  if (t.sign == MP_NEG) {
-    ++digs;
-    t.sign = MP_ZPOS;
-  }
-
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return 0;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-  return digs + 1;
-}
-
-/* read a bigint from a file stream in ASCII */
-int mp_fread(mp_int *a, int radix, FILE *stream)
-{
-   int err, ch, neg, y;
-   
-   /* clear a */
-   mp_zero(a);
-   
-   /* if first digit is - then set negative */
-   ch = fgetc(stream);
-   if (ch == '-') {
-      neg = MP_NEG;
-      ch = fgetc(stream);
-   } else {
-      neg = MP_ZPOS;
-   }
-   
-   for (;;) {
-      /* find y in the radix map */
-      for (y = 0; y < radix; y++) {
-          if (s_rmap[y] == ch) {
-             break;
-          }
-      }
-      if (y == radix) {
-         break;
-      }
-      
-      /* shift up and add */
-      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-      
-      ch = fgetc(stream);
-   }
-   if (mp_cmp_d(a, 0) != MP_EQ) {
-      a->sign = neg;
-   }
-   
-   return MP_OKAY;
-}
-
-int mp_fwrite(mp_int *a, int radix, FILE *stream)
-{
-   char *buf;
-   int err, len, x;
-   
-   len = mp_radix_size(a, radix);
-   if (len == 0) {
-      return MP_VAL;
-   }
-   
-   buf = malloc(len);
-   if (buf == NULL) {
-      return MP_MEM;
-   }
-   
-   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
-      free(buf);
-      return err;
-   }
-   
-   for (x = 0; x < len; x++) {
-       if (fputc(buf[x], stream) == EOF) {
-          free(buf);
-          return MP_VAL;
-       }
-   }
-   
-   free(buf);
-   return MP_OKAY;
-}
-
-
-/* End: bn_radix.c */
-
 /* Start: bn_reverse.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6506,222 +6656,236 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
 /* End: bn_s_mp_add.c */
 
 /* Start: bn_s_mp_exptmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  mp_int  M[256], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-    if (winsize > 5) {
-       winsize = 5;
-    }
-#endif
-
-  /* init M array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the base, 
-   * e.g. M[x] = G**x mod P
-   *
-   * The first half of the table is not 
-   * computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring 
-   * M[1] (winsize-1) times 
-   */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-                       &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-        break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __MU;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __MU;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int
+s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[TAB_SIZE], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
 
 /* End: bn_s_mp_exptmod.c */
 
diff --git a/test.c b/test.c
new file mode 100644
index 0000000..37f7e9c
--- /dev/null
+++ b/test.c
@@ -0,0 +1,46 @@
+#include <tommath.h>
+int main(int argc, char ** argv) {
+
+	const unsigned int a = 65537;
+	char b[] = 
+"272621192922230283305477639564135351471136149273956463844361347729298759183125368038593484043149128512765280523210111782526587388894777249539002925324108547349408624093466297893486263619517809026841716115227596170065100354451708345238523975900663359145770823068375223714001310312030819080370340176730626251422070";
+	char radix[1000];
+	mp_int vala, valb, valc;
+
+	if (mp_init(&vala) != MP_OKAY) {
+		fprintf(stderr, "failed to init vala\n");
+		exit(1);
+	}
+
+	if (mp_init(&valb) != MP_OKAY) {
+		fprintf(stderr, "failed to init valb\n");
+		exit(1);
+	}
+
+	if (mp_init(&valc) != MP_OKAY) {
+		fprintf(stderr, "failed to init valc\n");
+		exit(1);
+	}
+	if (mp_set_int(&vala, 65537) != MP_OKAY) {
+		fprintf(stderr, "failed to set vala to 65537\n");
+		exit(1);
+	}
+
+	if (mp_read_radix(&valb, b, 10) != MP_OKAY) {
+		fprintf(stderr, "failed to set valb to %s\n", b);
+		exit(1);
+	}
+
+	if (mp_invmod(&vala, &valb, &valc) != MP_OKAY) {
+		fprintf(stderr, "mp_invmod failed\n");
+		exit(1);
+	}
+
+	if (mp_toradix(&valc, radix, 10) != MP_OKAY) {
+		fprintf(stderr, "failed to convert value to radix 10\n");
+		exit(1);
+	}
+
+	fprintf(stderr, "a = %d\nb = %s\nc = %s\n", a, b, radix);
+	return 0;
+}
\ No newline at end of file
diff --git a/tommath.h b/tommath.h
index 8e43f6c..fe50906 100644
--- a/tommath.h
+++ b/tommath.h
@@ -120,7 +120,7 @@ extern int KARATSUBA_MUL_CUTOFF,
            TOOM_SQR_CUTOFF;
 
 /* various build options */
-#define MP_PREC                 64      /* default digits of precision (must be power of two) */
+#define MP_PREC                 64     /* default digits of precision (must be power of two) */
 
 /* define this to use lower memory usage routines (exptmods mostly) */
 /* #define MP_LOW_MEM */
@@ -166,7 +166,7 @@ int mp_init_size(mp_int *a, int size);
 /* ---> Basic Manipulations <--- */
 
 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
-#define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
+#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? 1 : 0)
 #define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
 
 /* set to zero */
@@ -213,6 +213,9 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c);
 /* computes a = 2**b */
 int mp_2expt(mp_int *a, int b);
 
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a);
+
 /* makes a pseudo-random int of a given size */
 int mp_rand(mp_int *a, int digits);
 
@@ -451,6 +454,8 @@ int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
 void bn_reverse(unsigned char *s, int len);
 
+extern const char *mp_s_rmap;
+
 #ifdef __cplusplus
    }
 #endif