diff --git a/bn.pdf b/bn.pdf
index a002099..8eac8d8 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index b5b17ec..d243a70 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.22 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.23 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
index 5c003e3..7017455 100644
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -64,7 +64,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      * that W[ix-1] have  the carry cleared (see after the inner loop)
      */
     register mp_digit mu;
-    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
+    mu = ((W[ix] & MP_MASK) * rho) & MP_MASK;
 
     /* a = a + mu * m * b**i
      *
@@ -93,7 +93,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
       /* inner loop */
       for (iy = 0; iy < n->used; iy++) {
-          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
+          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
       }
     }
 
@@ -101,7 +101,6 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
   }
 
-
   {
     register mp_digit *tmpx;
     register mp_word *_W, *_W1;
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
index d09489d..bca2a71 100644
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -81,7 +81,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       pb = MIN (b->used, digs - ix);
 
       for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
     }
 
@@ -104,20 +104,27 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
      * cost of the shifting.  On very small numbers this is slower 
      * but on most cryptographic size numbers it is faster.
+     *
+     * In this particular implementation we feed the carries from
+     * behind which means when the loop terminates we still have one
+     * last digit to copy
      */
     tmpc = c->dp;
     for (ix = 1; ix < digs; ix++) {
+      /* forward the carry from the previous temp */
       W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+
+      /* now extract the previous digit [below the carry] */
       *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
     }
+    /* fetch the last digit */
     *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
 
-    /* clear unused */
+    /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
       *tmpc++ = 0;
     }
   }
-
   mp_clamp (c);
   return MP_OKAY;
 }
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
index 1cc1639..e0e9281 100644
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -71,7 +71,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* compute column products for digits above the minimum */
       for (; iy < pb; iy++) {
-    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+         *_W++ += ((mp_word) tmpx) * ((mp_word)*tmpy++);
       }
     }
   }
@@ -80,12 +80,15 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   oldused = c->used;
   c->used = newused;
 
-  /* now convert the array W downto what we need */
+  /* now convert the array W downto what we need
+   *
+   * See comments in bn_fast_s_mp_mul_digs.c
+   */
   for (ix = digs + 1; ix < newused; ix++) {
     W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
     c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
   }
-  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+  c->dp[newused - 1] = (mp_digit) (W[newused - 1] & ((mp_word) MP_MASK));
 
   for (; ix < oldused; ix++) {
     c->dp[ix] = 0;
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
index 74179ee..2c01cd0 100644
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -68,7 +68,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
      * for a particular column only once which means that
      * there is no need todo a double precision addition
      */
-    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
 
     {
       register mp_digit tmpx, *tmpy;
@@ -86,7 +86,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 
       /* inner products */
       for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
     }
   }
diff --git a/bn_mp_clear.c b/bn_mp_clear.c
index 8273ac9..bc31c42 100644
--- a/bn_mp_clear.c
+++ b/bn_mp_clear.c
@@ -19,7 +19,6 @@ void
 mp_clear (mp_int * a)
 {
   if (a->dp != NULL) {
-
     /* first zero the digits */
     memset (a->dp, 0, sizeof (mp_digit) * a->used);
 
@@ -27,7 +26,7 @@ mp_clear (mp_int * a)
     free (a->dp);
 
     /* reset members to make debugging easier */
-    a->dp = NULL;
+    a->dp    = NULL;
     a->alloc = a->used = 0;
   }
 }
diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
index c721e6e..0f683a5 100644
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@@ -14,6 +14,19 @@
  */
 #include <tommath.h>
 
+static int s_is_power_of_two(mp_digit b, int *p)
+{
+   int x;
+
+   for (x = 1; x < DIGIT_BIT; x++) {
+      if (b == (((mp_digit)1)<<x)) {
+         *p = x;
+         return 1;
+      }
+   }
+   return 0;
+}
+
 /* single digit division (based on routine from MPI) */
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
@@ -22,15 +35,40 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
   mp_word w;
   mp_digit t;
   int     res, ix;
-  
+
+  /* cannot divide by zero */
   if (b == 0) {
      return MP_VAL;
   }
-  
+
+  /* quick outs */
+  if (b == 1 || mp_iszero(a) == 1) {
+     if (d != NULL) {
+        *d = 0;
+     }
+     if (c != NULL) {
+        return mp_copy(a, c);
+     }
+     return MP_OKAY;
+  }
+
+  /* power of two ? */
+  if (s_is_power_of_two(b, &ix) == 1) {
+     if (d != NULL) {
+        *d = a->dp[0] & ((1<<ix) - 1);
+     }
+     if (c != NULL) {
+        return mp_div_2d(a, ix, c, NULL);
+     }
+     return MP_OKAY;
+  }
+
+  /* three? */
   if (b == 3) {
      return mp_div_3(a, c, d);
   }
-  
+
+  /* no easy answer [c'est la vie].  Just division */
   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
      return res;
   }
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index 567d614..8a5e565 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -82,7 +82,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     }
   }
 
-
   /* determine and setup reduction code */
   if (redmode == 0) {
      /* now setup montgomery  */
diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c
index e422cf3..99a8a55 100644
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@@ -44,7 +44,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
   for (ix = 0; ix < n->used; ix++) {
     /* mu = ai * m' mod b */
-    mu = (x->dp[ix] * rho) & MP_MASK;
+    mu = ((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK;
 
     /* a = a + mu * m * b**i */
     {
@@ -61,7 +61,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
       
       /* Multiply and add in place */
       for (iy = 0; iy < n->used; iy++) {
-        r       = ((mp_word) mu) * ((mp_word) * tmpn++) + 
+        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
                   ((mp_word) u) + ((mp_word) * tmpx);
         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c
index 1c22208..8379035 100644
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@@ -50,7 +50,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     u = 0;
     for (ix = 0; ix < pa; ix++) {
       /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+      r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
 
       /* mask off higher bits to get a single digit */
       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/bn_mp_prime_fermat.c b/bn_mp_prime_fermat.c
index 202f45a..b0e9746 100644
--- a/bn_mp_prime_fermat.c
+++ b/bn_mp_prime_fermat.c
@@ -31,6 +31,11 @@ mp_prime_fermat (mp_int * a, mp_int * b, int *result)
   /* default to fail */
   *result = 0;
 
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }
+
   /* init t */
   if ((err = mp_init (&t)) != MP_OKAY) {
     return err;
diff --git a/bn_mp_prime_is_prime.c b/bn_mp_prime_is_prime.c
index 1a782b3..f9cece9 100644
--- a/bn_mp_prime_is_prime.c
+++ b/bn_mp_prime_is_prime.c
@@ -17,7 +17,7 @@
 /* performs a variable number of rounds of Miller-Rabin
  *
  * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= 256
+ * (1/4)^t when 1 <= t <= PRIME_SIZE
  *
  * Sets result to 1 if probably prime, 0 otherwise
  */
@@ -31,7 +31,7 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
   *result = 0;
 
   /* valid value of t? */
-  if (t < 1 || t > PRIME_SIZE) {
+  if (t <= 0 || t > PRIME_SIZE) {
     return MP_VAL;
   }
 
@@ -47,6 +47,8 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
     return err;
   }
+
+  /* return if it was trivially divisible */
   if (res == 1) {
     return MP_OKAY;
   }
diff --git a/bn_mp_prime_miller_rabin.c b/bn_mp_prime_miller_rabin.c
index 4a96674..f68ba75 100644
--- a/bn_mp_prime_miller_rabin.c
+++ b/bn_mp_prime_miller_rabin.c
@@ -30,6 +30,11 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
   /* default */
   *result = 0;
 
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }     
+
   /* get n1 = a - 1 */
   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
     return err;
@@ -42,8 +47,13 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
     goto __N1;
   }
- 
+
+  /* count the number of least significant bits
+   * which are zero
+   */
   s = mp_cnt_lsb(&r);
+
+  /* now divide n - 1 by 2^s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
     goto __R;
   }
diff --git a/bn_mp_prime_next_prime.c b/bn_mp_prime_next_prime.c
index cfebbe5..0dde42a 100644
--- a/bn_mp_prime_next_prime.c
+++ b/bn_mp_prime_next_prime.c
@@ -16,39 +16,151 @@
 
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
  */
-int mp_prime_next_prime(mp_int *a, int t)
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
 {
-   int err, res;
+   int      err, res, x, y;
+   mp_digit res_tab[PRIME_SIZE], step, kstep;
+   mp_int   b;
 
-   if (mp_iseven(a) == 1) {
-      /* force odd */
-      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
-         return err;
+   /* ensure t is valid */
+   if (t <= 0 || t > PRIME_SIZE) {
+      return MP_VAL;
+   }
+
+   /* force positive */
+   if (a->sign == MP_NEG) {
+      a->sign = MP_ZPOS;
+   }
+
+   /* simple algo if a is less than the largest prime in the table */
+   if (mp_cmp_d(a, __prime_tab[PRIME_SIZE-1]) == MP_LT) {
+      /* find which prime it is bigger than */
+      for (x = PRIME_SIZE - 2; x >= 0; x--) {
+          if (mp_cmp_d(a, __prime_tab[x]) != MP_LT) {
+             if (bbs_style == 1) {
+                /* ok we found a prime smaller or
+                 * equal [so the next is larger]
+                 *
+                 * however, the prime must be
+                 * congruent to 3 mod 4
+                 */
+                if ((__prime_tab[x + 1] & 3) != 3) {
+                   /* scan upwards for a prime congruent to 3 mod 4 */
+                   for (y = x + 1; y < PRIME_SIZE; y++) {
+                       if ((__prime_tab[y] & 3) == 3) {
+                          mp_set(a, __prime_tab[y]);
+                          return MP_OKAY;
+                       }
+                   }
+                }
+             } else {
+                mp_set(a, __prime_tab[x + 1]);
+                return MP_OKAY;
+             }
+          }
+      }
+      /* at this point a maybe 1 */
+      if (mp_cmp_d(a, 1) == MP_EQ) {
+         mp_set(a, 2);
+         return MP_OKAY;
+      }
+      /* fall through to the sieve */
+   }
+
+   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
+   if (bbs_style == 1) {
+      kstep   = 4;
+   } else {
+      kstep   = 2;
+   }
+
+   /* at this point we will use a combination of a sieve and Miller-Rabin */
+
+   if (bbs_style == 1) {
+      /* if a mod 4 != 3 subtract the correct value to make it so */
+      if ((a->dp[0] & 3) != 3) {
+         if ((err = mp_sub_d(a, (a->dp[0] & 3) + 1, a)) != MP_OKAY) { return err; };
       }
    } else {
-      /* force to next odd number */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+      if (mp_iseven(a) == 1) {
+         /* force odd */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) {
+            return err;
+         }
+      }
+   }
+
+   /* generate the restable */
+   for (x = 1; x < PRIME_SIZE; x++) {
+      if ((err = mp_mod_d(a, __prime_tab[x], res_tab + x)) != MP_OKAY) {
          return err;
       }
    }
 
+   /* init temp used for Miller-Rabin Testing */
+   if ((err = mp_init(&b)) != MP_OKAY) {
+      return err;
+   }
+
    for (;;) {
+      /* skip to the next non-trivially divisible candidate */
+      step = 0;
+      do {
+         /* y == 1 if any residue was zero [e.g. cannot be prime] */
+         y     =  0;
+
+         /* increase step to next odd */
+         step += kstep;
+
+         /* compute the new residue without using division */
+         for (x = 1; x < PRIME_SIZE; x++) {
+             /* add the step to each residue */
+             res_tab[x] += kstep;
+
+             /* subtract the modulus [instead of using division] */
+             if (res_tab[x] >= __prime_tab[x]) {
+                res_tab[x]  -= __prime_tab[x];
+             }
+
+             /* set flag if zero */
+             if (res_tab[x] == 0) {
+                y = 1;
+             }
+         }
+      } while (y == 1 && step < ((((mp_digit)1)<<DIGIT_BIT) - kstep));
+
+      /* add the step */
+      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
+         goto __ERR;
+      }
+
+      /* if step == MAX then skip test */
+      if (step >= ((((mp_digit)1)<<DIGIT_BIT) - kstep)) {
+         continue;
+      }
+
       /* is this prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         return err;
+      for (x = 0; x < t; x++) {
+          mp_set(&b, __prime_tab[t]);
+          if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
+             goto __ERR;
+          }
+          if (res == 0) {
+             break;
+          }
       }
 
       if (res == 1) {
          break;
       }
-
-      /* add two, next candidate */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
    }
 
-   return MP_OKAY;
+   err = MP_OKAY;
+__ERR:
+   mp_clear(&b);
+   return err;
 }
 
diff --git a/bn_mp_read_unsigned_bin.c b/bn_mp_read_unsigned_bin.c
index 378d1fa..8ca2c11 100644
--- a/bn_mp_read_unsigned_bin.c
+++ b/bn_mp_read_unsigned_bin.c
@@ -25,14 +25,14 @@ mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
       return res;
     }
 
-    if (DIGIT_BIT != 7) {
+#ifndef MP_8BIT
       a->dp[0] |= *b++;
       a->used += 1;
-    } else {
+#else
       a->dp[0] = (*b & MP_MASK);
       a->dp[1] |= ((*b++ >> 7U) & 1);
       a->used += 2;
-    }
+#endif
   }
   mp_clamp (a);
   return MP_OKAY;
diff --git a/bn_mp_to_unsigned_bin.c b/bn_mp_to_unsigned_bin.c
index 8f5eeb7..54e0739 100644
--- a/bn_mp_to_unsigned_bin.c
+++ b/bn_mp_to_unsigned_bin.c
@@ -27,11 +27,11 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 
   x = 0;
   while (mp_iszero (&t) == 0) {
-    if (DIGIT_BIT != 7) {
+#ifndef MP_8BIT
       b[x++] = (unsigned char) (t.dp[0] & 255);
-    } else {
+#else
       b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
-    }
+#endif
     if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
       mp_clear (&t);
       return res;
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
index c126a0c..cb3dbd7 100644
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@@ -62,7 +62,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     for (iy = 0; iy < pb; iy++) {
       /* compute the column as a mp_word */
       r = ((mp_word) *tmpt) + 
-          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word)tmpx) * ((mp_word)*tmpy++) +
           ((mp_word) u);
 
       /* the new column is the lower part of the result */
diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c
index bbe7378..a0c2c0e 100644
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@@ -55,7 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
     for (iy = digs - ix; iy < pb; iy++) {
       /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word) u);
 
       /* get the lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c
index bd4bc51..d45a00e 100644
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@@ -32,8 +32,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
   for (ix = 0; ix < pa; ix++) {
     /* first calculate the digit at 2*ix */
     /* calculate double precision result */
-    r = ((mp_word) t.dp[2*ix]) + 
-        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[2*ix]) +
+        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
 
     /* store lower part in result */
     t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -49,12 +49,12 @@ s_mp_sqr (mp_int * a, mp_int * b)
     
     for (iy = ix + 1; iy < pa; iy++) {
       /* first calculate the product */
-      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+      r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
 
       /* now calculate the double precision result, note we use
        * addition instead of *2 since it's easier to optimize
        */
-      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
+      r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
 
       /* store lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/changes.txt b/changes.txt
index 2dc77c8..be4869f 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,21 @@
+July 12th, 2003
+v0.23  -- Optimized mp_prime_next_prime() to not use mp_mod [via is_divisible()] in each
+          iteration.  Instead now a smaller table is kept of the residues which can be updated
+          without division.
+       -- Fixed a bug in next_prime() where an input of zero would be treated as odd and
+          have two added to it [to move to the next odd].
+       -- fixed a bug in prime_fermat() and prime_miller_rabin() which allowed the base
+          to be negative, zero or one.  Normally the test is only valid if the base is
+          greater than one.
+       -- changed the next_prime() prototype to accept a new parameter "bbs_style" which
+          will find the next prime congruent to 3 mod 4.  The default [bbs_style==0] will
+          make primes which are either congruent to 1 or 3 mod 4.
+       -- fixed mp_read_unsigned_bin() so that it doesn't include both code for
+          the case DIGIT_BIT < 8 and >= 8
+       -- optimized div_d() to easy out on division by 1 [or if a == 0] and use
+          logical shifts if the divisor is a power of two.
+       -- the default DIGIT_BIT type was not int for non-default builds.  Fixed.
+
 July 2nd, 2003
 v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always congruent to the inverse...]
        -- Fixed up s_mp_exptmod and mp_exptmod_fast so the lower half of the pre-computed table isn't allocated
diff --git a/demo/demo.c b/demo/demo.c
index a60d112..e7b3fdb 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -65,6 +65,31 @@ int main(void)
    
    srand(time(NULL));
 
+#if 0
+   for (;;) {
+      fgets(buf, sizeof(buf), stdin);
+      mp_read_radix(&a, buf, 10);
+      mp_prime_next_prime(&a, 5, 1);
+      mp_toradix(&a, buf, 10);
+      printf("%s, %lu\n", buf, a.dp[0] & 3);
+   }
+#endif
+
+#if 0
+{
+   mp_word aa, bb;
+
+   for (;;) {
+       aa = abs(rand()) & MP_MASK;
+       bb = abs(rand()) & MP_MASK;
+	  if (MULT(aa,bb) != (aa*bb)) {
+             printf("%llu * %llu == %llu or %llu?\n", aa, bb, (ulong64)MULT(aa,bb), (ulong64)(aa*bb));
+             return 0;
+          }
+   }
+}
+#endif
+
 #if 0
    /* test mp_cnt_lsb */
    mp_set(&a, 1);
@@ -122,7 +147,6 @@ int main(void)
 
 /* test the DR reduction */
 #if 0
-
    for (cnt = 2; cnt < 32; cnt++) {
        printf("%d digit modulus\n", cnt);
        mp_grow(&a, cnt);
@@ -175,8 +199,6 @@ int main(void)
          fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
       }
       fclose(log);
-      
-      return 0;
 
       log = fopen("logs/sub.log", "w");
       for (cnt = 8; cnt <= 128; cnt += 8) {
diff --git a/etc/2kprime.1 b/etc/2kprime.1
index c41ded1..e1384db 100644
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@@ -1,2 +1 @@
-256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
-512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
+259-bits (k = 17745) = 926336713898529563388567880069503262826159877325124512315660672063305037101743
diff --git a/etc/drprime.c b/etc/drprime.c
index 157e358..2b561e3 100644
--- a/etc/drprime.c
+++ b/etc/drprime.c
@@ -1,7 +1,7 @@
 /* Makes safe primes of a DR nature */
 #include <tommath.h>
 
-const int sizes[] = { 8, 19, 28, 37, 55, 74,  110, 147 };
+int sizes[] = { 256/DIGIT_BIT, 512/DIGIT_BIT, 768/DIGIT_BIT, 1024/DIGIT_BIT, 2048/DIGIT_BIT, 4096/DIGIT_BIT };
 int main(void)
 {
    int res, x, y;
@@ -14,6 +14,7 @@ int main(void)
    
    out = fopen("drprimes.txt", "w");
    for (x = 0; x < (int)(sizeof(sizes)/sizeof(sizes[0])); x++) {
+   top:
        printf("Seeking a %d-bit safe prime\n", sizes[x] * DIGIT_BIT);
        mp_grow(&a, sizes[x]);
        mp_zero(&a);
@@ -22,21 +23,26 @@ int main(void)
        }
        
        /* make a DR modulus */
-       a.dp[0] = 1;
+       a.dp[0] = -1;
        a.used = sizes[x];
        
        /* now loop */
-       do { 
-          fflush(stdout);
-          mp_prime_next_prime(&a, 3);
-          printf(".");
+       for (;;) { 
+          a.dp[0] += 4;
+          if (a.dp[0] >= MP_MASK) break;
+          mp_prime_is_prime(&a, 1, &res);
+          if (res == 0) continue;
+          printf("."); fflush(stdout);
           mp_sub_d(&a, 1, &b);
           mp_div_2(&b, &b);
           mp_prime_is_prime(&b, 3, &res);  
-	} while (res == 0);          
+          if (res == 0) continue;
+          mp_prime_is_prime(&a, 3, &res);
+          if (res == 1) break;
+	}
         
-        if (mp_dr_is_modulus(&a) != 1) {
-           printf("Error not DR modulus\n");
+        if (res != 1) {
+           printf("Error not DR modulus\n"); sizes[x] += 1; goto top;
         } else {
            mp_toradix(&a, buf, 10);
            printf("\n\np == %s\n\n", buf);
diff --git a/etc/drprimes.1 b/etc/drprimes.28
similarity index 99%
rename from etc/drprimes.1
rename to etc/drprimes.28
index e7cc366..9d438ad 100644
--- a/etc/drprimes.1
+++ b/etc/drprimes.28
@@ -1,3 +1,5 @@
+DR safe primes for 28-bit digits.
+
 224-bit prime:
 p == 26959946667150639794667015087019630673637144422540572481103341844143
 
diff --git a/etc/drprimes.txt b/etc/drprimes.txt
index 6593cd5..717420d 100644
--- a/etc/drprimes.txt
+++ b/etc/drprimes.txt
@@ -1,6 +1,15 @@
-224-bit prime:
-p == 26959946667150639794667015087019630673637144422540572481103341844143
+300-bit prime:
+p == 2037035976334486086268445688409378161051468393665936250636140449354381299763336706183393387
 
-532-bit prime:
-p == 14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368691747
+510-bit prime:
+p == 3351951982485649274893506249551461531869841455148098344430890360930441007518386744200468574541725856922507964546621512713438470702986642486608412251494847
+
+765-bit prime:
+p == 194064761537588616893622436057812819407110752139587076392381504753256369085797110791359801103580809743810966337141384150771447505514351798930535909380147642400556872002606238193783160703949805603157874899214558593861605856727005843
+
+1740-bit prime:
+p == 61971563797913992479098926650774597592238975917324828616370329001490282756182680310375299496755876376552390992409906202402580445340335946188208371182877207703039791403230793200138374588682414508868522097839706723444887189794752005280474068640895359332705297533442094790319040758184131464298255306336601284054032615054089107503261218395204931877449590906016855549287497608058070532126514935495184332288660623518513755499687752752528983258754107553298994358814410594621086881204713587661301862918471291451469190214535690028223
+
+2145-bit prime:
+p == 5120834017984591518147028606005386392991070803233539296225079797126347381640561714282620018633786528684625023495254338414266418034876748837569635527008462887513799703364491256252208677097644781218029521545625387720450034199300257983090290650191518075514440227307582827991892955933645635564359934476985058395497772801264225688705417270604479898255105628816161764712152286804906915652283101897505006786990112535065979412882966109410722156057838063961993103028819293481078313367826492291911499907219457764211473530756735049840415233164976184864760203928986194694093688479274544786530457604655777313274555786861719645260099496565700321073395329400403
 
diff --git a/logs/add.log b/logs/add.log
index e69de29..1d02326 100644
--- a/logs/add.log
+++ b/logs/add.log
@@ -0,0 +1,16 @@
+224  12444616
+448  10412040
+672   8825112
+896   7519080
+1120   6428432
+1344   5794784
+1568   5242952
+1792   4737008
+2016   4434104
+2240   4132912
+2464   3827752
+2688   3589672
+2912   3350176
+3136   3180208
+3360   3014160
+3584   2847672
diff --git a/logs/sqr.log b/logs/sqr.log
index 2fb2e98..e69de29 100644
--- a/logs/sqr.log
+++ b/logs/sqr.log
@@ -1,17 +0,0 @@
-896    415472
-1344    223736
-1792    141232
-2240     97624
-2688     71400
-3136     54800
-3584     16904
-4032     13528
-4480     10968
-4928      9128
-5376      7784
-5824      6672
-6272      5760
-6720      5056
-7168      4440
-7616      3952
-8064      3512
diff --git a/logs/sub.log b/logs/sub.log
index 91c7d65..272c098 100644
--- a/logs/sub.log
+++ b/logs/sub.log
@@ -1,16 +1,16 @@
-224   9728504
-448   8573648
-672   7488096
-896   6714064
-1120   5950472
-1344   5457400
-1568   5038896
-1792   4683632
-2016   4384656
-2240   4105976
-2464   3871608
-2688   3650680
-2912   3463552
-3136   3290016
-3360   3135272
-3584   2993848
+224  10876088
+448   9103552
+672   7823536
+896   6724960
+1120   5993496
+1344   5278984
+1568   4947736
+1792   4478384
+2016   4108840
+2240   3838696
+2464   3604128
+2688   3402192
+2912   3166568
+3136   3090672
+3360   2946720
+3584   2781288
diff --git a/makefile b/makefile
index ddfb64c..5ff6957 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.22
+VERSION=0.23
 
 default: libtommath.a
 
diff --git a/makefile.msvc b/makefile.msvc
index 619e2f0..0af7c89 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -2,7 +2,7 @@
 #
 #Tom St Denis
 
-CFLAGS = /I. /Ox /DWIN32 /W3 /WX
+CFLAGS = /I. /Ox /DWIN32 /W3
 
 default: library
 
@@ -29,7 +29,5 @@ bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj
 
-
-
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
diff --git a/poster.pdf b/poster.pdf
index a37d2db..3d3377d 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/poster.tex b/poster.tex
index 83ff45f..64af993 100644
--- a/poster.tex
+++ b/poster.tex
@@ -1,36 +1,36 @@
-\documentclass[landscape,11pt]{article}
-\usepackage{amsmath, amssymb}
-\usepackage{hyperref}
-\begin{document}
-
-\hspace*{-3in}
-\begin{tabular}{llllll}
-$c = a + b$  & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$  & {\tt mp\_mul\_2(\&a, \&b)} & \\
-$c = a - b$  & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\
-$c = ab $   & {\tt mp\_mul(\&a, \&b, \&c)}  & $c = 2^ba$  & {\tt mp\_mul\_2d(\&a, b, \&c)}  \\
-$b = a^2 $  & {\tt mp\_sqr(\&a, \&b)}       & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\
-$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $  & {\tt mp\_mod\_2d(\&a, b, \&c)}  \\
- && \\
-$a = b $  & {\tt mp\_set\_int(\&a, b)}  & $c = a \vee b$  & {\tt mp\_or(\&a, \&b, \&c)}  \\
-$b = a $  & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$  & {\tt mp\_and(\&a, \&b, \&c)}  \\
- && $c = a \oplus b$  & {\tt mp\_xor(\&a, \&b, \&c)}  \\
- & \\
-$b = -a $  & {\tt mp\_neg(\&a, \&b)}  & $d = a + b \mod c$  & {\tt mp\_addmod(\&a, \&b, \&c, \&d)}  \\
-$b = |a| $  & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$  & {\tt mp\_submod(\&a, \&b, \&c, \&d)}  \\
- && $d = ab \mod c$  & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)}  \\
-Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$  & {\tt mp\_sqrmod(\&a, \&b, \&c)}  \\
-Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$  & {\tt mp\_invmod(\&a, \&b, \&c)} \\
-Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\
-Is Odd ? & {\tt mp\_isodd(\&a)} \\
-&\\
-$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\
-$buf \leftarrow a$          & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t)} \\
-$a \leftarrow buf[0..len-1]$          & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\
-&\\
-$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)}  & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\
-$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\
-&\\
-Greater Than & MP\_GT & Equal To & MP\_EQ \\
-Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\
-\end{tabular}
-\end{document}
\ No newline at end of file
+\documentclass[landscape,11pt]{article}
+\usepackage{amsmath, amssymb}
+\usepackage{hyperref}
+\begin{document}
+
+\hspace*{-3in}
+\begin{tabular}{llllll}
+$c = a + b$  & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$  & {\tt mp\_mul\_2(\&a, \&b)} & \\
+$c = a - b$  & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\
+$c = ab $   & {\tt mp\_mul(\&a, \&b, \&c)}  & $c = 2^ba$  & {\tt mp\_mul\_2d(\&a, b, \&c)}  \\
+$b = a^2 $  & {\tt mp\_sqr(\&a, \&b)}       & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\
+$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $  & {\tt mp\_mod\_2d(\&a, b, \&c)}  \\
+ && \\
+$a = b $  & {\tt mp\_set\_int(\&a, b)}  & $c = a \vee b$  & {\tt mp\_or(\&a, \&b, \&c)}  \\
+$b = a $  & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$  & {\tt mp\_and(\&a, \&b, \&c)}  \\
+ && $c = a \oplus b$  & {\tt mp\_xor(\&a, \&b, \&c)}  \\
+ & \\
+$b = -a $  & {\tt mp\_neg(\&a, \&b)}  & $d = a + b \mod c$  & {\tt mp\_addmod(\&a, \&b, \&c, \&d)}  \\
+$b = |a| $  & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$  & {\tt mp\_submod(\&a, \&b, \&c, \&d)}  \\
+ && $d = ab \mod c$  & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)}  \\
+Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$  & {\tt mp\_sqrmod(\&a, \&b, \&c)}  \\
+Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$  & {\tt mp\_invmod(\&a, \&b, \&c)} \\
+Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\
+Is Odd ? & {\tt mp\_isodd(\&a)} \\
+&\\
+$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\
+$buf \leftarrow a$          & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t, bbs\_style)} \\
+$a \leftarrow buf[0..len-1]$          & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\
+&\\
+$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)}  & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\
+$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\
+&\\
+Greater Than & MP\_GT & Equal To & MP\_EQ \\
+Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\
+\end{tabular}
+\end{document}
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 9818cbe..5c09fbf 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -216,7 +216,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      * that W[ix-1] have  the carry cleared (see after the inner loop)
      */
     register mp_digit mu;
-    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
+    mu = ((W[ix] & MP_MASK) * rho) & MP_MASK;
 
     /* a = a + mu * m * b**i
      *
@@ -245,7 +245,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
       /* inner loop */
       for (iy = 0; iy < n->used; iy++) {
-          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
+          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
       }
     }
 
@@ -253,7 +253,6 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
   }
 
-
   {
     register mp_digit *tmpx;
     register mp_word *_W, *_W1;
@@ -383,7 +382,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       pb = MIN (b->used, digs - ix);
 
       for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
     }
 
@@ -406,20 +405,27 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
      * cost of the shifting.  On very small numbers this is slower 
      * but on most cryptographic size numbers it is faster.
+     *
+     * In this particular implementation we feed the carries from
+     * behind which means when the loop terminates we still have one
+     * last digit to copy
      */
     tmpc = c->dp;
     for (ix = 1; ix < digs; ix++) {
+      /* forward the carry from the previous temp */
       W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+
+      /* now extract the previous digit [below the carry] */
       *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
     }
+    /* fetch the last digit */
     *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
 
-    /* clear unused */
+    /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
       *tmpc++ = 0;
     }
   }
-
   mp_clamp (c);
   return MP_OKAY;
 }
@@ -500,7 +506,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* compute column products for digits above the minimum */
       for (; iy < pb; iy++) {
-    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+         *_W++ += ((mp_word) tmpx) * ((mp_word)*tmpy++);
       }
     }
   }
@@ -509,12 +515,15 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   oldused = c->used;
   c->used = newused;
 
-  /* now convert the array W downto what we need */
+  /* now convert the array W downto what we need
+   *
+   * See comments in bn_fast_s_mp_mul_digs.c
+   */
   for (ix = digs + 1; ix < newused; ix++) {
     W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
     c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
   }
-  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+  c->dp[newused - 1] = (mp_digit) (W[newused - 1] & ((mp_word) MP_MASK));
 
   for (; ix < oldused; ix++) {
     c->dp[ix] = 0;
@@ -596,7 +605,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
      * for a particular column only once which means that
      * there is no need todo a double precision addition
      */
-    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
 
     {
       register mp_digit tmpx, *tmpy;
@@ -614,7 +623,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 
       /* inner products */
       for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
       }
     }
   }
@@ -972,7 +981,6 @@ void
 mp_clear (mp_int * a)
 {
   if (a->dp != NULL) {
-
     /* first zero the digits */
     memset (a->dp, 0, sizeof (mp_digit) * a->used);
 
@@ -980,7 +988,7 @@ mp_clear (mp_int * a)
     free (a->dp);
 
     /* reset members to make debugging easier */
-    a->dp = NULL;
+    a->dp    = NULL;
     a->alloc = a->used = 0;
   }
 }
@@ -1724,6 +1732,19 @@ mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
  */
 #include <tommath.h>
 
+static int s_is_power_of_two(mp_digit b, int *p)
+{
+   int x;
+
+   for (x = 1; x < DIGIT_BIT; x++) {
+      if (b == (((mp_digit)1)<<x)) {
+         *p = x;
+         return 1;
+      }
+   }
+   return 0;
+}
+
 /* single digit division (based on routine from MPI) */
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
@@ -1732,15 +1753,40 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
   mp_word w;
   mp_digit t;
   int     res, ix;
-  
+
+  /* cannot divide by zero */
   if (b == 0) {
      return MP_VAL;
   }
-  
+
+  /* quick outs */
+  if (b == 1 || mp_iszero(a) == 1) {
+     if (d != NULL) {
+        *d = 0;
+     }
+     if (c != NULL) {
+        return mp_copy(a, c);
+     }
+     return MP_OKAY;
+  }
+
+  /* power of two ? */
+  if (s_is_power_of_two(b, &ix) == 1) {
+     if (d != NULL) {
+        *d = a->dp[0] & ((1<<ix) - 1);
+     }
+     if (c != NULL) {
+        return mp_div_2d(a, ix, c, NULL);
+     }
+     return MP_OKAY;
+  }
+
+  /* three? */
   if (b == 3) {
      return mp_div_3(a, c, d);
   }
-  
+
+  /* no easy answer [c'est la vie].  Just division */
   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
      return res;
   }
@@ -2186,7 +2232,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     }
   }
 
-
   /* determine and setup reduction code */
   if (redmode == 0) {
      /* now setup montgomery  */
@@ -3666,7 +3711,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
   for (ix = 0; ix < n->used; ix++) {
     /* mu = ai * m' mod b */
-    mu = (x->dp[ix] * rho) & MP_MASK;
+    mu = ((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK;
 
     /* a = a + mu * m * b**i */
     {
@@ -3683,7 +3728,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
       
       /* Multiply and add in place */
       for (iy = 0; iy < n->used; iy++) {
-        r       = ((mp_word) mu) * ((mp_word) * tmpn++) + 
+        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
                   ((mp_word) u) + ((mp_word) * tmpx);
         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
@@ -4039,7 +4084,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     u = 0;
     for (ix = 0; ix < pa; ix++) {
       /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+      r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
 
       /* mask off higher bits to get a single digit */
       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -4415,6 +4460,11 @@ mp_prime_fermat (mp_int * a, mp_int * b, int *result)
   /* default to fail */
   *result = 0;
 
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }
+
   /* init t */
   if ((err = mp_init (&t)) != MP_OKAY) {
     return err;
@@ -4506,7 +4556,7 @@ mp_prime_is_divisible (mp_int * a, int *result)
 /* performs a variable number of rounds of Miller-Rabin
  *
  * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= 256
+ * (1/4)^t when 1 <= t <= PRIME_SIZE
  *
  * Sets result to 1 if probably prime, 0 otherwise
  */
@@ -4520,7 +4570,7 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
   *result = 0;
 
   /* valid value of t? */
-  if (t < 1 || t > PRIME_SIZE) {
+  if (t <= 0 || t > PRIME_SIZE) {
     return MP_VAL;
   }
 
@@ -4536,6 +4586,8 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
     return err;
   }
+
+  /* return if it was trivially divisible */
   if (res == 1) {
     return MP_OKAY;
   }
@@ -4599,6 +4651,11 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
   /* default */
   *result = 0;
 
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }     
+
   /* get n1 = a - 1 */
   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
     return err;
@@ -4611,8 +4668,13 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
     goto __N1;
   }
- 
+
+  /* count the number of least significant bits
+   * which are zero
+   */
   s = mp_cnt_lsb(&r);
+
+  /* now divide n - 1 by 2^s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
     goto __R;
   }
@@ -4677,40 +4739,152 @@ __N1:mp_clear (&n1);
 
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
  */
-int mp_prime_next_prime(mp_int *a, int t)
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
 {
-   int err, res;
+   int      err, res, x, y;
+   mp_digit res_tab[PRIME_SIZE], step, kstep;
+   mp_int   b;
 
-   if (mp_iseven(a) == 1) {
-      /* force odd */
-      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
-         return err;
+   /* ensure t is valid */
+   if (t <= 0 || t > PRIME_SIZE) {
+      return MP_VAL;
+   }
+
+   /* force positive */
+   if (a->sign == MP_NEG) {
+      a->sign = MP_ZPOS;
+   }
+
+   /* simple algo if a is less than the largest prime in the table */
+   if (mp_cmp_d(a, __prime_tab[PRIME_SIZE-1]) == MP_LT) {
+      /* find which prime it is bigger than */
+      for (x = PRIME_SIZE - 2; x >= 0; x--) {
+          if (mp_cmp_d(a, __prime_tab[x]) != MP_LT) {
+             if (bbs_style == 1) {
+                /* ok we found a prime smaller or
+                 * equal [so the next is larger]
+                 *
+                 * however, the prime must be
+                 * congruent to 3 mod 4
+                 */
+                if ((__prime_tab[x + 1] & 3) != 3) {
+                   /* scan upwards for a prime congruent to 3 mod 4 */
+                   for (y = x + 1; y < PRIME_SIZE; y++) {
+                       if ((__prime_tab[y] & 3) == 3) {
+                          mp_set(a, __prime_tab[y]);
+                          return MP_OKAY;
+                       }
+                   }
+                }
+             } else {
+                mp_set(a, __prime_tab[x + 1]);
+                return MP_OKAY;
+             }
+          }
+      }
+      /* at this point a maybe 1 */
+      if (mp_cmp_d(a, 1) == MP_EQ) {
+         mp_set(a, 2);
+         return MP_OKAY;
+      }
+      /* fall through to the sieve */
+   }
+
+   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
+   if (bbs_style == 1) {
+      kstep   = 4;
+   } else {
+      kstep   = 2;
+   }
+
+   /* at this point we will use a combination of a sieve and Miller-Rabin */
+
+   if (bbs_style == 1) {
+      /* if a mod 4 != 3 subtract the correct value to make it so */
+      if ((a->dp[0] & 3) != 3) {
+         if ((err = mp_sub_d(a, (a->dp[0] & 3) + 1, a)) != MP_OKAY) { return err; };
       }
    } else {
-      /* force to next odd number */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+      if (mp_iseven(a) == 1) {
+         /* force odd */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) {
+            return err;
+         }
+      }
+   }
+
+   /* generate the restable */
+   for (x = 1; x < PRIME_SIZE; x++) {
+      if ((err = mp_mod_d(a, __prime_tab[x], res_tab + x)) != MP_OKAY) {
          return err;
       }
    }
 
+   /* init temp used for Miller-Rabin Testing */
+   if ((err = mp_init(&b)) != MP_OKAY) {
+      return err;
+   }
+
    for (;;) {
+      /* skip to the next non-trivially divisible candidate */
+      step = 0;
+      do {
+         /* y == 1 if any residue was zero [e.g. cannot be prime] */
+         y     =  0;
+
+         /* increase step to next odd */
+         step += kstep;
+
+         /* compute the new residue without using division */
+         for (x = 1; x < PRIME_SIZE; x++) {
+             /* add the step to each residue */
+             res_tab[x] += kstep;
+
+             /* subtract the modulus [instead of using division] */
+             if (res_tab[x] >= __prime_tab[x]) {
+                res_tab[x]  -= __prime_tab[x];
+             }
+
+             /* set flag if zero */
+             if (res_tab[x] == 0) {
+                y = 1;
+             }
+         }
+      } while (y == 1 && step < ((((mp_digit)1)<<DIGIT_BIT) - kstep));
+
+      /* add the step */
+      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
+         goto __ERR;
+      }
+
+      /* if step == MAX then skip test */
+      if (step >= ((((mp_digit)1)<<DIGIT_BIT) - kstep)) {
+         continue;
+      }
+
       /* is this prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         return err;
+      for (x = 0; x < t; x++) {
+          mp_set(&b, __prime_tab[t]);
+          if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
+             goto __ERR;
+          }
+          if (res == 0) {
+             break;
+          }
       }
 
       if (res == 1) {
          break;
       }
-
-      /* add two, next candidate */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
    }
 
-   return MP_OKAY;
+   err = MP_OKAY;
+__ERR:
+   mp_clear(&b);
+   return err;
 }
 
 
@@ -4990,14 +5164,14 @@ mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
       return res;
     }
 
-    if (DIGIT_BIT != 7) {
+#ifndef MP_8BIT
       a->dp[0] |= *b++;
       a->used += 1;
-    } else {
+#else
       a->dp[0] = (*b & MP_MASK);
       a->dp[1] |= ((*b++ >> 7U) & 1);
       a->used += 2;
-    }
+#endif
   }
   mp_clamp (a);
   return MP_OKAY;
@@ -5756,11 +5930,11 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 
   x = 0;
   while (mp_iszero (&t) == 0) {
-    if (DIGIT_BIT != 7) {
+#ifndef MP_8BIT
       b[x++] = (unsigned char) (t.dp[0] & 255);
-    } else {
+#else
       b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
-    }
+#endif
     if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -6954,7 +7128,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     for (iy = 0; iy < pb; iy++) {
       /* compute the column as a mp_word */
       r = ((mp_word) *tmpt) + 
-          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word)tmpx) * ((mp_word)*tmpy++) +
           ((mp_word) u);
 
       /* the new column is the lower part of the result */
@@ -7036,7 +7210,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
     for (iy = digs - ix; iy < pb; iy++) {
       /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word) u);
 
       /* get the lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -7089,8 +7263,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
   for (ix = 0; ix < pa; ix++) {
     /* first calculate the digit at 2*ix */
     /* calculate double precision result */
-    r = ((mp_word) t.dp[2*ix]) + 
-        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[2*ix]) +
+        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
 
     /* store lower part in result */
     t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -7106,12 +7280,12 @@ s_mp_sqr (mp_int * a, mp_int * b)
     
     for (iy = ix + 1; iy < pa; iy++) {
       /* first calculate the product */
-      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+      r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
 
       /* now calculate the double precision result, note we use
        * addition instead of *2 since it's easier to optimize
        */
-      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
+      r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
 
       /* store lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/test.c b/test.c
deleted file mode 100644
index 37f7e9c..0000000
--- a/test.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <tommath.h>
-int main(int argc, char ** argv) {
-
-	const unsigned int a = 65537;
-	char b[] = 
-"272621192922230283305477639564135351471136149273956463844361347729298759183125368038593484043149128512765280523210111782526587388894777249539002925324108547349408624093466297893486263619517809026841716115227596170065100354451708345238523975900663359145770823068375223714001310312030819080370340176730626251422070";
-	char radix[1000];
-	mp_int vala, valb, valc;
-
-	if (mp_init(&vala) != MP_OKAY) {
-		fprintf(stderr, "failed to init vala\n");
-		exit(1);
-	}
-
-	if (mp_init(&valb) != MP_OKAY) {
-		fprintf(stderr, "failed to init valb\n");
-		exit(1);
-	}
-
-	if (mp_init(&valc) != MP_OKAY) {
-		fprintf(stderr, "failed to init valc\n");
-		exit(1);
-	}
-	if (mp_set_int(&vala, 65537) != MP_OKAY) {
-		fprintf(stderr, "failed to set vala to 65537\n");
-		exit(1);
-	}
-
-	if (mp_read_radix(&valb, b, 10) != MP_OKAY) {
-		fprintf(stderr, "failed to set valb to %s\n", b);
-		exit(1);
-	}
-
-	if (mp_invmod(&vala, &valb, &valc) != MP_OKAY) {
-		fprintf(stderr, "mp_invmod failed\n");
-		exit(1);
-	}
-
-	if (mp_toradix(&valc, radix, 10) != MP_OKAY) {
-		fprintf(stderr, "failed to convert value to radix 10\n");
-		exit(1);
-	}
-
-	fprintf(stderr, "a = %d\nb = %s\nc = %s\n", a, b, radix);
-	return 0;
-}
\ No newline at end of file
diff --git a/tommath.h b/tommath.h
index fe50906..e5e166b 100644
--- a/tommath.h
+++ b/tommath.h
@@ -85,12 +85,13 @@ extern "C" {
    #define DIGIT_BIT          31
 #else
    #define DIGIT_BIT          28
+   #define MP_28BIT
 #endif   
 #endif
 
 /* otherwise the bits per digit is calculated automatically from the size of a mp_digit */
 #ifndef DIGIT_BIT
-   #define DIGIT_BIT     ((CHAR_BIT * sizeof(mp_digit) - 1))  /* bits per digit */
+   #define DIGIT_BIT     ((int)((CHAR_BIT * sizeof(mp_digit) - 1)))  /* bits per digit */
 #endif
 
 
@@ -400,9 +401,10 @@ int mp_prime_is_prime(mp_int *a, int t, int *result);
 
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
  */
-int mp_prime_next_prime(mp_int *a, int t);
-
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style);
 
 /* ---> radix conversion <--- */
 int mp_count_bits(mp_int *a);