diff --git a/bn.c b/bn.c
index 33d027f..040ff1c 100644
--- a/bn.c
+++ b/bn.c
@@ -796,7 +796,7 @@ int mp_mul_2(mp_int *a, mp_int *b)
       DECFUNC();
       return res;
    }
-   b->used = b->alloc;
+   ++b->used;
    
    /* shift any bit count < DIGIT_BIT */
    r = 0;
@@ -1017,7 +1017,6 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
            */
           _W   = W + ix;
           
-          
           /* the number of digits is limited by their placement.  E.g. 
              we avoid multiplying digits that will end up above the # of
              digits of precision requested
@@ -2840,11 +2839,404 @@ int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
   return res;
 }
 
+/* setups the montgomery reduction stuff */
+int mp_montgomery_setup(mp_int *a, mp_digit *mp)
+{
+   mp_int t, tt;
+   int res;
+   
+   if ((res = mp_init(&t)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_init(&tt)) != MP_OKAY) {
+      goto __T;
+   }
+   
+   /* tt = b */
+   tt.dp[0] = 0;
+   tt.dp[1] = 1;
+   tt.used  = 2;
+
+   /* t = m mod b */
+   t.dp[0] = a->dp[0];
+   t.used  = 1;
+
+   /* t = 1/m mod b */
+   if ((res = mp_invmod(&t, &tt, &t)) != MP_OKAY) {
+      goto __TT;
+   }
+   
+   /* t = -1/m mod b */
+   *mp = ((mp_digit)1 << ((mp_digit)DIGIT_BIT)) - t.dp[0];
+   
+   res = MP_OKAY;
+__TT: mp_clear(&tt);   
+__T:  mp_clear(&t);
+   return res;
+}   
+
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
+static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
+{
+   int ix, res, olduse;
+   mp_digit ui;
+   mp_word  W[512];
+   
+   /* get old used count */
+   olduse = a->used;
+   
+   /* grow a as required */
+   if (a->alloc < m->used*2+1) {
+      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
+         return res;
+      }
+   }
+   
+   /* copy and clear */
+   for (ix = 0; ix < a->used; ix++) {
+       W[ix] = a->dp[ix];
+   }
+   for (; ix < m->used * 2 + 1; ix++) {
+       W[ix] = 0;
+   }
+   
+   for (ix = 0; ix < m->used; ix++) {
+       /* ui = ai * m' mod b */
+       ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
+       
+       /* a = a + ui * m * b^i */
+       { 
+          register int      iy;
+          register mp_digit *tmpx;
+          register mp_word  *_W;
+          
+          /* aliases */
+          tmpx = m->dp;
+          _W   = W + ix;
+          
+          for (iy = 0; iy < m->used; iy++) {
+              *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
+          }
+          
+          /* now fix carry for W[ix+1] */
+          W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
+          W[ix]   &= ((mp_word)MP_MASK);
+       }
+   }
+   
+   /* nox fix rest of carries */
+   for (; ix <= m->used * 2 + 1; ix++) {
+       W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix-1] &= ((mp_word)MP_MASK);
+   }
+   
+   /* copy out */
+
+   /* A = A/b^n */
+   for (ix = 0; ix < m->used + 1; ix++) { 
+       a->dp[ix] = W[ix+m->used];
+   }
+   
+   a->used = m->used + 1;
+
+   /* zero oldused digits */  
+   for (; ix < olduse; ix++) {
+       a->dp[ix] = 0;
+   }
+
+   mp_clamp(a);
+   
+   /* if A >= m then A = A - m */
+   if (mp_cmp_mag(a, m) != MP_LT) {
+      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
+         return res;
+      }
+   }
+   
+   return MP_OKAY;
+}
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
+{
+   int ix, res, digs;
+   mp_digit ui;
+   
+   REGFUNC("mp_montgomery_reduce");
+   VERIFY(a);
+   VERIFY(m);
+   
+   digs = m->used * 2 + 1;
+   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
+      res = fast_mp_montgomery_reduce(a, m, mp);
+      DECFUNC();
+      return res;
+   }  
+
+   if (a->alloc < m->used*2+1) {
+      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
+   }
+   a->used = m->used * 2 + 1;
+      
+   for (ix = 0; ix < m->used; ix++) {
+       /* ui = ai * m' mod b */
+       ui = (a->dp[ix] * mp) & MP_MASK;
+       
+       /* a = a + ui * m * b^i */
+       { 
+          register int      iy;
+          register mp_digit *tmpx, *tmpy, mu;
+          register mp_word  r;
+          
+          /* aliases */
+          tmpx = m->dp;
+          tmpy = a->dp + ix;
+          
+          mu  = 0;
+          for (iy = 0; iy < m->used; iy++) {
+              r             = ((mp_word)ui) * ((mp_word)*tmpx++) + ((mp_word)mu) + ((mp_word)*tmpy);
+              mu            = (r >> ((mp_word)DIGIT_BIT));
+              *tmpy++       = (r & ((mp_word)MP_MASK));
+          }
+          /* propagate carries */
+          while (mu) {
+             *tmpy            += mu;
+             mu                = (*tmpy>>DIGIT_BIT)&1;
+             *tmpy++          &= MP_MASK;
+          }
+       }
+   }
+   
+   /* A = A/b^n */
+   mp_rshd(a, m->used);
+   
+   /* if A >= m then A = A - m */
+   if (mp_cmp_mag(a, m) != MP_LT) {
+      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
+   }
+   
+   DECFUNC();
+   return MP_OKAY;
+}
+
 /* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 
  *
  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
  * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery reduction 
  */
+static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
+{
+   mp_int M[64], res;
+   mp_digit buf, mp;
+   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+   
+   REGFUNC("mp_exptmod_fast");
+   VERIFY(G);
+   VERIFY(X);
+   VERIFY(P);
+   VERIFY(Y);
+   
+   /* find window size */
+   x = mp_count_bits(X);
+        if (x <= 18)    { winsize = 2; }
+   else if (x <= 84)    { winsize = 3; }
+   else if (x <= 300)   { winsize = 4; }
+   else if (x <= 930)   { winsize = 5; }
+   else                 { winsize = 6; }
+   
+   /* init G array */
+   for (x = 0; x < (1<<winsize); x++) {
+      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
+         for (y = 0; y < x; y++) {
+            mp_clear(&M[y]);
+         }
+         DECFUNC();
+         return err;
+      }
+   }
+   
+   /* now setup montgomery  */
+   if ((err = mp_montgomery_setup(P, &mp)) != MP_OKAY) {
+      goto __M;
+   }
+   
+   /* setup result */
+   if ((err = mp_init(&res)) != MP_OKAY) {
+      goto __M;
+   }
+
+   /* now we need R mod m */
+   mp_set(&res, 1);           
+   if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
+      goto __RES;
+   }
+   
+   /* res = R mod m */
+   if ((err = mp_mod(&res, P, &res)) != MP_OKAY) {
+      goto __RES;
+   }   
+   
+   /* create M table 
+    *
+    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+    *
+    * The first half of the table is not computed though accept for M[0] and M[1]
+    */
+   mp_set(&M[0], 1);
+   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+      goto __RES;
+   }
+   
+   /* now set M[1] to G * R mod m */
+   if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) {
+      goto __RES;
+   }
+   
+   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+   if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
+      goto __RES;
+   }
+   
+   for (x = 0; x < (winsize-1); x++) {
+       if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
+          goto __RES;
+       }
+       if ((err = mp_montgomery_reduce(&M[1<<(winsize-1)], P, mp)) != MP_OKAY) {
+          goto __RES;
+       }
+   }  
+   
+   /* create upper table */
+   for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
+       if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
+          goto __RES;
+       }
+       if ((err = mp_montgomery_reduce(&M[x], P, mp)) != MP_OKAY) {
+          goto __RES;
+       }
+   }
+
+   /* set initial mode and bit cnt */
+   mode   = 0;
+   bitcnt = 0;
+   buf    = 0;
+   digidx = X->used - 1;
+   bitcpy = bitbuf = 0;
+  
+   bitcnt = 1;
+   for (;;) {
+      /* grab next digit as required */
+      if (--bitcnt == 0) {
+         if (digidx == -1) {
+            break;
+         }
+         buf = X->dp[digidx--];
+         bitcnt = (int)DIGIT_BIT;
+      }
+      
+      /* grab the next msb from the exponent */
+      y = (buf >> (DIGIT_BIT - 1)) & 1;
+      buf <<= 1;
+    
+      /* if the bit is zero and mode == 0 then we ignore it 
+       * These represent the leading zero bits before the first 1 bit
+       * in the exponent.  Technically this opt is not required but it 
+       * does lower the # of trivial squaring/reductions used
+       */
+      if (mode == 0 && y == 0) continue;
+      
+      /* if the bit is zero and mode == 1 then we square */
+      if (y == 0 && mode == 1) {
+         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+            goto __RES;
+         }
+         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+            goto __RES;
+         }
+         continue;
+      }
+      
+      /* else we add it to the window */
+      bitbuf  |= (y<<(winsize-++bitcpy));
+      mode     = 2;
+      
+      if (bitcpy == winsize) {
+         /* ok window is filled so square as required and multiply multiply */
+         /* square first */
+         for (x = 0; x < winsize; x++) {
+            if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+               goto __RES;
+            }
+            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+               goto __RES;
+            }
+         }
+         
+         /* then multiply */
+         if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
+            goto __RES;
+         }
+         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+            goto __RES;
+         }
+         
+         /* empty window and reset */
+         bitcpy = bitbuf = 0;
+         mode   = 1;
+      }
+   }
+   
+   /* if bits remain then square/multiply */
+   if (mode == 2 && bitcpy > 0) {
+      /* square then multiply if the bit is set */
+      for (x = 0; x < bitcpy; x++) {
+         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+            goto __RES;
+         }
+         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+            goto __RES;
+         }
+         
+         bitbuf <<= 1;
+         if ((bitbuf & (1<<winsize)) != 0) {
+            /* then multiply */
+            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
+               goto __RES;
+            }
+            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+               goto __RES;
+            }
+         }
+      }
+   }
+   
+   /* fixup result */
+   if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
+      goto __RES;
+   } 
+   
+   mp_exch(&res, Y);
+   err = MP_OKAY;
+__RES: mp_clear(&res);
+__M  :
+   for (x = 0; x < (1<<winsize); x++) {
+      mp_clear(&M[x]);
+   }
+   DECFUNC();
+   return err;
+}
+
+
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
    mp_int M[64], res, mu;
@@ -2857,6 +3249,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    VERIFY(P);
    VERIFY(Y);
    
+   /* if the modulus is odd use the fast method */
+   if (mp_isodd(P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
+      err = mp_exptmod_fast(G, X, P, Y);
+      DECFUNC();
+      return err;
+   }   
+
    /* find window size */
    x = mp_count_bits(X);
         if (x <= 18)    { winsize = 2; }
@@ -2918,7 +3317,8 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
           goto __MU;
        }
    }
-   /* init result */
+   
+   /* setup result */
    if ((err = mp_init(&res)) != MP_OKAY) {
       goto __MU;
    }
@@ -3009,10 +3409,10 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
          if ((bitbuf & (1<<winsize)) != 0) {
             /* then multiply */
             if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
-               goto __MU;
+               goto __RES;
             }
             if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-               goto __MU;
+               goto __RES;
             }
          }
       }
@@ -3060,10 +3460,8 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
    neg     = a->sign;
    a->sign = MP_ZPOS;
 
-   /* t2 = a */
-   if ((res = mp_copy(a, &t2)) != MP_OKAY) {
-      goto __T3;
-   }
+   /* t2 = 2 */
+   mp_set(&t2, 2);
   
    do {
       /* t1 = t2 */
@@ -3098,16 +3496,20 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
       }
    } while (mp_cmp(&t1, &t2) != MP_EQ);
    
-   /* result can be at most off by one so check */
-   if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
-   }
-   
-   if (mp_cmp(&t2, a) == MP_GT) {
-      if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
+   /* result can be off by a few so check */
+   for (;;) {
+      if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
          goto __T3;
       }
-   }
+   
+      if (mp_cmp(&t2, a) == MP_GT) {
+         if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
+            goto __T3;
+         }
+      } else {
+         break;
+      }
+   }      
    
    /* reset the sign of a first */
    a->sign = neg;
@@ -3336,13 +3738,14 @@ int mp_to_signed_bin(mp_int *a, unsigned char *b)
 /* get the size for an unsigned equivalent */
 int mp_unsigned_bin_size(mp_int *a)
 {
-   return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+   int size = mp_count_bits(a);
+   return (size/8 + ((size&7) != 0 ? 1 : 0));
 }
 
 /* get the size for an signed equivalent */
 int mp_signed_bin_size(mp_int *a)
 {
-   return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+   return 1 + mp_unsigned_bin_size(a);
 }
 
 /* read a string [ASCII] in a given radix */
@@ -3431,8 +3834,11 @@ int mp_radix_size(mp_int *a, int radix)
    mp_int t;
    mp_digit d;
    
-   digs = 0;
-
+   /* special case for binary */
+   if (radix == 2) {
+      return mp_count_bits(a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+   }
+      
    if (radix < 2 || radix > 64) {
       return 0;
    }
@@ -3441,6 +3847,7 @@ int mp_radix_size(mp_int *a, int radix)
       return 0;
    }
    
+   digs = 0;
    if (t.sign == MP_NEG) { 
       ++digs;
       t.sign = MP_ZPOS;
diff --git a/bn.h b/bn.h
index 903e7d6..6e7bc85 100644
--- a/bn.h
+++ b/bn.h
@@ -84,6 +84,7 @@ typedef int           mp_err;
 /* you'll have to tune these... */
 #define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
 #define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
+#define MONTGOMERY_EXPT_CUTOFF  40      /* max. number of digits that montgomery reductions will help for */
 
 #define MP_PREC                 64      /* default digits of precision */
 
@@ -114,7 +115,7 @@ int mp_shrink(mp_int *a);
 
 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
 #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
-#define mp_isodd(a)  (((a)->used > 0 || (((a)->dp[0] & 1) == 1)) ? 1 : 0)
+#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
 
 /* set to zero */
 void mp_zero(mp_int *a);
@@ -256,6 +257,12 @@ int mp_reduce_setup(mp_int *a, mp_int *b);
  * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
  */
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+
+/* setups the montgomery reduction */
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
  
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
diff --git a/bn.pdf b/bn.pdf
index 3bea8f2..f9c86f2 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index ed2a46d..d2aab27 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.09 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -279,6 +279,22 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
 /* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
 int mp_jacobi(mp_int *a, mp_int *n, int *c);
 
+/* used to setup the Barrett reduction for a given modulus b */
+int mp_reduce_setup(mp_int *a, mp_int *b);
+
+/* Barrett Reduction, computes a (mod b) with a precomputed value c  
+ *
+ * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
+ * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
+ */
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+
+/* setups the montgomery reduction */
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 \end{verbatim}
@@ -451,21 +467,38 @@ by $c$.  Since the result of the Jacobi function $\left ( {a \over n} \right ) \
 natural to store the result in a simple C style \textbf{int}.  If $n$ is prime then the Jacobi function produces
 the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}.  This means if
 $n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if 
-it is not.  
+it is not.
 
 \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
 exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
-chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett reductions are used
-to reduce the squared or multiplied temporary results modulo $c$.  A Barrett reduction requires one division that is
-performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work.  
+chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery 
+reductions are used to reduce the squared or multiplied temporary results modulo $c$.
 
-Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications.  The total work of a exponentiation is
-therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which 
-simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$.  The bulk of the time is spent in the Barrett 
-reductions and the squaring algorithms.  Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and
-squaring routines first.  Significant improvements in the future will most likely stem from optimizing these instead
-of optimizing the multipliers.
+\subsection{Fast Modular Reductions}
+
+\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes 
+$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the 
+function mp\_reduce\_setup().
+
+The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
+have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial
+multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants.  Barrett reduction 
+can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications.  
+
+\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
+Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then
+$R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
+The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
+
+The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine
+is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can 
+reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  
+
+Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply
+by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the
+exptmod function can be made to take advantage of this such that only one normalization at the end is required.  
 
 \section{Timing Analysis}
 \subsection{Observed Timings}
@@ -503,9 +536,9 @@ Square & 1024 & 18,991  & 5,733     \\
 Square & 2048 & 72,126  & 17,621    \\
 Square & 4096 & 306,269  & 67,576   \\
 \hline 
-Exptmod & 512 & 32,021,586  & 4,138,354 \\
-Exptmod & 768 & 97,595,492  & 9,840,233 \\
-Exptmod & 1024 & 223,302,532  & 20,624,553     \\
+Exptmod & 512 & 32,021,586  & 3,118,435 \\
+Exptmod & 768 & 97,595,492  & 8,493,633 \\
+Exptmod & 1024 & 223,302,532  & 17,715,899     \\
 Exptmod & 2048 & 1,682,223,369   & 114,936,361      \\
 Exptmod & 2560 & 3,268,615,571   & 229,402,426       \\
 Exptmod & 3072 & 5,597,240,141   & 367,403,840      \\
diff --git a/changes.txt b/changes.txt
index 2d84db9..d302ee6 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,8 @@
+Jan 9th, 2003
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
+       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
+          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
+       
 Jan 6th, 2003
 v0.09  -- Updated the manual to reflect recent changes.  :-)
        -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
diff --git a/demo.c b/demo.c
index f671758..ab92707 100644
--- a/demo.c
+++ b/demo.c
@@ -21,12 +21,15 @@
 #define TIMER
 extern ulong64 rdtsc(void);
 extern void reset(void);
-#else 
+#endif
 
+#ifdef TIMER
+#ifndef TIMER_X86
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
 #endif
+#endif
 
 #ifndef DEBUG
 int _ifuncs;
@@ -82,6 +85,7 @@ int main(void)
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
    int rr;
+   mp_digit tom;
    
 #ifdef TIMER
    int n;
@@ -94,29 +98,38 @@ int main(void)
    mp_init(&d);
    mp_init(&e);
    mp_init(&f);
- 
+   
+   mp_read_radix(&a, "59994534535345535344389423", 10);
+   mp_read_radix(&b, "49993453555234234565675534", 10);
+   mp_read_radix(&c, "62398923474472948723847281", 10);
+    
+   mp_mulmod(&a, &b, &c, &f);
+   
+   /* setup mont */
+   mp_montgomery_setup(&c, &tom);
+   mp_mul(&a, &b, &a);
+   mp_montgomery_reduce(&a, &c, tom);
+   mp_montgomery_reduce(&a, &c, tom);
+   mp_lshd(&a, c.used*2);
+   mp_mod(&a, &c, &a);
+   
+   mp_toradix(&a, cmd, 10);
+   printf("%s\n\n", cmd);
+   mp_toradix(&f, cmd, 10);
+   printf("%s\n", cmd);
+   
+/*   return 0; */
+   
+   
    mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
    mp_reduce_setup(&b, &a);
    printf("\n\n----\n\n");
    mp_toradix(&b, buf, 10);
    printf("b == %s\n\n\n", buf);
-   
+
    mp_read_radix(&b, "4982748972349724892742", 10);
    mp_sub_d(&a, 1, &c);
-   
-#ifdef DEBUG
-   mp_sqr(&a, &a);mp_sqr(&c, &c);
-   mp_sqr(&a, &a);mp_sqr(&c, &c);
-   mp_sqr(&a, &a);mp_sqr(&c, &c);
-   reset_timings();
-#endif   
    mp_exptmod(&b, &c, &a, &d);
-#ifdef DEBUG   
-   dump_timings();
-   return 0;
-
-#endif   
-   
    mp_toradix(&d, buf, 10);
    printf("b^p-1 == %s\n", buf);
    
@@ -169,7 +182,7 @@ int main(void)
       printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
-   
+
    {
       char *primes[] = {
          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
diff --git a/etc/pprime.c b/etc/pprime.c
index fb987e3..4943088 100644
--- a/etc/pprime.c
+++ b/etc/pprime.c
@@ -85,10 +85,11 @@ static mp_digit prime_digit()
 }
 
 /* makes a prime of at least k bits */
-int pprime(int k, mp_int *p, mp_int *q)
+int pprime(int k, int li, mp_int *p, mp_int *q)
 {
    mp_int a, b, c, n, x, y, z, v;
-   int res;
+   int res, ii;
+   static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
    
    /* single digit ? */
    if (k <= (int)DIGIT_BIT) {
@@ -167,55 +168,60 @@ int pprime(int k, mp_int *p, mp_int *q)
       
       if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
       
-      /* now try base x=2  */
-      mp_set(&x, 2);
+      /* now try base x=bases[ii]  */
+      for (ii = 0; ii < li; ii++) {
+         mp_set(&x, bases[ii]);
        
-      /* compute x^a mod n */
-      if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
-         goto __Z;
-      }
+         /* compute x^a mod n */
+         if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
+            goto __Z;
+         }
       
-      /* if y == 1 loop */
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
+         /* if y == 1 loop */
+         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
    
-      /* now x^2a mod n */
-      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
-         goto __Z;
-      }
+         /* now x^2a mod n */
+         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
+           goto __Z;
+         }
    
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
+         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
    
-      /* compute x^b mod n */
-      if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
-          goto __Z;
-      }
+         /* compute x^b mod n */
+         if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
+            goto __Z;
+         }
       
-      /* if y == 1 loop */
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
+         /* if y == 1 loop */
+         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
    
-      /* now x^2b mod n */
-      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
-         goto __Z;
-      }
+         /* now x^2b mod n */
+         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
+            goto __Z;
+         }
    
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
+         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
 
+         /* compute x^c mod n == x^ab mod n */
+         if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
+             goto __Z;
+         }
       
-      /* compute x^c mod n == x^ab mod n */
-      if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
-          goto __Z;
+         /* if y == 1 loop */
+         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
+   
+         /* now compute (x^c mod n)^2 */
+         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
+            goto __Z;
+         }
+   
+         /* y should be 1 */
+         if (mp_cmp_d(&y, 1) != MP_EQ) continue;
+         break;
       }
       
-      /* if y == 1 loop */
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
-   
-      /* now compute (x^c mod n)^2 */
-      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
-         goto __Z;
-      }
-   
-      /* y should be 1 */
-      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
+      /* no bases worked? */
+      if (ii == li) goto top;
 
 /*
 {
@@ -232,10 +238,14 @@ int pprime(int k, mp_int *p, mp_int *q)
 */
      /* a = n */
      mp_copy(&n, &a);
-  }     
+  }
+  
+  /* get q to be the order of the large prime subgroup */
+  mp_sub_d(&n, 1, q);
+  mp_div_2(q, q);
+  mp_div(q, &b, q, NULL);
 
   mp_exch(&n, p);
-  mp_exch(&b, q);
    
    res = MP_OKAY;
 __Z: mp_clear(&z); 
@@ -254,19 +264,25 @@ int main(void)
 {
    mp_int p, q;
    char buf[4096];
-   int k;
+   int k, li;
    clock_t t1;
       
    srand(time(NULL));
    
    printf("Enter # of bits: \n");
-   scanf("%d", &k);
+   fgets(buf, sizeof(buf), stdin);
+   sscanf(buf, "%d", &k);
+   
+   printf("Enter number of bases to try (1 to 8):\n");
+   fgets(buf, sizeof(buf), stdin);
+   sscanf(buf, "%d", &li);
+   
    
    mp_init(&p);
    mp_init(&q);
    
    t1 = clock();   
-   pprime(k, &p, &q);
+   pprime(k, li, &p, &q);
    t1 = clock() - t1;
    
    printf("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits(&p));
diff --git a/makefile b/makefile
index d448933..edaf773 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops 
 
-VERSION=0.09
+VERSION=0.10
 
 default: test
 
diff --git a/poly.c b/poly.c
new file mode 100644
index 0000000..0c85897
--- /dev/null
+++ b/poly.c
@@ -0,0 +1,302 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision 
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * This file "poly.c" provides GF(p^k) functionality on top of the 
+ * libtommath library.
+ * 
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.  
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include "poly.h"
+
+#undef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#undef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
+static void s_free(mp_poly *a)
+{
+   int k;
+   for (k = 0; k < a->alloc; k++) {
+       mp_clear(&(a->co[k]));
+   }
+}
+
+static int s_setup(mp_poly *a, int low, int high)
+{
+   int res, k, j;
+   for (k = low; k < high; k++) {
+       if ((res = mp_init(&(a->co[k]))) != MP_OKAY) {
+          for (j = low; j < k; j++) {
+             mp_clear(&(a->co[j]));
+          }
+          return MP_MEM;
+       }
+   }
+   return MP_OKAY;
+}   
+
+int mp_poly_init(mp_poly *a, mp_int *cha)
+{
+   return mp_poly_init_size(a, cha, MP_POLY_PREC);
+}
+
+/* init a poly of a given (size) degree */
+int mp_poly_init_size(mp_poly *a, mp_int *cha, int size)
+{
+   int res;
+   
+   /* allocate array of mp_ints for coefficients */
+   a->co = malloc(size * sizeof(mp_int));
+   if (a->co == NULL) {
+      return MP_MEM;
+   }
+   a->used  = 0;
+   a->alloc = size;
+   
+   /* now init the range */
+   if ((res = s_setup(a, 0, size)) != MP_OKAY) {
+      free(a->co);
+      return res;
+   }
+   
+   /* copy characteristic */
+   if ((res = mp_init_copy(&(a->cha), cha)) != MP_OKAY) {
+      s_free(a);
+      free(a->co);
+      return res;
+   }
+   
+   /* return ok at this point */
+   return MP_OKAY;
+}
+
+/* grow the size of a poly */
+static int mp_poly_grow(mp_poly *a, int size)
+{
+  int res;
+  
+  if (size > a->alloc) {
+     /* resize the array of coefficients */
+     a->co = realloc(a->co, sizeof(mp_int) * size);
+     if (a->co == NULL) {
+        return MP_MEM;
+     }
+     
+     /* now setup the coefficients */
+     if ((res = s_setup(a, a->alloc, a->alloc + size)) != MP_OKAY) {
+        return res;
+     }
+     
+     a->alloc += size;
+  }
+  return MP_OKAY;
+}
+
+/* copy, b = a */
+int mp_poly_copy(mp_poly *a, mp_poly *b)
+{
+   int res, k;
+   
+   /* resize b */
+   if ((res = mp_poly_grow(b, a->used)) != MP_OKAY) {
+      return res;
+   }
+   
+   /* now copy the used part */
+   b->used = a->used;
+   
+   /* now the cha */
+   if ((res = mp_copy(&(a->cha), &(b->cha))) != MP_OKAY) {
+      return res;
+   }
+   
+   /* now all the coefficients */
+   for (k = 0; k < b->used; k++) {
+       if ((res = mp_copy(&(a->co[k]), &(b->co[k]))) != MP_OKAY) {
+          return res;
+       }
+   }
+   
+   /* now zero the top */
+   for (k = b->used; k < b->alloc; k++) {
+       mp_zero(&(b->co[k]));
+   }
+   
+   return MP_OKAY;
+}
+
+/* init from a copy, a = b */
+int mp_poly_init_copy(mp_poly *a, mp_poly *b)
+{
+   int res;
+   
+   if ((res = mp_poly_init(a, &(b->cha))) != MP_OKAY) {
+      return res;
+   }
+   return mp_poly_copy(b, a);
+}
+
+/* free a poly from ram */
+void mp_poly_clear(mp_poly *a)
+{
+   s_free(a);
+   mp_clear(&(a->cha));
+   free(a->co);
+   
+   a->co = NULL;
+   a->used = a->alloc = 0;
+}
+
+/* exchange two polys */
+void mp_poly_exch(mp_poly *a, mp_poly *b)
+{
+   mp_poly t;
+   t = *a; *a = *b; *b = t;
+}
+
+/* clamp the # of used digits */
+static void mp_poly_clamp(mp_poly *a)
+{
+   while (a->used > 0 && mp_cmp_d(&(a->co[a->used]), 0) == MP_EQ) --(a->used);
+}  
+
+/* add two polynomials, c(x) = a(x) + b(x) */
+int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c)
+{
+   mp_poly t, *x, *y;
+   int res, k;
+   
+   /* ensure char's are the same */
+   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
+      return MP_VAL;
+   }
+   
+   /* now figure out the sizes such that x is the 
+      largest degree poly and y is less or equal in degree 
+    */
+   if (a->used > b->used) {
+      x = a;
+      y = b;
+   } else {
+      x = b;
+      y = a;
+   }
+   
+   /* now init the result to be a copy of the largest */
+   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
+      return res;
+   }
+   
+   /* now add the coeffcients of the smaller one */
+   for (k = 0; k < y->used; k++) {
+       if ((res = mp_addmod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
+          goto __T;
+       }
+   }
+   
+   mp_poly_clamp(&t);
+   mp_poly_exch(&t, c);
+   res = MP_OKAY;
+       
+__T:  mp_poly_clear(&t);
+   return res;
+}
+
+/* subtracts two polynomials, c(x) = a(x) - b(x) */
+int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c)
+{
+   mp_poly t, *x, *y;
+   int res, k;
+   
+   /* ensure char's are the same */
+   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
+      return MP_VAL;
+   }
+   
+   /* now figure out the sizes such that x is the 
+      largest degree poly and y is less or equal in degree 
+    */
+   if (a->used > b->used) {
+      x = a;
+      y = b;
+   } else {
+      x = b;
+      y = a;
+   }
+   
+   /* now init the result to be a copy of the largest */
+   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
+      return res;
+   }
+   
+   /* now add the coeffcients of the smaller one */
+   for (k = 0; k < y->used; k++) {
+       if ((res = mp_submod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
+          goto __T;
+       }
+   }
+   
+   mp_poly_clamp(&t);
+   mp_poly_exch(&t, c);
+   res = MP_OKAY;
+       
+__T:  mp_poly_clear(&t);
+   return res;
+}
+
+/* multiplies two polynomials, c(x) = a(x) * b(x) */
+int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c)
+{
+   mp_poly t;
+   mp_int  tt;
+   int res, pa, pb, ix, iy;
+   
+   /* ensure char's are the same */
+   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
+      return MP_VAL;
+   }
+   
+   /* degrees of a and b */
+   pa = a->used;
+   pb = b->used;
+   
+   /* now init the temp polynomial to be of degree pa+pb */
+   if ((res = mp_poly_init_size(&t, &(a->cha), pa+pb)) != MP_OKAY) {
+      return res;
+   }
+   
+   /* now init our temp int */
+   if ((res = mp_init(&tt)) != MP_OKAY) {
+      goto __T;
+   }
+   
+   /* now loop through all the digits */
+   for (ix = 0; ix < pa; ix++) {
+       for (iy = 0; iy < pb; iy++) {
+          if ((res = mp_mul(&(a->co[ix]), &(b->co[iy]), &tt)) != MP_OKAY) {
+             goto __TT;
+          }
+          if ((res = mp_addmod(&tt, &(t.co[ix+iy]), &(a->cha), &(t.co[ix+iy]))) != MP_OKAY) {
+             goto __TT;
+          }
+       }
+   }
+   
+   mp_poly_clamp(&t);
+   mp_poly_exch(&t, c);
+   res = MP_OKAY;
+   
+__TT: mp_clear(&tt);
+__T:  mp_poly_clear(&t);
+   return res;
+}
+
diff --git a/poly.h b/poly.h
new file mode 100644
index 0000000..f2e3212
--- /dev/null
+++ b/poly.h
@@ -0,0 +1,73 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision 
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * This file "poly.h" provides GF(p^k) functionality on top of the 
+ * libtommath library.
+ * 
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.  
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+
+#ifndef POLY_H_
+#define POLY_H_
+
+#include "bn.h"
+
+/* a mp_poly is basically a derived "class" of a mp_int
+ * it uses the same technique of growing arrays via 
+ * used/alloc parameters except the base unit or "digit"
+ * is in fact a mp_int.  These hold the coefficients
+ * of the polynomial 
+ */
+typedef struct {
+    int    used,    /* coefficients used */
+           alloc;   /* coefficients allocated (and initialized) */
+    mp_int *co,     /* coefficients */
+           cha;     /* characteristic */
+    
+} mp_poly;
+
+
+#define MP_POLY_PREC     16             /* default coefficients allocated */
+
+/* init a poly */
+int mp_poly_init(mp_poly *a, mp_int *cha);
+
+/* init a poly of a given (size) degree  */
+int mp_poly_init_size(mp_poly *a, mp_int *cha, int size);
+
+/* copy, b = a */
+int mp_poly_copy(mp_poly *a, mp_poly *b);
+
+/* init from a copy, a = b */
+int mp_poly_init_copy(mp_poly *a, mp_poly *b);
+
+/* free a poly from ram */
+void mp_poly_clear(mp_poly *a);
+
+/* exchange two polys */
+void mp_poly_exch(mp_poly *a, mp_poly *b);
+
+/* ---> Basic Arithmetic <--- */
+
+/* add two polynomials, c(x) = a(x) + b(x) */
+int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c);
+
+/* subtracts two polynomials, c(x) = a(x) - b(x) */
+int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c);
+
+/* multiplies two polynomials, c(x) = a(x) * b(x) */
+int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c);
+
+
+
+#endif
+