diff --git a/bn.pdf b/bn.pdf
index 596c440..d047a83 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 79547f3..980d6b9 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
-\documentclass{article}
+\documentclass[]{report}
 \begin{document}
 
-\title{LibTomMath v0.15 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.16 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -286,6 +286,9 @@ int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
 /* a/b => cb + d == a */
 int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
 
+/* c = a^b */
+int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);
+
 /* c = a mod b, 0 <= c < b  */
 int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
 \end{verbatim}
@@ -1027,6 +1030,32 @@ p == 10951211157166778028568112903923951285881685924091094949001780089679
 \end{verbatim}
 \end{small}
 
+%\newpage
+%\section*{Appendix B - Function Quick Sheet}
 
+%The following table gives a quick summary of the functions provided within LibTomMath.
+
+%\begin{flushleft}
+%\begin{tiny}
+%\begin{tabular}{|l|l|l|}
+%\hline \textbf{Function Name} & \textbf{Purpose} & \textbf{Notes} \\
+%\hline mp\_init(mp\_int *a) & Initializes a mp\_int & Allocates runtime memory required for an integer \\
+%\hline mp\_clear(mp\_int *a) & Frees the ram used by an mp\_int & \\
+%\hline mp\_exch(mp\_int *a, mp\_int *b) & Swaps two mp\_int structures contents & \\
+%\hline mp\_shrink(mp\_int *a) & Frees unused memory & The mp\_int is still valid and not cleared. \\
+%\hline mp\_grow(mp\_int *a, int size) & Ensures that a has at least $size$ digits allocated & \\
+%\hline mp\_init\_size(mp\_int a, int size) & Inits and ensures it has at least $size$ digits & \\
+%\hline &&\\
+%\hline mp\_zero(mp\_int *a) & $a \leftarrow 0$ & \\
+%\hline mp\_set(mp\_int *a, mp\_digit b) & $a \leftarrow b$ & \\
+%\hline mp\_set\_int(mp\_int *a, unsigned long b) & $a \leftarrow b$ & Only reads upto 32 bits from $b$ \\
+%\hline &&\\
+%\hline mp\_rshd(mp\_int *a, int b) & $a \leftarrow a/\beta^b$ & \\
+%\hline mp\_lshd(mp\_int *a, int b) & $a \leftarrow a \cdot \beta^b$ &\\
+%\hline mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d) & &\\
+%\hline
+%\end{tabular}
+%\end{tiny}
+%\end{flushleft}
 
 \end{document}
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index 38c265e..68cdf1c 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -26,6 +26,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   mp_int  x, y, u, v, B, D;
   int     res, neg;
 
+  /* init all our temps */
   if ((res = mp_init (&x)) != MP_OKAY) {
     goto __ERR;
   }
@@ -58,6 +59,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __D;
   }
 
+  /* we need |y| */
   if ((res = mp_abs (&y, &y)) != MP_OKAY) {
     goto __D;
   }
@@ -93,13 +95,12 @@ top:
 	goto __D;
       }
     }
-    /* A = A/2, B = B/2 */
+    /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
       goto __D;
     }
   }
 
-
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
@@ -108,12 +109,12 @@ top:
     }
     /* 5.2 if C,D are even then */
     if (mp_iseven (&D) == 0) {
-      /* C = (C+y)/2, D = (D-x)/2 */
+      /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
 	goto __D;
       }
     }
-    /* C = C/2, D = D/2 */
+    /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
       goto __D;
     }
@@ -121,7 +122,7 @@ top:
 
   /* 6.  if u >= v then */
   if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
+    /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
       goto __D;
     }
@@ -130,7 +131,7 @@ top:
       goto __D;
     }
   } else {
-    /* v - v - u, C = C - A, D = D - B */
+    /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
       goto __D;
     }
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
index 17be2e4..031b410 100644
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -45,12 +45,12 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
     _W = W;
     tmpa = a->dp;
 
-    /* copy the digits of a */
+    /* copy the digits of a into W[0..a->used-1] */
     for (ix = 0; ix < a->used; ix++) {
       *_W++ = *tmpa++;
     }
 
-    /* zero the high words */
+    /* zero the high words of W[a->used..m->used*2] */
     for (; ix < m->used * 2 + 1; ix++) {
       *_W++ = 0;
     }
diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c
index 0fe5ad3..66d1f07 100644
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@@ -21,7 +21,7 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
   mp_int  t;
   int     res;
 
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
     return res;
   }
   mp_set (&t, b);
diff --git a/bn_mp_clamp.c b/bn_mp_clamp.c
index f2839a1..c349f7c 100644
--- a/bn_mp_clamp.c
+++ b/bn_mp_clamp.c
@@ -24,8 +24,9 @@
 void
 mp_clamp (mp_int * a)
 {
-  while (a->used > 0 && a->dp[a->used - 1] == 0)
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
     --(a->used);
+  }
   if (a->used == 0) {
     a->sign = MP_ZPOS;
   }
diff --git a/bn_mp_copy.c b/bn_mp_copy.c
index 10ab6a6..1bf5f12 100644
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@@ -37,6 +37,7 @@ mp_copy (mp_int * a, mp_int * b)
   {
     register mp_digit *tmpa, *tmpb;
 
+    /* point aliases */
     tmpa = a->dp;
     tmpb = b->dp;
 
diff --git a/bn_mp_div.c b/bn_mp_div.c
index 8eceec8..3888a4b 100644
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@@ -74,17 +74,19 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   x.sign = y.sign = MP_ZPOS;
 
   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
-  norm = 0;
-  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
-    ++norm;
-    if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-    if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
-      goto __Y;
-    }
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto __Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto __Y;
+     }
+  } else {
+     norm = 0;
   }
-
+     
   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
   n = x.used - 1;
   t = y.used - 1;
diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c
index 787f87a..858e8a4 100644
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@@ -32,15 +32,26 @@ mp_div_2 (mp_int * a, mp_int * b)
   {
     register mp_digit r, rr, *tmpa, *tmpb;
 
+    /* source alias */
     tmpa = a->dp + b->used - 1;
+    
+    /* dest alias */
     tmpb = b->dp + b->used - 1;
+    
+    /* carry */
     r = 0;
     for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
       rr = *tmpa & 1;
+      
+      /* shift the current digit, add in carry and store */
       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+      
+      /* forward carry to next iteration */
       r = rr;
     }
 
+    /* zero excess digits */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
index 4258c05..75501a4 100644
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@@ -58,13 +58,23 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
+    register mp_digit *tmpc, mask;
+    
+    /* mask */
+    mask = (1U << D) - 1U;
+    
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+    
+    /* carry */
     r = 0;
     for (x = c->used - 1; x >= 0; x--) {
       /* get the lower  bits of this word in a temp */
-      rr = c->dp[x] & ((mp_digit) ((1U << D) - 1U));
+      rr = *tmpc & mask;
 
       /* shift the current word and mix in the carry bits from the previous word */
-      c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT - D));
+      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      --tmpc;
 
       /* set the carry to the carry bits of the current word found above */
       r = rr;
diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
index 4c25a74..4b33a42 100644
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@@ -33,6 +33,7 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
   mp_set (&t, b);
   res = mp_div (a, &t, c, &t2);
 
+  /* set remainder if not null */
   if (d != NULL) {
     *d = t2.dp[0];
   }
diff --git a/bn_mp_exch.c b/bn_mp_exch.c
index 526c4c9..b2cdab7 100644
--- a/bn_mp_exch.c
+++ b/bn_mp_exch.c
@@ -14,6 +14,9 @@
  */
 #include <tommath.h>
 
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around 
+ */
 void
 mp_exch (mp_int * a, mp_int * b)
 {
diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c
index 602785b..144ae07 100644
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@@ -14,13 +14,13 @@
  */
 #include <tommath.h>
 
+/* calculate c = a^b  using a square-multiply algorithm */
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
   int     res, x;
   mp_int  g;
 
-
   if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
     return res;
   }
@@ -29,11 +29,13 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
   mp_set (c, 1);
 
   for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
     if ((res = mp_sqr (c, c)) != MP_OKAY) {
       mp_clear (&g);
       return res;
     }
 
+    /* if the bit is set multiply */    
     if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
       if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
 	mp_clear (&g);
@@ -41,6 +43,7 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
       }
     }
 
+    /* shift to next bit */
     b <<= 1;
   }
 
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
index a780dbc..b6635f5 100644
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -164,7 +164,7 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     mode = 2;
 
     if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply multiply */
+      /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
 	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index 83c7b7a..0906f27 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -19,7 +19,7 @@
  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
  * The value of k changes based on the size of the exponent.
  *
- * Uses Montgomery reduction 
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate] 
  */
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
@@ -29,7 +29,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   int     (*redux)(mp_int*,mp_int*,mp_digit);
   
-
   /* find window size */
   x = mp_count_bits (X);
   if (x <= 7) {
@@ -169,7 +168,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     mode = 2;
 
     if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply multiply */
+      /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
 	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index df68841..d7cc1d4 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -22,7 +22,6 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   mp_int  u, v, t;
   int     k, res, neg;
 
-
   /* either zero than gcd is the largest */
   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
     return mp_copy (b, c);
@@ -55,7 +54,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 
   /* B1.  Find power of two */
   k = 0;
-  while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
+  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
     ++k;
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __T;
@@ -66,12 +65,14 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* B2.  Initialize */
-  if ((u.dp[0] & 1) == 1) {
+  if (mp_isodd(&u) == 1) {
+    /* t = -v */
     if ((res = mp_copy (&v, &t)) != MP_OKAY) {
       goto __T;
     }
     t.sign = MP_NEG;
   } else {
+    /* t = u */
     if ((res = mp_copy (&u, &t)) != MP_OKAY) {
       goto __T;
     }
@@ -79,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 
   do {
     /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && (t.dp[0] & 1) == 0) {
+    while (t.used != 0 && mp_iseven(&t) == 1) {
       if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
 	goto __T;
       }
diff --git a/bn_mp_grow.c b/bn_mp_grow.c
index 369fb4e..9bd5118 100644
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@@ -22,13 +22,15 @@ mp_grow (mp_int * a, int size)
 
   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least MP_PREC digits extra on top */
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
 
     a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
     if (a->dp == NULL) {
       return MP_MEM;
     }
 
+    /* zero excess digits */
     n = a->alloc;
     a->alloc = size;
     for (i = n; i < a->alloc; i++) {
diff --git a/bn_mp_init.c b/bn_mp_init.c
index 7c3ee01..b96e6d9 100644
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@@ -27,9 +27,9 @@ mp_init (mp_int * a)
 
   /* set the used to zero, allocated digit to the default precision
    * and sign to positive */
-  a->used = 0;
+  a->used  = 0;
   a->alloc = MP_PREC;
-  a->sign = MP_ZPOS;
+  a->sign  = MP_ZPOS;
 
   return MP_OKAY;
 }
diff --git a/bn_mp_init_size.c b/bn_mp_init_size.c
index 45d8dc5..2408c1a 100644
--- a/bn_mp_init_size.c
+++ b/bn_mp_init_size.c
@@ -19,8 +19,10 @@ int
 mp_init_size (mp_int * a, int size)
 {
 
-  /* pad up so there are at least 16 zero digits */
-  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least 16 digits extra on top */
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
+  
+  /* alloc mem */
   a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
   if (a->dp == NULL) {
     return MP_MEM;
diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c
index fca7125..fd8db1f 100644
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@@ -35,17 +35,29 @@ mp_mul_2 (mp_int * a, mp_int * b)
   {
     register mp_digit r, rr, *tmpa, *tmpb;
 
-    r = 0;
+    /* alias for source */
     tmpa = a->dp;
+    
+    /* alias for dest */
     tmpb = b->dp;
+
+    /* carry */
+    r = 0;
     for (x = 0; x < b->used; x++) {
+    
+      /* get what will be the *next* carry bit from the MSB of the current digit */
       rr = *tmpa >> (DIGIT_BIT - 1);
+      
+      /* now shift up this digit, add in the carry [from the previous] */
       *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source digit into the next iteration */
       r = rr;
     }
 
     /* new leading digit? */
     if (r != 0) {
+      /* do we have to grow to accomodate the new digit? */
       if (b->alloc == b->used) {
 	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
 	  return res;
@@ -56,11 +68,12 @@ mp_mul_2 (mp_int * a, mp_int * b)
 	 */
 	tmpb = b->dp + b->used;
       }
-      /* add a MSB of 1 */
+      /* add a MSB which is always 1 at this point */
       *tmpb = 1;
       ++b->used;
     }
 
+    /* now zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
index 3b336d1..137df30 100644
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@@ -21,7 +21,6 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
   mp_digit d, r, rr;
   int     x, res;
 
-
   /* copy */
   if ((res = mp_copy (a, c)) != MP_OKAY) {
     return res;
@@ -42,13 +41,23 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    r = 0;
+    register mp_digit *tmpc, mask;
+    
+    /* bitmask for carries */
+    mask = (1U << d) - 1U;
+    
+    /* alias */
+    tmpc = c->dp;
+    
+    /* carry */
+    r    = 0;
     for (x = 0; x < c->used; x++) {
       /* get the higher bits of the current word */
-      rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit) ((1U << d) - 1U));
+      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
 
       /* shift the current word and OR in the carry */
-      c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
 
       /* set the carry to the carry bits of the current word */
       r = rr;
diff --git a/bn_radix.c b/bn_radix.c
index 6aeda17..3b4b639 100644
--- a/bn_radix.c
+++ b/bn_radix.c
@@ -17,7 +17,6 @@
 /* chars used in radix conversions */
 static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 
-
 /* read a string [ASCII] in a given radix */
 int
 mp_read_radix (mp_int * a, char *str, int radix)
diff --git a/changes.txt b/changes.txt
index df7ac4e..6833bdc 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,9 @@
+Mar 29th, 2003
+v0.16  -- Sped up mp_div by making normalization one shift call
+       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
+       -- Cleaned up mp_gcd to use the macros for odd/even detection
+       -- Added comments here and there, mostly there but occasionally here too.
+
 Mar 22nd, 2003
 v0.15  -- Added series of prime testing routines to lib
        -- Fixed up etc/tune.c
diff --git a/demo/demo.c b/demo/demo.c
index 8cf6dfe..ff85903 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -140,6 +140,7 @@ int main(void)
 
 #ifdef TIMER
       printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
+goto sqrtime;      
 
       log = fopen("add.log", "w");
       for (cnt = 4; cnt <= 128; cnt += 4) {
@@ -170,6 +171,7 @@ int main(void)
       fclose(log);
       
 
+sqrtime:   
    log = fopen("sqr.log", "w");
    for (cnt = 4; cnt <= 128; cnt += 4) {
       mp_rand(&a, cnt);
@@ -197,6 +199,7 @@ int main(void)
    }
    fclose(log);
 
+expttime:
    {
       char *primes[] = {
          /* DR moduli */
diff --git a/makefile b/makefile
index 4219e6b..8466163 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.15
+VERSION=0.16
 
 default: libtommath.a
 
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index d659761..3921dc4 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -53,6 +53,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   mp_int  x, y, u, v, B, D;
   int     res, neg;
 
+  /* init all our temps */
   if ((res = mp_init (&x)) != MP_OKAY) {
     goto __ERR;
   }
@@ -85,6 +86,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __D;
   }
 
+  /* we need |y| */
   if ((res = mp_abs (&y, &y)) != MP_OKAY) {
     goto __D;
   }
@@ -120,13 +122,12 @@ top:
 	goto __D;
       }
     }
-    /* A = A/2, B = B/2 */
+    /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
       goto __D;
     }
   }
 
-
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
@@ -135,12 +136,12 @@ top:
     }
     /* 5.2 if C,D are even then */
     if (mp_iseven (&D) == 0) {
-      /* C = (C+y)/2, D = (D-x)/2 */
+      /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
 	goto __D;
       }
     }
-    /* C = C/2, D = D/2 */
+    /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
       goto __D;
     }
@@ -148,7 +149,7 @@ top:
 
   /* 6.  if u >= v then */
   if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
+    /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
       goto __D;
     }
@@ -157,7 +158,7 @@ top:
       goto __D;
     }
   } else {
-    /* v - v - u, C = C - A, D = D - B */
+    /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
       goto __D;
     }
@@ -251,12 +252,12 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
     _W = W;
     tmpa = a->dp;
 
-    /* copy the digits of a */
+    /* copy the digits of a into W[0..a->used-1] */
     for (ix = 0; ix < a->used; ix++) {
       *_W++ = *tmpa++;
     }
 
-    /* zero the high words */
+    /* zero the high words of W[a->used..m->used*2] */
     for (; ix < m->used * 2 + 1; ix++) {
       *_W++ = 0;
     }
@@ -901,7 +902,7 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
   mp_int  t;
   int     res;
 
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
     return res;
   }
   mp_set (&t, b);
@@ -995,8 +996,9 @@ mp_and (mp_int * a, mp_int * b, mp_int * c)
 void
 mp_clamp (mp_int * a)
 {
-  while (a->used > 0 && a->dp[a->used - 1] == 0)
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
     --(a->used);
+  }
   if (a->used == 0) {
     a->sign = MP_ZPOS;
   }
@@ -1197,6 +1199,7 @@ mp_copy (mp_int * a, mp_int * b)
   {
     register mp_digit *tmpa, *tmpb;
 
+    /* point aliases */
     tmpa = a->dp;
     tmpb = b->dp;
 
@@ -1331,17 +1334,19 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   x.sign = y.sign = MP_ZPOS;
 
   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
-  norm = 0;
-  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
-    ++norm;
-    if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-    if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
-      goto __Y;
-    }
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto __Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto __Y;
+     }
+  } else {
+     norm = 0;
   }
-
+     
   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
   n = x.used - 1;
   t = y.used - 1;
@@ -1491,15 +1496,26 @@ mp_div_2 (mp_int * a, mp_int * b)
   {
     register mp_digit r, rr, *tmpa, *tmpb;
 
+    /* source alias */
     tmpa = a->dp + b->used - 1;
+    
+    /* dest alias */
     tmpb = b->dp + b->used - 1;
+    
+    /* carry */
     r = 0;
     for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
       rr = *tmpa & 1;
+      
+      /* shift the current digit, add in carry and store */
       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+      
+      /* forward carry to next iteration */
       r = rr;
     }
 
+    /* zero excess digits */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -1573,13 +1589,23 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
+    register mp_digit *tmpc, mask;
+    
+    /* mask */
+    mask = (1U << D) - 1U;
+    
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+    
+    /* carry */
     r = 0;
     for (x = c->used - 1; x >= 0; x--) {
       /* get the lower  bits of this word in a temp */
-      rr = c->dp[x] & ((mp_digit) ((1U << D) - 1U));
+      rr = *tmpc & mask;
 
       /* shift the current word and mix in the carry bits from the previous word */
-      c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT - D));
+      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      --tmpc;
 
       /* set the carry to the carry bits of the current word found above */
       r = rr;
@@ -1632,6 +1658,7 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
   mp_set (&t, b);
   res = mp_div (a, &t, c, &t2);
 
+  /* set remainder if not null */
   if (d != NULL) {
     *d = t2.dp[0];
   }
@@ -1814,6 +1841,9 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
  */
 #include <tommath.h>
 
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around 
+ */
 void
 mp_exch (mp_int * a, mp_int * b)
 {
@@ -1993,7 +2023,7 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     mode = 2;
 
     if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply multiply */
+      /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
 	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
@@ -2077,7 +2107,7 @@ __M:
  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
  * The value of k changes based on the size of the exponent.
  *
- * Uses Montgomery reduction 
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate] 
  */
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
@@ -2087,7 +2117,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   int     (*redux)(mp_int*,mp_int*,mp_digit);
   
-
   /* find window size */
   x = mp_count_bits (X);
   if (x <= 7) {
@@ -2227,7 +2256,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     mode = 2;
 
     if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply multiply */
+      /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
 	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
@@ -2312,13 +2341,13 @@ __M:
  */
 #include <tommath.h>
 
+/* calculate c = a^b  using a square-multiply algorithm */
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
   int     res, x;
   mp_int  g;
 
-
   if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
     return res;
   }
@@ -2327,11 +2356,13 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
   mp_set (c, 1);
 
   for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
     if ((res = mp_sqr (c, c)) != MP_OKAY) {
       mp_clear (&g);
       return res;
     }
 
+    /* if the bit is set multiply */    
     if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
       if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
 	mp_clear (&g);
@@ -2339,6 +2370,7 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
       }
     }
 
+    /* shift to next bit */
     b <<= 1;
   }
 
@@ -2373,7 +2405,6 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   mp_int  u, v, t;
   int     k, res, neg;
 
-
   /* either zero than gcd is the largest */
   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
     return mp_copy (b, c);
@@ -2406,7 +2437,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 
   /* B1.  Find power of two */
   k = 0;
-  while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
+  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
     ++k;
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __T;
@@ -2417,12 +2448,14 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* B2.  Initialize */
-  if ((u.dp[0] & 1) == 1) {
+  if (mp_isodd(&u) == 1) {
+    /* t = -v */
     if ((res = mp_copy (&v, &t)) != MP_OKAY) {
       goto __T;
     }
     t.sign = MP_NEG;
   } else {
+    /* t = u */
     if ((res = mp_copy (&u, &t)) != MP_OKAY) {
       goto __T;
     }
@@ -2430,7 +2463,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 
   do {
     /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && (t.dp[0] & 1) == 0) {
+    while (t.used != 0 && mp_iseven(&t) == 1) {
       if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
 	goto __T;
       }
@@ -2495,13 +2528,15 @@ mp_grow (mp_int * a, int size)
 
   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least MP_PREC digits extra on top */
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
 
     a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
     if (a->dp == NULL) {
       return MP_MEM;
     }
 
+    /* zero excess digits */
     n = a->alloc;
     a->alloc = size;
     for (i = n; i < a->alloc; i++) {
@@ -2543,9 +2578,9 @@ mp_init (mp_int * a)
 
   /* set the used to zero, allocated digit to the default precision
    * and sign to positive */
-  a->used = 0;
+  a->used  = 0;
   a->alloc = MP_PREC;
-  a->sign = MP_ZPOS;
+  a->sign  = MP_ZPOS;
 
   return MP_OKAY;
 }
@@ -2605,8 +2640,10 @@ int
 mp_init_size (mp_int * a, int size)
 {
 
-  /* pad up so there are at least 16 zero digits */
-  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least 16 digits extra on top */
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
+  
+  /* alloc mem */
   a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
   if (a->dp == NULL) {
     return MP_MEM;
@@ -3795,17 +3832,29 @@ mp_mul_2 (mp_int * a, mp_int * b)
   {
     register mp_digit r, rr, *tmpa, *tmpb;
 
-    r = 0;
+    /* alias for source */
     tmpa = a->dp;
+    
+    /* alias for dest */
     tmpb = b->dp;
+
+    /* carry */
+    r = 0;
     for (x = 0; x < b->used; x++) {
+    
+      /* get what will be the *next* carry bit from the MSB of the current digit */
       rr = *tmpa >> (DIGIT_BIT - 1);
+      
+      /* now shift up this digit, add in the carry [from the previous] */
       *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source digit into the next iteration */
       r = rr;
     }
 
     /* new leading digit? */
     if (r != 0) {
+      /* do we have to grow to accomodate the new digit? */
       if (b->alloc == b->used) {
 	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
 	  return res;
@@ -3816,11 +3865,12 @@ mp_mul_2 (mp_int * a, mp_int * b)
 	 */
 	tmpb = b->dp + b->used;
       }
-      /* add a MSB of 1 */
+      /* add a MSB which is always 1 at this point */
       *tmpb = 1;
       ++b->used;
     }
 
+    /* now zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -3856,7 +3906,6 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
   mp_digit d, r, rr;
   int     x, res;
 
-
   /* copy */
   if ((res = mp_copy (a, c)) != MP_OKAY) {
     return res;
@@ -3877,13 +3926,23 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    r = 0;
+    register mp_digit *tmpc, mask;
+    
+    /* bitmask for carries */
+    mask = (1U << d) - 1U;
+    
+    /* alias */
+    tmpc = c->dp;
+    
+    /* carry */
+    r    = 0;
     for (x = 0; x < c->used; x++) {
       /* get the higher bits of the current word */
-      rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit) ((1U << d) - 1U));
+      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
 
       /* shift the current word and OR in the carry */
-      c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
 
       /* set the carry to the carry bits of the current word */
       r = rr;
@@ -5401,7 +5460,6 @@ const mp_digit __prime_tab[] = {
 /* chars used in radix conversions */
 static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 
-
 /* read a string [ASCII] in a given radix */
 int
 mp_read_radix (mp_int * a, char *str, int radix)
diff --git a/tommath.h b/tommath.h
index d8f8d9d..cfd9da1 100644
--- a/tommath.h
+++ b/tommath.h
@@ -124,6 +124,12 @@ void mp_exch(mp_int *a, mp_int *b);
 /* shrink ram required for a bignum */
 int mp_shrink(mp_int *a);
 
+/* grow an int to a given size */
+int mp_grow(mp_int *a, int size);
+
+/* init to a given number of digits */
+int mp_init_size(mp_int *a, int size);
+
 /* ---> Basic Manipulations <--- */
 
 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
@@ -139,12 +145,6 @@ void mp_set(mp_int *a, mp_digit b);
 /* set a 32-bit const */
 int mp_set_int(mp_int *a, unsigned long b);
 
-/* grow an int to a given size */
-int mp_grow(mp_int *a, int size);
-
-/* init to a given number of digits */
-int mp_init_size(mp_int *a, int size);
-
 /* copy, b = a */
 int mp_copy(mp_int *a, mp_int *b);