diff --git a/TODO b/TODO
deleted file mode 100644
index deffba1..0000000
--- a/TODO
+++ /dev/null
@@ -1,16 +0,0 @@
-things for book in order of importance...
-
-- Fix up pseudo-code [only] for combas that are not consistent with source
-- Start in chapter 3 [basics] and work up...
-   - re-write to prose [less abrupt]
-   - clean up pseudo code [spacing]
-   - more examples where appropriate and figures
-
-Goal:
-   - Get sync done by mid January [roughly 8-12 hours work]
-   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
-   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
-
-Goal isn't "first edition" but merely cleaner to read.
-
-
diff --git a/bn.pdf b/bn.pdf
index 615ff4e..b8b8f8e 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 244bd6f..8b37766 100644
--- a/bn.tex
+++ b/bn.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.35}
+\title{LibTomMath User Manual \\ v0.36}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
diff --git a/bn_error.c b/bn_error.c
index 1546784..f58387d 100644
--- a/bn_error.c
+++ b/bn_error.c
@@ -41,3 +41,7 @@ char *mp_error_to_string(int code)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index acc8364..aa89dd7 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -142,3 +142,7 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
index 14f307f..518f2d4 100644
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -166,3 +166,7 @@ int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
index df3da26..a0ae08c 100644
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -70,6 +70,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       /* execute loop */
       for (iz = 0; iz < iy; ++iz) {
          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+
       }
 
       /* store term */
@@ -103,3 +104,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
index ee657f9..61d42dd 100644
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -95,3 +95,7 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
index 66a2942..7a5642c 100644
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -108,3 +108,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_2expt.c b/bn_mp_2expt.c
index 45a6818..e24cae4 100644
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@@ -42,3 +42,7 @@ mp_2expt (mp_int * a, int b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_abs.c b/bn_mp_abs.c
index 34f810f..f8d50d8 100644
--- a/bn_mp_abs.c
+++ b/bn_mp_abs.c
@@ -37,3 +37,7 @@ mp_abs (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_add.c b/bn_mp_add.c
index 554b7f7..bfd827b 100644
--- a/bn_mp_add.c
+++ b/bn_mp_add.c
@@ -47,3 +47,7 @@ int mp_add (mp_int * a, mp_int * b, mp_int * c)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c
index bdd0280..0300fe0 100644
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@@ -103,3 +103,7 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_addmod.c b/bn_mp_addmod.c
index 13eb33f..d1d07d8 100644
--- a/bn_mp_addmod.c
+++ b/bn_mp_addmod.c
@@ -35,3 +35,7 @@ mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_and.c b/bn_mp_and.c
index 61dc386..cb8e54c 100644
--- a/bn_mp_and.c
+++ b/bn_mp_and.c
@@ -51,3 +51,7 @@ mp_and (mp_int * a, mp_int * b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_clamp.c b/bn_mp_clamp.c
index c172611..3fc6b4d 100644
--- a/bn_mp_clamp.c
+++ b/bn_mp_clamp.c
@@ -38,3 +38,7 @@ mp_clamp (mp_int * a)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_clear.c b/bn_mp_clear.c
index 5342648..46aa421 100644
--- a/bn_mp_clear.c
+++ b/bn_mp_clear.c
@@ -38,3 +38,7 @@ mp_clear (mp_int * a)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_clear_multi.c b/bn_mp_clear_multi.c
index 24cbe73..2fdf125 100644
--- a/bn_mp_clear_multi.c
+++ b/bn_mp_clear_multi.c
@@ -28,3 +28,7 @@ void mp_clear_multi(mp_int *mp, ...)
     va_end(args);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c
index 583b5f8..2348066 100644
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@@ -37,3 +37,7 @@ mp_cmp (mp_int * a, mp_int * b)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_cmp_d.c b/bn_mp_cmp_d.c
index 882b1c9..3843911 100644
--- a/bn_mp_cmp_d.c
+++ b/bn_mp_cmp_d.c
@@ -38,3 +38,7 @@ int mp_cmp_d(mp_int * a, mp_digit b)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c
index a0f351c..45e9c6b 100644
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@@ -49,3 +49,7 @@ int mp_cmp_mag (mp_int * a, mp_int * b)
   return MP_EQ;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_cnt_lsb.c b/bn_mp_cnt_lsb.c
index 571f03f..03d694b 100644
--- a/bn_mp_cnt_lsb.c
+++ b/bn_mp_cnt_lsb.c
@@ -47,3 +47,7 @@ int mp_cnt_lsb(mp_int *a)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_copy.c b/bn_mp_copy.c
index 183ec9b..701b489 100644
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@@ -62,3 +62,7 @@ mp_copy (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_count_bits.c b/bn_mp_count_bits.c
index f3f85ac..52a9907 100644
--- a/bn_mp_count_bits.c
+++ b/bn_mp_count_bits.c
@@ -39,3 +39,7 @@ mp_count_bits (mp_int * a)
   return r;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_div.c b/bn_mp_div.c
index 6b2b8f0..f031f53 100644
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@@ -286,3 +286,7 @@ LBL_Q:mp_clear (&q);
 #endif
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c
index 5777997..4566580 100644
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@@ -62,3 +62,7 @@ int mp_div_2(mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
index cf103f2..4c9bbdd 100644
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@@ -91,3 +91,7 @@ int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_div_3.c b/bn_mp_div_3.c
index 7cbafc1..f386109 100644
--- a/bn_mp_div_3.c
+++ b/bn_mp_div_3.c
@@ -73,3 +73,7 @@ mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
index 9b58aa6..e93bfda 100644
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@@ -104,3 +104,7 @@ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_dr_is_modulus.c b/bn_mp_dr_is_modulus.c
index 5ef78a3..6dfd9b6 100644
--- a/bn_mp_dr_is_modulus.c
+++ b/bn_mp_dr_is_modulus.c
@@ -37,3 +37,7 @@ int mp_dr_is_modulus(mp_int *a)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_dr_reduce.c b/bn_mp_dr_reduce.c
index 9bb7ad7..988c08e 100644
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@@ -88,3 +88,7 @@ top:
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_dr_setup.c b/bn_mp_dr_setup.c
index 029d310..e17c052 100644
--- a/bn_mp_dr_setup.c
+++ b/bn_mp_dr_setup.c
@@ -26,3 +26,7 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_exch.c b/bn_mp_exch.c
index 0ef485a..f1cbb1d 100644
--- a/bn_mp_exch.c
+++ b/bn_mp_exch.c
@@ -28,3 +28,7 @@ mp_exch (mp_int * a, mp_int * b)
   *b = t;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c
index fdb8bd9..908bea1 100644
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@@ -51,3 +51,7 @@ int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
index 7c4e2f8..2514e2c 100644
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -66,7 +66,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
 /* modified diminished radix reduction */
-#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
   if (mp_reduce_is_2k_l(P) == MP_YES) {
      return s_mp_exptmod(G, X, P, Y, 1);
   }
@@ -106,3 +106,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index 82be9ac..7073dee 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -315,3 +315,7 @@ LBL_M:
 }
 #endif
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_exteuclid.c b/bn_mp_exteuclid.c
index c4ebab4..9a1f16c 100644
--- a/bn_mp_exteuclid.c
+++ b/bn_mp_exteuclid.c
@@ -76,3 +76,7 @@ _ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL
    return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_fread.c b/bn_mp_fread.c
index 293df3f..5fa23f9 100644
--- a/bn_mp_fread.c
+++ b/bn_mp_fread.c
@@ -61,3 +61,7 @@ int mp_fread(mp_int *a, int radix, FILE *stream)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_fwrite.c b/bn_mp_fwrite.c
index 8fa3129..e70e155 100644
--- a/bn_mp_fwrite.c
+++ b/bn_mp_fwrite.c
@@ -46,3 +46,7 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index 6265df1..db03dbb 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -107,3 +107,7 @@ LBL_U:mp_clear (&v);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_get_int.c b/bn_mp_get_int.c
index 034467b..25942db 100644
--- a/bn_mp_get_int.c
+++ b/bn_mp_get_int.c
@@ -39,3 +39,7 @@ unsigned long mp_get_int(mp_int * a)
   return res & 0xFFFFFFFFUL;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_grow.c b/bn_mp_grow.c
index 12a78a8..af987a3 100644
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@@ -51,3 +51,7 @@ int mp_grow (mp_int * a, int size)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init.c b/bn_mp_init.c
index 9d70554..6e935de 100644
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@@ -40,3 +40,7 @@ int mp_init (mp_int * a)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init_copy.c b/bn_mp_init_copy.c
index b1b0fa2..1d00607 100644
--- a/bn_mp_init_copy.c
+++ b/bn_mp_init_copy.c
@@ -26,3 +26,7 @@ int mp_init_copy (mp_int * a, mp_int * b)
   return mp_copy (b, a);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init_multi.c b/bn_mp_init_multi.c
index 8cb123a..de23432 100644
--- a/bn_mp_init_multi.c
+++ b/bn_mp_init_multi.c
@@ -53,3 +53,7 @@ int mp_init_multi(mp_int *mp, ...)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init_set.c b/bn_mp_init_set.c
index 0251e61..a7380a8 100644
--- a/bn_mp_init_set.c
+++ b/bn_mp_init_set.c
@@ -26,3 +26,7 @@ int mp_init_set (mp_int * a, mp_digit b)
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init_set_int.c b/bn_mp_init_set_int.c
index f59fd19..793ad86 100644
--- a/bn_mp_init_set_int.c
+++ b/bn_mp_init_set_int.c
@@ -25,3 +25,7 @@ int mp_init_set_int (mp_int * a, unsigned long b)
   return mp_set_int(a, b);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_init_size.c b/bn_mp_init_size.c
index 845ce2c..3e5724a 100644
--- a/bn_mp_init_size.c
+++ b/bn_mp_init_size.c
@@ -42,3 +42,7 @@ int mp_init_size (mp_int * a, int size)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
index 46118ad..d4e4e5f 100644
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@@ -37,3 +37,7 @@ int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   return MP_VAL;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_invmod_slow.c b/bn_mp_invmod_slow.c
index c048655..325282f 100644
--- a/bn_mp_invmod_slow.c
+++ b/bn_mp_invmod_slow.c
@@ -169,3 +169,7 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_is_square.c b/bn_mp_is_square.c
index 969d237..42be22d 100644
--- a/bn_mp_is_square.c
+++ b/bn_mp_is_square.c
@@ -103,3 +103,7 @@ ERR:mp_clear(&t);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c
index 74cbbf3..e21ee4b 100644
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@@ -99,3 +99,7 @@ LBL_A1:mp_clear (&a1);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c
index daa78c7..fe49694 100644
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@@ -26,12 +26,12 @@
  * b = b1 * B**n + b0
  *
  * Then, a * b => 
-   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+   a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
  *
  * Note that a1b1 and a0b0 are used twice and only need to be 
  * computed once.  So in total three half size (half # of 
  * digit) multiplications are performed, a0b0, a1b1 and 
- * (a1-b1)(a0-b0)
+ * (a1+b1)(a0+b0)
  *
  * Note that a multiplication of half the digits requires
  * 1/4th the number of single precision multiplications so in 
@@ -122,19 +122,19 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
     goto X1Y1;          /* x1y1 = x1*y1 */
 
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc x1+x0 and y1+y0 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1Y1;          /* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+  if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = y1 - y0 */
   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 
   /* add x0y0 */
   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -161,3 +161,7 @@ ERR:
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_karatsuba_sqr.c b/bn_mp_karatsuba_sqr.c
index 315ceab..ff8a1f6 100644
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@@ -80,8 +80,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
     goto X1X1;           /* x1x1 = x1*x1 */
 
-  /* now calc (x1-x0)**2 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc (x1+x0)**2 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = x1 - x0 */
   if (mp_sqr (&t1, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -89,8 +89,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   /* add x0y0 */
   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
     goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+  if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -115,3 +115,7 @@ ERR:
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_lcm.c b/bn_mp_lcm.c
index 8e3a759..66c2c8e 100644
--- a/bn_mp_lcm.c
+++ b/bn_mp_lcm.c
@@ -54,3 +54,7 @@ LBL_T:
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_lshd.c b/bn_mp_lshd.c
index 398b648..79e1e21 100644
--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@@ -61,3 +61,7 @@ int mp_lshd (mp_int * a, int b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mod.c b/bn_mp_mod.c
index 75779bb..364b1f9 100644
--- a/bn_mp_mod.c
+++ b/bn_mp_mod.c
@@ -42,3 +42,7 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mod_2d.c b/bn_mp_mod_2d.c
index 589e4ba..f191008 100644
--- a/bn_mp_mod_2d.c
+++ b/bn_mp_mod_2d.c
@@ -49,3 +49,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mod_d.c b/bn_mp_mod_d.c
index 8a2ad24..5ac6fff 100644
--- a/bn_mp_mod_d.c
+++ b/bn_mp_mod_d.c
@@ -21,3 +21,7 @@ mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
   return mp_div_d(a, b, NULL, c);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c
index e2efc34..a8c4582 100644
--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@@ -53,3 +53,7 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c
index 3095fa7..fedfbb7 100644
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@@ -112,3 +112,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c
index 9dfc087..28a3716 100644
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@@ -53,3 +53,7 @@ mp_montgomery_setup (mp_int * n, mp_digit * rho)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mul.c b/bn_mp_mul.c
index f9cfa09..e13e4c9 100644
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@@ -60,3 +60,7 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c
index 6936681..65416a2 100644
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@@ -76,3 +76,7 @@ int mp_mul_2(mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
index 04cb8dd..671b31e 100644
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@@ -79,3 +79,7 @@ int mp_mul_2d (mp_int * a, int b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c
index 9e11eef..7944d9c 100644
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@@ -73,3 +73,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_mulmod.c b/bn_mp_mulmod.c
index d34e90a..83b3449 100644
--- a/bn_mp_mulmod.c
+++ b/bn_mp_mulmod.c
@@ -16,8 +16,7 @@
  */
 
 /* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
   int     res;
   mp_int  t;
@@ -35,3 +34,7 @@ mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_n_root.c b/bn_mp_n_root.c
index 7b11aa2..fef65e3 100644
--- a/bn_mp_n_root.c
+++ b/bn_mp_n_root.c
@@ -126,3 +126,7 @@ LBL_T1:mp_clear (&t1);
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_neg.c b/bn_mp_neg.c
index 159cd74..587fe60 100644
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@@ -34,3 +34,7 @@ int mp_neg (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_or.c b/bn_mp_or.c
index dccee7e..1655e39 100644
--- a/bn_mp_or.c
+++ b/bn_mp_or.c
@@ -44,3 +44,7 @@ int mp_or (mp_int * a, mp_int * b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_fermat.c b/bn_mp_prime_fermat.c
index fd74dbe..59bcb86 100644
--- a/bn_mp_prime_fermat.c
+++ b/bn_mp_prime_fermat.c
@@ -56,3 +56,7 @@ LBL_T:mp_clear (&t);
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_is_divisible.c b/bn_mp_prime_is_divisible.c
index f85fe7c..1d30653 100644
--- a/bn_mp_prime_is_divisible.c
+++ b/bn_mp_prime_is_divisible.c
@@ -44,3 +44,7 @@ int mp_prime_is_divisible (mp_int * a, int *result)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_is_prime.c b/bn_mp_prime_is_prime.c
index 188053a..d45bf58 100644
--- a/bn_mp_prime_is_prime.c
+++ b/bn_mp_prime_is_prime.c
@@ -77,3 +77,7 @@ LBL_B:mp_clear (&b);
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_miller_rabin.c b/bn_mp_prime_miller_rabin.c
index 758a2c3..fbe055b 100644
--- a/bn_mp_prime_miller_rabin.c
+++ b/bn_mp_prime_miller_rabin.c
@@ -97,3 +97,7 @@ LBL_N1:mp_clear (&n1);
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_next_prime.c b/bn_mp_prime_next_prime.c
index 24f93c4..7b2be90 100644
--- a/bn_mp_prime_next_prime.c
+++ b/bn_mp_prime_next_prime.c
@@ -164,3 +164,7 @@ LBL_ERR:
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_rabin_miller_trials.c b/bn_mp_prime_rabin_miller_trials.c
index d1d0867..4bcf74e 100644
--- a/bn_mp_prime_rabin_miller_trials.c
+++ b/bn_mp_prime_rabin_miller_trials.c
@@ -46,3 +46,7 @@ int mp_prime_rabin_miller_trials(int size)
 
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_prime_random_ex.c b/bn_mp_prime_random_ex.c
index 78c0583..98f3dbb 100644
--- a/bn_mp_prime_random_ex.c
+++ b/bn_mp_prime_random_ex.c
@@ -62,10 +62,8 @@ int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback
    maskOR_msb        = 0;
    maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
-      maskOR_msb     |= 1 << ((size - 2) & 7);
-   } else if (flags & LTM_PRIME_2MSB_OFF) {
-      maskAND        &= ~(1 << ((size - 2) & 7));
-   } 
+      maskOR_msb       |= 0x80 >> ((9 - size) & 7);
+   }  
 
    /* get the maskOR_lsb */
    maskOR_lsb         = 1;
@@ -121,3 +119,7 @@ error:
 
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_radix_size.c b/bn_mp_radix_size.c
index 3d423ba..346ec41 100644
--- a/bn_mp_radix_size.c
+++ b/bn_mp_radix_size.c
@@ -72,3 +72,7 @@ int mp_radix_size (mp_int * a, int radix, int *size)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_radix_smap.c b/bn_mp_radix_smap.c
index bc7517d..7a8aa49 100644
--- a/bn_mp_radix_smap.c
+++ b/bn_mp_radix_smap.c
@@ -18,3 +18,7 @@
 /* chars used in radix conversions */
 const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_rand.c b/bn_mp_rand.c
index 0dc7019..6dc2abe 100644
--- a/bn_mp_rand.c
+++ b/bn_mp_rand.c
@@ -49,3 +49,7 @@ mp_rand (mp_int * a, int digits)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_read_radix.c b/bn_mp_read_radix.c
index 1ec3937..25aed05 100644
--- a/bn_mp_read_radix.c
+++ b/bn_mp_read_radix.c
@@ -76,3 +76,7 @@ int mp_read_radix (mp_int * a, const char *str, int radix)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_read_signed_bin.c b/bn_mp_read_signed_bin.c
index 814d6c1..0b913fd 100644
--- a/bn_mp_read_signed_bin.c
+++ b/bn_mp_read_signed_bin.c
@@ -16,8 +16,7 @@
  */
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_signed_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -36,3 +35,7 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_read_unsigned_bin.c b/bn_mp_read_unsigned_bin.c
index 946457d..84b996f 100644
--- a/bn_mp_read_unsigned_bin.c
+++ b/bn_mp_read_unsigned_bin.c
@@ -16,8 +16,7 @@
  */
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -50,3 +49,7 @@ mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
index d746445..aa18eab 100644
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@@ -94,3 +94,7 @@ CLEANUP:
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_2k.c b/bn_mp_reduce_2k.c
index 28c3a00..a23fd20 100644
--- a/bn_mp_reduce_2k.c
+++ b/bn_mp_reduce_2k.c
@@ -55,3 +55,7 @@ ERR:
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_2k_l.c b/bn_mp_reduce_2k_l.c
index 1d7e1f0..638caf4 100644
--- a/bn_mp_reduce_2k_l.c
+++ b/bn_mp_reduce_2k_l.c
@@ -56,3 +56,7 @@ ERR:
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_2k_setup.c b/bn_mp_reduce_2k_setup.c
index 585e1b7..30b6ff9 100644
--- a/bn_mp_reduce_2k_setup.c
+++ b/bn_mp_reduce_2k_setup.c
@@ -41,3 +41,7 @@ int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
    return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_2k_setup_l.c b/bn_mp_reduce_2k_setup_l.c
index 810a456..8e21c0e 100644
--- a/bn_mp_reduce_2k_setup_l.c
+++ b/bn_mp_reduce_2k_setup_l.c
@@ -38,3 +38,7 @@ ERR:
    return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_is_2k.c b/bn_mp_reduce_is_2k.c
index 0fb8384..c34dcf1 100644
--- a/bn_mp_reduce_is_2k.c
+++ b/bn_mp_reduce_is_2k.c
@@ -46,3 +46,7 @@ int mp_reduce_is_2k(mp_int *a)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_is_2k_l.c b/bn_mp_reduce_is_2k_l.c
index ceba0ed..e3a7fae 100644
--- a/bn_mp_reduce_is_2k_l.c
+++ b/bn_mp_reduce_is_2k_l.c
@@ -38,3 +38,7 @@ int mp_reduce_is_2k_l(mp_int *a)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_reduce_setup.c b/bn_mp_reduce_setup.c
index 99f158a..46ae229 100644
--- a/bn_mp_reduce_setup.c
+++ b/bn_mp_reduce_setup.c
@@ -28,3 +28,7 @@ int mp_reduce_setup (mp_int * a, mp_int * b)
   return mp_div (a, b, a, NULL);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c
index 913dda6..37ff66d 100644
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@@ -66,3 +66,7 @@ void mp_rshd (mp_int * a, int b)
   a->used -= b;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_set.c b/bn_mp_set.c
index 078fd5f..eec7bfb 100644
--- a/bn_mp_set.c
+++ b/bn_mp_set.c
@@ -23,3 +23,7 @@ void mp_set (mp_int * a, mp_digit b)
   a->used  = (a->dp[0] != 0) ? 1 : 0;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c
index bd47136..202c70c 100644
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@@ -42,3 +42,7 @@ int mp_set_int (mp_int * a, unsigned long b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_shrink.c b/bn_mp_shrink.c
index b31f9d2..b6cda9a 100644
--- a/bn_mp_shrink.c
+++ b/bn_mp_shrink.c
@@ -29,3 +29,7 @@ int mp_shrink (mp_int * a)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_signed_bin_size.c b/bn_mp_signed_bin_size.c
index 30048cb..178187a 100644
--- a/bn_mp_signed_bin_size.c
+++ b/bn_mp_signed_bin_size.c
@@ -21,3 +21,7 @@ int mp_signed_bin_size (mp_int * a)
   return 1 + mp_unsigned_bin_size (a);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c
index b1fdb57..675a87b 100644
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@@ -52,3 +52,7 @@ if (a->used >= KARATSUBA_SQR_CUTOFF) {
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_sqrmod.c b/bn_mp_sqrmod.c
index 1923be4..3cf6ab7 100644
--- a/bn_mp_sqrmod.c
+++ b/bn_mp_sqrmod.c
@@ -35,3 +35,7 @@ mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_sqrt.c b/bn_mp_sqrt.c
index 76cec87..7785737 100644
--- a/bn_mp_sqrt.c
+++ b/bn_mp_sqrt.c
@@ -75,3 +75,7 @@ E2: mp_clear(&t1);
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_sub.c b/bn_mp_sub.c
index 97495f4..4714aaf 100644
--- a/bn_mp_sub.c
+++ b/bn_mp_sub.c
@@ -53,3 +53,7 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_sub_d.c b/bn_mp_sub_d.c
index 4923dde..1bba3d0 100644
--- a/bn_mp_sub_d.c
+++ b/bn_mp_sub_d.c
@@ -83,3 +83,7 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_submod.c b/bn_mp_submod.c
index b999c85..79fa787 100644
--- a/bn_mp_submod.c
+++ b/bn_mp_submod.c
@@ -36,3 +36,7 @@ mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   return res;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_to_signed_bin.c b/bn_mp_to_signed_bin.c
index b0a597e..6365659 100644
--- a/bn_mp_to_signed_bin.c
+++ b/bn_mp_to_signed_bin.c
@@ -27,3 +27,7 @@ int mp_to_signed_bin (mp_int * a, unsigned char *b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_to_signed_bin_n.c b/bn_mp_to_signed_bin_n.c
index 0f765ee..bea0762 100644
--- a/bn_mp_to_signed_bin_n.c
+++ b/bn_mp_to_signed_bin_n.c
@@ -25,3 +25,7 @@ int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
    return mp_to_signed_bin(a, b);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_to_unsigned_bin.c b/bn_mp_to_unsigned_bin.c
index 000967e..18e3d97 100644
--- a/bn_mp_to_unsigned_bin.c
+++ b/bn_mp_to_unsigned_bin.c
@@ -42,3 +42,7 @@ int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_to_unsigned_bin_n.c b/bn_mp_to_unsigned_bin_n.c
index d0256b4..4a1778b 100644
--- a/bn_mp_to_unsigned_bin_n.c
+++ b/bn_mp_to_unsigned_bin_n.c
@@ -25,3 +25,7 @@ int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
    return mp_to_unsigned_bin(a, b);
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_toom_mul.c b/bn_mp_toom_mul.c
index 125331b..69de0da 100644
--- a/bn_mp_toom_mul.c
+++ b/bn_mp_toom_mul.c
@@ -278,3 +278,7 @@ ERR:
 }     
      
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_toom_sqr.c b/bn_mp_toom_sqr.c
index 8c46fea..871c75f 100644
--- a/bn_mp_toom_sqr.c
+++ b/bn_mp_toom_sqr.c
@@ -220,3 +220,7 @@ ERR:
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_toradix.c b/bn_mp_toradix.c
index a206d5e..4caeccc 100644
--- a/bn_mp_toradix.c
+++ b/bn_mp_toradix.c
@@ -69,3 +69,7 @@ int mp_toradix (mp_int * a, char *str, int radix)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_toradix_n.c b/bn_mp_toradix_n.c
index 7d43558..48456c3 100644
--- a/bn_mp_toradix_n.c
+++ b/bn_mp_toradix_n.c
@@ -83,3 +83,7 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_unsigned_bin_size.c b/bn_mp_unsigned_bin_size.c
index 091f406..21be05c 100644
--- a/bn_mp_unsigned_bin_size.c
+++ b/bn_mp_unsigned_bin_size.c
@@ -22,3 +22,7 @@ int mp_unsigned_bin_size (mp_int * a)
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_xor.c b/bn_mp_xor.c
index de7e62c..56becb4 100644
--- a/bn_mp_xor.c
+++ b/bn_mp_xor.c
@@ -45,3 +45,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_mp_zero.c b/bn_mp_zero.c
index c8d8907..7e18317 100644
--- a/bn_mp_zero.c
+++ b/bn_mp_zero.c
@@ -30,3 +30,7 @@ void mp_zero (mp_int * a)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_prime_tab.c b/bn_prime_tab.c
index 14306c2..ce130ef 100644
--- a/bn_prime_tab.c
+++ b/bn_prime_tab.c
@@ -55,3 +55,7 @@ const mp_digit ltm_prime_tab[] = {
 #endif
 };
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_reverse.c b/bn_reverse.c
index 851a6e8..bcd0649 100644
--- a/bn_reverse.c
+++ b/bn_reverse.c
@@ -33,3 +33,7 @@ bn_reverse (unsigned char *s, int len)
   }
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c
index 2b378ae..6976e62 100644
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@@ -103,3 +103,7 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_exptmod.c b/bn_s_mp_exptmod.c
index 597e877..12d981b 100644
--- a/bn_s_mp_exptmod.c
+++ b/bn_s_mp_exptmod.c
@@ -14,7 +14,6 @@
  *
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */
-
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
 #else
@@ -247,3 +246,7 @@ LBL_M:
   return err;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
index b40ae2e..a925e12 100644
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@@ -84,3 +84,7 @@ int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c
index a060248..e9505c8 100644
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@@ -75,3 +75,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c
index 9cdb563..4648296 100644
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@@ -78,3 +78,7 @@ int s_mp_sqr (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c
index 5b7aef9..4f7d47d 100644
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@@ -83,3 +83,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 }
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/bncore.c b/bncore.c
index 82e3132..fa4e64e 100644
--- a/bncore.c
+++ b/bncore.c
@@ -20,13 +20,17 @@
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
- AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
+ AMD Athlon64           /GCC v3.4.4   /        80/       120/LTM 0.35
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 80,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 120,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/booker.pl b/booker.pl
index 5c77e53..f419ab4 100644
--- a/booker.pl
+++ b/booker.pl
@@ -89,6 +89,9 @@ while (<IN>) {
       
       $inline = 0;
       while (<SRC>) {
+      next if ($_ =~ /\$Source/);
+      next if ($_ =~ /\$Revision/);
+      next if ($_ =~ /\$Date/);
          $text[$line++] = $_;
          ++$inline;
          chomp($_);
diff --git a/changes.txt b/changes.txt
index 99e40c1..4f27d63 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,15 @@
+August 1st, 2005
+v0.36  -- LTM_PRIME_2MSB_ON was fixed and the "OFF" flag was removed.
+       -- [Peter LaDow] found a typo in the XREALLOC macro
+       -- [Peter LaDow] pointed out that mp_read_(un)signed_bin should have "const" on the input
+       -- Ported LTC patch to fix the prime_random_ex() function to get the bitsize correct [and the maskOR flags]
+       -- Kevin Kenny pointed out a stray //
+       -- David Hulton pointed out a typo in the textbook [mp_montgomery_setup() pseudo-code]
+       -- Neal Hamilton (Elliptic Semiconductor) pointed out that my Karatsuba notation was backwards and that I could use 
+          unsigned operations in the routine.  
+       -- Paul Schmidt pointed out a linking error in mp_exptmod() when BN_S_MP_EXPTMOD_C is undefined (and another for read_radix)
+       -- Updated makefiles to be way more flexible
+
 March 12th, 2005
 v0.35  -- Stupid XOR function missing line again... oops.
        -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt]
diff --git a/demo/demo.c b/demo/demo.c
index 0a6115a..b406845 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -389,8 +389,8 @@ printf("compare no compare!\n"); exit(EXIT_FAILURE); }
       sub_d_n = 0;
 
    /* force KARA and TOOM to enable despite cutoffs */
-   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
-   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 150;
+   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 8;
+   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 16;
 
    for (;;) {
       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
@@ -734,3 +734,7 @@ printf("compare no compare!\n"); exit(EXIT_FAILURE); }
    }
    return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/demo/timing.c b/demo/timing.c
index bb3be52..12f30e3 100644
--- a/demo/timing.c
+++ b/demo/timing.c
@@ -313,3 +313,7 @@ int main(void)
 
    return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/2kprime.c b/etc/2kprime.c
index d48b83e..9450283 100644
--- a/etc/2kprime.c
+++ b/etc/2kprime.c
@@ -78,3 +78,7 @@ int main(void)
             
             
           
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/drprime.c b/etc/drprime.c
index 0ab8ea6..c7d253f 100644
--- a/etc/drprime.c
+++ b/etc/drprime.c
@@ -58,3 +58,7 @@ int main(void)
    return 0;
 }
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/makefile.icc b/etc/makefile.icc
index 0a50728..8a1ffff 100644
--- a/etc/makefile.icc
+++ b/etc/makefile.icc
@@ -16,7 +16,7 @@ CFLAGS += -I../
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
-CFLAGS += -O3 -xN -ip
+CFLAGS += -O3 -xP -ip
 
 # default lib name (requires install with root)
 # LIBNAME=-ltommath
diff --git a/etc/mersenne.c b/etc/mersenne.c
index 1cd5b50..5697559 100644
--- a/etc/mersenne.c
+++ b/etc/mersenne.c
@@ -138,3 +138,7 @@ main (void)
   }
   return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/mont.c b/etc/mont.c
index dbf1735..45cf3fd 100644
--- a/etc/mont.c
+++ b/etc/mont.c
@@ -44,3 +44,7 @@ int main(void)
 
 
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/pprime.c b/etc/pprime.c
index 26e0d84..d3a4afe 100644
--- a/etc/pprime.c
+++ b/etc/pprime.c
@@ -394,3 +394,7 @@ main (void)
 
   return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/etc/tune.c b/etc/tune.c
index d054d10..15a977b 100644
--- a/etc/tune.c
+++ b/etc/tune.c
@@ -136,3 +136,7 @@ main (void)
 
   return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/logs/expt.log b/logs/expt.log
index 920ba55..70932ab 100644
--- a/logs/expt.log
+++ b/logs/expt.log
@@ -1,7 +1,7 @@
-513   1489160
-769   3688476
-1025   8162061
-2049  49260015
-2561  89579052
-3073 148797060
-4097 324449263
+513   1435869
+769   3544970
+1025   7791638
+2049  46902238
+2561  85334899
+3073 141451412
+4097 308770310
diff --git a/logs/expt_2k.log b/logs/expt_2k.log
index 56b50db..97d325f 100644
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@@ -1,5 +1,5 @@
-607   2272809
-1279   9557382
-2203  36250309
-3217  87666486
-4253 174168369
+607   2109225
+1279  10148314
+2203  34126877
+3217  82716424
+4253 161569606
diff --git a/logs/expt_2kl.log b/logs/expt_2kl.log
index b2eb8c2..d9ad4be 100644
--- a/logs/expt_2kl.log
+++ b/logs/expt_2kl.log
@@ -1,4 +1,4 @@
-1024   6954080
-2048  35993987
-4096 176068521
-521   1683720
+1024   7705271
+2048  34286851
+4096 165207491
+521   1618631
diff --git a/logs/expt_dr.log b/logs/expt_dr.log
index eb93fc9..c6bbe07 100644
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@@ -1,7 +1,7 @@
-532   1989592
-784   3898697
-1036   6519700
-1540  15676650
-2072  33128187
-3080  82963362
-4116 168358337
+532   1928550
+784   3763908
+1036   7564221
+1540  16566059
+2072  32283784
+3080  79851565
+4116 157843530
diff --git a/logs/index.html b/logs/index.html
index 19fe403..4b68c25 100644
--- a/logs/index.html
+++ b/logs/index.html
@@ -21,4 +21,7 @@
 <hr>
 
 </body>
-</html>
\ No newline at end of file
+</html>
+/* $Source: /cvs/libtom/libtommath/logs/index.html,v $ */
+/* $Revision: 1.2 $ */
+/* $Date: 2005/05/05 14:38:47 $ */
diff --git a/logs/sqr.old b/logs/sqr.old
deleted file mode 100644
index 3c85882..0000000
--- a/logs/sqr.old
+++ /dev/null
@@ -1,17 +0,0 @@
-896    382617
-1344    207161
-1792    131522
-2240     90775
-2688     66652
-3136     50955
-3584     11678
-4032      9342
-4480      7684
-4928      6382
-5376      5399
-5824      4545
-6272      3994
-6720      3490
-7168      3075
-7616      2733
-8064      2428
diff --git a/makefile b/makefile
index 17873ee..a4697d4 100644
--- a/makefile
+++ b/makefile
@@ -3,12 +3,14 @@
 #Tom St Denis
 
 #version of library 
-VERSION=0.35
+VERSION=0.36
 
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
+ifndef IGNORE_SPEED
+
 #for speed 
-CFLAGS += -O3 -funroll-all-loops
+CFLAGS += -O3 -funroll-loops
 
 #for size 
 #CFLAGS += -Os
@@ -19,14 +21,27 @@ CFLAGS  += -fomit-frame-pointer
 #debug
 #CFLAGS += -g3
 
+endif
+
 #install as this user
-USER=root
-GROUP=root
+ifndef INSTALL_GROUP
+   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
+endif
+
+ifndef INSTALL_USER
+   USER=root
+else
+   USER=$(INSTALL_USER)
+endif
 
 default: libtommath.a
 
 #default files to install
-LIBNAME=libtommath.a
+ifndef LIBNAME
+   LIBNAME=libtommath.a
+endif
 HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
@@ -65,9 +80,9 @@ bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_ini
 bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
 bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
-libtommath.a:  $(OBJECTS)
-	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
-	ranlib libtommath.a
+$(LIBNAME):  $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $(OBJECTS)
+	ranlib $@
 
 #make a profiled library (takes a while!!!)
 #
@@ -89,23 +104,23 @@ profiled_single:
 	./ltmtest
 	rm -f *.o ltmtest
 	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
-	$(AR) $(ARFLAGS) libtommath.a mpi.o
-	ranlib libtommath.a	
+	$(AR) $(ARFLAGS) $(LIBNAME) mpi.o
+	ranlib $(LIBNAME)	
 
-install: libtommath.a
+install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
-test: libtommath.a demo/demo.o
-	$(CC) $(CFLAGS) demo/demo.o libtommath.a -o test
+test: $(LIBNAME) demo/demo.o
+	$(CC) $(CFLAGS) demo/demo.o $(LIBNAME) -o test
 	
 mtest: test	
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
         
-timing: libtommath.a
-	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+timing: $(LIBNAME)
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c $(LIBNAME) -o ltmtest
 
 # makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
 docdvi: tommath.src
@@ -151,6 +166,12 @@ clean:
 	cd etc ; make clean
 	cd pics ; make clean
 
+#zipup the project (take that!)
+no_oops: clean
+	cd .. ; cvs commit 
+	echo Scanning for scratch/dirty files
+	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
+
 zipup: clean manual poster docs
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
diff --git a/makefile.cygwin_dll b/makefile.cygwin_dll
index 85b10c7..85a9b20 100644
--- a/makefile.cygwin_dll
+++ b/makefile.cygwin_dll
@@ -49,3 +49,7 @@ windll:  $(OBJECTS)
 test: $(OBJECTS) windll
 	gcc $(CFLAGS) demo/demo.c libtommath.dll.a -Wl,--enable-auto-import -o test -s
 	cd mtest ; $(CC) -O3 -fomit-frame-pointer -funroll-loops mtest.c -o mtest -s
+
+/* $Source: /cvs/libtom/libtommath/makefile.cygwin_dll,v $ */
+/* $Revision: 1.2 $ */
+/* $Date: 2005/05/05 14:38:45 $ */
diff --git a/makefile.icc b/makefile.icc
index e764253..cf70ab0 100644
--- a/makefile.icc
+++ b/makefile.icc
@@ -19,7 +19,7 @@ CFLAGS  +=  -I./
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
-CFLAGS += -O3 -xN
+CFLAGS += -O3 -xP -ip
 
 #install as this user
 USER=root
diff --git a/makefile.msvc b/makefile.msvc
index dbbf9f3..5edebec 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -2,7 +2,7 @@
 #
 #Tom St Denis
 
-CFLAGS = /I. /Ox /DWIN32 /W4
+CFLAGS = /I. /Ox /DWIN32 /W3 /Fo$@
 
 default: library
 
@@ -34,5 +34,7 @@ bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
 bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
 bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
 
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
+
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
diff --git a/makefile.shared b/makefile.shared
index 7c35881..821558c 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -1,11 +1,14 @@
 #Makefile for GCC
 #
 #Tom St Denis
-VERSION=0:35
+VERSION=0:36
 
 CC = libtool --mode=compile gcc
+
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
+ifndef IGNORE_SPEED
+
 #for speed 
 CFLAGS += -O3 -funroll-loops
 
@@ -15,14 +18,30 @@ CFLAGS += -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer
 
+endif
+
 #install as this user
-USER=root
-GROUP=root
+ifndef INSTALL_GROUP
+   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
+endif
+
+ifndef INSTALL_USER
+   USER=root
+else
+   USER=$(INSTALL_USER)
+endif
 
 default: libtommath.la
 
 #default files to install
-LIBNAME=libtommath.la
+ifndef LIBNAME
+   LIBNAME=libtommath.la
+endif
+ifndef LIBNAME_S
+   LIBNAME_S=libtommath.a
+endif
 HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
@@ -61,20 +80,20 @@ bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_ini
 bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
 bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
-
-libtommath.la:  $(OBJECTS)
-	libtool --mode=link gcc *.lo -o libtommath.la -rpath $(LIBPATH) -version-info $(VERSION)
-	libtool --mode=link gcc *.o -o libtommath.a 
-	libtool --mode=install install -c libtommath.la $(LIBPATH)/libtommath.la
+$(LIBNAME):  $(OBJECTS)
+	libtool --mode=link gcc *.lo -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --mode=link gcc *.o -o $(LIBNAME_S)
+	ranlib $(LIBNAME_S)
+	libtool --mode=install install -c $(LIBNAME) $(LIBPATH)/$@
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
-test: libtommath.a demo/demo.o
+test: $(LIBNAME) demo/demo.o
 	gcc $(CFLAGS) -c demo/demo.c -o demo/demo.o
-	libtool --mode=link gcc -o test demo/demo.o libtommath.la
+	libtool --mode=link gcc -o test demo/demo.o $(LIBNAME_S)
 	
 mtest: test	
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest
         
-timing: libtommath.la
-	gcc $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
+timing: $(LIBNAME)
+	gcc $(CFLAGS) -DTIMER demo/timing.c $(LIBNAME_S) -o ltmtest
diff --git a/mess.sh b/mess.sh
new file mode 100644
index 0000000..bf639ce
--- /dev/null
+++ b/mess.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+if cvs log $1 >/dev/null 2>/dev/null; then exit 0; else echo "$1 shouldn't be here" ; exit 1; fi
+
+
diff --git a/mtest/logtab.h b/mtest/logtab.h
index 68462bd..751111e 100644
--- a/mtest/logtab.h
+++ b/mtest/logtab.h
@@ -18,3 +18,7 @@ const float s_logv_2[] = {
    0.166666667
 };
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/mtest/mpi-config.h b/mtest/mpi-config.h
index 9277dfb..f83a646 100644
--- a/mtest/mpi-config.h
+++ b/mtest/mpi-config.h
@@ -84,3 +84,7 @@
 
 
 /* crc==3287762869, version==2, Sat Feb 02 06:43:53 2002 */
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/mtest/mpi-types.h b/mtest/mpi-types.h
index e097188..f99d7ee 100644
--- a/mtest/mpi-types.h
+++ b/mtest/mpi-types.h
@@ -14,3 +14,7 @@ typedef int                mp_err;
 #define DIGIT_FMT          "%04X"
 #define RADIX              (MP_DIGIT_MAX+1)
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/mtest/mpi.c b/mtest/mpi.c
index 94019ef..a4b382b 100644
--- a/mtest/mpi.c
+++ b/mtest/mpi.c
@@ -3979,3 +3979,7 @@ int      s_mp_outlen(int bits, int r)
 /*------------------------------------------------------------------------*/
 /* HERE THERE BE DRAGONS                                                  */
 /* crc==4242132123, version==2, Sat Feb 02 06:43:52 2002 */
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/mtest/mpi.h b/mtest/mpi.h
index e19ecf8..d84435b 100644
--- a/mtest/mpi.h
+++ b/mtest/mpi.h
@@ -225,3 +225,7 @@ int    mp_char2value(char ch, int r);
 const  char  *mp_strerror(mp_err ec);
 
 #endif /* end _H_MPI_ */
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/mtest/mtest.c b/mtest/mtest.c
index d46f456..6ac2c81 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -302,3 +302,7 @@ int main(void)
    fclose(rng);
    return 0;
 }
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/poster.pdf b/poster.pdf
index 4c3e365..faceef1 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 8ec8a10..af6523d 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -43,6 +43,10 @@ char *mp_error_to_string(int code)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_error.c */
 
 /* Start: bn_fast_mp_invmod.c */
@@ -191,6 +195,10 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_fast_mp_invmod.c */
 
 /* Start: bn_fast_mp_montgomery_reduce.c */
@@ -363,6 +371,10 @@ int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_fast_mp_montgomery_reduce.c */
 
 /* Start: bn_fast_s_mp_mul_digs.c */
@@ -438,6 +450,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       /* execute loop */
       for (iz = 0; iz < iy; ++iz) {
          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+
       }
 
       /* store term */
@@ -472,6 +485,10 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_fast_s_mp_mul_digs.c */
 
 /* Start: bn_fast_s_mp_mul_high_digs.c */
@@ -573,6 +590,10 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_fast_s_mp_mul_high_digs.c */
 
 /* Start: bn_fast_s_mp_sqr.c */
@@ -687,6 +708,10 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_fast_s_mp_sqr.c */
 
 /* Start: bn_mp_2expt.c */
@@ -735,6 +760,10 @@ mp_2expt (mp_int * a, int b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_2expt.c */
 
 /* Start: bn_mp_abs.c */
@@ -778,6 +807,10 @@ mp_abs (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_abs.c */
 
 /* Start: bn_mp_add.c */
@@ -831,6 +864,10 @@ int mp_add (mp_int * a, mp_int * b, mp_int * c)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_add.c */
 
 /* Start: bn_mp_add_d.c */
@@ -940,6 +977,10 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_add_d.c */
 
 /* Start: bn_mp_addmod.c */
@@ -981,6 +1022,10 @@ mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_addmod.c */
 
 /* Start: bn_mp_and.c */
@@ -1038,6 +1083,10 @@ mp_and (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_and.c */
 
 /* Start: bn_mp_clamp.c */
@@ -1082,6 +1131,10 @@ mp_clamp (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_clamp.c */
 
 /* Start: bn_mp_clear.c */
@@ -1126,6 +1179,10 @@ mp_clear (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_clear.c */
 
 /* Start: bn_mp_clear_multi.c */
@@ -1160,6 +1217,10 @@ void mp_clear_multi(mp_int *mp, ...)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_clear_multi.c */
 
 /* Start: bn_mp_cmp.c */
@@ -1203,6 +1264,10 @@ mp_cmp (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_cmp.c */
 
 /* Start: bn_mp_cmp_d.c */
@@ -1247,6 +1312,10 @@ int mp_cmp_d(mp_int * a, mp_digit b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_cmp_d.c */
 
 /* Start: bn_mp_cmp_mag.c */
@@ -1302,6 +1371,10 @@ int mp_cmp_mag (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_cmp_mag.c */
 
 /* Start: bn_mp_cnt_lsb.c */
@@ -1355,6 +1428,10 @@ int mp_cnt_lsb(mp_int *a)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_cnt_lsb.c */
 
 /* Start: bn_mp_copy.c */
@@ -1423,6 +1500,10 @@ mp_copy (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_copy.c */
 
 /* Start: bn_mp_count_bits.c */
@@ -1468,6 +1549,10 @@ mp_count_bits (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_count_bits.c */
 
 /* Start: bn_mp_div.c */
@@ -1760,6 +1845,10 @@ LBL_Q:mp_clear (&q);
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_div.c */
 
 /* Start: bn_mp_div_2.c */
@@ -1828,6 +1917,10 @@ int mp_div_2(mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_div_2.c */
 
 /* Start: bn_mp_div_2d.c */
@@ -1925,6 +2018,10 @@ int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_div_2d.c */
 
 /* Start: bn_mp_div_3.c */
@@ -2004,6 +2101,10 @@ mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_div_3.c */
 
 /* Start: bn_mp_div_d.c */
@@ -2114,6 +2215,10 @@ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_div_d.c */
 
 /* Start: bn_mp_dr_is_modulus.c */
@@ -2157,6 +2262,10 @@ int mp_dr_is_modulus(mp_int *a)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_dr_is_modulus.c */
 
 /* Start: bn_mp_dr_reduce.c */
@@ -2251,6 +2360,10 @@ top:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_dr_reduce.c */
 
 /* Start: bn_mp_dr_setup.c */
@@ -2283,6 +2396,10 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_dr_setup.c */
 
 /* Start: bn_mp_exch.c */
@@ -2317,6 +2434,10 @@ mp_exch (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_exch.c */
 
 /* Start: bn_mp_expt_d.c */
@@ -2374,6 +2495,10 @@ int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_expt_d.c */
 
 /* Start: bn_mp_exptmod.c */
@@ -2445,7 +2570,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
 /* modified diminished radix reduction */
-#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
   if (mp_reduce_is_2k_l(P) == MP_YES) {
      return s_mp_exptmod(G, X, P, Y, 1);
   }
@@ -2486,6 +2611,10 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_exptmod.c */
 
 /* Start: bn_mp_exptmod_fast.c */
@@ -2807,6 +2936,10 @@ LBL_M:
 #endif
 
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_exptmod_fast.c */
 
 /* Start: bn_mp_exteuclid.c */
@@ -2889,6 +3022,10 @@ _ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_exteuclid.c */
 
 /* Start: bn_mp_fread.c */
@@ -2956,6 +3093,10 @@ int mp_fread(mp_int *a, int radix, FILE *stream)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_fread.c */
 
 /* Start: bn_mp_fwrite.c */
@@ -3008,6 +3149,10 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_fwrite.c */
 
 /* Start: bn_mp_gcd.c */
@@ -3121,6 +3266,10 @@ LBL_U:mp_clear (&v);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_gcd.c */
 
 /* Start: bn_mp_get_int.c */
@@ -3166,6 +3315,10 @@ unsigned long mp_get_int(mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_get_int.c */
 
 /* Start: bn_mp_grow.c */
@@ -3223,6 +3376,10 @@ int mp_grow (mp_int * a, int size)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_grow.c */
 
 /* Start: bn_mp_init.c */
@@ -3269,6 +3426,10 @@ int mp_init (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init.c */
 
 /* Start: bn_mp_init_copy.c */
@@ -3301,6 +3462,10 @@ int mp_init_copy (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init_copy.c */
 
 /* Start: bn_mp_init_multi.c */
@@ -3360,6 +3525,10 @@ int mp_init_multi(mp_int *mp, ...)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init_multi.c */
 
 /* Start: bn_mp_init_set.c */
@@ -3392,6 +3561,10 @@ int mp_init_set (mp_int * a, mp_digit b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init_set.c */
 
 /* Start: bn_mp_init_set_int.c */
@@ -3423,6 +3596,10 @@ int mp_init_set_int (mp_int * a, unsigned long b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init_set_int.c */
 
 /* Start: bn_mp_init_size.c */
@@ -3471,6 +3648,10 @@ int mp_init_size (mp_int * a, int size)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_init_size.c */
 
 /* Start: bn_mp_invmod.c */
@@ -3514,6 +3695,10 @@ int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_invmod.c */
 
 /* Start: bn_mp_invmod_slow.c */
@@ -3689,6 +3874,10 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_invmod_slow.c */
 
 /* Start: bn_mp_is_square.c */
@@ -3798,6 +3987,10 @@ ERR:mp_clear(&t);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_is_square.c */
 
 /* Start: bn_mp_jacobi.c */
@@ -3903,6 +4096,10 @@ LBL_A1:mp_clear (&a1);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_jacobi.c */
 
 /* Start: bn_mp_karatsuba_mul.c */
@@ -3934,12 +4131,12 @@ LBL_A1:mp_clear (&a1);
  * b = b1 * B**n + b0
  *
  * Then, a * b => 
-   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+   a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
  *
  * Note that a1b1 and a0b0 are used twice and only need to be 
  * computed once.  So in total three half size (half # of 
  * digit) multiplications are performed, a0b0, a1b1 and 
- * (a1-b1)(a0-b0)
+ * (a1+b1)(a0+b0)
  *
  * Note that a multiplication of half the digits requires
  * 1/4th the number of single precision multiplications so in 
@@ -4030,19 +4227,19 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
     goto X1Y1;          /* x1y1 = x1*y1 */
 
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc x1+x0 and y1+y0 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1Y1;          /* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+  if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = y1 - y0 */
   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 
   /* add x0y0 */
   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4070,6 +4267,10 @@ ERR:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_karatsuba_mul.c */
 
 /* Start: bn_mp_karatsuba_sqr.c */
@@ -4155,8 +4356,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
     goto X1X1;           /* x1x1 = x1*x1 */
 
-  /* now calc (x1-x0)**2 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc (x1+x0)**2 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = x1 - x0 */
   if (mp_sqr (&t1, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -4164,8 +4365,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   /* add x0y0 */
   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
     goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+  if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4191,6 +4392,10 @@ ERR:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_karatsuba_sqr.c */
 
 /* Start: bn_mp_lcm.c */
@@ -4251,6 +4456,10 @@ LBL_T:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_lcm.c */
 
 /* Start: bn_mp_lshd.c */
@@ -4318,6 +4527,10 @@ int mp_lshd (mp_int * a, int b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_lshd.c */
 
 /* Start: bn_mp_mod.c */
@@ -4366,6 +4579,10 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mod.c */
 
 /* Start: bn_mp_mod_2d.c */
@@ -4421,6 +4638,10 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mod_2d.c */
 
 /* Start: bn_mp_mod_d.c */
@@ -4448,6 +4669,10 @@ mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mod_d.c */
 
 /* Start: bn_mp_montgomery_calc_normalization.c */
@@ -4507,6 +4732,10 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_montgomery_calc_normalization.c */
 
 /* Start: bn_mp_montgomery_reduce.c */
@@ -4625,6 +4854,10 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_montgomery_reduce.c */
 
 /* Start: bn_mp_montgomery_setup.c */
@@ -4684,6 +4917,10 @@ mp_montgomery_setup (mp_int * n, mp_digit * rho)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_montgomery_setup.c */
 
 /* Start: bn_mp_mul.c */
@@ -4750,6 +4987,10 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mul.c */
 
 /* Start: bn_mp_mul_2.c */
@@ -4832,6 +5073,10 @@ int mp_mul_2(mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mul_2.c */
 
 /* Start: bn_mp_mul_2d.c */
@@ -4917,6 +5162,10 @@ int mp_mul_2d (mp_int * a, int b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mul_2d.c */
 
 /* Start: bn_mp_mul_d.c */
@@ -4996,6 +5245,10 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mul_d.c */
 
 /* Start: bn_mp_mulmod.c */
@@ -5017,8 +5270,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
  */
 
 /* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
   int     res;
   mp_int  t;
@@ -5037,6 +5289,10 @@ mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_mulmod.c */
 
 /* Start: bn_mp_n_root.c */
@@ -5169,6 +5425,10 @@ LBL_T1:mp_clear (&t1);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_n_root.c */
 
 /* Start: bn_mp_neg.c */
@@ -5209,6 +5469,10 @@ int mp_neg (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_neg.c */
 
 /* Start: bn_mp_or.c */
@@ -5259,6 +5523,10 @@ int mp_or (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_or.c */
 
 /* Start: bn_mp_prime_fermat.c */
@@ -5321,6 +5589,10 @@ LBL_T:mp_clear (&t);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_fermat.c */
 
 /* Start: bn_mp_prime_is_divisible.c */
@@ -5371,6 +5643,10 @@ int mp_prime_is_divisible (mp_int * a, int *result)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_is_divisible.c */
 
 /* Start: bn_mp_prime_is_prime.c */
@@ -5454,6 +5730,10 @@ LBL_B:mp_clear (&b);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_is_prime.c */
 
 /* Start: bn_mp_prime_miller_rabin.c */
@@ -5557,6 +5837,10 @@ LBL_N1:mp_clear (&n1);
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_miller_rabin.c */
 
 /* Start: bn_mp_prime_next_prime.c */
@@ -5727,6 +6011,10 @@ LBL_ERR:
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_next_prime.c */
 
 /* Start: bn_mp_prime_rabin_miller_trials.c */
@@ -5779,6 +6067,10 @@ int mp_prime_rabin_miller_trials(int size)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_rabin_miller_trials.c */
 
 /* Start: bn_mp_prime_random_ex.c */
@@ -5846,10 +6138,8 @@ int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback
    maskOR_msb        = 0;
    maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
-      maskOR_msb     |= 1 << ((size - 2) & 7);
-   } else if (flags & LTM_PRIME_2MSB_OFF) {
-      maskAND        &= ~(1 << ((size - 2) & 7));
-   } 
+      maskOR_msb       |= 0x80 >> ((9 - size) & 7);
+   }  
 
    /* get the maskOR_lsb */
    maskOR_lsb         = 1;
@@ -5906,6 +6196,10 @@ error:
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_prime_random_ex.c */
 
 /* Start: bn_mp_radix_size.c */
@@ -5984,6 +6278,10 @@ int mp_radix_size (mp_int * a, int radix, int *size)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_radix_size.c */
 
 /* Start: bn_mp_radix_smap.c */
@@ -6008,6 +6306,10 @@ int mp_radix_size (mp_int * a, int radix, int *size)
 const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_radix_smap.c */
 
 /* Start: bn_mp_rand.c */
@@ -6063,6 +6365,10 @@ mp_rand (mp_int * a, int digits)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_rand.c */
 
 /* Start: bn_mp_read_radix.c */
@@ -6145,6 +6451,10 @@ int mp_read_radix (mp_int * a, const char *str, int radix)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_read_radix.c */
 
 /* Start: bn_mp_read_signed_bin.c */
@@ -6166,8 +6476,7 @@ int mp_read_radix (mp_int * a, const char *str, int radix)
  */
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_signed_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -6187,6 +6496,10 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_read_signed_bin.c */
 
 /* Start: bn_mp_read_unsigned_bin.c */
@@ -6208,8 +6521,7 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
  */
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -6243,6 +6555,10 @@ mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_read_unsigned_bin.c */
 
 /* Start: bn_mp_reduce.c */
@@ -6343,6 +6659,10 @@ CLEANUP:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce.c */
 
 /* Start: bn_mp_reduce_2k.c */
@@ -6404,6 +6724,10 @@ ERR:
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_2k.c */
 
 /* Start: bn_mp_reduce_2k_l.c */
@@ -6466,6 +6790,10 @@ ERR:
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_2k_l.c */
 
 /* Start: bn_mp_reduce_2k_setup.c */
@@ -6513,6 +6841,10 @@ int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_2k_setup.c */
 
 /* Start: bn_mp_reduce_2k_setup_l.c */
@@ -6557,6 +6889,10 @@ ERR:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_2k_setup_l.c */
 
 /* Start: bn_mp_reduce_is_2k.c */
@@ -6609,6 +6945,10 @@ int mp_reduce_is_2k(mp_int *a)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_is_2k.c */
 
 /* Start: bn_mp_reduce_is_2k_l.c */
@@ -6653,6 +6993,10 @@ int mp_reduce_is_2k_l(mp_int *a)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_is_2k_l.c */
 
 /* Start: bn_mp_reduce_setup.c */
@@ -6687,6 +7031,10 @@ int mp_reduce_setup (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_reduce_setup.c */
 
 /* Start: bn_mp_rshd.c */
@@ -6759,6 +7107,10 @@ void mp_rshd (mp_int * a, int b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_rshd.c */
 
 /* Start: bn_mp_set.c */
@@ -6788,6 +7140,10 @@ void mp_set (mp_int * a, mp_digit b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_set.c */
 
 /* Start: bn_mp_set_int.c */
@@ -6836,6 +7192,10 @@ int mp_set_int (mp_int * a, unsigned long b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_set_int.c */
 
 /* Start: bn_mp_shrink.c */
@@ -6871,6 +7231,10 @@ int mp_shrink (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_shrink.c */
 
 /* Start: bn_mp_signed_bin_size.c */
@@ -6898,6 +7262,10 @@ int mp_signed_bin_size (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_signed_bin_size.c */
 
 /* Start: bn_mp_sqr.c */
@@ -6956,6 +7324,10 @@ if (a->used >= KARATSUBA_SQR_CUTOFF) {
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_sqr.c */
 
 /* Start: bn_mp_sqrmod.c */
@@ -6997,6 +7369,10 @@ mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_sqrmod.c */
 
 /* Start: bn_mp_sqrt.c */
@@ -7078,6 +7454,10 @@ E2: mp_clear(&t1);
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_sqrt.c */
 
 /* Start: bn_mp_sub.c */
@@ -7137,6 +7517,10 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_sub.c */
 
 /* Start: bn_mp_sub_d.c */
@@ -7226,6 +7610,10 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_sub_d.c */
 
 /* Start: bn_mp_submod.c */
@@ -7268,6 +7656,10 @@ mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_submod.c */
 
 /* Start: bn_mp_to_signed_bin.c */
@@ -7301,6 +7693,10 @@ int mp_to_signed_bin (mp_int * a, unsigned char *b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_to_signed_bin.c */
 
 /* Start: bn_mp_to_signed_bin_n.c */
@@ -7332,6 +7728,10 @@ int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_to_signed_bin_n.c */
 
 /* Start: bn_mp_to_unsigned_bin.c */
@@ -7380,6 +7780,10 @@ int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_to_unsigned_bin.c */
 
 /* Start: bn_mp_to_unsigned_bin_n.c */
@@ -7411,6 +7815,10 @@ int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_to_unsigned_bin_n.c */
 
 /* Start: bn_mp_toom_mul.c */
@@ -7695,6 +8103,10 @@ ERR:
      
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_toom_mul.c */
 
 /* Start: bn_mp_toom_sqr.c */
@@ -7921,6 +8333,10 @@ ERR:
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_toom_sqr.c */
 
 /* Start: bn_mp_toradix.c */
@@ -7996,6 +8412,10 @@ int mp_toradix (mp_int * a, char *str, int radix)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_toradix.c */
 
 /* Start: bn_mp_toradix_n.c */
@@ -8085,6 +8505,10 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_toradix_n.c */
 
 /* Start: bn_mp_unsigned_bin_size.c */
@@ -8113,6 +8537,10 @@ int mp_unsigned_bin_size (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_unsigned_bin_size.c */
 
 /* Start: bn_mp_xor.c */
@@ -8164,6 +8592,10 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_xor.c */
 
 /* Start: bn_mp_zero.c */
@@ -8200,6 +8632,10 @@ void mp_zero (mp_int * a)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_mp_zero.c */
 
 /* Start: bn_prime_tab.c */
@@ -8261,6 +8697,10 @@ const mp_digit ltm_prime_tab[] = {
 };
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_prime_tab.c */
 
 /* Start: bn_reverse.c */
@@ -8300,6 +8740,10 @@ bn_reverse (unsigned char *s, int len)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_reverse.c */
 
 /* Start: bn_s_mp_add.c */
@@ -8409,6 +8853,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_add.c */
 
 /* Start: bn_s_mp_exptmod.c */
@@ -8428,7 +8876,6 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
  *
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */
-
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
 #else
@@ -8662,6 +9109,10 @@ LBL_M:
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_exptmod.c */
 
 /* Start: bn_s_mp_mul_digs.c */
@@ -8752,6 +9203,10 @@ int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_mul_digs.c */
 
 /* Start: bn_s_mp_mul_high_digs.c */
@@ -8833,6 +9288,10 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_mul_high_digs.c */
 
 /* Start: bn_s_mp_sqr.c */
@@ -8917,6 +9376,10 @@ int s_mp_sqr (mp_int * a, mp_int * b)
 }
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_sqr.c */
 
 /* Start: bn_s_mp_sub.c */
@@ -9006,6 +9469,10 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bn_s_mp_sub.c */
 
 /* Start: bncore.c */
@@ -9031,17 +9498,21 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
- AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
+ AMD Athlon64           /GCC v3.4.4   /        80/       120/LTM 0.35
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 80,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 120,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
 #endif
 
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+
 /* End: bncore.c */
 
 
diff --git a/tommath.h b/tommath.h
index bcb9d86..80db43c 100644
--- a/tommath.h
+++ b/tommath.h
@@ -23,10 +23,13 @@
 
 #include <tommath_class.h>
 
-#undef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
-#undef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
+#ifndef MIN
+   #define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+#ifndef MAX
+   #define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -112,7 +115,7 @@ extern "C" {
    #else
       /* prototypes for our heap functions */
       extern void *XMALLOC(size_t n);
-      extern void *REALLOC(void *p, size_t n);
+      extern void *XREALLOC(void *p, size_t n);
       extern void *XCALLOC(size_t n, size_t s);
       extern void XFREE(void *p);
    #endif
@@ -147,7 +150,6 @@ extern "C" {
 /* Primality generation flags */
 #define LTM_PRIME_BBS      0x0001 /* BBS style prime */
 #define LTM_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
-#define LTM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
 #define LTM_PRIME_2MSB_ON  0x0008 /* force 2nd MSB to 1 */
 
 typedef int           mp_err;
@@ -164,7 +166,7 @@ extern int KARATSUBA_MUL_CUTOFF,
 /* default precision */
 #ifndef MP_PREC
    #ifndef MP_LOW_MEM
-      #define MP_PREC                 64     /* default digits of precision */
+      #define MP_PREC                 32     /* default digits of precision */
    #else
       #define MP_PREC                 8      /* default digits of precision */
    #endif   
@@ -518,13 +520,13 @@ int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback
 int mp_count_bits(mp_int *a);
 
 int mp_unsigned_bin_size(mp_int *a);
-int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_read_unsigned_bin(mp_int *a, const unsigned char *b, int c);
 int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
 int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
 int mp_signed_bin_size(mp_int *a);
-int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
-int mp_to_signed_bin(mp_int *a, unsigned char *b);
+int mp_read_signed_bin(mp_int *a, const unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a,  unsigned char *b);
 int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
 int mp_read_radix(mp_int *a, const char *str, int radix);
@@ -576,3 +578,7 @@ extern const char *mp_s_rmap;
 
 #endif
 
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/tommath.pdf b/tommath.pdf
index c486d29..08f6a1e 100644
Binary files a/tommath.pdf and b/tommath.pdf differ
diff --git a/tommath.src b/tommath.src
index 7a53860..b392ead 100644
--- a/tommath.src
+++ b/tommath.src
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.36 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -2775,26 +2775,25 @@ general purpose multiplication.  Given two polynomial basis representations $f(x
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
 
 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
 \end{equation}
 
 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
 this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
 out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
 
 \begin{center}
 \begin{tabular}{rcrcrcrc}
 $\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
 $\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
 \end{tabular}
 \end{center}
 
 By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
 of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -2817,13 +2816,13 @@ Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
 Calculate the three products. \\
 8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
 9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
+10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
+11.  $x0 \leftarrow y1 + y0$ \\
 12.  $t1 \leftarrow t1 \cdot x0$ \\
 \\
 Calculate the middle term. \\
 13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
+14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
 \\
 Calculate the final product. \\
 15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
@@ -2850,7 +2849,7 @@ smallest input \textbf{used} count.  After the radix point is chosen the inputs
 compute the lower halves.  Step 6 and 7 computer the upper halves.  
 
 After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
 of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
 
 The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
@@ -3246,10 +3245,10 @@ Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  Th
 number with the following equation.
 
 \begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
 \end{equation}
 
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in 
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.
 
@@ -3281,12 +3280,12 @@ Split the input.  e.g. $a = x1\beta^B + x0$ \\
 Calculate the three squares. \\
 6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
 7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
 9.  $t1 \leftarrow t1^2$ \\
 \\
 Compute the middle term. \\
 10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
+11.  $t1 \leftarrow t1 - t2$ \\
 \\
 Compute final product. \\
 12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
@@ -3309,7 +3308,7 @@ The radix point for squaring is simply placed exactly in the middle of the digit
 placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
 as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
 
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
 Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
 this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
 
@@ -4035,7 +4034,7 @@ To calculate the variable $\rho$ a relatively simple algorithm will be required.
 \hline \\
 1.  $b \leftarrow n_0$ \\
 2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
 4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
 \hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
 5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
diff --git a/tommath.tex b/tommath.tex
index b016010..b69421b 100644
--- a/tommath.tex
+++ b/tommath.tex
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.36 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -814,6 +814,7 @@ decrementally.
 039     return MP_OKAY;
 040   \}
 041   #endif
+042   
 \end{alltt}
 \end{small}
 
@@ -902,6 +903,7 @@ with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp
 037     \}
 038   \}
 039   #endif
+040   
 \end{alltt}
 \end{small}
 
@@ -1008,6 +1010,7 @@ assumed to contain undefined values they are initially set to zero.
 050     return MP_OKAY;
 051   \}
 052   #endif
+053   
 \end{alltt}
 \end{small}
 
@@ -1096,6 +1099,7 @@ correct no further memory re-allocations are required to work with the mp\_int.
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -1183,6 +1187,7 @@ initialization which allows for quick recovery from runtime errors.
 052   \}
 053   
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -1268,6 +1273,7 @@ when all of the digits are zero to ensure that the mp\_int is valid at all times
 037     \}
 038   \}
 039   #endif
+040   
 \end{alltt}
 \end{small}
 
@@ -1405,6 +1411,7 @@ implement the pseudo-code.
 061     return MP_OKAY;
 062   \}
 063   #endif
+064   
 \end{alltt}
 \end{small}
 
@@ -1519,6 +1526,7 @@ such this algorithm will perform two operations in one step.
 025     return mp_copy (b, a);
 026   \}
 027   #endif
+028   
 \end{alltt}
 \end{small}
 
@@ -1570,6 +1578,7 @@ This algorithm simply resets a mp\_int to the default state.
 029     \}
 030   \}
 031   #endif
+032   
 \end{alltt}
 \end{small}
 
@@ -1631,6 +1640,7 @@ logic to handle it.
 036     return MP_OKAY;
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -1692,6 +1702,7 @@ zero as negative.
 033     return MP_OKAY;
 034   \}
 035   #endif
+036   
 \end{alltt}
 \end{small}
 
@@ -1739,6 +1750,7 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj
 022     a->used  = (a->dp[0] != 0) ? 1 : 0;
 023   \}
 024   #endif
+025   
 \end{alltt}
 \end{small}
 
@@ -1819,6 +1831,7 @@ Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorith
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -1921,6 +1934,7 @@ the zero'th digit.  If after all of the digits have been compared, no difference
 048     return MP_EQ;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -1987,6 +2001,7 @@ $\vert a \vert < \vert b \vert$.  Step number four will compare the two when the
 036     \}
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -2205,6 +2220,7 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are
 102     return MP_OKAY;
 103   \}
 104   #endif
+105   
 \end{alltt}
 \end{small}
 
@@ -2376,6 +2392,7 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop
 082   \}
 083   
 084   #endif
+085   
 \end{alltt}
 \end{small}
 
@@ -2511,6 +2528,7 @@ within algorithm s\_mp\_add will force $-0$ to become $0$.
 046   \}
 047   
 048   #endif
+049   
 \end{alltt}
 \end{small}
 
@@ -2623,6 +2641,7 @@ algorithm from producing $-a - -a = -0$ as a result.
 052   \}
 053   
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -2757,6 +2776,7 @@ Step 8 clears any leading digits of $b$ in case it originally had a larger magni
 075     return MP_OKAY;
 076   \}
 077   #endif
+078   
 \end{alltt}
 \end{small}
 
@@ -2857,6 +2877,7 @@ least significant bit not the most significant bit.
 061     return MP_OKAY;
 062   \}
 063   #endif
+064   
 \end{alltt}
 \end{small}
 
@@ -2977,6 +2998,7 @@ step 8 sets the lower $b$ digits to zero.
 060     return MP_OKAY;
 061   \}
 062   #endif
+063   
 \end{alltt}
 \end{small}
 
@@ -3088,6 +3110,7 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf
 065     a->used -= b;
 066   \}
 067   #endif
+068   
 \end{alltt}
 \end{small}
 
@@ -3221,6 +3244,7 @@ complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm
 078     return MP_OKAY;
 079   \}
 080   #endif
+081   
 \end{alltt}
 \end{small}
 
@@ -3357,6 +3381,7 @@ by using algorithm mp\_mod\_2d.
 090     return MP_OKAY;
 091   \}
 092   #endif
+093   
 \end{alltt}
 \end{small}
 
@@ -3448,6 +3473,7 @@ is copied to $b$, leading digits are removed and the remaining leading digit is
 048     return MP_OKAY;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -3687,6 +3713,7 @@ exceed the precision requested.
 083     return MP_OKAY;
 084   \}
 085   #endif
+086   
 \end{alltt}
 \end{small}
 
@@ -3942,39 +3969,41 @@ and addition operations in the nested loop in parallel.
 069         /* execute loop */
 070         for (iz = 0; iz < iy; ++iz) \{
 071            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-072         \}
-073   
-074         /* store term */
-075         W[ix] = ((mp_digit)_W) & MP_MASK;
-076   
-077         /* make next carry */
-078         _W = _W >> ((mp_word)DIGIT_BIT);
-079     \}
-080   
-081     /* store final carry */
-082     W[ix] = (mp_digit)(_W & MP_MASK);
-083   
-084     /* setup dest */
-085     olduse  = c->used;
-086     c->used = pa;
-087   
-088     \{
-089       register mp_digit *tmpc;
-090       tmpc = c->dp;
-091       for (ix = 0; ix < pa+1; ix++) \{
-092         /* now extract the previous digit [below the carry] */
-093         *tmpc++ = W[ix];
-094       \}
-095   
-096       /* clear unused digits [that existed in the old copy of c] */
-097       for (; ix < olduse; ix++) \{
-098         *tmpc++ = 0;
-099       \}
-100     \}
-101     mp_clamp (c);
-102     return MP_OKAY;
-103   \}
-104   #endif
+072   
+073         \}
+074   
+075         /* store term */
+076         W[ix] = ((mp_digit)_W) & MP_MASK;
+077   
+078         /* make next carry */
+079         _W = _W >> ((mp_word)DIGIT_BIT);
+080     \}
+081   
+082     /* store final carry */
+083     W[ix] = (mp_digit)(_W & MP_MASK);
+084   
+085     /* setup dest */
+086     olduse  = c->used;
+087     c->used = pa;
+088   
+089     \{
+090       register mp_digit *tmpc;
+091       tmpc = c->dp;
+092       for (ix = 0; ix < pa+1; ix++) \{
+093         /* now extract the previous digit [below the carry] */
+094         *tmpc++ = W[ix];
+095       \}
+096   
+097       /* clear unused digits [that existed in the old copy of c] */
+098       for (; ix < olduse; ix++) \{
+099         *tmpc++ = 0;
+100       \}
+101     \}
+102     mp_clamp (c);
+103     return MP_OKAY;
+104   \}
+105   #endif
+106   
 \end{alltt}
 \end{small}
 
@@ -3982,7 +4011,7 @@ As per the pseudo--code we first calculate $pa$ (line 47) as the number of digit
 to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
 inside the two multiplicands quickly.  
 
-The inner loop (lines 70 to 72) of this implementation is where the tradeoff come into play.  Originally this comba 
+The inner loop (lines 70 to 73) of this implementation is where the tradeoff come into play.  Originally this comba 
 implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
 the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
 one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
@@ -3990,8 +4019,8 @@ is very high and it can keep the ALU fed with data.  It did, however, matter on
 slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
 compiler has aliased $\_ \hat W$ to a CPU register.
 
-After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 75, 78) to forward it as 
-a carry for the next pass.  After the outer loop we use the final carry (line 82) as the last digit of the product.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 76, 79) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 83) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -4095,26 +4124,25 @@ general purpose multiplication.  Given two polynomial basis representations $f(x
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
 
 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
 \end{equation}
 
 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
 this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
 out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
 
 \begin{center}
 \begin{tabular}{rcrcrcrc}
 $\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
 $\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
 \end{tabular}
 \end{center}
 
 By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
 of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4137,13 +4165,13 @@ Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
 Calculate the three products. \\
 8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
 9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
+10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
+11.  $x0 \leftarrow y1 + y0$ \\
 12.  $t1 \leftarrow t1 \cdot x0$ \\
 \\
 Calculate the middle term. \\
 13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
+14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
 \\
 Calculate the final product. \\
 15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
@@ -4170,7 +4198,7 @@ smallest input \textbf{used} count.  After the radix point is chosen the inputs
 compute the lower halves.  Step 6 and 7 computer the upper halves.  
 
 After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
 of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
 
 The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
@@ -4191,12 +4219,12 @@ The remaining steps 13 through 18 compute the Karatsuba polynomial through a var
 025    * b = b1 * B**n + b0
 026    *
 027    * Then, a * b => 
-028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+028      a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
 029    *
 030    * Note that a1b1 and a0b0 are used twice and only need to be 
 031    * computed once.  So in total three half size (half # of 
 032    * digit) multiplications are performed, a0b0, a1b1 and 
-033    * (a1-b1)(a0-b0)
+033    * (a1+b1)(a0+b0)
 034    *
 035    * Note that a multiplication of half the digits requires
 036    * 1/4th the number of single precision multiplications so in 
@@ -4287,19 +4315,19 @@ The remaining steps 13 through 18 compute the Karatsuba polynomial through a var
 121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
 122       goto X1Y1;          /* x1y1 = x1*y1 */
 123   
-124     /* now calc x1-x0 and y1-y0 */
-125     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+124     /* now calc x1+x0 and y1+y0 */
+125     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
 126       goto X1Y1;          /* t1 = x1 - x0 */
-127     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+127     if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
 128       goto X1Y1;          /* t2 = y1 - y0 */
 129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-130       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+130       goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 131   
 132     /* add x0y0 */
 133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
 134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
-135     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-136       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+135     if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+136       goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 137   
 138     /* shift by B */
 139     if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4326,6 +4354,7 @@ The remaining steps 13 through 18 compute the Karatsuba polynomial through a var
 160     return err;
 161   \}
 162   #endif
+163   
 \end{alltt}
 \end{small}
 
@@ -4729,6 +4758,7 @@ result $a \cdot b$ is produced.
 277   \}     
 278        
 279   #endif
+280   
 \end{alltt}
 \end{small}
 
@@ -4837,6 +4867,7 @@ s\_mp\_mul\_digs will clear it.
 059     return res;
 060   \}
 061   #endif
+062   
 \end{alltt}
 \end{small}
 
@@ -5006,6 +5037,7 @@ results calculated so far.  This involves expensive carry propagation which will
 077     return MP_OKAY;
 078   \}
 079   #endif
+080   
 \end{alltt}
 \end{small}
 
@@ -5188,6 +5220,7 @@ only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rf
 107     return MP_OKAY;
 108   \}
 109   #endif
+110   
 \end{alltt}
 \end{small}
 
@@ -5205,10 +5238,10 @@ Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  Th
 number with the following equation.
 
 \begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
 \end{equation}
 
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in 
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.
 
@@ -5240,12 +5273,12 @@ Split the input.  e.g. $a = x1\beta^B + x0$ \\
 Calculate the three squares. \\
 6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
 7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
 9.  $t1 \leftarrow t1^2$ \\
 \\
 Compute the middle term. \\
 10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
+11.  $t1 \leftarrow t1 - t2$ \\
 \\
 Compute final product. \\
 12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
@@ -5268,7 +5301,7 @@ The radix point for squaring is simply placed exactly in the middle of the digit
 placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
 as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
 
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
 Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
 this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
 
@@ -5363,8 +5396,8 @@ ratio of 1:7.  } than simpler operations such as addition.
 079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
 080       goto X1X1;           /* x1x1 = x1*x1 */
 081   
-082     /* now calc (x1-x0)**2 */
-083     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+082     /* now calc (x1+x0)**2 */
+083     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
 084       goto X1X1;           /* t1 = x1 - x0 */
 085     if (mp_sqr (&t1, &t1) != MP_OKAY)
 086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -5372,8 +5405,8 @@ ratio of 1:7.  } than simpler operations such as addition.
 088     /* add x0y0 */
 089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
 090       goto X1X1;           /* t2 = x0x0 + x1x1 */
-091     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-092       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+091     if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+092       goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 093   
 094     /* shift by B */
 095     if (mp_lshd (&t1, B) != MP_OKAY)
@@ -5398,6 +5431,7 @@ ratio of 1:7.  } than simpler operations such as addition.
 114     return err;
 115   \}
 116   #endif
+117   
 \end{alltt}
 \end{small}
 
@@ -5494,6 +5528,7 @@ neither of the polynomial basis algorithms should be used then either the Comba
 051     return res;
 052   \}
 053   #endif
+054   
 \end{alltt}
 \end{small}
 
@@ -5827,6 +5862,7 @@ performed at most twice, and on average once. However, if $a \ge b^2$ than it wi
 093     return res;
 094   \}
 095   #endif
+096   
 \end{alltt}
 \end{small}
 
@@ -5879,6 +5915,7 @@ is equivalent and much faster.  The final value is computed by taking the intege
 027     return mp_div (a, b, a, NULL);
 028   \}
 029   #endif
+030   
 \end{alltt}
 \end{small}
 
@@ -6234,6 +6271,7 @@ multiplications.
 111     return MP_OKAY;
 112   \}
 113   #endif
+114   
 \end{alltt}
 \end{small}
 
@@ -6478,6 +6516,7 @@ stored in the destination $x$.
 165     return MP_OKAY;
 166   \}
 167   #endif
+168   
 \end{alltt}
 \end{small}
 
@@ -6505,7 +6544,7 @@ To calculate the variable $\rho$ a relatively simple algorithm will be required.
 \hline \\
 1.  $b \leftarrow n_0$ \\
 2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
 4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
 \hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
 5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
@@ -6564,6 +6603,7 @@ to calculate $1/n_0$ when $\beta$ is a power of two.
 052     return MP_OKAY;
 053   \}
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -6830,6 +6870,7 @@ at step 3.
 087     return MP_OKAY;
 088   \}
 089   #endif
+090   
 \end{alltt}
 \end{small}
 
@@ -6885,6 +6926,7 @@ completeness.
 025   \}
 026   
 027   #endif
+028   
 \end{alltt}
 \end{small}
 
@@ -6943,6 +6985,7 @@ step 3 then $n$ must be of Diminished Radix form.
 036   \}
 037   
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -7027,6 +7070,7 @@ shift which makes the algorithm fairly inexpensive to use.
 054   \}
 055   
 056   #endif
+057   
 \end{alltt}
 \end{small}
 
@@ -7096,6 +7140,7 @@ is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit th
 040      return MP_OKAY;
 041   \}
 042   #endif
+043   
 \end{alltt}
 \end{small}
 
@@ -7172,6 +7217,7 @@ This algorithm quickly determines if a modulus is of the form required for algor
 045   \}
 046   
 047   #endif
+048   
 \end{alltt}
 \end{small}
 
@@ -7381,6 +7427,7 @@ iteration of the loop moves the bits of the exponent $b$ upwards to the most sig
 050     return MP_OKAY;
 051   \}
 052   #endif
+053   
 \end{alltt}
 \end{small}
 
@@ -7620,7 +7667,8 @@ algorithm since their arguments are essentially the same (\textit{two mp\_ints a
 065     \}
 066   
 067   /* modified diminished radix reduction */
-068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defin
+      ed(BN_S_MP_EXPTMOD_C)
 069     if (mp_reduce_is_2k_l(P) == MP_YES) \{
 070        return s_mp_exptmod(G, X, P, Y, 1);
 071     \}
@@ -7660,6 +7708,7 @@ algorithm since their arguments are essentially the same (\textit{two mp\_ints a
 105   \}
 106   
 107   #endif
+108   
 \end{alltt}
 \end{small}
 
@@ -7839,251 +7888,251 @@ a Left-to-Right algorithm is used to process the remaining few bits.
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   #ifdef MP_LOW_MEM
-018      #define TAB_SIZE 32
-019   #else
-020      #define TAB_SIZE 256
-021   #endif
-022   
-023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
+016   #ifdef MP_LOW_MEM
+017      #define TAB_SIZE 32
+018   #else
+019      #define TAB_SIZE 256
+020   #endif
+021   
+022   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
       e)
-024   \{
-025     mp_int  M[TAB_SIZE], res, mu;
-026     mp_digit buf;
-027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-028     int (*redux)(mp_int*,mp_int*,mp_int*);
-029   
-030     /* find window size */
-031     x = mp_count_bits (X);
-032     if (x <= 7) \{
-033       winsize = 2;
-034     \} else if (x <= 36) \{
-035       winsize = 3;
-036     \} else if (x <= 140) \{
-037       winsize = 4;
-038     \} else if (x <= 450) \{
-039       winsize = 5;
-040     \} else if (x <= 1303) \{
-041       winsize = 6;
-042     \} else if (x <= 3529) \{
-043       winsize = 7;
-044     \} else \{
-045       winsize = 8;
-046     \}
-047   
-048   #ifdef MP_LOW_MEM
-049       if (winsize > 5) \{
-050          winsize = 5;
-051       \}
-052   #endif
-053   
-054     /* init M array */
-055     /* init first cell */
-056     if ((err = mp_init(&M[1])) != MP_OKAY) \{
-057        return err; 
-058     \}
-059   
-060     /* now init the second half of the array */
-061     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-062       if ((err = mp_init(&M[x])) != MP_OKAY) \{
-063         for (y = 1<<(winsize-1); y < x; y++) \{
-064           mp_clear (&M[y]);
-065         \}
-066         mp_clear(&M[1]);
-067         return err;
-068       \}
-069     \}
-070   
-071     /* create mu, used for Barrett reduction */
-072     if ((err = mp_init (&mu)) != MP_OKAY) \{
-073       goto LBL_M;
-074     \}
-075     
-076     if (redmode == 0) \{
-077        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-078           goto LBL_MU;
-079        \}
-080        redux = mp_reduce;
-081     \} else \{
-082        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
-083           goto LBL_MU;
-084        \}
-085        redux = mp_reduce_2k_l;
-086     \}    
-087   
-088     /* create M table
-089      *
-090      * The M table contains powers of the base, 
-091      * e.g. M[x] = G**x mod P
-092      *
-093      * The first half of the table is not 
-094      * computed though accept for M[0] and M[1]
-095      */
-096     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-097       goto LBL_MU;
-098     \}
-099   
-100     /* compute the value at M[1<<(winsize-1)] by squaring 
-101      * M[1] (winsize-1) times 
-102      */
-103     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-104       goto LBL_MU;
-105     \}
-106   
-107     for (x = 0; x < (winsize - 1); x++) \{
-108       /* square it */
-109       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-110                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-111         goto LBL_MU;
-112       \}
-113   
-114       /* reduce modulo P */
-115       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-116         goto LBL_MU;
-117       \}
-118     \}
-119   
-120     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-121      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-122      */
-123     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-124       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-125         goto LBL_MU;
-126       \}
-127       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
-128         goto LBL_MU;
-129       \}
-130     \}
-131   
-132     /* setup result */
-133     if ((err = mp_init (&res)) != MP_OKAY) \{
-134       goto LBL_MU;
-135     \}
-136     mp_set (&res, 1);
-137   
-138     /* set initial mode and bit cnt */
-139     mode   = 0;
-140     bitcnt = 1;
-141     buf    = 0;
-142     digidx = X->used - 1;
-143     bitcpy = 0;
-144     bitbuf = 0;
-145   
-146     for (;;) \{
-147       /* grab next digit as required */
-148       if (--bitcnt == 0) \{
-149         /* if digidx == -1 we are out of digits */
-150         if (digidx == -1) \{
-151           break;
-152         \}
-153         /* read next digit and reset the bitcnt */
-154         buf    = X->dp[digidx--];
-155         bitcnt = (int) DIGIT_BIT;
-156       \}
-157   
-158       /* grab the next msb from the exponent */
-159       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-160       buf <<= (mp_digit)1;
-161   
-162       /* if the bit is zero and mode == 0 then we ignore it
-163        * These represent the leading zero bits before the first 1 bit
-164        * in the exponent.  Technically this opt is not required but it
-165        * does lower the # of trivial squaring/reductions used
-166        */
-167       if (mode == 0 && y == 0) \{
-168         continue;
-169       \}
-170   
-171       /* if the bit is zero and mode == 1 then we square */
-172       if (mode == 1 && y == 0) \{
-173         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-174           goto LBL_RES;
-175         \}
-176         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-177           goto LBL_RES;
-178         \}
-179         continue;
-180       \}
-181   
-182       /* else we add it to the window */
-183       bitbuf |= (y << (winsize - ++bitcpy));
-184       mode    = 2;
-185   
-186       if (bitcpy == winsize) \{
-187         /* ok window is filled so square as required and multiply  */
-188         /* square first */
-189         for (x = 0; x < winsize; x++) \{
-190           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-191             goto LBL_RES;
-192           \}
-193           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-194             goto LBL_RES;
-195           \}
-196         \}
-197   
-198         /* then multiply */
-199         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-200           goto LBL_RES;
-201         \}
-202         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-203           goto LBL_RES;
-204         \}
-205   
-206         /* empty window and reset */
-207         bitcpy = 0;
-208         bitbuf = 0;
-209         mode   = 1;
-210       \}
-211     \}
-212   
-213     /* if bits remain then square/multiply */
-214     if (mode == 2 && bitcpy > 0) \{
-215       /* square then multiply if the bit is set */
-216       for (x = 0; x < bitcpy; x++) \{
-217         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-218           goto LBL_RES;
-219         \}
-220         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-221           goto LBL_RES;
-222         \}
-223   
-224         bitbuf <<= 1;
-225         if ((bitbuf & (1 << winsize)) != 0) \{
-226           /* then multiply */
-227           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-228             goto LBL_RES;
-229           \}
-230           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-231             goto LBL_RES;
-232           \}
-233         \}
-234       \}
-235     \}
-236   
-237     mp_exch (&res, Y);
-238     err = MP_OKAY;
-239   LBL_RES:mp_clear (&res);
-240   LBL_MU:mp_clear (&mu);
-241   LBL_M:
-242     mp_clear(&M[1]);
-243     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-244       mp_clear (&M[x]);
-245     \}
-246     return err;
-247   \}
-248   #endif
+023   \{
+024     mp_int  M[TAB_SIZE], res, mu;
+025     mp_digit buf;
+026     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+027     int (*redux)(mp_int*,mp_int*,mp_int*);
+028   
+029     /* find window size */
+030     x = mp_count_bits (X);
+031     if (x <= 7) \{
+032       winsize = 2;
+033     \} else if (x <= 36) \{
+034       winsize = 3;
+035     \} else if (x <= 140) \{
+036       winsize = 4;
+037     \} else if (x <= 450) \{
+038       winsize = 5;
+039     \} else if (x <= 1303) \{
+040       winsize = 6;
+041     \} else if (x <= 3529) \{
+042       winsize = 7;
+043     \} else \{
+044       winsize = 8;
+045     \}
+046   
+047   #ifdef MP_LOW_MEM
+048       if (winsize > 5) \{
+049          winsize = 5;
+050       \}
+051   #endif
+052   
+053     /* init M array */
+054     /* init first cell */
+055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
+056        return err; 
+057     \}
+058   
+059     /* now init the second half of the array */
+060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
+062         for (y = 1<<(winsize-1); y < x; y++) \{
+063           mp_clear (&M[y]);
+064         \}
+065         mp_clear(&M[1]);
+066         return err;
+067       \}
+068     \}
+069   
+070     /* create mu, used for Barrett reduction */
+071     if ((err = mp_init (&mu)) != MP_OKAY) \{
+072       goto LBL_M;
+073     \}
+074     
+075     if (redmode == 0) \{
+076        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
+077           goto LBL_MU;
+078        \}
+079        redux = mp_reduce;
+080     \} else \{
+081        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
+082           goto LBL_MU;
+083        \}
+084        redux = mp_reduce_2k_l;
+085     \}    
+086   
+087     /* create M table
+088      *
+089      * The M table contains powers of the base, 
+090      * e.g. M[x] = G**x mod P
+091      *
+092      * The first half of the table is not 
+093      * computed though accept for M[0] and M[1]
+094      */
+095     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
+096       goto LBL_MU;
+097     \}
+098   
+099     /* compute the value at M[1<<(winsize-1)] by squaring 
+100      * M[1] (winsize-1) times 
+101      */
+102     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
+103       goto LBL_MU;
+104     \}
+105   
+106     for (x = 0; x < (winsize - 1); x++) \{
+107       /* square it */
+108       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+109                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
+110         goto LBL_MU;
+111       \}
+112   
+113       /* reduce modulo P */
+114       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
+115         goto LBL_MU;
+116       \}
+117     \}
+118   
+119     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+120      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+121      */
+122     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
+123       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
+124         goto LBL_MU;
+125       \}
+126       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
+127         goto LBL_MU;
+128       \}
+129     \}
+130   
+131     /* setup result */
+132     if ((err = mp_init (&res)) != MP_OKAY) \{
+133       goto LBL_MU;
+134     \}
+135     mp_set (&res, 1);
+136   
+137     /* set initial mode and bit cnt */
+138     mode   = 0;
+139     bitcnt = 1;
+140     buf    = 0;
+141     digidx = X->used - 1;
+142     bitcpy = 0;
+143     bitbuf = 0;
+144   
+145     for (;;) \{
+146       /* grab next digit as required */
+147       if (--bitcnt == 0) \{
+148         /* if digidx == -1 we are out of digits */
+149         if (digidx == -1) \{
+150           break;
+151         \}
+152         /* read next digit and reset the bitcnt */
+153         buf    = X->dp[digidx--];
+154         bitcnt = (int) DIGIT_BIT;
+155       \}
+156   
+157       /* grab the next msb from the exponent */
+158       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+159       buf <<= (mp_digit)1;
+160   
+161       /* if the bit is zero and mode == 0 then we ignore it
+162        * These represent the leading zero bits before the first 1 bit
+163        * in the exponent.  Technically this opt is not required but it
+164        * does lower the # of trivial squaring/reductions used
+165        */
+166       if (mode == 0 && y == 0) \{
+167         continue;
+168       \}
+169   
+170       /* if the bit is zero and mode == 1 then we square */
+171       if (mode == 1 && y == 0) \{
+172         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+173           goto LBL_RES;
+174         \}
+175         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+176           goto LBL_RES;
+177         \}
+178         continue;
+179       \}
+180   
+181       /* else we add it to the window */
+182       bitbuf |= (y << (winsize - ++bitcpy));
+183       mode    = 2;
+184   
+185       if (bitcpy == winsize) \{
+186         /* ok window is filled so square as required and multiply  */
+187         /* square first */
+188         for (x = 0; x < winsize; x++) \{
+189           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+190             goto LBL_RES;
+191           \}
+192           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+193             goto LBL_RES;
+194           \}
+195         \}
+196   
+197         /* then multiply */
+198         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
+199           goto LBL_RES;
+200         \}
+201         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+202           goto LBL_RES;
+203         \}
+204   
+205         /* empty window and reset */
+206         bitcpy = 0;
+207         bitbuf = 0;
+208         mode   = 1;
+209       \}
+210     \}
+211   
+212     /* if bits remain then square/multiply */
+213     if (mode == 2 && bitcpy > 0) \{
+214       /* square then multiply if the bit is set */
+215       for (x = 0; x < bitcpy; x++) \{
+216         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+217           goto LBL_RES;
+218         \}
+219         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+220           goto LBL_RES;
+221         \}
+222   
+223         bitbuf <<= 1;
+224         if ((bitbuf & (1 << winsize)) != 0) \{
+225           /* then multiply */
+226           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
+227             goto LBL_RES;
+228           \}
+229           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+230             goto LBL_RES;
+231           \}
+232         \}
+233       \}
+234     \}
+235   
+236     mp_exch (&res, Y);
+237     err = MP_OKAY;
+238   LBL_RES:mp_clear (&res);
+239   LBL_MU:mp_clear (&mu);
+240   LBL_M:
+241     mp_clear(&M[1]);
+242     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+243       mp_clear (&M[x]);
+244     \}
+245     return err;
+246   \}
+247   #endif
+248   
 \end{alltt}
 \end{small}
 
-Lines 21 through 40 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+Lines 31 through 41 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
 from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line 32 the value of $x$ is already known to be greater than $140$.  
+on line 33 the value of $x$ is already known to be greater than $140$.  
 
-The conditional piece of code beginning on line 48 allows the window size to be restricted to five bits.  This logic is used to ensure
+The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
 the table of precomputed powers of $G$ remains relatively small.  
 
-The for loop on line 61 initializes the $M$ array while lines 62 and 77 compute the value of $\mu$ required for
+The for loop on line 60 initializes the $M$ array while lines 61 and 76 compute the value of $\mu$ required for
 Barrett reduction.  
 
 -- More later.
@@ -8146,6 +8195,7 @@ equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two ca
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -8666,6 +8716,7 @@ respectively be replaced with a zero.
 285   #endif
 286   
 287   #endif
+288   
 \end{alltt}
 \end{small}
 
@@ -8820,6 +8871,7 @@ This algorithm initiates a temporary mp\_int with the value of the single digit
 102   \}
 103   
 104   #endif
+105   
 \end{alltt}
 \end{small}
 
@@ -8929,6 +8981,7 @@ Unlike the full multiplication algorithms this algorithm does not require any si
 072     return MP_OKAY;
 073   \}
 074   #endif
+075   
 \end{alltt}
 \end{small}
 
@@ -9074,6 +9127,7 @@ from chapter seven.
 103   \}
 104   
 105   #endif
+106   
 \end{alltt}
 \end{small}
 
@@ -9260,6 +9314,7 @@ root.  Ideally this algorithm is meant to find the $n$'th root of an input where
 125     return res;
 126   \}
 127   #endif
+128   
 \end{alltt}
 \end{small}
 
@@ -9336,6 +9391,7 @@ the integers from $0$ to $\beta - 1$.
 048     return MP_OKAY;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -9480,6 +9536,7 @@ as part of larger input without any significant problem.
 075     return MP_OKAY;
 076   \}
 077   #endif
+078   
 \end{alltt}
 \end{small}
 
@@ -9599,6 +9656,7 @@ are required instead of a series of $n \times k$ divisions.  One design flaw of
 068   \}
 069   
 070   #endif
+071   
 \end{alltt}
 \end{small}
 
@@ -9879,6 +9937,7 @@ must be adjusted by multiplying by the common factors of two ($2^k$) removed ear
 106     return res;
 107   \}
 108   #endif
+109   
 \end{alltt}
 \end{small}
 
@@ -9974,6 +10033,7 @@ dividing the product of the two inputs by their greatest common divisor.
 053     return res;
 054   \}
 055   #endif
+056   
 \end{alltt}
 \end{small}
 
@@ -10218,6 +10278,7 @@ $\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi
 098     return res;
 099   \}
 100   #endif
+101   
 \end{alltt}
 \end{small}
 
@@ -10366,6 +10427,7 @@ then only a couple of additions or subtractions will be required to adjust the i
 036     return MP_VAL;
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -10467,6 +10529,7 @@ This algorithm attempts to determine if a candidate integer $n$ is composite by
 043     return MP_OKAY;
 044   \}
 045   #endif
+046   
 \end{alltt}
 \end{small}
 
@@ -10518,6 +10581,7 @@ mp\_digit.  The table \_\_prime\_tab is defined in the following file.
 054   #endif
 055   \};
 056   #endif
+057   
 \end{alltt}
 \end{small}
 
@@ -10606,6 +10670,7 @@ determine the result.
 055     return err;
 056   \}
 057   #endif
+058   
 \end{alltt}
 \end{small}
 
@@ -10741,6 +10806,7 @@ composite then it is \textit{probably} prime.
 096     return err;
 097   \}
 098   #endif
+099   
 \end{alltt}
 \end{small}
 
diff --git a/tommath_class.h b/tommath_class.h
index 6d05b7b..68b88b9 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -687,6 +687,7 @@
 #if defined(BN_MP_READ_RADIX_C)
    #define BN_MP_ZERO_C
    #define BN_MP_S_RMAP_C
+   #define BN_MP_RADIX_SMAP_C
    #define BN_MP_MUL_D_C
    #define BN_MP_ADD_D_C
    #define BN_MP_ISZERO_C
@@ -992,3 +993,7 @@
 #else
 #define LTM_LAST
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/tommath_superclass.h b/tommath_superclass.h
index b50ecb0..1b26841 100644
--- a/tommath_superclass.h
+++ b/tommath_superclass.h
@@ -4,7 +4,7 @@
 #define LTM_ALL
 
 /* RSA only (does not support DH/DSA/ECC) */
-// #define SC_RSA_1
+/* #define SC_RSA_1 */
 
 /* For reference.... On an Athlon64 optimizing for speed...
 
@@ -70,3 +70,7 @@
 #endif
 
 #endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */