added libtommath-0.27

2003-09-19 22:43:07 +00:00 · 2003-09-19 22:43:07 +00:00 · c343371bb2
commit c343371bb2
parent 6e732340a9
20 changed files with 294 additions and 515 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}

-\title{LibTomMath v0.26 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.27 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@ -56,9 +56,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)

  /* if a is positive */
  if (a->sign == MP_ZPOS) {
-     /* setup size */
-     c->used = a->used + 1;
-
     /* add digit, after this we're propagating
      * the carry.
      */
@ -75,6 +72,9 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     /* set final carry */
     ix++;
     *tmpc++  = mu;
+
+     /* setup size */
+     c->used = a->used + 1;
  } else {
     /* a was negative and |a| < b */
     c->used  = 1;
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@ -26,7 +26,7 @@
 *
 * Has been modified to use algorithm 7.10 from the LTM book instead
 *
- * Input x must be in the range 0 <= x <= (n-1)^2
+ * Input x must be in the range 0 <= x <= (n-1)**2
 */
 int
 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
@ -34,10 +34,10 @@ mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
  int      err, i, m;
  mp_word  r;
  mp_digit mu, *tmpx1, *tmpx2;
-  
+
  /* m = digits in modulus */
  m = n->used;
-  
+
  /* ensure that "x" has at least 2m digits */
  if (x->alloc < m + m) {
    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
@ -45,20 +45,20 @@ mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
    }
  }

-/* top of loop, this is where the code resumes if 
+/* top of loop, this is where the code resumes if
 * another reduction pass is required.
 */
 top:
  /* aliases for digits */
  /* alias for lower half of x */
  tmpx1 = x->dp;
-  
+
  /* alias for upper half of x, or x/B**m */
  tmpx2 = x->dp + m;
-  
+
  /* set carry to zero */
  mu = 0;
-  
+
  /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
  for (i = 0; i < m; i++) {
      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
@ -77,7 +77,7 @@ top:
  /* clamp, sub and return */
  mp_clamp (x);

-  /* if x >= n then subtract and reduce again 
+  /* if x >= n then subtract and reduce again
   * Each successive "recursion" makes the input smaller and smaller.
   */
  if (mp_cmp_mag (x, n) != MP_LT) {
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
+/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
 *
 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
 * The value of k changes based on the size of the exponent.
@ -34,10 +34,10 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  mp_int  M[TAB_SIZE], res;
  mp_digit buf, mp;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-  
+
  /* use a pointer to the reduction algorithm.  This allows us to use
   * one of many reduction algorithms without modding the guts of
-   * the code with if statements everywhere.  
+   * the code with if statements everywhere.
   */
  int     (*redux)(mp_int*,mp_int*,mp_digit);

@ -68,7 +68,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  /* init M array */
  /* init first cell */
  if ((err = mp_init(&M[1])) != MP_OKAY) {
-     return err; 
+     return err;
  }

  /* now init the second half of the array */
@ -88,7 +88,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
        goto __M;
     }
-     
+
     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
     if (((P->used * 2 + 1) < MP_WARRAY) &&
          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@ -19,17 +19,29 @@ int
 mp_grow (mp_int * a, int size)
 {
  int     i;
+  mp_digit *tmp;
+

  /* if the alloc size is smaller alloc more ram */
  if (a->alloc < size) {
    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size % MP_PREC);     
+    size += (MP_PREC * 2) - (size % MP_PREC);

-    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
-    if (a->dp == NULL) {
+    /* reallocate the array a->dp
+     *
+     * We store the return in a temporary variable
+     * in case the operation failed we don't want
+     * to overwrite the dp member of a.
+     */
+    tmp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
+    if (tmp == NULL) {
+      /* reallocation failed but "a" is still valid [can be freed] */
      return MP_MEM;
    }

+    /* reallocation succeeded so set a->dp */
+    a->dp = tmp;
+
    /* zero excess digits */
    i        = a->alloc;
    a->alloc = size;
--- a/bn_mp_mod_2d.c
+++ b/bn_mp_mod_2d.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* calc a value mod 2^b */
+/* calc a value mod 2**b */
 int
 mp_mod_2d (mp_int * a, int b, mp_int * c)
 {
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@ -18,12 +18,13 @@
 int
 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int     res, pa, olduse;
+  mp_digit u, *tmpa, *tmpc;
+  mp_word  r;
+  int      ix, res, olduse;

  /* make sure c is big enough to hold a*b */
-  pa = a->used;
-  if (c->alloc < pa + 1) {
-    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+  if (c->alloc < a->used + 1) {
+    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
      return res;
    }
  }
@ -31,42 +32,41 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
  /* get the original destinations used count */
  olduse = c->used;

-  /* set the new temporary used count */
-  c->used = pa + 1;
+  /* set the sign */
  c->sign = a->sign;

-  {
-    register mp_digit u, *tmpa, *tmpc;
-    register mp_word r;
-    register int ix;
+  /* alias for a->dp [source] */
+  tmpa = a->dp;

-    /* alias for a->dp [source] */
-    tmpa = a->dp;
+  /* alias for c->dp [dest] */
+  tmpc = c->dp;

-    /* alias for c->dp [dest] */
-    tmpc = c->dp;
+  /* zero carry */
+  u = 0;

-    /* zero carry */
-    u = 0;
-    for (ix = 0; ix < pa; ix++) {
-      /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+  /* compute columns */
+  for (ix = 0; ix < a->used; ix++) {
+    /* compute product and carry sum for this term */
+    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);

-      /* mask off higher bits to get a single digit */
-      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+    /* mask off higher bits to get a single digit */
+    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));

-      /* send carry into next iteration */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* store final carry [if any] */
-    *tmpc++ = u;
-
-    /* now zero digits above the top */
-    for (; pa < olduse; pa++) {
-       *tmpc++ = 0;
-    }
+    /* send carry into next iteration */
+    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
  }

-  mp_clamp (c);
+  /* store final carry [if any] */
+  *tmpc++ = u;
+
+  /* now zero digits above the top */
+  while (ix++ < olduse) {
+     *tmpc++ = 0;
+  }
+
+  /* set used count */
+  c->used = a->used + 1;
+  mp_clamp(c);
+
  return MP_OKAY;
 }
--- a/bn_mp_sub_d.c
+++ b/bn_mp_sub_d.c
@ -73,7 +73,8 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
     }
  }

-  for (; ix < oldused; ix++) {
+  /* zero excess digits */
+  while (ix++ < oldused) {
     *tmpc++ = 0;
  }
  mp_clamp(c);
--- a/bn_mp_toom_sqr.c
+++ b/bn_mp_toom_sqr.c
@ -15,12 +15,12 @@
 #include <tommath.h>

 /* squaring using Toom-Cook 3-way algorithm */
-int 
+int
 mp_toom_sqr(mp_int *a, mp_int *b)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
    int res, B;
-        
+
    /* init temps */
    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
       return res;
@ -28,8 +28,8 @@ mp_toom_sqr(mp_int *a, mp_int *b)

    /* B */
    B = a->used / 3;
-    
-    /* a = a2 * B^2 + a1 * B + a0 */
+
+    /* a = a2 * B**2 + a1 * B + a0 */
    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
       goto ERR;
    }
@ -44,17 +44,17 @@ mp_toom_sqr(mp_int *a, mp_int *b)
       goto ERR;
    }
    mp_rshd(&a2, B*2);
-        
+
    /* w0 = a0*a0 */
    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w4 = a2 * a2 */
    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w1 = (a2 + 2(a1 + 2a0))**2 */
    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
       goto ERR;
@ -68,11 +68,11 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w3 = (a0 + 2(a1 + 2a2))**2 */
    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
       goto ERR;
@ -86,11 +86,11 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
       goto ERR;
    }
-    
+

    /* w2 = (a2 + a1 + a0)**2 */
    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
@ -102,18 +102,18 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
       goto ERR;
    }
-    
-    /* now solve the matrix 
-    
+
+    /* now solve the matrix
+
       0  0  0  0  1
       1  2  4  8  16
       1  1  1  1  1
       16 8  4  2  1
       1  0  0  0  0
-       
+
       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
     */
-     
+
     /* r1 - r4 */
     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
        goto ERR;
@ -185,7 +185,7 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
        goto ERR;
     }
-     
+
     /* at this point shift W[n] by B*n */
     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
        goto ERR;
@ -198,8 +198,8 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     }
     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
        goto ERR;
-     }     
-     
+     }
+
     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
        goto ERR;
     }
@ -211,10 +211,10 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     }
     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
        goto ERR;
-     }     
-     
+     }
+
 ERR:
     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
     return res;
-}     
-     
+}
+
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,12 @@
+Sept 19th, 2003
+v0.27  -- Removed changes.txt~ which was made by accident since "kate" decided it was
+          a good time to re-enable backups... [kde is fun!]
+       -- In mp_grow() "a->dp" is not overwritten by realloc call [re: memory leak]
+          Now if mp_grow() fails the mp_int is still valid and can be cleared via
+          mp_clear() to reclaim the memory.
+       -- Henrik Goldman found a buffer overflow bug in mp_add_d().  Fixed.
+       -- Cleaned up mp_mul_d() to be much easier to read and follow.
+
 Aug 29th, 2003
 v0.26  -- Fixed typo that caused warning with GCC 3.2
       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.  
--- a/changes.txt~
+++ b/changes.txt~
@ -1,244 +0,0 @@
-Aug 11th, 2003
-v0.26  -- Fixed typo that caused warning with GCC 3.2
-       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.  
-          Also, Martin is the fellow who noted the bugs in mp_gcd() of 0.24/0.25.
-       -- Martin Marcel noticed an optimization [and slight bug] in mp_lcm().
-       -- Added fix to mp_read_unsigned_bin to prevent a buffer overflow.
-       -- Beefed up the comments in the baseline multipliers [and montgomery]
-       -- Added "mont" demo to the makefile.msvc in etc/
-       -- Optimized sign compares in mp_cmp from 4 to 2 cases.
-
-Aug 4th, 2003
-v0.25  -- Fix to mp_gcd again... oops (0,-a) == (-a, 0) == a
-       -- Fix to mp_clear which didn't reset the sign  [Greg Rose]
-       -- Added mp_error_to_string() to convert return codes to strings.  [Greg Rose]
-       -- Optimized fast_mp_invmod() to do the test for invalid inputs [both even]
-          first so temps don't have to be initialized if it's going to fail.
-       -- Optimized mp_gcd() by removing mp_div_2d calls for when one of the inputs
-          is odd.
-       -- Tons of new comments, some indentation fixups, etc.
-       -- mp_jacobi() returns MP_VAL if the modulus is less than or equal to zero.
-       -- fixed two typos in the header of each file :-)
-       -- LibTomMath is officially Public Domain [see LICENSE]
-
-July 15th, 2003
-v0.24  -- Optimized mp_add_d and mp_sub_d to not allocate temporary variables
-       -- Fixed mp_gcd() so the gcd of 0,0 is 0.  Allows the gcd operation to be chained
-          e.g. (0,0,a) == a [instead of 1]
-       -- Should be one of the last release for a while.  Working on LibTomMath book now.
-       -- optimized the pprime demo [/etc/pprime.c] to first make a huge table of single
-          digit primes then it reads them randomly instead of randomly choosing/testing single
-          digit primes.
-
-July 12th, 2003
-v0.23  -- Optimized mp_prime_next_prime() to not use mp_mod [via is_divisible()] in each
-          iteration.  Instead now a smaller table is kept of the residues which can be updated
-          without division.
-       -- Fixed a bug in next_prime() where an input of zero would be treated as odd and
-          have two added to it [to move to the next odd].
-       -- fixed a bug in prime_fermat() and prime_miller_rabin() which allowed the base
-          to be negative, zero or one.  Normally the test is only valid if the base is
-          greater than one.
-       -- changed the next_prime() prototype to accept a new parameter "bbs_style" which
-          will find the next prime congruent to 3 mod 4.  The default [bbs_style==0] will
-          make primes which are either congruent to 1 or 3 mod 4.
-       -- fixed mp_read_unsigned_bin() so that it doesn't include both code for
-          the case DIGIT_BIT < 8 and >= 8
-       -- optimized div_d() to easy out on division by 1 [or if a == 0] and use
-          logical shifts if the divisor is a power of two.
-       -- the default DIGIT_BIT type was not int for non-default builds.  Fixed.
-
-July 2nd, 2003
-v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always congruent to the inverse...]
-       -- Fixed up s_mp_exptmod and mp_exptmod_fast so the lower half of the pre-computed table isn't allocated
-          which makes the algorithm use half as much ram.
-       -- Fixed the install script not to make the book :-) [which isn't included anyways]
-       -- added mp_cnt_lsb() which counts how many of the lsbs are zero
-       -- optimized mp_gcd() to use the new mp_cnt_lsb() to replace multiple divisions by two by a single division.
-       -- applied similar optimization to mp_prime_miller_rabin().
-       -- Fixed a bug in both mp_invmod() and fast_mp_invmod() which tested for odd
-          via "mp_iseven() == 0" which is not valid [since zero is not even either].
-
-June 19th, 2003
-v0.21  -- Fixed bug in mp_mul_d which would not handle sign correctly [would not always forward it]
-       -- Removed the #line lines from gen.pl [was in violation of ISO C]       
-
-June 8th, 2003
-v0.20  -- Removed the book from the package.  Added the TDCAL license document.  
-       -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16]
-
-June 6th, 2003
-v0.19  -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release.
-          Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time.
-       -- Fixed up etc/tune.c a bit to stop testing new cutoffs after 16 failures [to find more optimal points]. 
-          Brute force ho!
-          
-
-May 29th, 2003
-v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
-          (e.g. correct result, just bad looking code)
-       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
-       -- Added Toom-Cook multipliers [needs tuning!]
-       -- Added efficient divide by 3 algorithm mp_div_3
-       -- Re-wrote mp_div_d to be faster than calling mp_div 
-       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
-       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
-       -- Made the exptmod system be aware of the 2k reduction algorithms.
-       -- Rewrote mp_dr_reduce to be smaller, simpler and easier to understand.
-
-May 17th, 2003
-v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
-          as several smaller suggestions.  Thanks!
-       -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-)
-       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
-       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
-       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
-       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the 
-          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed) 
-       -- Refactored the DR reduction code so there is only one function per file.
-       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
-          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
-       -- Fixed similar bug in the Montgomery reduction code.
-       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. 
-          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
-       -- Added new comments here and there, cleaned up some code [style stuff]
-       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
-       -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed.  This
-          saves quite a few calls and if statements.
-       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
-       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
-          uses RDTSC for high precision timing.  
-       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its 
-          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
-       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files 
-          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the 
-          root of the package.
-       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].  
-       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
-       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
-          shifting caused a carry.  
-       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
-          slower than it had to be.
-          
-Mar 29th, 2003
-v0.16  -- Sped up mp_div by making normalization one shift call
-       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
-       -- Cleaned up mp_gcd to use the macros for odd/even detection
-       -- Added comments here and there, mostly there but occasionally here too.
-
-Mar 22nd, 2003
-v0.15  -- Added series of prime testing routines to lib
-       -- Fixed up etc/tune.c
-       -- Added DR reduction algorithm
-       -- Beefed up the manual more.
-       -- Fixed up demo/demo.c so it doesn't have so many warnings and it does the full series of
-          tests
-       -- Added "pre-gen" directory which will hold a "gen.pl"'ed copy of the entire lib [done at
-          zipup time so its always the latest]
-       -- Added conditional casts for C++ users [boo!]
-
-Mar 15th, 2003
-v0.14  -- Tons of manual updates
-       -- cleaned up the directory
-       -- added MSVC makefiles
-       -- source changes [that I don't recall]
-       -- Fixed up the lshd/rshd code to use pointer aliasing
-       -- Fixed up the mul_2d and div_2d to not call rshd/lshd unless needed
-       -- Fixed up etc/tune.c a tad
-       -- fixed up demo/demo.c to output comma-delimited results of timing
-          also fixed up timing demo to use a finer granularity for various functions
-       -- fixed up demo/demo.c testing to pause during testing so my Duron won't catch on fire
-          [stays around 31-35C during testing :-)]
-       
-Feb 13th, 2003
-v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate 
-          to other functions like mp_invmod, mp_div, etc...
-       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
-       -- minor fixes
-
-Jan 17th, 2003
-v0.12  -- re-wrote the majority of the makefile so its more portable and will
-          install via "make install" on most *nix platforms
-       -- Re-packaged all the source as seperate files.  Means the library a single
-          file packagage any more.  Instead of just adding "bn.c" you have to add
-          libtommath.a
-       -- Renamed "bn.h" to "tommath.h"
-       -- Changes to the manual to reflect all of this
-       -- Used GNU Indent to clean up the source
-
-Jan 15th, 2003
-v0.11  -- More subtle fixes
-       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
-       -- Sped up the montgomery reduction code quite a bit
-       -- fixed up demo so when building timing for the x86 it assumes ELF format now
-       
-Jan 9th, 2003
-v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
-       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
-          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
-       
-Jan 6th, 2003
-v0.09  -- Updated the manual to reflect recent changes.  :-)
-       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
-       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
-
-Jan 2nd, 2003
-v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
-       -- Corrected a bunch of small "warnings"
-       -- Added more comments
-       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
-       -- Corrected some bugs where error messages were potentially ignored
-       -- add etc/pprime.c program which makes numbers which are provably prime.
-       
-Jan 1st, 2003
-v0.07  -- Removed alot of heap operations from core functions to speed them up
-       -- Added a root finding function [and mp_sqrt macro like from MPI]
-       -- Added more to manual 
-
-Dec 31st, 2002
-v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
-       -- Cleaned up the header a bit more
-       
-Dec 30th, 2002
-v0.05  -- Builds with MSVC out of the box
-       -- Fixed a bug in mp_invmod w.r.t. even moduli
-       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
-       -- Fixed up exptmod to use fewer multiplications
-       -- Fixed up mp_init_size to use only one heap operation
-          -- Note there is a slight "off-by-one" bug in the library somewhere
-             without the padding (see the source for comment) the library 
-             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
-             numbers which will always correct it since as the numbers grow the padding
-             will still be beyond the end of the number
-       -- Added more to the manual
-       
-Dec 29th, 2002
-v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
-       -- optimized invmod code
-       -- Fixed bug in mp_div
-       -- use exchange instead of copy for results
-       -- added a bit more to the manual
-       
-Dec 27th, 2002
-v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
-       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
-       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
-       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
-       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
-       -- mp_mul_d didn't preserve sign
-       -- Many many many many fixes
-       -- Works in LibTomCrypt now :-)
-       -- Added iterations to the timing demos... more accurate.
-       -- Tom needs a job.       
-
-Dec 26th, 2002
-v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
-       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
-       -- Sped up the fast [comba] multipliers more [yahoo!]
-
-Dec 25th,2002
-v0.01  -- Initial release.  Gimme a break.
-       -- Todo list, 
-           add details to manual [e.g. algorithms]
-           more comments in code
-           example programs
--- a/demo/demo.c
+++ b/demo/demo.c
@ -68,8 +68,8 @@ int main(void)
   mp_init(&c);
   mp_init(&d);
   mp_init(&e);
-   mp_init(&f);   
-   
+   mp_init(&f);
+
   srand(time(NULL));

 #if 0
@ -89,7 +89,7 @@ int main(void)
   for (;;) {
       aa = abs(rand()) & MP_MASK;
       bb = abs(rand()) & MP_MASK;
-	  if (MULT(aa,bb) != (aa*bb)) {
+      if (MULT(aa,bb) != (aa*bb)) {
             printf("%llu * %llu == %llu or %llu?\n", aa, bb, (ulong64)MULT(aa,bb), (ulong64)(aa*bb));
             return 0;
          }
@ -111,18 +111,18 @@ int main(void)

 /* test mp_reduce_2k */
 #if 0
-   for (cnt = 3; cnt <= 4096; ++cnt) {
+   for (cnt = 3; cnt <= 256; ++cnt) {
       mp_digit tmp;
       mp_2expt(&a, cnt);
       mp_sub_d(&a, 1, &a);  /* a = 2**cnt - 1 */
-       
-       
+
+
       printf("\nTesting %4d bits", cnt);
       printf("(%d)", mp_reduce_is_2k(&a));
       mp_reduce_2k_setup(&a, &tmp);
       printf("(%d)", tmp);
-       for (ix = 0; ix < 100000; ix++) {
-           if (!(ix & 1023)) {printf("."); fflush(stdout); }
+       for (ix = 0; ix < 10000; ix++) {
+           if (!(ix & 127)) {printf("."); fflush(stdout); }
           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
           mp_copy(&c, &b);
           mp_mod(&c, &a, &c);
@ -135,22 +135,23 @@ int main(void)
    }
 #endif

-           
+
 /* test mp_div_3  */
 #if 0
-   for (cnt = 0; cnt < 1000000; ) {
+   for (cnt = 0; cnt < 10000; ) {
      mp_digit r1, r2;
-      
+
      if (!(++cnt & 127)) printf("%9d\r", cnt);
      mp_rand(&a, abs(rand()) % 32 + 1);
      mp_div_d(&a, 3, &b, &r1);
      mp_div_3(&a, &c, &r2);
-      
+
      if (mp_cmp(&b, &c) || r1 != r2) {
-         printf("Failure\n");
+         printf("\n\nmp_div_3 => Failure\n");
      }
   }
-#endif     
+   printf("\n\nPassed div_3 testing\n");
+#endif

 /* test the DR reduction */
 #if 0
@ -162,7 +163,7 @@ int main(void)
           a.dp[ix] = MP_MASK;
       }
       a.used = cnt;
-       mp_prime_next_prime(&a, 3);
+       mp_prime_next_prime(&a, 3, 0);

       mp_rand(&b, cnt - 1);
       mp_copy(&b, &c);
@ -178,9 +179,9 @@ int main(void)

         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);
-                       
+
         }
-      } while (++rr < 1000000);
+      } while (++rr < 10000);
      printf("Passed DR test for %d digits\n", cnt);
   }
 #endif
@ -369,7 +370,7 @@ int main(void)

   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
-   
+
   /* force KARA and TOOM to enable despite cutoffs */
   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
   TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@ -1,2 +0,0 @@
-256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
-512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
--- a/etc/2kprime.c
+++ b/etc/2kprime.c
@ -1,80 +1,80 @@
-/* Makes safe primes of a 2k nature */
-#include <tommath.h>
-#include <time.h>
-
-int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
-
-int main(void)
-{
-   char buf[2000];
-   int x, y;
-   mp_int q, p;
-   FILE *out;
-   clock_t t1;
-   mp_digit z;
-   
-   mp_init_multi(&q, &p, NULL);
-   
-   out = fopen("2kprime.1", "w");
-   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
-   top:
-       mp_2expt(&q, sizes[x]);
-       mp_add_d(&q, 3, &q);
-       z = -3;
-       
-       t1 = clock();
-       for(;;) {
-         mp_sub_d(&q, 4, &q);
-         z += 4;
-
-         if (z > MP_MASK) {
-            printf("No primes of size %d found\n", sizes[x]);
-            break;
-         }
-         
-         if (clock() - t1 > CLOCKS_PER_SEC) { 
-            printf("."); fflush(stdout);
-//            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
-            t1 = clock();
-         }
-         
-         /* quick test on q */
-         mp_prime_is_prime(&q, 1, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         /* find (q-1)/2 */
-         mp_sub_d(&q, 1, &p);
-         mp_div_2(&p, &p);
-         mp_prime_is_prime(&p, 3, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         /* test on q */
-         mp_prime_is_prime(&q, 3, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         break;
-       }
-       
-       if (y == 0) {
-          ++sizes[x];
-          goto top;
-       }
-       
-       mp_toradix(&q, buf, 10);
-       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
-       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
-   }
-   
-   return 0;
-}   
-       
-         
-            
-            
-          
+/* Makes safe primes of a 2k nature */
+#include <tommath.h>
+#include <time.h>
+
+int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
+
+int main(void)
+{
+   char buf[2000];
+   int x, y;
+   mp_int q, p;
+   FILE *out;
+   clock_t t1;
+   mp_digit z;
+   
+   mp_init_multi(&q, &p, NULL);
+   
+   out = fopen("2kprime.1", "w");
+   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
+   top:
+       mp_2expt(&q, sizes[x]);
+       mp_add_d(&q, 3, &q);
+       z = -3;
+       
+       t1 = clock();
+       for(;;) {
+         mp_sub_d(&q, 4, &q);
+         z += 4;
+
+         if (z > MP_MASK) {
+            printf("No primes of size %d found\n", sizes[x]);
+            break;
+         }
+         
+         if (clock() - t1 > CLOCKS_PER_SEC) { 
+            printf("."); fflush(stdout);
+//            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
+            t1 = clock();
+         }
+         
+         /* quick test on q */
+         mp_prime_is_prime(&q, 1, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* find (q-1)/2 */
+         mp_sub_d(&q, 1, &p);
+         mp_div_2(&p, &p);
+         mp_prime_is_prime(&p, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* test on q */
+         mp_prime_is_prime(&q, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         break;
+       }
+       
+       if (y == 0) {
+          ++sizes[x];
+          goto top;
+       }
+       
+       mp_toradix(&q, buf, 10);
+       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
+       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
+   }
+   
+   return 0;
+}   
+       
+         
+            
+            
+          
--- a/etc/drprime.c
+++ b/etc/drprime.c
@ -1,7 +1,7 @@
 /* Makes safe primes of a DR nature */
 #include <tommath.h>

-int sizes[] = { 256/DIGIT_BIT, 512/DIGIT_BIT, 768/DIGIT_BIT, 1024/DIGIT_BIT, 2048/DIGIT_BIT, 4096/DIGIT_BIT };
+int sizes[] = { 1+256/DIGIT_BIT, 1+512/DIGIT_BIT, 1+768/DIGIT_BIT, 1+1024/DIGIT_BIT, 1+2048/DIGIT_BIT, 1+4096/DIGIT_BIT };
 int main(void)
 {
   int res, x, y;
@ -27,6 +27,7 @@ int main(void)
       a.used = sizes[x];
       
       /* now loop */
+       res = 0;
       for (;;) { 
          a.dp[0] += 4;
          if (a.dp[0] >= MP_MASK) break;
--- a/etc/drprimes.txt
+++ b/etc/drprimes.txt
@ -1,15 +1,3 @@
-300-bit prime:
-p == 2037035976334486086268445688409378161051468393665936250636140449354381299763336706183393387
-
-510-bit prime:
-p == 3351951982485649274893506249551461531869841455148098344430890360930441007518386744200468574541725856922507964546621512713438470702986642486608412251494847
-
-765-bit prime:
-p == 194064761537588616893622436057812819407110752139587076392381504753256369085797110791359801103580809743810966337141384150771447505514351798930535909380147642400556872002606238193783160703949805603157874899214558593861605856727005843
-
-1740-bit prime:
-p == 61971563797913992479098926650774597592238975917324828616370329001490282756182680310375299496755876376552390992409906202402580445340335946188208371182877207703039791403230793200138374588682414508868522097839706723444887189794752005280474068640895359332705297533442094790319040758184131464298255306336601284054032615054089107503261218395204931877449590906016855549287497608058070532126514935495184332288660623518513755499687752752528983258754107553298994358814410594621086881204713587661301862918471291451469190214535690028223
-
-2145-bit prime:
-p == 5120834017984591518147028606005386392991070803233539296225079797126347381640561714282620018633786528684625023495254338414266418034876748837569635527008462887513799703364491256252208677097644781218029521545625387720450034199300257983090290650191518075514440227307582827991892955933645635564359934476985058395497772801264225688705417270604479898255105628816161764712152286804906915652283101897505006786990112535065979412882966109410722156057838063961993103028819293481078313367826492291911499907219457764211473530756735049840415233164976184864760203928986194694093688479274544786530457604655777313274555786861719645260099496565700321073395329400403
-
+280-bit prime:
+p == 1942668892225729070919461906823518906642406839052139521251812409738904285204940164839
+
--- a/4
+++ b/4
@ -6,7 +6,7 @@ CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer 

-VERSION=0.26
+VERSION=0.27

 default: libtommath.a

@ -95,7 +95,7 @@ manual:
 	rm -f bn.aux bn.dvi bn.log
 	
 clean:
-	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c \
        poster.aux poster.dvi poster.log
 	cd etc ; make clean
--- a/poster.pdf
+++ b/poster.pdf
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -942,9 +942,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)

  /* if a is positive */
  if (a->sign == MP_ZPOS) {
-     /* setup size */
-     c->used = a->used + 1;
-
     /* add digit, after this we're propagating
      * the carry.
      */
@ -961,6 +958,9 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     /* set final carry */
     ix++;
     *tmpc++  = mu;
+
+     /* setup size */
+     c->used = a->used + 1;
  } else {
     /* a was negative and |a| < b */
     c->used  = 1;
@ -2121,7 +2121,7 @@ int mp_dr_is_modulus(mp_int *a)
 *
 * Has been modified to use algorithm 7.10 from the LTM book instead
 *
- * Input x must be in the range 0 <= x <= (n-1)^2
+ * Input x must be in the range 0 <= x <= (n-1)**2
 */
 int
 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
@ -2129,10 +2129,10 @@ mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
  int      err, i, m;
  mp_word  r;
  mp_digit mu, *tmpx1, *tmpx2;
-  
+
  /* m = digits in modulus */
  m = n->used;
-  
+
  /* ensure that "x" has at least 2m digits */
  if (x->alloc < m + m) {
    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
@ -2140,20 +2140,20 @@ mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
    }
  }

-/* top of loop, this is where the code resumes if 
+/* top of loop, this is where the code resumes if
 * another reduction pass is required.
 */
 top:
  /* aliases for digits */
  /* alias for lower half of x */
  tmpx1 = x->dp;
-  
+
  /* alias for upper half of x, or x/B**m */
  tmpx2 = x->dp + m;
-  
+
  /* set carry to zero */
  mu = 0;
-  
+
  /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
  for (i = 0; i < m; i++) {
      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
@ -2172,7 +2172,7 @@ top:
  /* clamp, sub and return */
  mp_clamp (x);

-  /* if x >= n then subtract and reduce again 
+  /* if x >= n then subtract and reduce again
   * Each successive "recursion" makes the input smaller and smaller.
   */
  if (mp_cmp_mag (x, n) != MP_LT) {
@ -2402,7 +2402,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 */
 #include <tommath.h>

-/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
+/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
 *
 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
 * The value of k changes based on the size of the exponent.
@ -2422,10 +2422,10 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  mp_int  M[TAB_SIZE], res;
  mp_digit buf, mp;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-  
+
  /* use a pointer to the reduction algorithm.  This allows us to use
   * one of many reduction algorithms without modding the guts of
-   * the code with if statements everywhere.  
+   * the code with if statements everywhere.
   */
  int     (*redux)(mp_int*,mp_int*,mp_digit);

@ -2456,7 +2456,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  /* init M array */
  /* init first cell */
  if ((err = mp_init(&M[1])) != MP_OKAY) {
-     return err; 
+     return err;
  }

  /* now init the second half of the array */
@ -2476,7 +2476,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
        goto __M;
     }
-     
+
     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
     if (((P->used * 2 + 1) < MP_WARRAY) &&
          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
@ -2926,17 +2926,29 @@ int
 mp_grow (mp_int * a, int size)
 {
  int     i;
+  mp_digit *tmp;
+

  /* if the alloc size is smaller alloc more ram */
  if (a->alloc < size) {
    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size % MP_PREC);     
+    size += (MP_PREC * 2) - (size % MP_PREC);

-    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
-    if (a->dp == NULL) {
+    /* reallocate the array a->dp
+     *
+     * We store the return in a temporary variable
+     * in case the operation failed we don't want
+     * to overwrite the dp member of a.
+     */
+    tmp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
+    if (tmp == NULL) {
+      /* reallocation failed but "a" is still valid [can be freed] */
      return MP_MEM;
    }

+    /* reallocation succeeded so set a->dp */
+    a->dp = tmp;
+
    /* zero excess digits */
    i        = a->alloc;
    a->alloc = size;
@ -3874,7 +3886,7 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
 */
 #include <tommath.h>

-/* calc a value mod 2^b */
+/* calc a value mod 2**b */
 int
 mp_mod_2d (mp_int * a, int b, mp_int * c)
 {
@ -4405,12 +4417,13 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
 int
 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int     res, pa, olduse;
+  mp_digit u, *tmpa, *tmpc;
+  mp_word  r;
+  int      ix, res, olduse;

  /* make sure c is big enough to hold a*b */
-  pa = a->used;
-  if (c->alloc < pa + 1) {
-    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+  if (c->alloc < a->used + 1) {
+    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
      return res;
    }
  }
@ -4418,43 +4431,42 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
  /* get the original destinations used count */
  olduse = c->used;

-  /* set the new temporary used count */
-  c->used = pa + 1;
+  /* set the sign */
  c->sign = a->sign;

-  {
-    register mp_digit u, *tmpa, *tmpc;
-    register mp_word r;
-    register int ix;
+  /* alias for a->dp [source] */
+  tmpa = a->dp;

-    /* alias for a->dp [source] */
-    tmpa = a->dp;
+  /* alias for c->dp [dest] */
+  tmpc = c->dp;

-    /* alias for c->dp [dest] */
-    tmpc = c->dp;
+  /* zero carry */
+  u = 0;

-    /* zero carry */
-    u = 0;
-    for (ix = 0; ix < pa; ix++) {
-      /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+  /* compute columns */
+  for (ix = 0; ix < a->used; ix++) {
+    /* compute product and carry sum for this term */
+    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);

-      /* mask off higher bits to get a single digit */
-      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+    /* mask off higher bits to get a single digit */
+    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));

-      /* send carry into next iteration */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* store final carry [if any] */
-    *tmpc++ = u;
-
-    /* now zero digits above the top */
-    for (; pa < olduse; pa++) {
-       *tmpc++ = 0;
-    }
+    /* send carry into next iteration */
+    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
  }

-  mp_clamp (c);
+  /* store final carry [if any] */
+  *tmpc++ = u;
+
+  /* now zero digits above the top */
+  while (ix++ < olduse) {
+     *tmpc++ = 0;
+  }
+
+  /* set used count */
+  c->used = a->used + 1;
+  mp_clamp(c);
+
  return MP_OKAY;
 }

@ -6172,7 +6184,8 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
     }
  }

-  for (; ix < oldused; ix++) {
+  /* zero excess digits */
+  while (ix++ < oldused) {
     *tmpc++ = 0;
  }
  mp_clamp(c);
@ -6596,12 +6609,12 @@ ERR:
 #include <tommath.h>

 /* squaring using Toom-Cook 3-way algorithm */
-int 
+int
 mp_toom_sqr(mp_int *a, mp_int *b)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
    int res, B;
-        
+
    /* init temps */
    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
       return res;
@ -6609,8 +6622,8 @@ mp_toom_sqr(mp_int *a, mp_int *b)

    /* B */
    B = a->used / 3;
-    
-    /* a = a2 * B^2 + a1 * B + a0 */
+
+    /* a = a2 * B**2 + a1 * B + a0 */
    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
       goto ERR;
    }
@ -6625,17 +6638,17 @@ mp_toom_sqr(mp_int *a, mp_int *b)
       goto ERR;
    }
    mp_rshd(&a2, B*2);
-        
+
    /* w0 = a0*a0 */
    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w4 = a2 * a2 */
    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w1 = (a2 + 2(a1 + 2a0))**2 */
    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
       goto ERR;
@ -6649,11 +6662,11 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    /* w3 = (a0 + 2(a1 + 2a2))**2 */
    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
       goto ERR;
@ -6667,11 +6680,11 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
-    
+
    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
       goto ERR;
    }
-    
+

    /* w2 = (a2 + a1 + a0)**2 */
    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
@ -6683,18 +6696,18 @@ mp_toom_sqr(mp_int *a, mp_int *b)
    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
       goto ERR;
    }
-    
-    /* now solve the matrix 
-    
+
+    /* now solve the matrix
+
       0  0  0  0  1
       1  2  4  8  16
       1  1  1  1  1
       16 8  4  2  1
       1  0  0  0  0
-       
+
       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
     */
-     
+
     /* r1 - r4 */
     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
        goto ERR;
@ -6766,7 +6779,7 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
        goto ERR;
     }
-     
+
     /* at this point shift W[n] by B*n */
     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
        goto ERR;
@ -6779,8 +6792,8 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     }
     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
        goto ERR;
-     }     
-     
+     }
+
     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
        goto ERR;
     }
@ -6792,13 +6805,13 @@ mp_toom_sqr(mp_int *a, mp_int *b)
     }
     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
        goto ERR;
-     }     
-     
+     }
+
 ERR:
     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
     return res;
-}     
-     
+}
+

 /* End: bn_mp_toom_sqr.c */