diff --git a/aes.c b/aes.c
index 4d63992..01b2447 100644
--- a/aes.c
+++ b/aes.c
@@ -1,19 +1,19 @@
-/* This is an independent implementation of the encryption algorithm:   */
-/*                                                                      */
-/*         RIJNDAEL by Joan Daemen and Vincent Rijmen                   */
-/*                                                                      */
-/* which is a candidate algorithm in the Advanced Encryption Standard   */
-/* programme of the US National Institute of Standards and Technology.  */
-/*                                                                      */
-/* Copyright in this implementation is held by Dr B R Gladman but I     */
-/* hereby give permission for its free direct or derivative use subject */
-/* to acknowledgment of its origin and compliance with any conditions   */
-/* that the originators of the algorithm place on its exploitation.     */
-/*                                                                      */
-/* Dr Brian Gladman (gladman@seven77.demon.co.uk) 14th January 1999     */
-
-
-/* This code has been modified by Tom St Denis for libtomcrypt.a */
+/* AES implementation by Tom St Denis 
+ *
+ * Derived from the Public Domain source code by
+ 
+---  
+  * rijndael-alg-fst.c
+  *
+  * @version 3.0 (December 2000)
+  *
+  * Optimised ANSI C code for the Rijndael cipher (now AES)
+  *
+  * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+  * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+  * @author Paulo Barreto <paulo.barreto@terra.com.br>
+---
+ */
 
 #include "mycrypt.h"
 
@@ -45,285 +45,517 @@ const struct _cipher_descriptor aes_desc =
 
 #include "aes_tab.c"
 
-#define byte(x, y) (((x)>>(8*(y)))&255)
-
-#define f_rn(bo, bi, n, k)                          \
-    bo[n] =  ft_tab[0][byte(bi[n],0)] ^             \
-             ft_tab[1][byte(bi[(n + 1) & 3],1)] ^   \
-             ft_tab[2][byte(bi[(n + 2) & 3],2)] ^   \
-             ft_tab[3][byte(bi[(n + 3) & 3],3)] ^ *(k + n)
-
-#define i_rn(bo, bi, n, k)                          \
-    bo[n] =  it_tab[0][byte(bi[n],0)] ^             \
-             it_tab[1][byte(bi[(n + 3) & 3],1)] ^   \
-             it_tab[2][byte(bi[(n + 2) & 3],2)] ^   \
-             it_tab[3][byte(bi[(n + 1) & 3],3)] ^ *(k + n)
-
-#define ls_box(x)                \
-    ( fl_tab[0][byte(x, 0)] ^    \
-      fl_tab[1][byte(x, 1)] ^    \
-      fl_tab[2][byte(x, 2)] ^    \
-      fl_tab[3][byte(x, 3)] )
-
-#define f_rl(bo, bi, n, k)                          \
-    bo[n] =  fl_tab[0][byte(bi[n],0)] ^             \
-             fl_tab[1][byte(bi[(n + 1) & 3],1)] ^   \
-             fl_tab[2][byte(bi[(n + 2) & 3],2)] ^   \
-             fl_tab[3][byte(bi[(n + 3) & 3],3)] ^ *(k + n)
-
-#define i_rl(bo, bi, n, k)                          \
-    bo[n] =  il_tab[0][byte(bi[n],0)] ^             \
-             il_tab[1][byte(bi[(n + 3) & 3],1)] ^   \
-             il_tab[2][byte(bi[(n + 2) & 3],2)] ^   \
-             il_tab[3][byte(bi[(n + 1) & 3],3)] ^ *(k + n)
-
-#define star_x(x) (((x) & 0x7f7f7f7fUL) << 1) ^ ((((x) & 0x80808080UL) >> 7) * 0x1bUL)
-
-#define imix_col(y,x)       \
-    u   = star_x(x);        \
-    v   = star_x(u);        \
-    w   = star_x(v);        \
-    t   = w ^ (x);          \
-   (y)  = u ^ v ^ w;        \
-   (y) ^= ROR(u ^ t,  8) ^  \
-          ROR(v ^ t, 16) ^  \
-          ROR(t,24)
-
-#ifdef CLEAN_STACK
-static int _rijndael_setup(const unsigned char *key, int keylen, int numrounds, symmetric_key *skey)
-#else
-int rijndael_setup(const unsigned char *key, int keylen, int numrounds, symmetric_key *skey)
-#endif
+int rijndael_setup(const unsigned char *key, int keylen, int rounds, symmetric_key *skey)
 {
-    unsigned long  t, u, v, w, in_key[8];
-    int i, k_len;
-
-    /* check arguments */
-    _ARGCHK(key  != NULL);
+    int i = 0, j;
+    unsigned long temp, *rk;
+    
+    _ARGCHK(key != NULL);
     _ARGCHK(skey != NULL);
-
-    if (numrounds == 0) { 
-       numrounds = 10 + (2 * ((keylen/8)-2));
-    }       
-
+    
     if (keylen != 16 && keylen != 24 && keylen != 32) {
        return CRYPT_INVALID_KEYSIZE;
     }
-
-    if (numrounds != (10 + (2 * ((keylen/8)-2)))) {
+    
+    if (rounds != 0 && rounds != (10 + ((keylen/8)-2)*2)) {
        return CRYPT_INVALID_ROUNDS;
     }
-
-    k_len = keylen / 4;
-    for (i = 0; i < k_len; i++) {
-        LOAD32L(in_key[i], key+(4*i));
-    }
-
-    skey->rijndael.k_len = k_len;
-    skey->rijndael.eK[0] = in_key[0]; skey->rijndael.eK[1] = in_key[1];
-    skey->rijndael.eK[2] = in_key[2]; skey->rijndael.eK[3] = in_key[3];
-
-    switch(k_len) {
-    case 4: t = skey->rijndael.eK[3];
-            for(i = 0; i < 10; ++i) {
-               t = ls_box(ROR(t,  8)) ^ rco_tab[i];
-               t ^= skey->rijndael.eK[4 * i];     skey->rijndael.eK[4 * i + 4] = t;
-               t ^= skey->rijndael.eK[4 * i + 1]; skey->rijndael.eK[4 * i + 5] = t;
-               t ^= skey->rijndael.eK[4 * i + 2]; skey->rijndael.eK[4 * i + 6] = t;
-               t ^= skey->rijndael.eK[4 * i + 3]; skey->rijndael.eK[4 * i + 7] = t;
+    
+    skey->rijndael.Nr = 10 + ((keylen/8)-2)*2;
+        
+    /* setup the forward key */
+    rk                = skey->rijndael.eK;
+    LOAD32H(rk[0], key     );
+    LOAD32H(rk[1], key +  4);
+    LOAD32H(rk[2], key +  8);
+    LOAD32H(rk[3], key + 12);
+    if (keylen == 16) {
+        for (;;) {
+            temp  = rk[3];
+            rk[4] = rk[0] ^
+                (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[5] = rk[1] ^ rk[4];
+            rk[6] = rk[2] ^ rk[5];
+            rk[7] = rk[3] ^ rk[6];
+            if (++i == 10) {
+                break;
             }
-            break;
-    case 6: skey->rijndael.eK[4]     = in_key[4]; 
-            t = skey->rijndael.eK[5] = in_key[5];
-            for(i = 0; i < 8; ++i) {
-              t = ls_box(ROR(t,  8)) ^ rco_tab[i];
-              t ^= skey->rijndael.eK[6 * i];     skey->rijndael.eK[6 * i + 6] = t;
-              t ^= skey->rijndael.eK[6 * i + 1]; skey->rijndael.eK[6 * i + 7] = t;
-              t ^= skey->rijndael.eK[6 * i + 2]; skey->rijndael.eK[6 * i + 8] = t;
-              t ^= skey->rijndael.eK[6 * i + 3]; skey->rijndael.eK[6 * i + 9] = t;
-              t ^= skey->rijndael.eK[6 * i + 4]; skey->rijndael.eK[6 * i + 10] = t;
-              t ^= skey->rijndael.eK[6 * i + 5]; skey->rijndael.eK[6 * i + 11] = t;
+            rk += 4;
+        }
+    } else if (keylen == 24) {
+        LOAD32H(rk[4], key + 16);
+        LOAD32H(rk[5], key + 20);
+        for (;;) {
+            temp = rk[ 5];
+            rk[ 6] = rk[ 0] ^
+                (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[ 7] = rk[ 1] ^ rk[ 6];
+            rk[ 8] = rk[ 2] ^ rk[ 7];
+            rk[ 9] = rk[ 3] ^ rk[ 8];
+            if (++i == 8) {
+                break;
             }
-            break;
-    case 8: skey->rijndael.eK[4]     = in_key[4]; 
-            skey->rijndael.eK[5]     = in_key[5];
-            skey->rijndael.eK[6]     = in_key[6]; 
-            t = skey->rijndael.eK[7] = in_key[7];
-            for(i = 0; i < 7; ++i) {
-               t = ls_box(ROR(t,  8)) ^ rco_tab[i];
-               t ^= skey->rijndael.eK[8 * i];     skey->rijndael.eK[8 * i + 8] = t;
-               t ^= skey->rijndael.eK[8 * i + 1]; skey->rijndael.eK[8 * i + 9] = t;
-               t ^= skey->rijndael.eK[8 * i + 2]; skey->rijndael.eK[8 * i + 10] = t;
-               t ^= skey->rijndael.eK[8 * i + 3]; skey->rijndael.eK[8 * i + 11] = t;
-
-               t  = skey->rijndael.eK[8 * i + 4] ^ ls_box(t); skey->rijndael.eK[8 * i + 12] = t;
-               t ^= skey->rijndael.eK[8 * i + 5]; skey->rijndael.eK[8 * i + 13] = t;
-               t ^= skey->rijndael.eK[8 * i + 6]; skey->rijndael.eK[8 * i + 14] = t;
-               t ^= skey->rijndael.eK[8 * i + 7]; skey->rijndael.eK[8 * i + 15] = t;
+            rk[10] = rk[ 4] ^ rk[ 9];
+            rk[11] = rk[ 5] ^ rk[10];
+            rk += 6;
+        }
+    } else if (keylen == 32) {
+        LOAD32H(rk[4], key + 16);
+        LOAD32H(rk[5], key + 20);
+        LOAD32H(rk[6], key + 24);
+        LOAD32H(rk[7], key + 28);
+        for (;;) {
+            temp = rk[ 7];
+            rk[ 8] = rk[ 0] ^
+                (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[ 9] = rk[ 1] ^ rk[ 8];
+            rk[10] = rk[ 2] ^ rk[ 9];
+            rk[11] = rk[ 3] ^ rk[10];
+            if (++i == 7) {
+                break;
             }
-            break;
+            temp = rk[11];
+            rk[12] = rk[ 4] ^
+                (Te4[(temp >> 24)       ] & 0xff000000) ^
+                (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+                (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
+                (Te4[(temp      ) & 0xff] & 0x000000ff);
+            rk[13] = rk[ 5] ^ rk[12];
+            rk[14] = rk[ 6] ^ rk[13];
+            rk[15] = rk[ 7] ^ rk[14];
+            rk += 8;
+        }
     }
-
-    skey->rijndael.dK[0] = skey->rijndael.eK[0];
-    skey->rijndael.dK[1] = skey->rijndael.eK[1];
-    skey->rijndael.dK[2] = skey->rijndael.eK[2];
-    skey->rijndael.dK[3] = skey->rijndael.eK[3];
-    for(i = 4; i < 4 * k_len + 24; ++i) {
-        imix_col(skey->rijndael.dK[i], skey->rijndael.eK[i]);
+    
+    /* setup the inverse key now */
+    memcpy(skey->rijndael.dK, skey->rijndael.eK, sizeof(skey->rijndael.eK));
+    rk = skey->rijndael.dK;
+    
+    for (i = 0, j = 4*skey->rijndael.Nr; i < j; i += 4, j -= 4) {
+        temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+        temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+        temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+        temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
     }
-    return CRYPT_OK;
-};
-
-#ifdef CLEAN_STACK
-int rijndael_setup(const unsigned char *key, int keylen, int numrounds, symmetric_key *skey)
-{
-   int x;
-   x = _rijndael_setup(key, keylen, numrounds, skey);
-   burn_stack(sizeof(unsigned long) * 12 + sizeof(int) * 2);
-   return x;
+    /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+    for (i = 1; i < skey->rijndael.Nr; i++) {
+        rk += 4;
+        rk[0] =
+            Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
+            Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+            Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
+            Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
+        rk[1] =
+            Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
+            Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+            Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
+            Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
+        rk[2] =
+            Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
+            Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+            Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
+            Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
+        rk[3] =
+            Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
+            Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+            Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
+            Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
+    }
+    
+    return CRYPT_OK;   
 }
-#endif
 
-/* encrypt a block of text  */
-
-#define f_nround(bo, bi, k) \
-    f_rn(bo, bi, 0, k);     \
-    f_rn(bo, bi, 1, k);     \
-    f_rn(bo, bi, 2, k);     \
-    f_rn(bo, bi, 3, k);     \
-    k += 4
-
-#define f_lround(bo, bi, k) \
-    f_rl(bo, bi, 0, k);     \
-    f_rl(bo, bi, 1, k);     \
-    f_rl(bo, bi, 2, k);     \
-    f_rl(bo, bi, 3, k)
+void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey) 
+{
+    unsigned long s0, s1, s2, s3, t0, t1, t2, t3, *rk;
+    int Nr;
     
 #ifdef SMALL_CODE
-
-static void _fnround(unsigned long *bo, unsigned long *bi, unsigned long *k)
-{
-   f_nround(bo, bi, k);
-}
-
-static void _flround(unsigned long *bo, unsigned long *bi, unsigned long *k)
-{
-   f_lround(bo, bi, k);
-} 
-
-#undef   f_nround
-#define  f_nround(bo, bi, k) { _fnround(bo, bi, k); k += 4; }
-
-#undef   f_lround
-#define  f_lround(bo, bi, k) _flround(bo, bi, k)
-
-#endif
-
-void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey)
-{   
-    unsigned long  b0[4], b1[4], *kp;
+    int r;
+#endif 
 
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
     _ARGCHK(skey != NULL);
-
-    LOAD32L(b0[0], &pt[0]); LOAD32L(b0[1], &pt[4]);
-    LOAD32L(b0[2], &pt[8]); LOAD32L(b0[3], &pt[12]);
-    b0[0] ^= skey->rijndael.eK[0]; b0[1] ^= skey->rijndael.eK[1];
-    b0[2] ^= skey->rijndael.eK[2]; b0[3] ^= skey->rijndael.eK[3];
-    kp = skey->rijndael.eK + 4;
-
-    if (skey->rijndael.k_len > 6) {
-        f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-        f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    } else if (skey->rijndael.k_len > 4) {
-        f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    }
-
-    f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    f_nround(b1, b0, kp); f_nround(b0, b1, kp);
-    f_nround(b1, b0, kp); f_lround(b0, b1, kp);
-
-    STORE32L(b0[0], &ct[0]); STORE32L(b0[1], &ct[4]);
-    STORE32L(b0[2], &ct[8]); STORE32L(b0[3], &ct[12]);
-#ifdef CLEAN_STACK
-    zeromem(b0, sizeof(b0));
-    zeromem(b1, sizeof(b1));
-#endif
-};
-
-/* decrypt a block of text  */
-#define i_nround(bo, bi, k) \
-    i_rn(bo, bi, 0, k);     \
-    i_rn(bo, bi, 1, k);     \
-    i_rn(bo, bi, 2, k);     \
-    i_rn(bo, bi, 3, k);     \
-    k -= 4
-
-#define i_lround(bo, bi, k) \
-    i_rl(bo, bi, 0, k);     \
-    i_rl(bo, bi, 1, k);     \
-    i_rl(bo, bi, 2, k);     \
-    i_rl(bo, bi, 3, k)
     
-#ifdef SMALL_CODE
+    Nr = skey->rijndael.Nr;
+    rk = skey->rijndael.eK;
+    
+    /*
+     * map byte array block to cipher state
+     * and add initial round key:
+     */
+    LOAD32H(s0, pt      ); s0 ^= rk[0];
+    LOAD32H(s1, pt  +  4); s1 ^= rk[1];
+    LOAD32H(s2, pt  +  8); s2 ^= rk[2];
+    LOAD32H(s3, pt  + 12); s3 ^= rk[3];
+#ifndef SMALL_CODE
+    /* round 1: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+        /* round 10: */
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+        if (Nr > 12) {
+            /* round 12: */
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+        }
+    }
+    rk += Nr << 2;
+#else  /* SMALL_CODE */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = Nr >> 1;
+    for (;;) {
+        t0 =
+            Te0[(s0 >> 24)       ] ^
+            Te1[(s1 >> 16) & 0xff] ^
+            Te2[(s2 >>  8) & 0xff] ^
+            Te3[(s3      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Te0[(s1 >> 24)       ] ^
+            Te1[(s2 >> 16) & 0xff] ^
+            Te2[(s3 >>  8) & 0xff] ^
+            Te3[(s0      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Te0[(s2 >> 24)       ] ^
+            Te1[(s3 >> 16) & 0xff] ^
+            Te2[(s0 >>  8) & 0xff] ^
+            Te3[(s1      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Te0[(s3 >> 24)       ] ^
+            Te1[(s0 >> 16) & 0xff] ^
+            Te2[(s1 >>  8) & 0xff] ^
+            Te3[(s2      ) & 0xff] ^
+            rk[7];
 
-static void _inround(unsigned long *bo, unsigned long *bi, unsigned long *k)
-{
-   i_nround(bo, bi, k);
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Te0[(t0 >> 24)       ] ^
+            Te1[(t1 >> 16) & 0xff] ^
+            Te2[(t2 >>  8) & 0xff] ^
+            Te3[(t3      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Te0[(t1 >> 24)       ] ^
+            Te1[(t2 >> 16) & 0xff] ^
+            Te2[(t3 >>  8) & 0xff] ^
+            Te3[(t0      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Te0[(t2 >> 24)       ] ^
+            Te1[(t3 >> 16) & 0xff] ^
+            Te2[(t0 >>  8) & 0xff] ^
+            Te3[(t1      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Te0[(t3 >> 24)       ] ^
+            Te1[(t0 >> 16) & 0xff] ^
+            Te2[(t1 >>  8) & 0xff] ^
+            Te3[(t2      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* SMALL_CODE */
+    /*
+     * apply last round and
+     * map cipher state to byte array block:
+     */
+    s0 =
+        (Te4[(t0 >> 24)       ] & 0xff000000) ^
+        (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te4[(t3      ) & 0xff] & 0x000000ff) ^
+        rk[0];
+    STORE32H(s0, ct);
+    s1 =
+        (Te4[(t1 >> 24)       ] & 0xff000000) ^
+        (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te4[(t0      ) & 0xff] & 0x000000ff) ^
+        rk[1];
+    STORE32H(s1, ct+4);
+    s2 =
+        (Te4[(t2 >> 24)       ] & 0xff000000) ^
+        (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te4[(t1      ) & 0xff] & 0x000000ff) ^
+        rk[2];
+    STORE32H(s2, ct+8);
+    s3 =
+        (Te4[(t3 >> 24)       ] & 0xff000000) ^
+        (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te4[(t2      ) & 0xff] & 0x000000ff) ^
+        rk[3];
+    STORE32H(s3, ct+12);
 }
 
-static void _ilround(unsigned long *bo, unsigned long *bi, unsigned long *k)
-{
-   i_lround(bo, bi, k);
-} 
-
-#undef   i_nround
-#define  i_nround(bo, bi, k) { _inround(bo, bi, k); k -= 4; }
-
-#undef   i_lround
-#define  i_lround(bo, bi, k) _ilround(bo, bi, k)
-
-#endif    
-
-void rijndael_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey)
-{   
-    unsigned long b0[4], b1[4], *kp;
+void rijndael_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey) {
+    unsigned long s0, s1, s2, s3, t0, t1, t2, t3, *rk;
+    int Nr;
+#ifdef SMALL_CODE
+    int r;
+#endif /* SMALL_CODE */
 
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
     _ARGCHK(skey != NULL);
+    
+    Nr = skey->rijndael.Nr;
+    rk = skey->rijndael.dK;
 
-    LOAD32L(b0[0], &ct[0]); LOAD32L(b0[1], &ct[4]);
-    LOAD32L(b0[2], &ct[8]); LOAD32L(b0[3], &ct[12]);
-    b0[0] ^= skey->rijndael.eK[4 * skey->rijndael.k_len + 24]; 
-    b0[1] ^= skey->rijndael.eK[4 * skey->rijndael.k_len + 25];
-    b0[2] ^= skey->rijndael.eK[4 * skey->rijndael.k_len + 26]; 
-    b0[3] ^= skey->rijndael.eK[4 * skey->rijndael.k_len + 27];
-    kp = skey->rijndael.dK + 4 * (skey->rijndael.k_len + 5);
-
-    if(skey->rijndael.k_len > 6) {
-        i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-        i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-    } else if(skey->rijndael.k_len > 4) {
-        i_nround(b1, b0, kp); i_nround(b0, b1, kp);
+    /*
+     * map byte array block to cipher state
+     * and add initial round key:
+     */
+    LOAD32H(s0, ct      ); s0 ^= rk[0];
+    LOAD32H(s1, ct  +  4); s1 ^= rk[1];
+    LOAD32H(s2, ct  +  8); s2 ^= rk[2];
+    LOAD32H(s3, ct  + 12); s3 ^= rk[3];
+#ifndef SMALL_CODE
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+        /* round 10: */
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+        if (Nr > 12) {
+            /* round 12: */
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+        }
     }
+    rk += Nr << 2;
+#else  /* SMALL_CODE */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = Nr >> 1;
+    for (;;) {
+        t0 =
+            Td0[(s0 >> 24)       ] ^
+            Td1[(s3 >> 16) & 0xff] ^
+            Td2[(s2 >>  8) & 0xff] ^
+            Td3[(s1      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Td0[(s1 >> 24)       ] ^
+            Td1[(s0 >> 16) & 0xff] ^
+            Td2[(s3 >>  8) & 0xff] ^
+            Td3[(s2      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Td0[(s2 >> 24)       ] ^
+            Td1[(s1 >> 16) & 0xff] ^
+            Td2[(s0 >>  8) & 0xff] ^
+            Td3[(s3      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Td0[(s3 >> 24)       ] ^
+            Td1[(s2 >> 16) & 0xff] ^
+            Td2[(s1 >>  8) & 0xff] ^
+            Td3[(s0      ) & 0xff] ^
+            rk[7];
 
-    i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-    i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-    i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-    i_nround(b1, b0, kp); i_nround(b0, b1, kp);
-    i_nround(b1, b0, kp); i_lround(b0, b1, kp);
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
 
-    STORE32L(b0[0], &pt[0]); STORE32L(b0[1], &pt[4]);
-    STORE32L(b0[2], &pt[8]); STORE32L(b0[3], &pt[12]);
-#ifdef CLEAN_STACK
-    zeromem(b0, sizeof(b0));
-    zeromem(b1, sizeof(b1));
-#endif    
-};
+        s0 =
+            Td0[(t0 >> 24)       ] ^
+            Td1[(t3 >> 16) & 0xff] ^
+            Td2[(t2 >>  8) & 0xff] ^
+            Td3[(t1      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Td0[(t1 >> 24)       ] ^
+            Td1[(t0 >> 16) & 0xff] ^
+            Td2[(t3 >>  8) & 0xff] ^
+            Td3[(t2      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Td0[(t2 >> 24)       ] ^
+            Td1[(t1 >> 16) & 0xff] ^
+            Td2[(t0 >>  8) & 0xff] ^
+            Td3[(t3      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Td0[(t3 >> 24)       ] ^
+            Td1[(t2 >> 16) & 0xff] ^
+            Td2[(t1 >>  8) & 0xff] ^
+            Td3[(t0      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* SMALL_CODE */
+    /*
+     * apply last round and
+     * map cipher state to byte array block:
+     */
+    s0 =
+        (Td4[(t0 >> 24)       ] & 0xff000000) ^
+        (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+        (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+        (Td4[(t1      ) & 0xff] & 0x000000ff) ^
+        rk[0];
+    STORE32H(s0, pt);
+    s1 =
+        (Td4[(t1 >> 24)       ] & 0xff000000) ^
+        (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+        (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+        (Td4[(t2      ) & 0xff] & 0x000000ff) ^
+        rk[1];
+    STORE32H(s1, pt+4);
+    s2 =
+        (Td4[(t2 >> 24)       ] & 0xff000000) ^
+        (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+        (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+        (Td4[(t3      ) & 0xff] & 0x000000ff) ^
+        rk[2];
+    STORE32H(s2, pt+8);
+    s3 =
+        (Td4[(t3 >> 24)       ] & 0xff000000) ^
+        (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+        (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+        (Td4[(t0      ) & 0xff] & 0x000000ff) ^
+        rk[3];
+    STORE32H(s3, pt+12);
+}
 
 int rijndael_test(void)
 {
@@ -337,7 +569,7 @@ int rijndael_test(void)
  } tests[] = {
     { 16,
       { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 
-        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, 
       { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
         0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
       { 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 
@@ -369,14 +601,31 @@ int rijndael_test(void)
  int i;
  
  for (i = 0; i < (int)(sizeof(tests)/sizeof(tests[0])); i++) {
-     if ((errno = rijndael_setup(tests[i].key, tests[i].keylen, 0, &key)) != CRYPT_OK) { 
+    zeromem(&key, sizeof(key));
+    if ((errno = rijndael_setup(tests[i].key, tests[i].keylen, 0, &key)) != CRYPT_OK) { 
        return errno;
     }
-
+  
     rijndael_ecb_encrypt(tests[i].pt, tmp[0], &key);
     rijndael_ecb_decrypt(tmp[0], tmp[1], &key);
     if (memcmp(tmp[0], tests[i].ct, 16) || memcmp(tmp[1], tests[i].pt, 16)) { 
-       return CRYPT_FAIL_TESTVECTOR;
+#if 0
+       printf("\n\nTest %d failed\n", i);
+       if (memcmp(tmp[0], tests[i].ct, 16)) {
+          printf("CT: ");
+          for (i = 0; i < 16; i++) {
+             printf("%02x ", tmp[0][i]);
+          }
+          printf("\n");
+       } else {
+          printf("PT: ");
+          for (i = 0; i < 16; i++) {
+             printf("%02x ", tmp[1][i]);
+          }
+          printf("\n");
+       }
+#endif       
+        return CRYPT_FAIL_TESTVECTOR;
     }
  }       
  return CRYPT_OK;
diff --git a/aes_tab.c b/aes_tab.c
index 398bf84..98cf63b 100644
--- a/aes_tab.c
+++ b/aes_tab.c
@@ -1,851 +1,682 @@
 /* The precomputed tables for AES */
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
 
-static const unsigned long ft_tab[4][256] = {
-{0xa56363c6UL, 0x847c7cf8UL, 0x997777eeUL, 0x8d7b7bf6UL, 0x0df2f2ffUL, 
- 0xbd6b6bd6UL, 0xb16f6fdeUL, 0x54c5c591UL, 0x50303060UL, 0x03010102UL, 
- 0xa96767ceUL, 0x7d2b2b56UL, 0x19fefee7UL, 0x62d7d7b5UL, 0xe6abab4dUL, 
- 0x9a7676ecUL, 0x45caca8fUL, 0x9d82821fUL, 0x40c9c989UL, 0x877d7dfaUL, 
- 0x15fafaefUL, 0xeb5959b2UL, 0xc947478eUL, 0x0bf0f0fbUL, 0xecadad41UL, 
- 0x67d4d4b3UL, 0xfda2a25fUL, 0xeaafaf45UL, 0xbf9c9c23UL, 0xf7a4a453UL, 
- 0x967272e4UL, 0x5bc0c09bUL, 0xc2b7b775UL, 0x1cfdfde1UL, 0xae93933dUL, 
- 0x6a26264cUL, 0x5a36366cUL, 0x413f3f7eUL, 0x02f7f7f5UL, 0x4fcccc83UL, 
- 0x5c343468UL, 0xf4a5a551UL, 0x34e5e5d1UL, 0x08f1f1f9UL, 0x937171e2UL, 
- 0x73d8d8abUL, 0x53313162UL, 0x3f15152aUL, 0x0c040408UL, 0x52c7c795UL, 
- 0x65232346UL, 0x5ec3c39dUL, 0x28181830UL, 0xa1969637UL, 0x0f05050aUL, 
- 0xb59a9a2fUL, 0x0907070eUL, 0x36121224UL, 0x9b80801bUL, 0x3de2e2dfUL, 
- 0x26ebebcdUL, 0x6927274eUL, 0xcdb2b27fUL, 0x9f7575eaUL, 0x1b090912UL, 
- 0x9e83831dUL, 0x742c2c58UL, 0x2e1a1a34UL, 0x2d1b1b36UL, 0xb26e6edcUL, 
- 0xee5a5ab4UL, 0xfba0a05bUL, 0xf65252a4UL, 0x4d3b3b76UL, 0x61d6d6b7UL, 
- 0xceb3b37dUL, 0x7b292952UL, 0x3ee3e3ddUL, 0x712f2f5eUL, 0x97848413UL, 
- 0xf55353a6UL, 0x68d1d1b9UL, 0x00000000UL, 0x2cededc1UL, 0x60202040UL, 
- 0x1ffcfce3UL, 0xc8b1b179UL, 0xed5b5bb6UL, 0xbe6a6ad4UL, 0x46cbcb8dUL, 
- 0xd9bebe67UL, 0x4b393972UL, 0xde4a4a94UL, 0xd44c4c98UL, 0xe85858b0UL, 
- 0x4acfcf85UL, 0x6bd0d0bbUL, 0x2aefefc5UL, 0xe5aaaa4fUL, 0x16fbfbedUL, 
- 0xc5434386UL, 0xd74d4d9aUL, 0x55333366UL, 0x94858511UL, 0xcf45458aUL, 
- 0x10f9f9e9UL, 0x06020204UL, 0x817f7ffeUL, 0xf05050a0UL, 0x443c3c78UL, 
- 0xba9f9f25UL, 0xe3a8a84bUL, 0xf35151a2UL, 0xfea3a35dUL, 0xc0404080UL, 
- 0x8a8f8f05UL, 0xad92923fUL, 0xbc9d9d21UL, 0x48383870UL, 0x04f5f5f1UL, 
- 0xdfbcbc63UL, 0xc1b6b677UL, 0x75dadaafUL, 0x63212142UL, 0x30101020UL, 
- 0x1affffe5UL, 0x0ef3f3fdUL, 0x6dd2d2bfUL, 0x4ccdcd81UL, 0x140c0c18UL, 
- 0x35131326UL, 0x2fececc3UL, 0xe15f5fbeUL, 0xa2979735UL, 0xcc444488UL, 
- 0x3917172eUL, 0x57c4c493UL, 0xf2a7a755UL, 0x827e7efcUL, 0x473d3d7aUL, 
- 0xac6464c8UL, 0xe75d5dbaUL, 0x2b191932UL, 0x957373e6UL, 0xa06060c0UL, 
- 0x98818119UL, 0xd14f4f9eUL, 0x7fdcdca3UL, 0x66222244UL, 0x7e2a2a54UL, 
- 0xab90903bUL, 0x8388880bUL, 0xca46468cUL, 0x29eeeec7UL, 0xd3b8b86bUL, 
- 0x3c141428UL, 0x79dedea7UL, 0xe25e5ebcUL, 0x1d0b0b16UL, 0x76dbdbadUL, 
- 0x3be0e0dbUL, 0x56323264UL, 0x4e3a3a74UL, 0x1e0a0a14UL, 0xdb494992UL, 
- 0x0a06060cUL, 0x6c242448UL, 0xe45c5cb8UL, 0x5dc2c29fUL, 0x6ed3d3bdUL, 
- 0xefacac43UL, 0xa66262c4UL, 0xa8919139UL, 0xa4959531UL, 0x37e4e4d3UL, 
- 0x8b7979f2UL, 0x32e7e7d5UL, 0x43c8c88bUL, 0x5937376eUL, 0xb76d6ddaUL, 
- 0x8c8d8d01UL, 0x64d5d5b1UL, 0xd24e4e9cUL, 0xe0a9a949UL, 0xb46c6cd8UL, 
- 0xfa5656acUL, 0x07f4f4f3UL, 0x25eaeacfUL, 0xaf6565caUL, 0x8e7a7af4UL, 
- 0xe9aeae47UL, 0x18080810UL, 0xd5baba6fUL, 0x887878f0UL, 0x6f25254aUL, 
- 0x722e2e5cUL, 0x241c1c38UL, 0xf1a6a657UL, 0xc7b4b473UL, 0x51c6c697UL, 
- 0x23e8e8cbUL, 0x7cdddda1UL, 0x9c7474e8UL, 0x211f1f3eUL, 0xdd4b4b96UL, 
- 0xdcbdbd61UL, 0x868b8b0dUL, 0x858a8a0fUL, 0x907070e0UL, 0x423e3e7cUL, 
- 0xc4b5b571UL, 0xaa6666ccUL, 0xd8484890UL, 0x05030306UL, 0x01f6f6f7UL, 
- 0x120e0e1cUL, 0xa36161c2UL, 0x5f35356aUL, 0xf95757aeUL, 0xd0b9b969UL, 
- 0x91868617UL, 0x58c1c199UL, 0x271d1d3aUL, 0xb99e9e27UL, 0x38e1e1d9UL, 
- 0x13f8f8ebUL, 0xb398982bUL, 0x33111122UL, 0xbb6969d2UL, 0x70d9d9a9UL, 
- 0x898e8e07UL, 0xa7949433UL, 0xb69b9b2dUL, 0x221e1e3cUL, 0x92878715UL, 
- 0x20e9e9c9UL, 0x49cece87UL, 0xff5555aaUL, 0x78282850UL, 0x7adfdfa5UL, 
- 0x8f8c8c03UL, 0xf8a1a159UL, 0x80898909UL, 0x170d0d1aUL, 0xdabfbf65UL, 
- 0x31e6e6d7UL, 0xc6424284UL, 0xb86868d0UL, 0xc3414182UL, 0xb0999929UL, 
- 0x772d2d5aUL, 0x110f0f1eUL, 0xcbb0b07bUL, 0xfc5454a8UL, 0xd6bbbb6dUL, 
- 0x3a16162cUL}, 
-{0x6363c6a5UL, 0x7c7cf884UL, 0x7777ee99UL, 0x7b7bf68dUL, 0xf2f2ff0dUL,
- 0x6b6bd6bdUL, 0x6f6fdeb1UL, 0xc5c59154UL, 0x30306050UL, 0x01010203UL, 
- 0x6767cea9UL, 0x2b2b567dUL, 0xfefee719UL, 0xd7d7b562UL, 0xabab4de6UL, 
- 0x7676ec9aUL, 0xcaca8f45UL, 0x82821f9dUL, 0xc9c98940UL, 0x7d7dfa87UL, 
- 0xfafaef15UL, 0x5959b2ebUL, 0x47478ec9UL, 0xf0f0fb0bUL, 0xadad41ecUL, 
- 0xd4d4b367UL, 0xa2a25ffdUL, 0xafaf45eaUL, 0x9c9c23bfUL, 0xa4a453f7UL, 
- 0x7272e496UL, 0xc0c09b5bUL, 0xb7b775c2UL, 0xfdfde11cUL, 0x93933daeUL, 
- 0x26264c6aUL, 0x36366c5aUL, 0x3f3f7e41UL, 0xf7f7f502UL, 0xcccc834fUL, 
- 0x3434685cUL, 0xa5a551f4UL, 0xe5e5d134UL, 0xf1f1f908UL, 0x7171e293UL, 
- 0xd8d8ab73UL, 0x31316253UL, 0x15152a3fUL, 0x0404080cUL, 0xc7c79552UL, 
- 0x23234665UL, 0xc3c39d5eUL, 0x18183028UL, 0x969637a1UL, 0x05050a0fUL, 
- 0x9a9a2fb5UL, 0x07070e09UL, 0x12122436UL, 0x80801b9bUL, 0xe2e2df3dUL, 
- 0xebebcd26UL, 0x27274e69UL, 0xb2b27fcdUL, 0x7575ea9fUL, 0x0909121bUL, 
- 0x83831d9eUL, 0x2c2c5874UL, 0x1a1a342eUL, 0x1b1b362dUL, 0x6e6edcb2UL, 
- 0x5a5ab4eeUL, 0xa0a05bfbUL, 0x5252a4f6UL, 0x3b3b764dUL, 0xd6d6b761UL, 
- 0xb3b37dceUL, 0x2929527bUL, 0xe3e3dd3eUL, 0x2f2f5e71UL, 0x84841397UL, 
- 0x5353a6f5UL, 0xd1d1b968UL, 0x00000000UL, 0xededc12cUL, 0x20204060UL, 
- 0xfcfce31fUL, 0xb1b179c8UL, 0x5b5bb6edUL, 0x6a6ad4beUL, 0xcbcb8d46UL, 
- 0xbebe67d9UL, 0x3939724bUL, 0x4a4a94deUL, 0x4c4c98d4UL, 0x5858b0e8UL, 
- 0xcfcf854aUL, 0xd0d0bb6bUL, 0xefefc52aUL, 0xaaaa4fe5UL, 0xfbfbed16UL, 
- 0x434386c5UL, 0x4d4d9ad7UL, 0x33336655UL, 0x85851194UL, 0x45458acfUL, 
- 0xf9f9e910UL, 0x02020406UL, 0x7f7ffe81UL, 0x5050a0f0UL, 0x3c3c7844UL, 
- 0x9f9f25baUL, 0xa8a84be3UL, 0x5151a2f3UL, 0xa3a35dfeUL, 0x404080c0UL, 
- 0x8f8f058aUL, 0x92923fadUL, 0x9d9d21bcUL, 0x38387048UL, 0xf5f5f104UL, 
- 0xbcbc63dfUL, 0xb6b677c1UL, 0xdadaaf75UL, 0x21214263UL, 0x10102030UL, 
- 0xffffe51aUL, 0xf3f3fd0eUL, 0xd2d2bf6dUL, 0xcdcd814cUL, 0x0c0c1814UL, 
- 0x13132635UL, 0xececc32fUL, 0x5f5fbee1UL, 0x979735a2UL, 0x444488ccUL, 
- 0x17172e39UL, 0xc4c49357UL, 0xa7a755f2UL, 0x7e7efc82UL, 0x3d3d7a47UL, 
- 0x6464c8acUL, 0x5d5dbae7UL, 0x1919322bUL, 0x7373e695UL, 0x6060c0a0UL, 
- 0x81811998UL, 0x4f4f9ed1UL, 0xdcdca37fUL, 0x22224466UL, 0x2a2a547eUL, 
- 0x90903babUL, 0x88880b83UL, 0x46468ccaUL, 0xeeeec729UL, 0xb8b86bd3UL, 
- 0x1414283cUL, 0xdedea779UL, 0x5e5ebce2UL, 0x0b0b161dUL, 0xdbdbad76UL, 
- 0xe0e0db3bUL, 0x32326456UL, 0x3a3a744eUL, 0x0a0a141eUL, 0x494992dbUL, 
- 0x06060c0aUL, 0x2424486cUL, 0x5c5cb8e4UL, 0xc2c29f5dUL, 0xd3d3bd6eUL, 
- 0xacac43efUL, 0x6262c4a6UL, 0x919139a8UL, 0x959531a4UL, 0xe4e4d337UL, 
- 0x7979f28bUL, 0xe7e7d532UL, 0xc8c88b43UL, 0x37376e59UL, 0x6d6ddab7UL, 
- 0x8d8d018cUL, 0xd5d5b164UL, 0x4e4e9cd2UL, 0xa9a949e0UL, 0x6c6cd8b4UL, 
- 0x5656acfaUL, 0xf4f4f307UL, 0xeaeacf25UL, 0x6565caafUL, 0x7a7af48eUL, 
- 0xaeae47e9UL, 0x08081018UL, 0xbaba6fd5UL, 0x7878f088UL, 0x25254a6fUL, 
- 0x2e2e5c72UL, 0x1c1c3824UL, 0xa6a657f1UL, 0xb4b473c7UL, 0xc6c69751UL, 
- 0xe8e8cb23UL, 0xdddda17cUL, 0x7474e89cUL, 0x1f1f3e21UL, 0x4b4b96ddUL, 
- 0xbdbd61dcUL, 0x8b8b0d86UL, 0x8a8a0f85UL, 0x7070e090UL, 0x3e3e7c42UL, 
- 0xb5b571c4UL, 0x6666ccaaUL, 0x484890d8UL, 0x03030605UL, 0xf6f6f701UL, 
- 0x0e0e1c12UL, 0x6161c2a3UL, 0x35356a5fUL, 0x5757aef9UL, 0xb9b969d0UL, 
- 0x86861791UL, 0xc1c19958UL, 0x1d1d3a27UL, 0x9e9e27b9UL, 0xe1e1d938UL, 
- 0xf8f8eb13UL, 0x98982bb3UL, 0x11112233UL, 0x6969d2bbUL, 0xd9d9a970UL, 
- 0x8e8e0789UL, 0x949433a7UL, 0x9b9b2db6UL, 0x1e1e3c22UL, 0x87871592UL, 
- 0xe9e9c920UL, 0xcece8749UL, 0x5555aaffUL, 0x28285078UL, 0xdfdfa57aUL, 
- 0x8c8c038fUL, 0xa1a159f8UL, 0x89890980UL, 0x0d0d1a17UL, 0xbfbf65daUL, 
- 0xe6e6d731UL, 0x424284c6UL, 0x6868d0b8UL, 0x414182c3UL, 0x999929b0UL, 
- 0x2d2d5a77UL, 0x0f0f1e11UL, 0xb0b07bcbUL, 0x5454a8fcUL, 0xbbbb6dd6UL, 
- 0x16162c3aUL}, 
-{0x63c6a563UL, 0x7cf8847cUL, 0x77ee9977UL, 0x7bf68d7bUL, 0xf2ff0df2UL, 
- 0x6bd6bd6bUL, 0x6fdeb16fUL, 0xc59154c5UL, 0x30605030UL, 0x01020301UL, 
- 0x67cea967UL, 0x2b567d2bUL, 0xfee719feUL, 0xd7b562d7UL, 0xab4de6abUL, 
- 0x76ec9a76UL, 0xca8f45caUL, 0x821f9d82UL, 0xc98940c9UL, 0x7dfa877dUL,
- 0xfaef15faUL, 0x59b2eb59UL, 0x478ec947UL, 0xf0fb0bf0UL, 0xad41ecadUL, 
- 0xd4b367d4UL, 0xa25ffda2UL, 0xaf45eaafUL, 0x9c23bf9cUL, 0xa453f7a4UL, 
- 0x72e49672UL, 0xc09b5bc0UL, 0xb775c2b7UL, 0xfde11cfdUL, 0x933dae93UL, 
- 0x264c6a26UL, 0x366c5a36UL, 0x3f7e413fUL, 0xf7f502f7UL, 0xcc834fccUL, 
- 0x34685c34UL, 0xa551f4a5UL, 0xe5d134e5UL, 0xf1f908f1UL, 0x71e29371UL, 
- 0xd8ab73d8UL, 0x31625331UL, 0x152a3f15UL, 0x04080c04UL, 0xc79552c7UL, 
- 0x23466523UL, 0xc39d5ec3UL, 0x18302818UL, 0x9637a196UL, 0x050a0f05UL, 
- 0x9a2fb59aUL, 0x070e0907UL, 0x12243612UL, 0x801b9b80UL, 0xe2df3de2UL, 
- 0xebcd26ebUL, 0x274e6927UL, 0xb27fcdb2UL, 0x75ea9f75UL, 0x09121b09UL, 
- 0x831d9e83UL, 0x2c58742cUL, 0x1a342e1aUL, 0x1b362d1bUL, 0x6edcb26eUL, 
- 0x5ab4ee5aUL, 0xa05bfba0UL, 0x52a4f652UL, 0x3b764d3bUL, 0xd6b761d6UL, 
- 0xb37dceb3UL, 0x29527b29UL, 0xe3dd3ee3UL, 0x2f5e712fUL, 0x84139784UL, 
- 0x53a6f553UL, 0xd1b968d1UL, 0x00000000UL, 0xedc12cedUL, 0x20406020UL, 
- 0xfce31ffcUL, 0xb179c8b1UL, 0x5bb6ed5bUL, 0x6ad4be6aUL, 0xcb8d46cbUL, 
- 0xbe67d9beUL, 0x39724b39UL, 0x4a94de4aUL, 0x4c98d44cUL, 0x58b0e858UL, 
- 0xcf854acfUL, 0xd0bb6bd0UL, 0xefc52aefUL, 0xaa4fe5aaUL, 0xfbed16fbUL, 
- 0x4386c543UL, 0x4d9ad74dUL, 0x33665533UL, 0x85119485UL, 0x458acf45UL, 
- 0xf9e910f9UL, 0x02040602UL, 0x7ffe817fUL, 0x50a0f050UL, 0x3c78443cUL, 
- 0x9f25ba9fUL, 0xa84be3a8UL, 0x51a2f351UL, 0xa35dfea3UL, 0x4080c040UL, 
- 0x8f058a8fUL, 0x923fad92UL, 0x9d21bc9dUL, 0x38704838UL, 0xf5f104f5UL, 
- 0xbc63dfbcUL, 0xb677c1b6UL, 0xdaaf75daUL, 0x21426321UL, 0x10203010UL, 
- 0xffe51affUL, 0xf3fd0ef3UL, 0xd2bf6dd2UL, 0xcd814ccdUL, 0x0c18140cUL, 
- 0x13263513UL, 0xecc32fecUL, 0x5fbee15fUL, 0x9735a297UL, 0x4488cc44UL, 
- 0x172e3917UL, 0xc49357c4UL, 0xa755f2a7UL, 0x7efc827eUL, 0x3d7a473dUL, 
- 0x64c8ac64UL, 0x5dbae75dUL, 0x19322b19UL, 0x73e69573UL, 0x60c0a060UL, 
- 0x81199881UL, 0x4f9ed14fUL, 0xdca37fdcUL, 0x22446622UL, 0x2a547e2aUL, 
- 0x903bab90UL, 0x880b8388UL, 0x468cca46UL, 0xeec729eeUL, 0xb86bd3b8UL, 
- 0x14283c14UL, 0xdea779deUL, 0x5ebce25eUL, 0x0b161d0bUL, 0xdbad76dbUL, 
- 0xe0db3be0UL, 0x32645632UL, 0x3a744e3aUL, 0x0a141e0aUL, 0x4992db49UL, 
- 0x060c0a06UL, 0x24486c24UL, 0x5cb8e45cUL, 0xc29f5dc2UL, 0xd3bd6ed3UL, 
- 0xac43efacUL, 0x62c4a662UL, 0x9139a891UL, 0x9531a495UL, 0xe4d337e4UL, 
- 0x79f28b79UL, 0xe7d532e7UL, 0xc88b43c8UL, 0x376e5937UL, 0x6ddab76dUL, 
- 0x8d018c8dUL, 0xd5b164d5UL, 0x4e9cd24eUL, 0xa949e0a9UL, 0x6cd8b46cUL, 
- 0x56acfa56UL, 0xf4f307f4UL, 0xeacf25eaUL, 0x65caaf65UL, 0x7af48e7aUL, 
- 0xae47e9aeUL, 0x08101808UL, 0xba6fd5baUL, 0x78f08878UL, 0x254a6f25UL, 
- 0x2e5c722eUL, 0x1c38241cUL, 0xa657f1a6UL, 0xb473c7b4UL, 0xc69751c6UL, 
- 0xe8cb23e8UL, 0xdda17cddUL, 0x74e89c74UL, 0x1f3e211fUL, 0x4b96dd4bUL, 
- 0xbd61dcbdUL, 0x8b0d868bUL, 0x8a0f858aUL, 0x70e09070UL, 0x3e7c423eUL, 
- 0xb571c4b5UL, 0x66ccaa66UL, 0x4890d848UL, 0x03060503UL, 0xf6f701f6UL, 
- 0x0e1c120eUL, 0x61c2a361UL, 0x356a5f35UL, 0x57aef957UL, 0xb969d0b9UL, 
- 0x86179186UL, 0xc19958c1UL, 0x1d3a271dUL, 0x9e27b99eUL, 0xe1d938e1UL, 
- 0xf8eb13f8UL, 0x982bb398UL, 0x11223311UL, 0x69d2bb69UL, 0xd9a970d9UL, 
- 0x8e07898eUL, 0x9433a794UL, 0x9b2db69bUL, 0x1e3c221eUL, 0x87159287UL, 
- 0xe9c920e9UL, 0xce8749ceUL, 0x55aaff55UL, 0x28507828UL, 0xdfa57adfUL, 
- 0x8c038f8cUL, 0xa159f8a1UL, 0x89098089UL, 0x0d1a170dUL, 0xbf65dabfUL, 
- 0xe6d731e6UL, 0x4284c642UL, 0x68d0b868UL, 0x4182c341UL, 0x9929b099UL, 
- 0x2d5a772dUL, 0x0f1e110fUL, 0xb07bcbb0UL, 0x54a8fc54UL, 0xbb6dd6bbUL, 
- 0x162c3a16UL}, 
-{0xc6a56363UL, 0xf8847c7cUL, 0xee997777UL, 0xf68d7b7bUL, 0xff0df2f2UL, 
- 0xd6bd6b6bUL, 0xdeb16f6fUL, 0x9154c5c5UL, 0x60503030UL, 0x02030101UL, 
- 0xcea96767UL, 0x567d2b2bUL, 0xe719fefeUL, 0xb562d7d7UL, 0x4de6ababUL, 
- 0xec9a7676UL, 0x8f45cacaUL, 0x1f9d8282UL, 0x8940c9c9UL, 0xfa877d7dUL, 
- 0xef15fafaUL, 0xb2eb5959UL, 0x8ec94747UL, 0xfb0bf0f0UL, 0x41ecadadUL, 
- 0xb367d4d4UL, 0x5ffda2a2UL, 0x45eaafafUL, 0x23bf9c9cUL, 0x53f7a4a4UL, 
- 0xe4967272UL, 0x9b5bc0c0UL, 0x75c2b7b7UL, 0xe11cfdfdUL, 0x3dae9393UL, 
- 0x4c6a2626UL, 0x6c5a3636UL, 0x7e413f3fUL, 0xf502f7f7UL, 0x834fccccUL, 
- 0x685c3434UL, 0x51f4a5a5UL, 0xd134e5e5UL, 0xf908f1f1UL, 0xe2937171UL, 
- 0xab73d8d8UL, 0x62533131UL, 0x2a3f1515UL, 0x080c0404UL, 0x9552c7c7UL, 
- 0x46652323UL, 0x9d5ec3c3UL, 0x30281818UL, 0x37a19696UL, 0x0a0f0505UL, 
- 0x2fb59a9aUL, 0x0e090707UL, 0x24361212UL, 0x1b9b8080UL, 0xdf3de2e2UL, 
- 0xcd26ebebUL, 0x4e692727UL, 0x7fcdb2b2UL, 0xea9f7575UL, 0x121b0909UL, 
- 0x1d9e8383UL, 0x58742c2cUL, 0x342e1a1aUL, 0x362d1b1bUL, 0xdcb26e6eUL, 
- 0xb4ee5a5aUL, 0x5bfba0a0UL, 0xa4f65252UL, 0x764d3b3bUL, 0xb761d6d6UL, 
- 0x7dceb3b3UL, 0x527b2929UL, 0xdd3ee3e3UL, 0x5e712f2fUL, 0x13978484UL, 
- 0xa6f55353UL, 0xb968d1d1UL, 0x00000000UL, 0xc12cededUL, 0x40602020UL, 
- 0xe31ffcfcUL, 0x79c8b1b1UL, 0xb6ed5b5bUL, 0xd4be6a6aUL, 0x8d46cbcbUL, 
- 0x67d9bebeUL, 0x724b3939UL, 0x94de4a4aUL, 0x98d44c4cUL, 0xb0e85858UL, 
- 0x854acfcfUL, 0xbb6bd0d0UL, 0xc52aefefUL, 0x4fe5aaaaUL, 0xed16fbfbUL, 
- 0x86c54343UL, 0x9ad74d4dUL, 0x66553333UL, 0x11948585UL, 0x8acf4545UL, 
- 0xe910f9f9UL, 0x04060202UL, 0xfe817f7fUL, 0xa0f05050UL, 0x78443c3cUL, 
- 0x25ba9f9fUL, 0x4be3a8a8UL, 0xa2f35151UL, 0x5dfea3a3UL, 0x80c04040UL, 
- 0x058a8f8fUL, 0x3fad9292UL, 0x21bc9d9dUL, 0x70483838UL, 0xf104f5f5UL, 
- 0x63dfbcbcUL, 0x77c1b6b6UL, 0xaf75dadaUL, 0x42632121UL, 0x20301010UL, 
- 0xe51affffUL, 0xfd0ef3f3UL, 0xbf6dd2d2UL, 0x814ccdcdUL, 0x18140c0cUL, 
- 0x26351313UL, 0xc32fececUL, 0xbee15f5fUL, 0x35a29797UL, 0x88cc4444UL, 
- 0x2e391717UL, 0x9357c4c4UL, 0x55f2a7a7UL, 0xfc827e7eUL, 0x7a473d3dUL, 
- 0xc8ac6464UL, 0xbae75d5dUL, 0x322b1919UL, 0xe6957373UL, 0xc0a06060UL, 
- 0x19988181UL, 0x9ed14f4fUL, 0xa37fdcdcUL, 0x44662222UL, 0x547e2a2aUL, 
- 0x3bab9090UL, 0x0b838888UL, 0x8cca4646UL, 0xc729eeeeUL, 0x6bd3b8b8UL, 
- 0x283c1414UL, 0xa779dedeUL, 0xbce25e5eUL, 0x161d0b0bUL, 0xad76dbdbUL, 
- 0xdb3be0e0UL, 0x64563232UL, 0x744e3a3aUL, 0x141e0a0aUL, 0x92db4949UL, 
- 0x0c0a0606UL, 0x486c2424UL, 0xb8e45c5cUL, 0x9f5dc2c2UL, 0xbd6ed3d3UL, 
- 0x43efacacUL, 0xc4a66262UL, 0x39a89191UL, 0x31a49595UL, 0xd337e4e4UL, 
- 0xf28b7979UL, 0xd532e7e7UL, 0x8b43c8c8UL, 0x6e593737UL, 0xdab76d6dUL, 
- 0x018c8d8dUL, 0xb164d5d5UL, 0x9cd24e4eUL, 0x49e0a9a9UL, 0xd8b46c6cUL, 
- 0xacfa5656UL, 0xf307f4f4UL, 0xcf25eaeaUL, 0xcaaf6565UL, 0xf48e7a7aUL, 
- 0x47e9aeaeUL, 0x10180808UL, 0x6fd5babaUL, 0xf0887878UL, 0x4a6f2525UL, 
- 0x5c722e2eUL, 0x38241c1cUL, 0x57f1a6a6UL, 0x73c7b4b4UL, 0x9751c6c6UL, 
- 0xcb23e8e8UL, 0xa17cddddUL, 0xe89c7474UL, 0x3e211f1fUL, 0x96dd4b4bUL, 
- 0x61dcbdbdUL, 0x0d868b8bUL, 0x0f858a8aUL, 0xe0907070UL, 0x7c423e3eUL, 
- 0x71c4b5b5UL, 0xccaa6666UL, 0x90d84848UL, 0x06050303UL, 0xf701f6f6UL, 
- 0x1c120e0eUL, 0xc2a36161UL, 0x6a5f3535UL, 0xaef95757UL, 0x69d0b9b9UL, 
- 0x17918686UL, 0x9958c1c1UL, 0x3a271d1dUL, 0x27b99e9eUL, 0xd938e1e1UL, 
- 0xeb13f8f8UL, 0x2bb39898UL, 0x22331111UL, 0xd2bb6969UL, 0xa970d9d9UL, 
- 0x07898e8eUL, 0x33a79494UL, 0x2db69b9bUL, 0x3c221e1eUL, 0x15928787UL, 
- 0xc920e9e9UL, 0x8749ceceUL, 0xaaff5555UL, 0x50782828UL, 0xa57adfdfUL, 
- 0x038f8c8cUL, 0x59f8a1a1UL, 0x09808989UL, 0x1a170d0dUL, 0x65dabfbfUL, 
- 0xd731e6e6UL, 0x84c64242UL, 0xd0b86868UL, 0x82c34141UL, 0x29b09999UL, 
- 0x5a772d2dUL, 0x1e110f0fUL, 0x7bcbb0b0UL, 0xa8fc5454UL, 0x6dd6bbbbUL, 
- 0x2c3a1616UL}
- };
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
 
-static const unsigned long it_tab[4][256] = {
-{0x50a7f451UL, 0x5365417eUL, 0xc3a4171aUL, 0x965e273aUL, 0xcb6bab3bUL, 
- 0xf1459d1fUL, 0xab58faacUL, 0x9303e34bUL, 0x55fa3020UL, 0xf66d76adUL, 
- 0x9176cc88UL, 0x254c02f5UL, 0xfcd7e54fUL, 0xd7cb2ac5UL, 0x80443526UL, 
- 0x8fa362b5UL, 0x495ab1deUL, 0x671bba25UL, 0x980eea45UL, 0xe1c0fe5dUL, 
- 0x02752fc3UL, 0x12f04c81UL, 0xa397468dUL, 0xc6f9d36bUL, 0xe75f8f03UL, 
- 0x959c9215UL, 0xeb7a6dbfUL, 0xda595295UL, 0x2d83bed4UL, 0xd3217458UL, 
- 0x2969e049UL, 0x44c8c98eUL, 0x6a89c275UL, 0x78798ef4UL, 0x6b3e5899UL, 
- 0xdd71b927UL, 0xb64fe1beUL, 0x17ad88f0UL, 0x66ac20c9UL, 0xb43ace7dUL, 
- 0x184adf63UL, 0x82311ae5UL, 0x60335197UL, 0x457f5362UL, 0xe07764b1UL, 
- 0x84ae6bbbUL, 0x1ca081feUL, 0x942b08f9UL, 0x58684870UL, 0x19fd458fUL, 
- 0x876cde94UL, 0xb7f87b52UL, 0x23d373abUL, 0xe2024b72UL, 0x578f1fe3UL, 
- 0x2aab5566UL, 0x0728ebb2UL, 0x03c2b52fUL, 0x9a7bc586UL, 0xa50837d3UL, 
- 0xf2872830UL, 0xb2a5bf23UL, 0xba6a0302UL, 0x5c8216edUL, 0x2b1ccf8aUL, 
- 0x92b479a7UL, 0xf0f207f3UL, 0xa1e2694eUL, 0xcdf4da65UL, 0xd5be0506UL, 
- 0x1f6234d1UL, 0x8afea6c4UL, 0x9d532e34UL, 0xa055f3a2UL, 0x32e18a05UL, 
- 0x75ebf6a4UL, 0x39ec830bUL, 0xaaef6040UL, 0x069f715eUL, 0x51106ebdUL, 
- 0xf98a213eUL, 0x3d06dd96UL, 0xae053eddUL, 0x46bde64dUL, 0xb58d5491UL, 
- 0x055dc471UL, 0x6fd40604UL, 0xff155060UL, 0x24fb9819UL, 0x97e9bdd6UL, 
- 0xcc434089UL, 0x779ed967UL, 0xbd42e8b0UL, 0x888b8907UL, 0x385b19e7UL, 
- 0xdbeec879UL, 0x470a7ca1UL, 0xe90f427cUL, 0xc91e84f8UL, 0x00000000UL, 
- 0x83868009UL, 0x48ed2b32UL, 0xac70111eUL, 0x4e725a6cUL, 0xfbff0efdUL, 
- 0x5638850fUL, 0x1ed5ae3dUL, 0x27392d36UL, 0x64d90f0aUL, 0x21a65c68UL, 
- 0xd1545b9bUL, 0x3a2e3624UL, 0xb1670a0cUL, 0x0fe75793UL, 0xd296eeb4UL, 
- 0x9e919b1bUL, 0x4fc5c080UL, 0xa220dc61UL, 0x694b775aUL, 0x161a121cUL, 
- 0x0aba93e2UL, 0xe52aa0c0UL, 0x43e0223cUL, 0x1d171b12UL, 0x0b0d090eUL, 
- 0xadc78bf2UL, 0xb9a8b62dUL, 0xc8a91e14UL, 0x8519f157UL, 0x4c0775afUL, 
- 0xbbdd99eeUL, 0xfd607fa3UL, 0x9f2601f7UL, 0xbcf5725cUL, 0xc53b6644UL, 
- 0x347efb5bUL, 0x7629438bUL, 0xdcc623cbUL, 0x68fcedb6UL, 0x63f1e4b8UL, 
- 0xcadc31d7UL, 0x10856342UL, 0x40229713UL, 0x2011c684UL, 0x7d244a85UL, 
- 0xf83dbbd2UL, 0x1132f9aeUL, 0x6da129c7UL, 0x4b2f9e1dUL, 0xf330b2dcUL, 
- 0xec52860dUL, 0xd0e3c177UL, 0x6c16b32bUL, 0x99b970a9UL, 0xfa489411UL, 
- 0x2264e947UL, 0xc48cfca8UL, 0x1a3ff0a0UL, 0xd82c7d56UL, 0xef903322UL, 
- 0xc74e4987UL, 0xc1d138d9UL, 0xfea2ca8cUL, 0x360bd498UL, 0xcf81f5a6UL, 
- 0x28de7aa5UL, 0x268eb7daUL, 0xa4bfad3fUL, 0xe49d3a2cUL, 0x0d927850UL, 
- 0x9bcc5f6aUL, 0x62467e54UL, 0xc2138df6UL, 0xe8b8d890UL, 0x5ef7392eUL, 
- 0xf5afc382UL, 0xbe805d9fUL, 0x7c93d069UL, 0xa92dd56fUL, 0xb31225cfUL, 
- 0x3b99acc8UL, 0xa77d1810UL, 0x6e639ce8UL, 0x7bbb3bdbUL, 0x097826cdUL, 
- 0xf418596eUL, 0x01b79aecUL, 0xa89a4f83UL, 0x656e95e6UL, 0x7ee6ffaaUL, 
- 0x08cfbc21UL, 0xe6e815efUL, 0xd99be7baUL, 0xce366f4aUL, 0xd4099feaUL, 
- 0xd67cb029UL, 0xafb2a431UL, 0x31233f2aUL, 0x3094a5c6UL, 0xc066a235UL, 
- 0x37bc4e74UL, 0xa6ca82fcUL, 0xb0d090e0UL, 0x15d8a733UL, 0x4a9804f1UL, 
- 0xf7daec41UL, 0x0e50cd7fUL, 0x2ff69117UL, 0x8dd64d76UL, 0x4db0ef43UL, 
- 0x544daaccUL, 0xdf0496e4UL, 0xe3b5d19eUL, 0x1b886a4cUL, 0xb81f2cc1UL, 
- 0x7f516546UL, 0x04ea5e9dUL, 0x5d358c01UL, 0x737487faUL, 0x2e410bfbUL, 
- 0x5a1d67b3UL, 0x52d2db92UL, 0x335610e9UL, 0x1347d66dUL, 0x8c61d79aUL, 
- 0x7a0ca137UL, 0x8e14f859UL, 0x893c13ebUL, 0xee27a9ceUL, 0x35c961b7UL, 
- 0xede51ce1UL, 0x3cb1477aUL, 0x59dfd29cUL, 0x3f73f255UL, 0x79ce1418UL, 
- 0xbf37c773UL, 0xeacdf753UL, 0x5baafd5fUL, 0x146f3ddfUL, 0x86db4478UL, 
- 0x81f3afcaUL, 0x3ec468b9UL, 0x2c342438UL, 0x5f40a3c2UL, 0x72c31d16UL, 
- 0x0c25e2bcUL, 0x8b493c28UL, 0x41950dffUL, 0x7101a839UL, 0xdeb30c08UL, 
- 0x9ce4b4d8UL, 0x90c15664UL, 0x6184cb7bUL, 0x70b632d5UL, 0x745c6c48UL, 
- 0x4257b8d0UL}, 
-{0xa7f45150UL, 0x65417e53UL, 0xa4171ac3UL, 0x5e273a96UL, 0x6bab3bcbUL, 
- 0x459d1ff1UL, 0x58faacabUL, 0x03e34b93UL, 0xfa302055UL, 0x6d76adf6UL, 
- 0x76cc8891UL, 0x4c02f525UL, 0xd7e54ffcUL, 0xcb2ac5d7UL, 0x44352680UL, 
- 0xa362b58fUL, 0x5ab1de49UL, 0x1bba2567UL, 0x0eea4598UL, 0xc0fe5de1UL, 
- 0x752fc302UL, 0xf04c8112UL, 0x97468da3UL, 0xf9d36bc6UL, 0x5f8f03e7UL, 
- 0x9c921595UL, 0x7a6dbfebUL, 0x595295daUL, 0x83bed42dUL, 0x217458d3UL, 
- 0x69e04929UL, 0xc8c98e44UL, 0x89c2756aUL, 0x798ef478UL, 0x3e58996bUL, 
- 0x71b927ddUL, 0x4fe1beb6UL, 0xad88f017UL, 0xac20c966UL, 0x3ace7db4UL, 
- 0x4adf6318UL, 0x311ae582UL, 0x33519760UL, 0x7f536245UL, 0x7764b1e0UL, 
- 0xae6bbb84UL, 0xa081fe1cUL, 0x2b08f994UL, 0x68487058UL, 0xfd458f19UL, 
- 0x6cde9487UL, 0xf87b52b7UL, 0xd373ab23UL, 0x024b72e2UL, 0x8f1fe357UL, 
- 0xab55662aUL, 0x28ebb207UL, 0xc2b52f03UL, 0x7bc5869aUL, 0x0837d3a5UL, 
- 0x872830f2UL, 0xa5bf23b2UL, 0x6a0302baUL, 0x8216ed5cUL, 0x1ccf8a2bUL, 
- 0xb479a792UL, 0xf207f3f0UL, 0xe2694ea1UL, 0xf4da65cdUL, 0xbe0506d5UL, 
- 0x6234d11fUL, 0xfea6c48aUL, 0x532e349dUL, 0x55f3a2a0UL, 0xe18a0532UL, 
- 0xebf6a475UL, 0xec830b39UL, 0xef6040aaUL, 0x9f715e06UL, 0x106ebd51UL, 
- 0x8a213ef9UL, 0x06dd963dUL, 0x053eddaeUL, 0xbde64d46UL, 0x8d5491b5UL, 
- 0x5dc47105UL, 0xd406046fUL, 0x155060ffUL, 0xfb981924UL, 0xe9bdd697UL, 
- 0x434089ccUL, 0x9ed96777UL, 0x42e8b0bdUL, 0x8b890788UL, 0x5b19e738UL, 
- 0xeec879dbUL, 0x0a7ca147UL, 0x0f427ce9UL, 0x1e84f8c9UL, 0x00000000UL, 
- 0x86800983UL, 0xed2b3248UL, 0x70111eacUL, 0x725a6c4eUL, 0xff0efdfbUL, 
- 0x38850f56UL, 0xd5ae3d1eUL, 0x392d3627UL, 0xd90f0a64UL, 0xa65c6821UL, 
- 0x545b9bd1UL, 0x2e36243aUL, 0x670a0cb1UL, 0xe757930fUL, 0x96eeb4d2UL, 
- 0x919b1b9eUL, 0xc5c0804fUL, 0x20dc61a2UL, 0x4b775a69UL, 0x1a121c16UL, 
- 0xba93e20aUL, 0x2aa0c0e5UL, 0xe0223c43UL, 0x171b121dUL, 0x0d090e0bUL, 
- 0xc78bf2adUL, 0xa8b62db9UL, 0xa91e14c8UL, 0x19f15785UL, 0x0775af4cUL, 
- 0xdd99eebbUL, 0x607fa3fdUL, 0x2601f79fUL, 0xf5725cbcUL, 0x3b6644c5UL, 
- 0x7efb5b34UL, 0x29438b76UL, 0xc623cbdcUL, 0xfcedb668UL, 0xf1e4b863UL, 
- 0xdc31d7caUL, 0x85634210UL, 0x22971340UL, 0x11c68420UL, 0x244a857dUL, 
- 0x3dbbd2f8UL, 0x32f9ae11UL, 0xa129c76dUL, 0x2f9e1d4bUL, 0x30b2dcf3UL, 
- 0x52860decUL, 0xe3c177d0UL, 0x16b32b6cUL, 0xb970a999UL, 0x489411faUL, 
- 0x64e94722UL, 0x8cfca8c4UL, 0x3ff0a01aUL, 0x2c7d56d8UL, 0x903322efUL, 
- 0x4e4987c7UL, 0xd138d9c1UL, 0xa2ca8cfeUL, 0x0bd49836UL, 0x81f5a6cfUL, 
- 0xde7aa528UL, 0x8eb7da26UL, 0xbfad3fa4UL, 0x9d3a2ce4UL, 0x9278500dUL, 
- 0xcc5f6a9bUL, 0x467e5462UL, 0x138df6c2UL, 0xb8d890e8UL, 0xf7392e5eUL, 
- 0xafc382f5UL, 0x805d9fbeUL, 0x93d0697cUL, 0x2dd56fa9UL, 0x1225cfb3UL, 
- 0x99acc83bUL, 0x7d1810a7UL, 0x639ce86eUL, 0xbb3bdb7bUL, 0x7826cd09UL, 
- 0x18596ef4UL, 0xb79aec01UL, 0x9a4f83a8UL, 0x6e95e665UL, 0xe6ffaa7eUL, 
- 0xcfbc2108UL, 0xe815efe6UL, 0x9be7bad9UL, 0x366f4aceUL, 0x099fead4UL, 
- 0x7cb029d6UL, 0xb2a431afUL, 0x233f2a31UL, 0x94a5c630UL, 0x66a235c0UL, 
- 0xbc4e7437UL, 0xca82fca6UL, 0xd090e0b0UL, 0xd8a73315UL, 0x9804f14aUL, 
- 0xdaec41f7UL, 0x50cd7f0eUL, 0xf691172fUL, 0xd64d768dUL, 0xb0ef434dUL, 
- 0x4daacc54UL, 0x0496e4dfUL, 0xb5d19ee3UL, 0x886a4c1bUL, 0x1f2cc1b8UL, 
- 0x5165467fUL, 0xea5e9d04UL, 0x358c015dUL, 0x7487fa73UL, 0x410bfb2eUL, 
- 0x1d67b35aUL, 0xd2db9252UL, 0x5610e933UL, 0x47d66d13UL, 0x61d79a8cUL, 
- 0x0ca1377aUL, 0x14f8598eUL, 0x3c13eb89UL, 0x27a9ceeeUL, 0xc961b735UL, 
- 0xe51ce1edUL, 0xb1477a3cUL, 0xdfd29c59UL, 0x73f2553fUL, 0xce141879UL, 
- 0x37c773bfUL, 0xcdf753eaUL, 0xaafd5f5bUL, 0x6f3ddf14UL, 0xdb447886UL, 
- 0xf3afca81UL, 0xc468b93eUL, 0x3424382cUL, 0x40a3c25fUL, 0xc31d1672UL, 
- 0x25e2bc0cUL, 0x493c288bUL, 0x950dff41UL, 0x01a83971UL, 0xb30c08deUL, 
- 0xe4b4d89cUL, 0xc1566490UL, 0x84cb7b61UL, 0xb632d570UL, 0x5c6c4874UL, 
- 0x57b8d042UL}, 
-{0xf45150a7UL, 0x417e5365UL, 0x171ac3a4UL, 0x273a965eUL, 0xab3bcb6bUL, 
- 0x9d1ff145UL, 0xfaacab58UL, 0xe34b9303UL, 0x302055faUL, 0x76adf66dUL, 
- 0xcc889176UL, 0x02f5254cUL, 0xe54ffcd7UL, 0x2ac5d7cbUL, 0x35268044UL, 
- 0x62b58fa3UL, 0xb1de495aUL, 0xba25671bUL, 0xea45980eUL, 0xfe5de1c0UL, 
- 0x2fc30275UL, 0x4c8112f0UL, 0x468da397UL, 0xd36bc6f9UL, 0x8f03e75fUL, 
- 0x9215959cUL, 0x6dbfeb7aUL, 0x5295da59UL, 0xbed42d83UL, 0x7458d321UL, 
- 0xe0492969UL, 0xc98e44c8UL, 0xc2756a89UL, 0x8ef47879UL, 0x58996b3eUL, 
- 0xb927dd71UL, 0xe1beb64fUL, 0x88f017adUL, 0x20c966acUL, 0xce7db43aUL, 
- 0xdf63184aUL, 0x1ae58231UL, 0x51976033UL, 0x5362457fUL, 0x64b1e077UL, 
- 0x6bbb84aeUL, 0x81fe1ca0UL, 0x08f9942bUL, 0x48705868UL, 0x458f19fdUL, 
- 0xde94876cUL, 0x7b52b7f8UL, 0x73ab23d3UL, 0x4b72e202UL, 0x1fe3578fUL, 
- 0x55662aabUL, 0xebb20728UL, 0xb52f03c2UL, 0xc5869a7bUL, 0x37d3a508UL, 
- 0x2830f287UL, 0xbf23b2a5UL, 0x0302ba6aUL, 0x16ed5c82UL, 0xcf8a2b1cUL, 
- 0x79a792b4UL, 0x07f3f0f2UL, 0x694ea1e2UL, 0xda65cdf4UL, 0x0506d5beUL, 
- 0x34d11f62UL, 0xa6c48afeUL, 0x2e349d53UL, 0xf3a2a055UL, 0x8a0532e1UL, 
- 0xf6a475ebUL, 0x830b39ecUL, 0x6040aaefUL, 0x715e069fUL, 0x6ebd5110UL, 
- 0x213ef98aUL, 0xdd963d06UL, 0x3eddae05UL, 0xe64d46bdUL, 0x5491b58dUL, 
- 0xc471055dUL, 0x06046fd4UL, 0x5060ff15UL, 0x981924fbUL, 0xbdd697e9UL, 
- 0x4089cc43UL, 0xd967779eUL, 0xe8b0bd42UL, 0x8907888bUL, 0x19e7385bUL, 
- 0xc879dbeeUL, 0x7ca1470aUL, 0x427ce90fUL, 0x84f8c91eUL, 0x00000000UL, 
- 0x80098386UL, 0x2b3248edUL, 0x111eac70UL, 0x5a6c4e72UL, 0x0efdfbffUL, 
- 0x850f5638UL, 0xae3d1ed5UL, 0x2d362739UL, 0x0f0a64d9UL, 0x5c6821a6UL, 
- 0x5b9bd154UL, 0x36243a2eUL, 0x0a0cb167UL, 0x57930fe7UL, 0xeeb4d296UL, 
- 0x9b1b9e91UL, 0xc0804fc5UL, 0xdc61a220UL, 0x775a694bUL, 0x121c161aUL, 
- 0x93e20abaUL, 0xa0c0e52aUL, 0x223c43e0UL, 0x1b121d17UL, 0x090e0b0dUL, 
- 0x8bf2adc7UL, 0xb62db9a8UL, 0x1e14c8a9UL, 0xf1578519UL, 0x75af4c07UL, 
- 0x99eebbddUL, 0x7fa3fd60UL, 0x01f79f26UL, 0x725cbcf5UL, 0x6644c53bUL, 
- 0xfb5b347eUL, 0x438b7629UL, 0x23cbdcc6UL, 0xedb668fcUL, 0xe4b863f1UL, 
- 0x31d7cadcUL, 0x63421085UL, 0x97134022UL, 0xc6842011UL, 0x4a857d24UL, 
- 0xbbd2f83dUL, 0xf9ae1132UL, 0x29c76da1UL, 0x9e1d4b2fUL, 0xb2dcf330UL, 
- 0x860dec52UL, 0xc177d0e3UL, 0xb32b6c16UL, 0x70a999b9UL, 0x9411fa48UL, 
- 0xe9472264UL, 0xfca8c48cUL, 0xf0a01a3fUL, 0x7d56d82cUL, 0x3322ef90UL, 
- 0x4987c74eUL, 0x38d9c1d1UL, 0xca8cfea2UL, 0xd498360bUL, 0xf5a6cf81UL, 
- 0x7aa528deUL, 0xb7da268eUL, 0xad3fa4bfUL, 0x3a2ce49dUL, 0x78500d92UL, 
- 0x5f6a9bccUL, 0x7e546246UL, 0x8df6c213UL, 0xd890e8b8UL, 0x392e5ef7UL, 
- 0xc382f5afUL, 0x5d9fbe80UL, 0xd0697c93UL, 0xd56fa92dUL, 0x25cfb312UL, 
- 0xacc83b99UL, 0x1810a77dUL, 0x9ce86e63UL, 0x3bdb7bbbUL, 0x26cd0978UL, 
- 0x596ef418UL, 0x9aec01b7UL, 0x4f83a89aUL, 0x95e6656eUL, 0xffaa7ee6UL, 
- 0xbc2108cfUL, 0x15efe6e8UL, 0xe7bad99bUL, 0x6f4ace36UL, 0x9fead409UL, 
- 0xb029d67cUL, 0xa431afb2UL, 0x3f2a3123UL, 0xa5c63094UL, 0xa235c066UL, 
- 0x4e7437bcUL, 0x82fca6caUL, 0x90e0b0d0UL, 0xa73315d8UL, 0x04f14a98UL, 
- 0xec41f7daUL, 0xcd7f0e50UL, 0x91172ff6UL, 0x4d768dd6UL, 0xef434db0UL, 
- 0xaacc544dUL, 0x96e4df04UL, 0xd19ee3b5UL, 0x6a4c1b88UL, 0x2cc1b81fUL, 
- 0x65467f51UL, 0x5e9d04eaUL, 0x8c015d35UL, 0x87fa7374UL, 0x0bfb2e41UL, 
- 0x67b35a1dUL, 0xdb9252d2UL, 0x10e93356UL, 0xd66d1347UL, 0xd79a8c61UL, 
- 0xa1377a0cUL, 0xf8598e14UL, 0x13eb893cUL, 0xa9ceee27UL, 0x61b735c9UL, 
- 0x1ce1ede5UL, 0x477a3cb1UL, 0xd29c59dfUL, 0xf2553f73UL, 0x141879ceUL, 
- 0xc773bf37UL, 0xf753eacdUL, 0xfd5f5baaUL, 0x3ddf146fUL, 0x447886dbUL, 
- 0xafca81f3UL, 0x68b93ec4UL, 0x24382c34UL, 0xa3c25f40UL, 0x1d1672c3UL, 
- 0xe2bc0c25UL, 0x3c288b49UL, 0x0dff4195UL, 0xa8397101UL, 0x0c08deb3UL, 
- 0xb4d89ce4UL, 0x566490c1UL, 0xcb7b6184UL, 0x32d570b6UL, 0x6c48745cUL, 
- 0xb8d04257UL}, 
-{0x5150a7f4UL, 0x7e536541UL, 0x1ac3a417UL, 0x3a965e27UL, 0x3bcb6babUL, 
- 0x1ff1459dUL, 0xacab58faUL, 0x4b9303e3UL, 0x2055fa30UL, 0xadf66d76UL, 
- 0x889176ccUL, 0xf5254c02UL, 0x4ffcd7e5UL, 0xc5d7cb2aUL, 0x26804435UL, 
- 0xb58fa362UL, 0xde495ab1UL, 0x25671bbaUL, 0x45980eeaUL, 0x5de1c0feUL, 
- 0xc302752fUL, 0x8112f04cUL, 0x8da39746UL, 0x6bc6f9d3UL, 0x03e75f8fUL, 
- 0x15959c92UL, 0xbfeb7a6dUL, 0x95da5952UL, 0xd42d83beUL, 0x58d32174UL, 
- 0x492969e0UL, 0x8e44c8c9UL, 0x756a89c2UL, 0xf478798eUL, 0x996b3e58UL, 
- 0x27dd71b9UL, 0xbeb64fe1UL, 0xf017ad88UL, 0xc966ac20UL, 0x7db43aceUL, 
- 0x63184adfUL, 0xe582311aUL, 0x97603351UL, 0x62457f53UL, 0xb1e07764UL, 
- 0xbb84ae6bUL, 0xfe1ca081UL, 0xf9942b08UL, 0x70586848UL, 0x8f19fd45UL, 
- 0x94876cdeUL, 0x52b7f87bUL, 0xab23d373UL, 0x72e2024bUL, 0xe3578f1fUL, 
- 0x662aab55UL, 0xb20728ebUL, 0x2f03c2b5UL, 0x869a7bc5UL, 0xd3a50837UL, 
- 0x30f28728UL, 0x23b2a5bfUL, 0x02ba6a03UL, 0xed5c8216UL, 0x8a2b1ccfUL, 
- 0xa792b479UL, 0xf3f0f207UL, 0x4ea1e269UL, 0x65cdf4daUL, 0x06d5be05UL, 
- 0xd11f6234UL, 0xc48afea6UL, 0x349d532eUL, 0xa2a055f3UL, 0x0532e18aUL, 
- 0xa475ebf6UL, 0x0b39ec83UL, 0x40aaef60UL, 0x5e069f71UL, 0xbd51106eUL, 
- 0x3ef98a21UL, 0x963d06ddUL, 0xddae053eUL, 0x4d46bde6UL, 0x91b58d54UL, 
- 0x71055dc4UL, 0x046fd406UL, 0x60ff1550UL, 0x1924fb98UL, 0xd697e9bdUL, 
- 0x89cc4340UL, 0x67779ed9UL, 0xb0bd42e8UL, 0x07888b89UL, 0xe7385b19UL, 
- 0x79dbeec8UL, 0xa1470a7cUL, 0x7ce90f42UL, 0xf8c91e84UL, 0x00000000UL, 
- 0x09838680UL, 0x3248ed2bUL, 0x1eac7011UL, 0x6c4e725aUL, 0xfdfbff0eUL, 
- 0x0f563885UL, 0x3d1ed5aeUL, 0x3627392dUL, 0x0a64d90fUL, 0x6821a65cUL, 
- 0x9bd1545bUL, 0x243a2e36UL, 0x0cb1670aUL, 0x930fe757UL, 0xb4d296eeUL, 
- 0x1b9e919bUL, 0x804fc5c0UL, 0x61a220dcUL, 0x5a694b77UL, 0x1c161a12UL, 
- 0xe20aba93UL, 0xc0e52aa0UL, 0x3c43e022UL, 0x121d171bUL, 0x0e0b0d09UL, 
- 0xf2adc78bUL, 0x2db9a8b6UL, 0x14c8a91eUL, 0x578519f1UL, 0xaf4c0775UL, 
- 0xeebbdd99UL, 0xa3fd607fUL, 0xf79f2601UL, 0x5cbcf572UL, 0x44c53b66UL, 
- 0x5b347efbUL, 0x8b762943UL, 0xcbdcc623UL, 0xb668fcedUL, 0xb863f1e4UL, 
- 0xd7cadc31UL, 0x42108563UL, 0x13402297UL, 0x842011c6UL, 0x857d244aUL, 
- 0xd2f83dbbUL, 0xae1132f9UL, 0xc76da129UL, 0x1d4b2f9eUL, 0xdcf330b2UL, 
- 0x0dec5286UL, 0x77d0e3c1UL, 0x2b6c16b3UL, 0xa999b970UL, 0x11fa4894UL, 
- 0x472264e9UL, 0xa8c48cfcUL, 0xa01a3ff0UL, 0x56d82c7dUL, 0x22ef9033UL, 
- 0x87c74e49UL, 0xd9c1d138UL, 0x8cfea2caUL, 0x98360bd4UL, 0xa6cf81f5UL, 
- 0xa528de7aUL, 0xda268eb7UL, 0x3fa4bfadUL, 0x2ce49d3aUL, 0x500d9278UL, 
- 0x6a9bcc5fUL, 0x5462467eUL, 0xf6c2138dUL, 0x90e8b8d8UL, 0x2e5ef739UL, 
- 0x82f5afc3UL, 0x9fbe805dUL, 0x697c93d0UL, 0x6fa92dd5UL, 0xcfb31225UL, 
- 0xc83b99acUL, 0x10a77d18UL, 0xe86e639cUL, 0xdb7bbb3bUL, 0xcd097826UL, 
- 0x6ef41859UL, 0xec01b79aUL, 0x83a89a4fUL, 0xe6656e95UL, 0xaa7ee6ffUL, 
- 0x2108cfbcUL, 0xefe6e815UL, 0xbad99be7UL, 0x4ace366fUL, 0xead4099fUL, 
- 0x29d67cb0UL, 0x31afb2a4UL, 0x2a31233fUL, 0xc63094a5UL, 0x35c066a2UL, 
- 0x7437bc4eUL, 0xfca6ca82UL, 0xe0b0d090UL, 0x3315d8a7UL, 0xf14a9804UL, 
- 0x41f7daecUL, 0x7f0e50cdUL, 0x172ff691UL, 0x768dd64dUL, 0x434db0efUL, 
- 0xcc544daaUL, 0xe4df0496UL, 0x9ee3b5d1UL, 0x4c1b886aUL, 0xc1b81f2cUL, 
- 0x467f5165UL, 0x9d04ea5eUL, 0x015d358cUL, 0xfa737487UL, 0xfb2e410bUL, 
- 0xb35a1d67UL, 0x9252d2dbUL, 0xe9335610UL, 0x6d1347d6UL, 0x9a8c61d7UL, 
- 0x377a0ca1UL, 0x598e14f8UL, 0xeb893c13UL, 0xceee27a9UL, 0xb735c961UL, 
- 0xe1ede51cUL, 0x7a3cb147UL, 0x9c59dfd2UL, 0x553f73f2UL, 0x1879ce14UL, 
- 0x73bf37c7UL, 0x53eacdf7UL, 0x5f5baafdUL, 0xdf146f3dUL, 0x7886db44UL, 
- 0xca81f3afUL, 0xb93ec468UL, 0x382c3424UL, 0xc25f40a3UL, 0x1672c31dUL, 
- 0xbc0c25e2UL, 0x288b493cUL, 0xff41950dUL, 0x397101a8UL, 0x08deb30cUL, 
- 0xd89ce4b4UL, 0x6490c156UL, 0x7b6184cbUL, 0xd570b632UL, 0x48745c6cUL, 
- 0xd04257b8UL}
- };
+static const unsigned long Te0[256] = {
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const unsigned long Te1[256] = {
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const unsigned long Te2[256] = {
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const unsigned long Te3[256] = {
 
-static const unsigned long fl_tab[4][256] = {
-{0x00000063UL, 0x0000007cUL, 0x00000077UL, 0x0000007bUL, 0x000000f2UL, 
- 0x0000006bUL, 0x0000006fUL, 0x000000c5UL, 0x00000030UL, 0x00000001UL, 
- 0x00000067UL, 0x0000002bUL, 0x000000feUL, 0x000000d7UL, 0x000000abUL, 
- 0x00000076UL, 0x000000caUL, 0x00000082UL, 0x000000c9UL, 0x0000007dUL, 
- 0x000000faUL, 0x00000059UL, 0x00000047UL, 0x000000f0UL, 0x000000adUL, 
- 0x000000d4UL, 0x000000a2UL, 0x000000afUL, 0x0000009cUL, 0x000000a4UL, 
- 0x00000072UL, 0x000000c0UL, 0x000000b7UL, 0x000000fdUL, 0x00000093UL, 
- 0x00000026UL, 0x00000036UL, 0x0000003fUL, 0x000000f7UL, 0x000000ccUL, 
- 0x00000034UL, 0x000000a5UL, 0x000000e5UL, 0x000000f1UL, 0x00000071UL, 
- 0x000000d8UL, 0x00000031UL, 0x00000015UL, 0x00000004UL, 0x000000c7UL, 
- 0x00000023UL, 0x000000c3UL, 0x00000018UL, 0x00000096UL, 0x00000005UL, 
- 0x0000009aUL, 0x00000007UL, 0x00000012UL, 0x00000080UL, 0x000000e2UL, 
- 0x000000ebUL, 0x00000027UL, 0x000000b2UL, 0x00000075UL, 0x00000009UL, 
- 0x00000083UL, 0x0000002cUL, 0x0000001aUL, 0x0000001bUL, 0x0000006eUL, 
- 0x0000005aUL, 0x000000a0UL, 0x00000052UL, 0x0000003bUL, 0x000000d6UL, 
- 0x000000b3UL, 0x00000029UL, 0x000000e3UL, 0x0000002fUL, 0x00000084UL, 
- 0x00000053UL, 0x000000d1UL, 0x00000000UL, 0x000000edUL, 0x00000020UL, 
- 0x000000fcUL, 0x000000b1UL, 0x0000005bUL, 0x0000006aUL, 0x000000cbUL, 
- 0x000000beUL, 0x00000039UL, 0x0000004aUL, 0x0000004cUL, 0x00000058UL, 
- 0x000000cfUL, 0x000000d0UL, 0x000000efUL, 0x000000aaUL, 0x000000fbUL, 
- 0x00000043UL, 0x0000004dUL, 0x00000033UL, 0x00000085UL, 0x00000045UL, 
- 0x000000f9UL, 0x00000002UL, 0x0000007fUL, 0x00000050UL, 0x0000003cUL, 
- 0x0000009fUL, 0x000000a8UL, 0x00000051UL, 0x000000a3UL, 0x00000040UL, 
- 0x0000008fUL, 0x00000092UL, 0x0000009dUL, 0x00000038UL, 0x000000f5UL, 
- 0x000000bcUL, 0x000000b6UL, 0x000000daUL, 0x00000021UL, 0x00000010UL, 
- 0x000000ffUL, 0x000000f3UL, 0x000000d2UL, 0x000000cdUL, 0x0000000cUL, 
- 0x00000013UL, 0x000000ecUL, 0x0000005fUL, 0x00000097UL, 0x00000044UL, 
- 0x00000017UL, 0x000000c4UL, 0x000000a7UL, 0x0000007eUL, 0x0000003dUL, 
- 0x00000064UL, 0x0000005dUL, 0x00000019UL, 0x00000073UL, 0x00000060UL, 
- 0x00000081UL, 0x0000004fUL, 0x000000dcUL, 0x00000022UL, 0x0000002aUL, 
- 0x00000090UL, 0x00000088UL, 0x00000046UL, 0x000000eeUL, 0x000000b8UL, 
- 0x00000014UL, 0x000000deUL, 0x0000005eUL, 0x0000000bUL, 0x000000dbUL, 
- 0x000000e0UL, 0x00000032UL, 0x0000003aUL, 0x0000000aUL, 0x00000049UL, 
- 0x00000006UL, 0x00000024UL, 0x0000005cUL, 0x000000c2UL, 0x000000d3UL, 
- 0x000000acUL, 0x00000062UL, 0x00000091UL, 0x00000095UL, 0x000000e4UL, 
- 0x00000079UL, 0x000000e7UL, 0x000000c8UL, 0x00000037UL, 0x0000006dUL, 
- 0x0000008dUL, 0x000000d5UL, 0x0000004eUL, 0x000000a9UL, 0x0000006cUL, 
- 0x00000056UL, 0x000000f4UL, 0x000000eaUL, 0x00000065UL, 0x0000007aUL, 
- 0x000000aeUL, 0x00000008UL, 0x000000baUL, 0x00000078UL, 0x00000025UL, 
- 0x0000002eUL, 0x0000001cUL, 0x000000a6UL, 0x000000b4UL, 0x000000c6UL, 
- 0x000000e8UL, 0x000000ddUL, 0x00000074UL, 0x0000001fUL, 0x0000004bUL, 
- 0x000000bdUL, 0x0000008bUL, 0x0000008aUL, 0x00000070UL, 0x0000003eUL, 
- 0x000000b5UL, 0x00000066UL, 0x00000048UL, 0x00000003UL, 0x000000f6UL, 
- 0x0000000eUL, 0x00000061UL, 0x00000035UL, 0x00000057UL, 0x000000b9UL, 
- 0x00000086UL, 0x000000c1UL, 0x0000001dUL, 0x0000009eUL, 0x000000e1UL, 
- 0x000000f8UL, 0x00000098UL, 0x00000011UL, 0x00000069UL, 0x000000d9UL, 
- 0x0000008eUL, 0x00000094UL, 0x0000009bUL, 0x0000001eUL, 0x00000087UL, 
- 0x000000e9UL, 0x000000ceUL, 0x00000055UL, 0x00000028UL, 0x000000dfUL, 
- 0x0000008cUL, 0x000000a1UL, 0x00000089UL, 0x0000000dUL, 0x000000bfUL, 
- 0x000000e6UL, 0x00000042UL, 0x00000068UL, 0x00000041UL, 0x00000099UL, 
- 0x0000002dUL, 0x0000000fUL, 0x000000b0UL, 0x00000054UL, 0x000000bbUL, 
- 0x00000016UL}, 
-{0x00006300UL, 0x00007c00UL, 0x00007700UL, 0x00007b00UL, 0x0000f200UL, 
- 0x00006b00UL, 0x00006f00UL, 0x0000c500UL, 0x00003000UL, 0x00000100UL, 
- 0x00006700UL, 0x00002b00UL, 0x0000fe00UL, 0x0000d700UL, 0x0000ab00UL, 
- 0x00007600UL, 0x0000ca00UL, 0x00008200UL, 0x0000c900UL, 0x00007d00UL, 
- 0x0000fa00UL, 0x00005900UL, 0x00004700UL, 0x0000f000UL, 0x0000ad00UL, 
- 0x0000d400UL, 0x0000a200UL, 0x0000af00UL, 0x00009c00UL, 0x0000a400UL, 
- 0x00007200UL, 0x0000c000UL, 0x0000b700UL, 0x0000fd00UL, 0x00009300UL, 
- 0x00002600UL, 0x00003600UL, 0x00003f00UL, 0x0000f700UL, 0x0000cc00UL, 
- 0x00003400UL, 0x0000a500UL, 0x0000e500UL, 0x0000f100UL, 0x00007100UL, 
- 0x0000d800UL, 0x00003100UL, 0x00001500UL, 0x00000400UL, 0x0000c700UL, 
- 0x00002300UL, 0x0000c300UL, 0x00001800UL, 0x00009600UL, 0x00000500UL, 
- 0x00009a00UL, 0x00000700UL, 0x00001200UL, 0x00008000UL, 0x0000e200UL, 
- 0x0000eb00UL, 0x00002700UL, 0x0000b200UL, 0x00007500UL, 0x00000900UL, 
- 0x00008300UL, 0x00002c00UL, 0x00001a00UL, 0x00001b00UL, 0x00006e00UL, 
- 0x00005a00UL, 0x0000a000UL, 0x00005200UL, 0x00003b00UL, 0x0000d600UL, 
- 0x0000b300UL, 0x00002900UL, 0x0000e300UL, 0x00002f00UL, 0x00008400UL, 
- 0x00005300UL, 0x0000d100UL, 0x00000000UL, 0x0000ed00UL, 0x00002000UL, 
- 0x0000fc00UL, 0x0000b100UL, 0x00005b00UL, 0x00006a00UL, 0x0000cb00UL, 
- 0x0000be00UL, 0x00003900UL, 0x00004a00UL, 0x00004c00UL, 0x00005800UL, 
- 0x0000cf00UL, 0x0000d000UL, 0x0000ef00UL, 0x0000aa00UL, 0x0000fb00UL, 
- 0x00004300UL, 0x00004d00UL, 0x00003300UL, 0x00008500UL, 0x00004500UL, 
- 0x0000f900UL, 0x00000200UL, 0x00007f00UL, 0x00005000UL, 0x00003c00UL, 
- 0x00009f00UL, 0x0000a800UL, 0x00005100UL, 0x0000a300UL, 0x00004000UL, 
- 0x00008f00UL, 0x00009200UL, 0x00009d00UL, 0x00003800UL, 0x0000f500UL, 
- 0x0000bc00UL, 0x0000b600UL, 0x0000da00UL, 0x00002100UL, 0x00001000UL, 
- 0x0000ff00UL, 0x0000f300UL, 0x0000d200UL, 0x0000cd00UL, 0x00000c00UL, 
- 0x00001300UL, 0x0000ec00UL, 0x00005f00UL, 0x00009700UL, 0x00004400UL, 
- 0x00001700UL, 0x0000c400UL, 0x0000a700UL, 0x00007e00UL, 0x00003d00UL, 
- 0x00006400UL, 0x00005d00UL, 0x00001900UL, 0x00007300UL, 0x00006000UL, 
- 0x00008100UL, 0x00004f00UL, 0x0000dc00UL, 0x00002200UL, 0x00002a00UL, 
- 0x00009000UL, 0x00008800UL, 0x00004600UL, 0x0000ee00UL, 0x0000b800UL, 
- 0x00001400UL, 0x0000de00UL, 0x00005e00UL, 0x00000b00UL, 0x0000db00UL, 
- 0x0000e000UL, 0x00003200UL, 0x00003a00UL, 0x00000a00UL, 0x00004900UL, 
- 0x00000600UL, 0x00002400UL, 0x00005c00UL, 0x0000c200UL, 0x0000d300UL, 
- 0x0000ac00UL, 0x00006200UL, 0x00009100UL, 0x00009500UL, 0x0000e400UL, 
- 0x00007900UL, 0x0000e700UL, 0x0000c800UL, 0x00003700UL, 0x00006d00UL, 
- 0x00008d00UL, 0x0000d500UL, 0x00004e00UL, 0x0000a900UL, 0x00006c00UL, 
- 0x00005600UL, 0x0000f400UL, 0x0000ea00UL, 0x00006500UL, 0x00007a00UL, 
- 0x0000ae00UL, 0x00000800UL, 0x0000ba00UL, 0x00007800UL, 0x00002500UL, 
- 0x00002e00UL, 0x00001c00UL, 0x0000a600UL, 0x0000b400UL, 0x0000c600UL, 
- 0x0000e800UL, 0x0000dd00UL, 0x00007400UL, 0x00001f00UL, 0x00004b00UL, 
- 0x0000bd00UL, 0x00008b00UL, 0x00008a00UL, 0x00007000UL, 0x00003e00UL, 
- 0x0000b500UL, 0x00006600UL, 0x00004800UL, 0x00000300UL, 0x0000f600UL, 
- 0x00000e00UL, 0x00006100UL, 0x00003500UL, 0x00005700UL, 0x0000b900UL, 
- 0x00008600UL, 0x0000c100UL, 0x00001d00UL, 0x00009e00UL, 0x0000e100UL, 
- 0x0000f800UL, 0x00009800UL, 0x00001100UL, 0x00006900UL, 0x0000d900UL, 
- 0x00008e00UL, 0x00009400UL, 0x00009b00UL, 0x00001e00UL, 0x00008700UL, 
- 0x0000e900UL, 0x0000ce00UL, 0x00005500UL, 0x00002800UL, 0x0000df00UL, 
- 0x00008c00UL, 0x0000a100UL, 0x00008900UL, 0x00000d00UL, 0x0000bf00UL, 
- 0x0000e600UL, 0x00004200UL, 0x00006800UL, 0x00004100UL, 0x00009900UL, 
- 0x00002d00UL, 0x00000f00UL, 0x0000b000UL, 0x00005400UL, 0x0000bb00UL, 
- 0x00001600UL}, 
-{0x00630000UL, 0x007c0000UL, 0x00770000UL, 0x007b0000UL, 0x00f20000UL, 
- 0x006b0000UL, 0x006f0000UL, 0x00c50000UL, 0x00300000UL, 0x00010000UL, 
- 0x00670000UL, 0x002b0000UL, 0x00fe0000UL, 0x00d70000UL, 0x00ab0000UL, 
- 0x00760000UL, 0x00ca0000UL, 0x00820000UL, 0x00c90000UL, 0x007d0000UL, 
- 0x00fa0000UL, 0x00590000UL, 0x00470000UL, 0x00f00000UL, 0x00ad0000UL, 
- 0x00d40000UL, 0x00a20000UL, 0x00af0000UL, 0x009c0000UL, 0x00a40000UL, 
- 0x00720000UL, 0x00c00000UL, 0x00b70000UL, 0x00fd0000UL, 0x00930000UL, 
- 0x00260000UL, 0x00360000UL, 0x003f0000UL, 0x00f70000UL, 0x00cc0000UL, 
- 0x00340000UL, 0x00a50000UL, 0x00e50000UL, 0x00f10000UL, 0x00710000UL, 
- 0x00d80000UL, 0x00310000UL, 0x00150000UL, 0x00040000UL, 0x00c70000UL, 
- 0x00230000UL, 0x00c30000UL, 0x00180000UL, 0x00960000UL, 0x00050000UL, 
- 0x009a0000UL, 0x00070000UL, 0x00120000UL, 0x00800000UL, 0x00e20000UL, 
- 0x00eb0000UL, 0x00270000UL, 0x00b20000UL, 0x00750000UL, 0x00090000UL, 
- 0x00830000UL, 0x002c0000UL, 0x001a0000UL, 0x001b0000UL, 0x006e0000UL, 
- 0x005a0000UL, 0x00a00000UL, 0x00520000UL, 0x003b0000UL, 0x00d60000UL, 
- 0x00b30000UL, 0x00290000UL, 0x00e30000UL, 0x002f0000UL, 0x00840000UL, 
- 0x00530000UL, 0x00d10000UL, 0x00000000UL, 0x00ed0000UL, 0x00200000UL, 
- 0x00fc0000UL, 0x00b10000UL, 0x005b0000UL, 0x006a0000UL, 0x00cb0000UL, 
- 0x00be0000UL, 0x00390000UL, 0x004a0000UL, 0x004c0000UL, 0x00580000UL, 
- 0x00cf0000UL, 0x00d00000UL, 0x00ef0000UL, 0x00aa0000UL, 0x00fb0000UL, 
- 0x00430000UL, 0x004d0000UL, 0x00330000UL, 0x00850000UL, 0x00450000UL, 
- 0x00f90000UL, 0x00020000UL, 0x007f0000UL, 0x00500000UL, 0x003c0000UL, 
- 0x009f0000UL, 0x00a80000UL, 0x00510000UL, 0x00a30000UL, 0x00400000UL, 
- 0x008f0000UL, 0x00920000UL, 0x009d0000UL, 0x00380000UL, 0x00f50000UL, 
- 0x00bc0000UL, 0x00b60000UL, 0x00da0000UL, 0x00210000UL, 0x00100000UL, 
- 0x00ff0000UL, 0x00f30000UL, 0x00d20000UL, 0x00cd0000UL, 0x000c0000UL, 
- 0x00130000UL, 0x00ec0000UL, 0x005f0000UL, 0x00970000UL, 0x00440000UL, 
- 0x00170000UL, 0x00c40000UL, 0x00a70000UL, 0x007e0000UL, 0x003d0000UL, 
- 0x00640000UL, 0x005d0000UL, 0x00190000UL, 0x00730000UL, 0x00600000UL, 
- 0x00810000UL, 0x004f0000UL, 0x00dc0000UL, 0x00220000UL, 0x002a0000UL, 
- 0x00900000UL, 0x00880000UL, 0x00460000UL, 0x00ee0000UL, 0x00b80000UL, 
- 0x00140000UL, 0x00de0000UL, 0x005e0000UL, 0x000b0000UL, 0x00db0000UL, 
- 0x00e00000UL, 0x00320000UL, 0x003a0000UL, 0x000a0000UL, 0x00490000UL, 
- 0x00060000UL, 0x00240000UL, 0x005c0000UL, 0x00c20000UL, 0x00d30000UL, 
- 0x00ac0000UL, 0x00620000UL, 0x00910000UL, 0x00950000UL, 0x00e40000UL, 
- 0x00790000UL, 0x00e70000UL, 0x00c80000UL, 0x00370000UL, 0x006d0000UL, 
- 0x008d0000UL, 0x00d50000UL, 0x004e0000UL, 0x00a90000UL, 0x006c0000UL, 
- 0x00560000UL, 0x00f40000UL, 0x00ea0000UL, 0x00650000UL, 0x007a0000UL, 
- 0x00ae0000UL, 0x00080000UL, 0x00ba0000UL, 0x00780000UL, 0x00250000UL, 
- 0x002e0000UL, 0x001c0000UL, 0x00a60000UL, 0x00b40000UL, 0x00c60000UL, 
- 0x00e80000UL, 0x00dd0000UL, 0x00740000UL, 0x001f0000UL, 0x004b0000UL, 
- 0x00bd0000UL, 0x008b0000UL, 0x008a0000UL, 0x00700000UL, 0x003e0000UL, 
- 0x00b50000UL, 0x00660000UL, 0x00480000UL, 0x00030000UL, 0x00f60000UL, 
- 0x000e0000UL, 0x00610000UL, 0x00350000UL, 0x00570000UL, 0x00b90000UL, 
- 0x00860000UL, 0x00c10000UL, 0x001d0000UL, 0x009e0000UL, 0x00e10000UL, 
- 0x00f80000UL, 0x00980000UL, 0x00110000UL, 0x00690000UL, 0x00d90000UL, 
- 0x008e0000UL, 0x00940000UL, 0x009b0000UL, 0x001e0000UL, 0x00870000UL, 
- 0x00e90000UL, 0x00ce0000UL, 0x00550000UL, 0x00280000UL, 0x00df0000UL, 
- 0x008c0000UL, 0x00a10000UL, 0x00890000UL, 0x000d0000UL, 0x00bf0000UL, 
- 0x00e60000UL, 0x00420000UL, 0x00680000UL, 0x00410000UL, 0x00990000UL, 
- 0x002d0000UL, 0x000f0000UL, 0x00b00000UL, 0x00540000UL, 0x00bb0000UL, 
- 0x00160000UL}, 
-{0x63000000UL, 0x7c000000UL, 0x77000000UL, 0x7b000000UL, 0xf2000000UL, 
- 0x6b000000UL, 0x6f000000UL, 0xc5000000UL, 0x30000000UL, 0x01000000UL, 
- 0x67000000UL, 0x2b000000UL, 0xfe000000UL, 0xd7000000UL, 0xab000000UL, 
- 0x76000000UL, 0xca000000UL, 0x82000000UL, 0xc9000000UL, 0x7d000000UL, 
- 0xfa000000UL, 0x59000000UL, 0x47000000UL, 0xf0000000UL, 0xad000000UL, 
- 0xd4000000UL, 0xa2000000UL, 0xaf000000UL, 0x9c000000UL, 0xa4000000UL, 
- 0x72000000UL, 0xc0000000UL, 0xb7000000UL, 0xfd000000UL, 0x93000000UL, 
- 0x26000000UL, 0x36000000UL, 0x3f000000UL, 0xf7000000UL, 0xcc000000UL, 
- 0x34000000UL, 0xa5000000UL, 0xe5000000UL, 0xf1000000UL, 0x71000000UL, 
- 0xd8000000UL, 0x31000000UL, 0x15000000UL, 0x04000000UL, 0xc7000000UL, 
- 0x23000000UL, 0xc3000000UL, 0x18000000UL, 0x96000000UL, 0x05000000UL, 
- 0x9a000000UL, 0x07000000UL, 0x12000000UL, 0x80000000UL, 0xe2000000UL, 
- 0xeb000000UL, 0x27000000UL, 0xb2000000UL, 0x75000000UL, 0x09000000UL, 
- 0x83000000UL, 0x2c000000UL, 0x1a000000UL, 0x1b000000UL, 0x6e000000UL, 
- 0x5a000000UL, 0xa0000000UL, 0x52000000UL, 0x3b000000UL, 0xd6000000UL, 
- 0xb3000000UL, 0x29000000UL, 0xe3000000UL, 0x2f000000UL, 0x84000000UL, 
- 0x53000000UL, 0xd1000000UL, 0x00000000UL, 0xed000000UL, 0x20000000UL, 
- 0xfc000000UL, 0xb1000000UL, 0x5b000000UL, 0x6a000000UL, 0xcb000000UL, 
- 0xbe000000UL, 0x39000000UL, 0x4a000000UL, 0x4c000000UL, 0x58000000UL, 
- 0xcf000000UL, 0xd0000000UL, 0xef000000UL, 0xaa000000UL, 0xfb000000UL, 
- 0x43000000UL, 0x4d000000UL, 0x33000000UL, 0x85000000UL, 0x45000000UL, 
- 0xf9000000UL, 0x02000000UL, 0x7f000000UL, 0x50000000UL, 0x3c000000UL, 
- 0x9f000000UL, 0xa8000000UL, 0x51000000UL, 0xa3000000UL, 0x40000000UL, 
- 0x8f000000UL, 0x92000000UL, 0x9d000000UL, 0x38000000UL, 0xf5000000UL, 
- 0xbc000000UL, 0xb6000000UL, 0xda000000UL, 0x21000000UL, 0x10000000UL, 
- 0xff000000UL, 0xf3000000UL, 0xd2000000UL, 0xcd000000UL, 0x0c000000UL, 
- 0x13000000UL, 0xec000000UL, 0x5f000000UL, 0x97000000UL, 0x44000000UL, 
- 0x17000000UL, 0xc4000000UL, 0xa7000000UL, 0x7e000000UL, 0x3d000000UL, 
- 0x64000000UL, 0x5d000000UL, 0x19000000UL, 0x73000000UL, 0x60000000UL, 
- 0x81000000UL, 0x4f000000UL, 0xdc000000UL, 0x22000000UL, 0x2a000000UL, 
- 0x90000000UL, 0x88000000UL, 0x46000000UL, 0xee000000UL, 0xb8000000UL, 
- 0x14000000UL, 0xde000000UL, 0x5e000000UL, 0x0b000000UL, 0xdb000000UL, 
- 0xe0000000UL, 0x32000000UL, 0x3a000000UL, 0x0a000000UL, 0x49000000UL, 
- 0x06000000UL, 0x24000000UL, 0x5c000000UL, 0xc2000000UL, 0xd3000000UL, 
- 0xac000000UL, 0x62000000UL, 0x91000000UL, 0x95000000UL, 0xe4000000UL, 
- 0x79000000UL, 0xe7000000UL, 0xc8000000UL, 0x37000000UL, 0x6d000000UL, 
- 0x8d000000UL, 0xd5000000UL, 0x4e000000UL, 0xa9000000UL, 0x6c000000UL, 
- 0x56000000UL, 0xf4000000UL, 0xea000000UL, 0x65000000UL, 0x7a000000UL, 
- 0xae000000UL, 0x08000000UL, 0xba000000UL, 0x78000000UL, 0x25000000UL, 
- 0x2e000000UL, 0x1c000000UL, 0xa6000000UL, 0xb4000000UL, 0xc6000000UL, 
- 0xe8000000UL, 0xdd000000UL, 0x74000000UL, 0x1f000000UL, 0x4b000000UL, 
- 0xbd000000UL, 0x8b000000UL, 0x8a000000UL, 0x70000000UL, 0x3e000000UL, 
- 0xb5000000UL, 0x66000000UL, 0x48000000UL, 0x03000000UL, 0xf6000000UL, 
- 0x0e000000UL, 0x61000000UL, 0x35000000UL, 0x57000000UL, 0xb9000000UL, 
- 0x86000000UL, 0xc1000000UL, 0x1d000000UL, 0x9e000000UL, 0xe1000000UL, 
- 0xf8000000UL, 0x98000000UL, 0x11000000UL, 0x69000000UL, 0xd9000000UL, 
- 0x8e000000UL, 0x94000000UL, 0x9b000000UL, 0x1e000000UL, 0x87000000UL, 
- 0xe9000000UL, 0xce000000UL, 0x55000000UL, 0x28000000UL, 0xdf000000UL, 
- 0x8c000000UL, 0xa1000000UL, 0x89000000UL, 0x0d000000UL, 0xbf000000UL, 
- 0xe6000000UL, 0x42000000UL, 0x68000000UL, 0x41000000UL, 0x99000000UL, 
- 0x2d000000UL, 0x0f000000UL, 0xb0000000UL, 0x54000000UL, 0xbb000000UL, 
- 0x16000000UL}
- };
-
-static const unsigned long il_tab[4][256] = {
-{0x00000052UL, 0x00000009UL, 0x0000006aUL, 0x000000d5UL, 0x00000030UL, 
- 0x00000036UL, 0x000000a5UL, 0x00000038UL, 0x000000bfUL, 0x00000040UL, 
- 0x000000a3UL, 0x0000009eUL, 0x00000081UL, 0x000000f3UL, 0x000000d7UL, 
- 0x000000fbUL, 0x0000007cUL, 0x000000e3UL, 0x00000039UL, 0x00000082UL, 
- 0x0000009bUL, 0x0000002fUL, 0x000000ffUL, 0x00000087UL, 0x00000034UL, 
- 0x0000008eUL, 0x00000043UL, 0x00000044UL, 0x000000c4UL, 0x000000deUL, 
- 0x000000e9UL, 0x000000cbUL, 0x00000054UL, 0x0000007bUL, 0x00000094UL, 
- 0x00000032UL, 0x000000a6UL, 0x000000c2UL, 0x00000023UL, 0x0000003dUL, 
- 0x000000eeUL, 0x0000004cUL, 0x00000095UL, 0x0000000bUL, 0x00000042UL, 
- 0x000000faUL, 0x000000c3UL, 0x0000004eUL, 0x00000008UL, 0x0000002eUL, 
- 0x000000a1UL, 0x00000066UL, 0x00000028UL, 0x000000d9UL, 0x00000024UL, 
- 0x000000b2UL, 0x00000076UL, 0x0000005bUL, 0x000000a2UL, 0x00000049UL, 
- 0x0000006dUL, 0x0000008bUL, 0x000000d1UL, 0x00000025UL, 0x00000072UL, 
- 0x000000f8UL, 0x000000f6UL, 0x00000064UL, 0x00000086UL, 0x00000068UL, 
- 0x00000098UL, 0x00000016UL, 0x000000d4UL, 0x000000a4UL, 0x0000005cUL, 
- 0x000000ccUL, 0x0000005dUL, 0x00000065UL, 0x000000b6UL, 0x00000092UL, 
- 0x0000006cUL, 0x00000070UL, 0x00000048UL, 0x00000050UL, 0x000000fdUL, 
- 0x000000edUL, 0x000000b9UL, 0x000000daUL, 0x0000005eUL, 0x00000015UL, 
- 0x00000046UL, 0x00000057UL, 0x000000a7UL, 0x0000008dUL, 0x0000009dUL, 
- 0x00000084UL, 0x00000090UL, 0x000000d8UL, 0x000000abUL, 0x00000000UL, 
- 0x0000008cUL, 0x000000bcUL, 0x000000d3UL, 0x0000000aUL, 0x000000f7UL, 
- 0x000000e4UL, 0x00000058UL, 0x00000005UL, 0x000000b8UL, 0x000000b3UL, 
- 0x00000045UL, 0x00000006UL, 0x000000d0UL, 0x0000002cUL, 0x0000001eUL, 
- 0x0000008fUL, 0x000000caUL, 0x0000003fUL, 0x0000000fUL, 0x00000002UL, 
- 0x000000c1UL, 0x000000afUL, 0x000000bdUL, 0x00000003UL, 0x00000001UL, 
- 0x00000013UL, 0x0000008aUL, 0x0000006bUL, 0x0000003aUL, 0x00000091UL, 
- 0x00000011UL, 0x00000041UL, 0x0000004fUL, 0x00000067UL, 0x000000dcUL, 
- 0x000000eaUL, 0x00000097UL, 0x000000f2UL, 0x000000cfUL, 0x000000ceUL, 
- 0x000000f0UL, 0x000000b4UL, 0x000000e6UL, 0x00000073UL, 0x00000096UL, 
- 0x000000acUL, 0x00000074UL, 0x00000022UL, 0x000000e7UL, 0x000000adUL, 
- 0x00000035UL, 0x00000085UL, 0x000000e2UL, 0x000000f9UL, 0x00000037UL, 
- 0x000000e8UL, 0x0000001cUL, 0x00000075UL, 0x000000dfUL, 0x0000006eUL, 
- 0x00000047UL, 0x000000f1UL, 0x0000001aUL, 0x00000071UL, 0x0000001dUL, 
- 0x00000029UL, 0x000000c5UL, 0x00000089UL, 0x0000006fUL, 0x000000b7UL, 
- 0x00000062UL, 0x0000000eUL, 0x000000aaUL, 0x00000018UL, 0x000000beUL, 
- 0x0000001bUL, 0x000000fcUL, 0x00000056UL, 0x0000003eUL, 0x0000004bUL, 
- 0x000000c6UL, 0x000000d2UL, 0x00000079UL, 0x00000020UL, 0x0000009aUL, 
- 0x000000dbUL, 0x000000c0UL, 0x000000feUL, 0x00000078UL, 0x000000cdUL, 
- 0x0000005aUL, 0x000000f4UL, 0x0000001fUL, 0x000000ddUL, 0x000000a8UL, 
- 0x00000033UL, 0x00000088UL, 0x00000007UL, 0x000000c7UL, 0x00000031UL, 
- 0x000000b1UL, 0x00000012UL, 0x00000010UL, 0x00000059UL, 0x00000027UL, 
- 0x00000080UL, 0x000000ecUL, 0x0000005fUL, 0x00000060UL, 0x00000051UL, 
- 0x0000007fUL, 0x000000a9UL, 0x00000019UL, 0x000000b5UL, 0x0000004aUL, 
- 0x0000000dUL, 0x0000002dUL, 0x000000e5UL, 0x0000007aUL, 0x0000009fUL, 
- 0x00000093UL, 0x000000c9UL, 0x0000009cUL, 0x000000efUL, 0x000000a0UL, 
- 0x000000e0UL, 0x0000003bUL, 0x0000004dUL, 0x000000aeUL, 0x0000002aUL, 
- 0x000000f5UL, 0x000000b0UL, 0x000000c8UL, 0x000000ebUL, 0x000000bbUL, 
- 0x0000003cUL, 0x00000083UL, 0x00000053UL, 0x00000099UL, 0x00000061UL, 
- 0x00000017UL, 0x0000002bUL, 0x00000004UL, 0x0000007eUL, 0x000000baUL, 
- 0x00000077UL, 0x000000d6UL, 0x00000026UL, 0x000000e1UL, 0x00000069UL, 
- 0x00000014UL, 0x00000063UL, 0x00000055UL, 0x00000021UL, 0x0000000cUL, 
- 0x0000007dUL}, 
-{0x00005200UL, 0x00000900UL, 0x00006a00UL, 0x0000d500UL, 0x00003000UL, 
- 0x00003600UL, 0x0000a500UL, 0x00003800UL, 0x0000bf00UL, 0x00004000UL, 
- 0x0000a300UL, 0x00009e00UL, 0x00008100UL, 0x0000f300UL, 0x0000d700UL, 
- 0x0000fb00UL, 0x00007c00UL, 0x0000e300UL, 0x00003900UL, 0x00008200UL, 
- 0x00009b00UL, 0x00002f00UL, 0x0000ff00UL, 0x00008700UL, 0x00003400UL, 
- 0x00008e00UL, 0x00004300UL, 0x00004400UL, 0x0000c400UL, 0x0000de00UL, 
- 0x0000e900UL, 0x0000cb00UL, 0x00005400UL, 0x00007b00UL, 0x00009400UL, 
- 0x00003200UL, 0x0000a600UL, 0x0000c200UL, 0x00002300UL, 0x00003d00UL, 
- 0x0000ee00UL, 0x00004c00UL, 0x00009500UL, 0x00000b00UL, 0x00004200UL, 
- 0x0000fa00UL, 0x0000c300UL, 0x00004e00UL, 0x00000800UL, 0x00002e00UL, 
- 0x0000a100UL, 0x00006600UL, 0x00002800UL, 0x0000d900UL, 0x00002400UL, 
- 0x0000b200UL, 0x00007600UL, 0x00005b00UL, 0x0000a200UL, 0x00004900UL, 
- 0x00006d00UL, 0x00008b00UL, 0x0000d100UL, 0x00002500UL, 0x00007200UL, 
- 0x0000f800UL, 0x0000f600UL, 0x00006400UL, 0x00008600UL, 0x00006800UL, 
- 0x00009800UL, 0x00001600UL, 0x0000d400UL, 0x0000a400UL, 0x00005c00UL, 
- 0x0000cc00UL, 0x00005d00UL, 0x00006500UL, 0x0000b600UL, 0x00009200UL, 
- 0x00006c00UL, 0x00007000UL, 0x00004800UL, 0x00005000UL, 0x0000fd00UL, 
- 0x0000ed00UL, 0x0000b900UL, 0x0000da00UL, 0x00005e00UL, 0x00001500UL, 
- 0x00004600UL, 0x00005700UL, 0x0000a700UL, 0x00008d00UL, 0x00009d00UL, 
- 0x00008400UL, 0x00009000UL, 0x0000d800UL, 0x0000ab00UL, 0x00000000UL, 
- 0x00008c00UL, 0x0000bc00UL, 0x0000d300UL, 0x00000a00UL, 0x0000f700UL, 
- 0x0000e400UL, 0x00005800UL, 0x00000500UL, 0x0000b800UL, 0x0000b300UL, 
- 0x00004500UL, 0x00000600UL, 0x0000d000UL, 0x00002c00UL, 0x00001e00UL, 
- 0x00008f00UL, 0x0000ca00UL, 0x00003f00UL, 0x00000f00UL, 0x00000200UL, 
- 0x0000c100UL, 0x0000af00UL, 0x0000bd00UL, 0x00000300UL, 0x00000100UL, 
- 0x00001300UL, 0x00008a00UL, 0x00006b00UL, 0x00003a00UL, 0x00009100UL, 
- 0x00001100UL, 0x00004100UL, 0x00004f00UL, 0x00006700UL, 0x0000dc00UL, 
- 0x0000ea00UL, 0x00009700UL, 0x0000f200UL, 0x0000cf00UL, 0x0000ce00UL, 
- 0x0000f000UL, 0x0000b400UL, 0x0000e600UL, 0x00007300UL, 0x00009600UL, 
- 0x0000ac00UL, 0x00007400UL, 0x00002200UL, 0x0000e700UL, 0x0000ad00UL, 
- 0x00003500UL, 0x00008500UL, 0x0000e200UL, 0x0000f900UL, 0x00003700UL, 
- 0x0000e800UL, 0x00001c00UL, 0x00007500UL, 0x0000df00UL, 0x00006e00UL, 
- 0x00004700UL, 0x0000f100UL, 0x00001a00UL, 0x00007100UL, 0x00001d00UL, 
- 0x00002900UL, 0x0000c500UL, 0x00008900UL, 0x00006f00UL, 0x0000b700UL, 
- 0x00006200UL, 0x00000e00UL, 0x0000aa00UL, 0x00001800UL, 0x0000be00UL, 
- 0x00001b00UL, 0x0000fc00UL, 0x00005600UL, 0x00003e00UL, 0x00004b00UL, 
- 0x0000c600UL, 0x0000d200UL, 0x00007900UL, 0x00002000UL, 0x00009a00UL, 
- 0x0000db00UL, 0x0000c000UL, 0x0000fe00UL, 0x00007800UL, 0x0000cd00UL, 
- 0x00005a00UL, 0x0000f400UL, 0x00001f00UL, 0x0000dd00UL, 0x0000a800UL, 
- 0x00003300UL, 0x00008800UL, 0x00000700UL, 0x0000c700UL, 0x00003100UL, 
- 0x0000b100UL, 0x00001200UL, 0x00001000UL, 0x00005900UL, 0x00002700UL, 
- 0x00008000UL, 0x0000ec00UL, 0x00005f00UL, 0x00006000UL, 0x00005100UL, 
- 0x00007f00UL, 0x0000a900UL, 0x00001900UL, 0x0000b500UL, 0x00004a00UL, 
- 0x00000d00UL, 0x00002d00UL, 0x0000e500UL, 0x00007a00UL, 0x00009f00UL, 
- 0x00009300UL, 0x0000c900UL, 0x00009c00UL, 0x0000ef00UL, 0x0000a000UL, 
- 0x0000e000UL, 0x00003b00UL, 0x00004d00UL, 0x0000ae00UL, 0x00002a00UL, 
- 0x0000f500UL, 0x0000b000UL, 0x0000c800UL, 0x0000eb00UL, 0x0000bb00UL, 
- 0x00003c00UL, 0x00008300UL, 0x00005300UL, 0x00009900UL, 0x00006100UL, 
- 0x00001700UL, 0x00002b00UL, 0x00000400UL, 0x00007e00UL, 0x0000ba00UL, 
- 0x00007700UL, 0x0000d600UL, 0x00002600UL, 0x0000e100UL, 0x00006900UL, 
- 0x00001400UL, 0x00006300UL, 0x00005500UL, 0x00002100UL, 0x00000c00UL, 
- 0x00007d00UL}, 
-{0x00520000UL, 0x00090000UL, 0x006a0000UL, 0x00d50000UL, 0x00300000UL, 
- 0x00360000UL, 0x00a50000UL, 0x00380000UL, 0x00bf0000UL, 0x00400000UL, 
- 0x00a30000UL, 0x009e0000UL, 0x00810000UL, 0x00f30000UL, 0x00d70000UL, 
- 0x00fb0000UL, 0x007c0000UL, 0x00e30000UL, 0x00390000UL, 0x00820000UL, 
- 0x009b0000UL, 0x002f0000UL, 0x00ff0000UL, 0x00870000UL, 0x00340000UL, 
- 0x008e0000UL, 0x00430000UL, 0x00440000UL, 0x00c40000UL, 0x00de0000UL, 
- 0x00e90000UL, 0x00cb0000UL, 0x00540000UL, 0x007b0000UL, 0x00940000UL, 
- 0x00320000UL, 0x00a60000UL, 0x00c20000UL, 0x00230000UL, 0x003d0000UL, 
- 0x00ee0000UL, 0x004c0000UL, 0x00950000UL, 0x000b0000UL, 0x00420000UL, 
- 0x00fa0000UL, 0x00c30000UL, 0x004e0000UL, 0x00080000UL, 0x002e0000UL, 
- 0x00a10000UL, 0x00660000UL, 0x00280000UL, 0x00d90000UL, 0x00240000UL, 
- 0x00b20000UL, 0x00760000UL, 0x005b0000UL, 0x00a20000UL, 0x00490000UL, 
- 0x006d0000UL, 0x008b0000UL, 0x00d10000UL, 0x00250000UL, 0x00720000UL, 
- 0x00f80000UL, 0x00f60000UL, 0x00640000UL, 0x00860000UL, 0x00680000UL, 
- 0x00980000UL, 0x00160000UL, 0x00d40000UL, 0x00a40000UL, 0x005c0000UL, 
- 0x00cc0000UL, 0x005d0000UL, 0x00650000UL, 0x00b60000UL, 0x00920000UL, 
- 0x006c0000UL, 0x00700000UL, 0x00480000UL, 0x00500000UL, 0x00fd0000UL, 
- 0x00ed0000UL, 0x00b90000UL, 0x00da0000UL, 0x005e0000UL, 0x00150000UL, 
- 0x00460000UL, 0x00570000UL, 0x00a70000UL, 0x008d0000UL, 0x009d0000UL, 
- 0x00840000UL, 0x00900000UL, 0x00d80000UL, 0x00ab0000UL, 0x00000000UL, 
- 0x008c0000UL, 0x00bc0000UL, 0x00d30000UL, 0x000a0000UL, 0x00f70000UL, 
- 0x00e40000UL, 0x00580000UL, 0x00050000UL, 0x00b80000UL, 0x00b30000UL, 
- 0x00450000UL, 0x00060000UL, 0x00d00000UL, 0x002c0000UL, 0x001e0000UL, 
- 0x008f0000UL, 0x00ca0000UL, 0x003f0000UL, 0x000f0000UL, 0x00020000UL, 
- 0x00c10000UL, 0x00af0000UL, 0x00bd0000UL, 0x00030000UL, 0x00010000UL, 
- 0x00130000UL, 0x008a0000UL, 0x006b0000UL, 0x003a0000UL, 0x00910000UL, 
- 0x00110000UL, 0x00410000UL, 0x004f0000UL, 0x00670000UL, 0x00dc0000UL, 
- 0x00ea0000UL, 0x00970000UL, 0x00f20000UL, 0x00cf0000UL, 0x00ce0000UL, 
- 0x00f00000UL, 0x00b40000UL, 0x00e60000UL, 0x00730000UL, 0x00960000UL, 
- 0x00ac0000UL, 0x00740000UL, 0x00220000UL, 0x00e70000UL, 0x00ad0000UL, 
- 0x00350000UL, 0x00850000UL, 0x00e20000UL, 0x00f90000UL, 0x00370000UL, 
- 0x00e80000UL, 0x001c0000UL, 0x00750000UL, 0x00df0000UL, 0x006e0000UL, 
- 0x00470000UL, 0x00f10000UL, 0x001a0000UL, 0x00710000UL, 0x001d0000UL, 
- 0x00290000UL, 0x00c50000UL, 0x00890000UL, 0x006f0000UL, 0x00b70000UL, 
- 0x00620000UL, 0x000e0000UL, 0x00aa0000UL, 0x00180000UL, 0x00be0000UL, 
- 0x001b0000UL, 0x00fc0000UL, 0x00560000UL, 0x003e0000UL, 0x004b0000UL, 
- 0x00c60000UL, 0x00d20000UL, 0x00790000UL, 0x00200000UL, 0x009a0000UL, 
- 0x00db0000UL, 0x00c00000UL, 0x00fe0000UL, 0x00780000UL, 0x00cd0000UL, 
- 0x005a0000UL, 0x00f40000UL, 0x001f0000UL, 0x00dd0000UL, 0x00a80000UL, 
- 0x00330000UL, 0x00880000UL, 0x00070000UL, 0x00c70000UL, 0x00310000UL, 
- 0x00b10000UL, 0x00120000UL, 0x00100000UL, 0x00590000UL, 0x00270000UL, 
- 0x00800000UL, 0x00ec0000UL, 0x005f0000UL, 0x00600000UL, 0x00510000UL, 
- 0x007f0000UL, 0x00a90000UL, 0x00190000UL, 0x00b50000UL, 0x004a0000UL, 
- 0x000d0000UL, 0x002d0000UL, 0x00e50000UL, 0x007a0000UL, 0x009f0000UL, 
- 0x00930000UL, 0x00c90000UL, 0x009c0000UL, 0x00ef0000UL, 0x00a00000UL, 
- 0x00e00000UL, 0x003b0000UL, 0x004d0000UL, 0x00ae0000UL, 0x002a0000UL, 
- 0x00f50000UL, 0x00b00000UL, 0x00c80000UL, 0x00eb0000UL, 0x00bb0000UL, 
- 0x003c0000UL, 0x00830000UL, 0x00530000UL, 0x00990000UL, 0x00610000UL, 
- 0x00170000UL, 0x002b0000UL, 0x00040000UL, 0x007e0000UL, 0x00ba0000UL, 
- 0x00770000UL, 0x00d60000UL, 0x00260000UL, 0x00e10000UL, 0x00690000UL, 
- 0x00140000UL, 0x00630000UL, 0x00550000UL, 0x00210000UL, 0x000c0000UL, 
- 0x007d0000UL}, 
-{0x52000000UL, 0x09000000UL, 0x6a000000UL, 0xd5000000UL, 0x30000000UL, 
- 0x36000000UL, 0xa5000000UL, 0x38000000UL, 0xbf000000UL, 0x40000000UL, 
- 0xa3000000UL, 0x9e000000UL, 0x81000000UL, 0xf3000000UL, 0xd7000000UL, 
- 0xfb000000UL, 0x7c000000UL, 0xe3000000UL, 0x39000000UL, 0x82000000UL, 
- 0x9b000000UL, 0x2f000000UL, 0xff000000UL, 0x87000000UL, 0x34000000UL, 
- 0x8e000000UL, 0x43000000UL, 0x44000000UL, 0xc4000000UL, 0xde000000UL, 
- 0xe9000000UL, 0xcb000000UL, 0x54000000UL, 0x7b000000UL, 0x94000000UL, 
- 0x32000000UL, 0xa6000000UL, 0xc2000000UL, 0x23000000UL, 0x3d000000UL, 
- 0xee000000UL, 0x4c000000UL, 0x95000000UL, 0x0b000000UL, 0x42000000UL, 
- 0xfa000000UL, 0xc3000000UL, 0x4e000000UL, 0x08000000UL, 0x2e000000UL, 
- 0xa1000000UL, 0x66000000UL, 0x28000000UL, 0xd9000000UL, 0x24000000UL, 
- 0xb2000000UL, 0x76000000UL, 0x5b000000UL, 0xa2000000UL, 0x49000000UL, 
- 0x6d000000UL, 0x8b000000UL, 0xd1000000UL, 0x25000000UL, 0x72000000UL, 
- 0xf8000000UL, 0xf6000000UL, 0x64000000UL, 0x86000000UL, 0x68000000UL, 
- 0x98000000UL, 0x16000000UL, 0xd4000000UL, 0xa4000000UL, 0x5c000000UL, 
- 0xcc000000UL, 0x5d000000UL, 0x65000000UL, 0xb6000000UL, 0x92000000UL, 
- 0x6c000000UL, 0x70000000UL, 0x48000000UL, 0x50000000UL, 0xfd000000UL, 
- 0xed000000UL, 0xb9000000UL, 0xda000000UL, 0x5e000000UL, 0x15000000UL, 
- 0x46000000UL, 0x57000000UL, 0xa7000000UL, 0x8d000000UL, 0x9d000000UL, 
- 0x84000000UL, 0x90000000UL, 0xd8000000UL, 0xab000000UL, 0x00000000UL, 
- 0x8c000000UL, 0xbc000000UL, 0xd3000000UL, 0x0a000000UL, 0xf7000000UL, 
- 0xe4000000UL, 0x58000000UL, 0x05000000UL, 0xb8000000UL, 0xb3000000UL, 
- 0x45000000UL, 0x06000000UL, 0xd0000000UL, 0x2c000000UL, 0x1e000000UL, 
- 0x8f000000UL, 0xca000000UL, 0x3f000000UL, 0x0f000000UL, 0x02000000UL, 
- 0xc1000000UL, 0xaf000000UL, 0xbd000000UL, 0x03000000UL, 0x01000000UL, 
- 0x13000000UL, 0x8a000000UL, 0x6b000000UL, 0x3a000000UL, 0x91000000UL, 
- 0x11000000UL, 0x41000000UL, 0x4f000000UL, 0x67000000UL, 0xdc000000UL, 
- 0xea000000UL, 0x97000000UL, 0xf2000000UL, 0xcf000000UL, 0xce000000UL, 
- 0xf0000000UL, 0xb4000000UL, 0xe6000000UL, 0x73000000UL, 0x96000000UL, 
- 0xac000000UL, 0x74000000UL, 0x22000000UL, 0xe7000000UL, 0xad000000UL, 
- 0x35000000UL, 0x85000000UL, 0xe2000000UL, 0xf9000000UL, 0x37000000UL, 
- 0xe8000000UL, 0x1c000000UL, 0x75000000UL, 0xdf000000UL, 0x6e000000UL, 
- 0x47000000UL, 0xf1000000UL, 0x1a000000UL, 0x71000000UL, 0x1d000000UL, 
- 0x29000000UL, 0xc5000000UL, 0x89000000UL, 0x6f000000UL, 0xb7000000UL, 
- 0x62000000UL, 0x0e000000UL, 0xaa000000UL, 0x18000000UL, 0xbe000000UL, 
- 0x1b000000UL, 0xfc000000UL, 0x56000000UL, 0x3e000000UL, 0x4b000000UL, 
- 0xc6000000UL, 0xd2000000UL, 0x79000000UL, 0x20000000UL, 0x9a000000UL, 
- 0xdb000000UL, 0xc0000000UL, 0xfe000000UL, 0x78000000UL, 0xcd000000UL, 
- 0x5a000000UL, 0xf4000000UL, 0x1f000000UL, 0xdd000000UL, 0xa8000000UL, 
- 0x33000000UL, 0x88000000UL, 0x07000000UL, 0xc7000000UL, 0x31000000UL, 
- 0xb1000000UL, 0x12000000UL, 0x10000000UL, 0x59000000UL, 0x27000000UL, 
- 0x80000000UL, 0xec000000UL, 0x5f000000UL, 0x60000000UL, 0x51000000UL, 
- 0x7f000000UL, 0xa9000000UL, 0x19000000UL, 0xb5000000UL, 0x4a000000UL, 
- 0x0d000000UL, 0x2d000000UL, 0xe5000000UL, 0x7a000000UL, 0x9f000000UL, 
- 0x93000000UL, 0xc9000000UL, 0x9c000000UL, 0xef000000UL, 0xa0000000UL, 
- 0xe0000000UL, 0x3b000000UL, 0x4d000000UL, 0xae000000UL, 0x2a000000UL, 
- 0xf5000000UL, 0xb0000000UL, 0xc8000000UL, 0xeb000000UL, 0xbb000000UL, 
- 0x3c000000UL, 0x83000000UL, 0x53000000UL, 0x99000000UL, 0x61000000UL, 
- 0x17000000UL, 0x2b000000UL, 0x04000000UL, 0x7e000000UL, 0xba000000UL, 
- 0x77000000UL, 0xd6000000UL, 0x26000000UL, 0xe1000000UL, 0x69000000UL, 
- 0x14000000UL, 0x63000000UL, 0x55000000UL, 0x21000000UL, 0x0c000000UL, 
- 0x7d000000UL}
- };
-
-static const unsigned long rco_tab[10] = {
- 0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL, 0x00000010UL, 
- 0x00000020UL, 0x00000040UL, 0x00000080UL, 0x0000001bUL, 0x00000036UL
- };
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const unsigned long Te4[256] = {
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const unsigned long Td0[256] = {
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const unsigned long Td1[256] = {
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const unsigned long Td2[256] = {
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
 
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const unsigned long Td3[256] = {
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const unsigned long Td4[256] = {
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const unsigned long rcon[] = {
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+    0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
diff --git a/changes b/changes
index e2cf92d..c2cbf35 100644
--- a/changes
+++ b/changes
@@ -1,3 +1,8 @@
+Jun 11th, 2003
+v0.85  -- Swapped in a new AES routine
+       -- Removed Serpent
+       -- Added TDCAL policy document
+       
 Jun 1st, 2003
 v0.84  -- Removed a 4KB buffer from rsa_decrypt_key that wasn't being used no more
        -- Fixed another potential buffer problem.  Not an overflow but could cause the 
diff --git a/config.pl b/config.pl
index cee3453..c00fcb5 100644
--- a/config.pl
+++ b/config.pl
@@ -30,7 +30,6 @@
    "RC2,Include RC2 block cipher,y",
    "RC5,Include RC5 block cipher,y",
    "RC6,Include RC6 block cipher,y",
-   "SERPENT,Include Serpent block cipher,y",
    "SAFERP,Include Safer+ block cipher,y",
    "SAFER,Include Safer-64 block ciphers,y",
    "RIJNDAEL,Include Rijndael (AES) block cipher,y",
@@ -145,7 +144,7 @@ for (@settings) {
 
 # output objects
 print OUT "\ndefault: library\n\n";
-print OUT "OBJECTS = keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o serpent.o des.o safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o mpi.o prime.o twofish.o packet.o hmac.o strings.o\n\n";
+print OUT "OBJECTS = keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o des.o safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o mpi.o prime.o twofish.o packet.o hmac.o strings.o \n\n";
 
 # some depends
 print OUT "rsa.o: rsa_sys.c\ndh.o: dh_sys.c\necc.o: ecc_sys.c\n\n";
diff --git a/crypt.c b/crypt.c
index 09d7797..f3f36dd 100644
--- a/crypt.c
+++ b/crypt.c
@@ -384,9 +384,6 @@ const char *crypt_build_settings =
 #if defined(RC6)
    "   RC6\n"
 #endif
-#if defined(SERPENT)
-   "   Serpent\n"
-#endif
 #if defined(SAFERP)
    "   Safer+\n"
 #endif
diff --git a/crypt.out b/crypt.out
new file mode 100644
index 0000000..7471011
--- /dev/null
+++ b/crypt.out
@@ -0,0 +1,76 @@
+\BOOKMARK [0][-]{chapter.1}{Introduction}{}
+\BOOKMARK [1][-]{section.1.1}{What is the LibTomCrypt?}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.1.1}{What the library IS for?}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.2}{What the library IS NOT for?}{section.1.1}
+\BOOKMARK [1][-]{section.1.2}{Why did I write it?}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.2.1}{Modular}{section.1.2}
+\BOOKMARK [1][-]{section.1.3}{License}{chapter.1}
+\BOOKMARK [1][-]{section.1.4}{Patent Disclosure}{chapter.1}
+\BOOKMARK [1][-]{section.1.5}{Building the library}{chapter.1}
+\BOOKMARK [1][-]{section.1.6}{Building against the library}{chapter.1}
+\BOOKMARK [1][-]{section.1.7}{Thanks}{chapter.1}
+\BOOKMARK [0][-]{chapter.2}{The Application Programming Interface \(API\)}{}
+\BOOKMARK [1][-]{section.2.1}{Introduction}{chapter.2}
+\BOOKMARK [1][-]{section.2.2}{Macros}{chapter.2}
+\BOOKMARK [1][-]{section.2.3}{Functions with Variable Length Output}{chapter.2}
+\BOOKMARK [1][-]{section.2.4}{Functions that need a PRNG}{chapter.2}
+\BOOKMARK [1][-]{section.2.5}{Functions that use Arrays of Octets}{chapter.2}
+\BOOKMARK [0][-]{chapter.3}{Symmetric Block Ciphers}{}
+\BOOKMARK [1][-]{section.3.1}{Core Functions}{chapter.3}
+\BOOKMARK [1][-]{section.3.2}{Key Sizes and Number of Rounds}{chapter.3}
+\BOOKMARK [1][-]{section.3.3}{The Cipher Descriptors}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.3.1}{Notes}{section.3.3}
+\BOOKMARK [1][-]{section.3.4}{Symmetric Modes of Operations}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.4.1}{Background}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.2}{Choice of Mode}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.3}{Implementation}{section.3.4}
+\BOOKMARK [0][-]{chapter.4}{One-Way Cryptographic Hash Functions}{}
+\BOOKMARK [1][-]{section.4.1}{Core Functions}{chapter.4}
+\BOOKMARK [1][-]{section.4.2}{Hash Descriptors}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.2.1}{Notice}{section.4.2}
+\BOOKMARK [1][-]{section.4.3}{Hash based Message Authenication Codes}{chapter.4}
+\BOOKMARK [0][-]{chapter.5}{Pseudo-Random Number Generators}{}
+\BOOKMARK [1][-]{section.5.1}{Core Functions}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.1.1}{Remarks}{section.5.1}
+\BOOKMARK [2][-]{subsection.5.1.2}{Example}{section.5.1}
+\BOOKMARK [1][-]{section.5.2}{PRNG Descriptors}{chapter.5}
+\BOOKMARK [1][-]{section.5.3}{The Secure RNG}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.3.1}{The Secure PRNG Interface}{section.5.3}
+\BOOKMARK [0][-]{chapter.6}{RSA Routines}{}
+\BOOKMARK [1][-]{section.6.1}{Background}{chapter.6}
+\BOOKMARK [1][-]{section.6.2}{Core Functions}{chapter.6}
+\BOOKMARK [1][-]{section.6.3}{Packet Routines}{chapter.6}
+\BOOKMARK [1][-]{section.6.4}{Remarks}{chapter.6}
+\BOOKMARK [0][-]{chapter.7}{Diffie-Hellman Key Exchange}{}
+\BOOKMARK [1][-]{section.7.1}{Background}{chapter.7}
+\BOOKMARK [1][-]{section.7.2}{Core Functions}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.2.1}{Remarks on Usage}{section.7.2}
+\BOOKMARK [2][-]{subsection.7.2.2}{Remarks on The Snippet}{section.7.2}
+\BOOKMARK [1][-]{section.7.3}{Other Diffie-Hellman Functions}{chapter.7}
+\BOOKMARK [1][-]{section.7.4}{DH Packet}{chapter.7}
+\BOOKMARK [0][-]{chapter.8}{Elliptic Curve Cryptography}{}
+\BOOKMARK [1][-]{section.8.1}{Background}{chapter.8}
+\BOOKMARK [1][-]{section.8.2}{Core Functions}{chapter.8}
+\BOOKMARK [1][-]{section.8.3}{ECC Packet}{chapter.8}
+\BOOKMARK [1][-]{section.8.4}{ECC Keysizes}{chapter.8}
+\BOOKMARK [0][-]{chapter.9}{Public Keyrings}{}
+\BOOKMARK [1][-]{section.9.1}{Introduction}{chapter.9}
+\BOOKMARK [1][-]{section.9.2}{The Keyring API}{chapter.9}
+\BOOKMARK [0][-]{chapter.10}{GF\(2w\) Math Routines}{}
+\BOOKMARK [0][-]{chapter.11}{Miscellaneous}{}
+\BOOKMARK [1][-]{section.11.1}{Base64 Encoding and Decoding}{chapter.11}
+\BOOKMARK [1][-]{section.11.2}{The Multiple Precision Integer Library \(MPI\)}{chapter.11}
+\BOOKMARK [2][-]{subsection.11.2.1}{Binary Forms of ``mp\137int'' Variables}{section.11.2}
+\BOOKMARK [2][-]{subsection.11.2.2}{Primality Testing}{section.11.2}
+\BOOKMARK [0][-]{chapter.12}{Programming Guidelines}{}
+\BOOKMARK [1][-]{section.12.1}{Secure Pseudo Random Number Generators}{chapter.12}
+\BOOKMARK [1][-]{section.12.2}{Preventing Trivial Errors}{chapter.12}
+\BOOKMARK [1][-]{section.12.3}{Registering Your Algorithms}{chapter.12}
+\BOOKMARK [1][-]{section.12.4}{Key Sizes}{chapter.12}
+\BOOKMARK [2][-]{subsection.12.4.1}{Symmetric Ciphers}{section.12.4}
+\BOOKMARK [2][-]{subsection.12.4.2}{Assymetric Ciphers}{section.12.4}
+\BOOKMARK [1][-]{section.12.5}{Thread Safety}{chapter.12}
+\BOOKMARK [0][-]{chapter.13}{Configuring the Library}{}
+\BOOKMARK [1][-]{section.13.1}{Introduction}{chapter.13}
+\BOOKMARK [1][-]{section.13.2}{mycrypt\137cfg.h}{chapter.13}
+\BOOKMARK [1][-]{section.13.3}{The Configure Script}{chapter.13}
diff --git a/crypt.pdf b/crypt.pdf
index 0fe991a..9612855 100644
Binary files a/crypt.pdf and b/crypt.pdf differ
diff --git a/crypt.tex b/crypt.tex
index 3b64457..7141629 100644
--- a/crypt.tex
+++ b/crypt.tex
@@ -1,4 +1,5 @@
 \documentclass{book}
+\usepackage{hyperref}
 \usepackage{makeidx}
 \usepackage{amssymb}
 \usepackage{color}
@@ -41,14 +42,12 @@
 \def\C{{\mathbb C}}
 \def\Q{{\mathbb Q}}
 
-\newcommand{\url}[1]{\mbox{$<${#1}$>$}}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
 \def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
 
 \def\gap{\vspace{0.5ex}}
 \makeindex
 \begin{document}
-\title{A Tiny Crypto Library, \\ LibTomCrypt \\ Version 0.84}
+\title{A Tiny Crypto Library, \\ LibTomCrypt \\ Version 0.85}
 \author{Tom St Denis \\
 Algonquin College \\
 \\
@@ -162,14 +161,11 @@ All of the source code except for the following files have been written by the a
 under the TDCAL license:
 
 \begin{enumerate}
-   \item aes.c
    \item rc2.c
-   \item serpent.c
    \item safer.c
 \end{enumerate}
 
-``aes.c'' and ``serpent.c'' were written by Brian Gladman (gladman@seven77.demon.co.uk).  They are copyrighted works
-but were both granted unrestricted usage in any project (commercial or otherwise).  ``mpi.c'' was originally written 
+`mpi.c'' was originally written 
 by Michael Fromberger (sting@linguist.dartmouth.edu) but has since been replaced with my LibTomMath library.  
 ``rc2.c'' is based on publicly available code that is not attributed to a person from the given source.  ``safer.c'' 
 was written by Richard De Moliner (demoliner@isi.ee.ethz.ch) and is public domain.
@@ -512,7 +508,6 @@ As of this release the current cipher\_descriptors elements are
      \hline Safer SK64  & safer\_sk64\_desc & 8 & 8 & 6 .. 13 \\
      \hline Safer K128  & safer\_k128\_desc & 8 & 16 & 6 .. 13 \\
      \hline Safer SK128 & safer\_sk128\_desc & 8 & 16 & 6 .. 13 \\
-     \hline Serpent & serpent\_desc & 16 & 16 .. 32 & 32 \\
      \hline AES & aes\_desc & 16 & 16, 24, 32 & 10, 12, 14 \\
      \hline Twofish & twofish\_desc & 16 & 16, 24, 32 & 16 \\
      \hline DES & des\_desc & 8 & 7 & 16 \\
diff --git a/demos/test.c b/demos/test.c
index dcdceaf..cc66c10 100644
--- a/demos/test.c
+++ b/demos/test.c
@@ -506,7 +506,6 @@ pad_test (void)
     }
   printf ("passed.\n");
 }
-
 void
 rsa_test (void)
 {
@@ -658,9 +657,6 @@ rsa_test (void)
       rsa_free (&key);
     }
   }
-
-
-
 }
 #else
 void
@@ -1296,6 +1292,9 @@ test_prime (void)
 void
 register_all_algs (void)
 {
+#ifdef RIJNDAEL
+  register_cipher (&aes_desc);
+#endif
 #ifdef BLOWFISH
   register_cipher (&blowfish_desc);
 #endif
@@ -1311,12 +1310,6 @@ register_all_algs (void)
 #ifdef SAFERP
   register_cipher (&saferp_desc);
 #endif
-#ifdef SERPENT
-  register_cipher (&serpent_desc);
-#endif
-#ifdef RIJNDAEL
-  register_cipher (&aes_desc);
-#endif
 #ifdef TWOFISH
   register_cipher (&twofish_desc);
 #endif
@@ -1375,6 +1368,7 @@ register_all_algs (void)
 #endif
 }
 
+#ifdef KR
 void
 kr_display (pk_key * kr)
 {
@@ -1664,8 +1658,8 @@ kr_test (void)
   }
 
   kr_clear (&kr);
-
 }
+#endif
 
 void
 test_errs (void)
@@ -1715,7 +1709,7 @@ main (void)
 #endif
 
   register_all_algs ();
-
+  
   if ((errnum = yarrow_start (&prng)) != CRYPT_OK) {
     printf ("yarrow_start: %s\n", error_to_string (errnum));
   }
@@ -1746,7 +1740,9 @@ main (void)
   rng_tests ();
   //test_prime();
 
+#ifdef KR
   kr_test ();
+#endif  
   rsa_test ();
   pad_test ();
   ecc_tests ();
diff --git a/legal.txt b/legal.txt
index 0603d4f..6ef478e 100644
--- a/legal.txt
+++ b/legal.txt
@@ -4,19 +4,6 @@ Tom St Denis
 The bulk of the code was written or donated under the TDCAL "Tom Doesn't Care About License" license.  It entitles the developer to free-reign on
 the use and distribution of derived works, commercial or otherwise.  Certain files are taken from public domain packages.
 
-AES.C
------
-Author: Dr Brian Gladman
-Email : gladman@seven77.demon.co.uk
-Disclaimer (verbatim)
-----
-/* Copyright in this implementation is held by Dr B R Gladman but I     */
-/* hereby give permission for its free direct or derivative use subject */
-/* to acknowledgment of its origin and compliance with any conditions   */
-/* that the originators of the algorithm place on its exploitation.     */
-----
-Status:  Public Domain, modified [not original]
-
 DES.C
 -----
 Author:  Unknown,  Submitted by Dobes Vandermeer
@@ -63,18 +50,4 @@ Author: [copied verbatim]
 ----
 Email: demoliner@isi.ee.ethz.ch
 Disclaimer:  Appears to be Public Domain [not quite sure]
-Status: Public Domain, modified [not original]
-
-SERPENT.C
----------
-Author:  Dr. Brian Gladman
-Email : gladman@seven77.demon.co.uk
-Disclaimer (verbatim)
-----
-/* Copyright in this implementation is held by Dr B R Gladman but I     */
-/* hereby give permission for its free direct or derivative use subject */
-/* to acknowledgment of its origin and compliance with any conditions   */
-/* that the originators of the algorithm place on its exploitation.     */
-----
-Status:  Public Domain, modified [not original]
-
+Status: Public Domain, modified [not original]
\ No newline at end of file
diff --git a/makefile b/makefile
index f2255a9..136a43d 100644
--- a/makefile
+++ b/makefile
@@ -9,7 +9,7 @@
 # a build. This is easy to remedy though, for those that have problems.
 
 # The version
-VERSION=0.84
+VERSION=0.85
 
 #ch1-01-1
 # Compiler and Linker Names
@@ -55,9 +55,9 @@ DATAPATH=/usr/share/doc/libtomcrypt/pdf
 #List of objects to compile.
 OBJECTS=keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o \
 bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o \
-md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o serpent.o des.o \
+md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o des.o \
 safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o \
-mpi.o prime.o twofish.o packet.o hmac.o strings.o
+mpi.o prime.o twofish.o packet.o hmac.o strings.o 
 
 TESTOBJECTS=demos/test.o
 HASHOBJECTS=demos/hashsum.o
diff --git a/makefile.msvc b/makefile.msvc
index 381ebb8..34f0e63 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -2,16 +2,17 @@
 #
 #Tom St Denis
 
-CFLAGS = /I. /Ogiyb1s /Gs /DWIN32 /W3
+# note optimizations are turned off because it causes a bug in aes.c that cannot be rectified [right away]
+CFLAGS = /I. /Od /G3 /DWIN32 /W3
 
 default: library
 
 #List of objects to compile.
 OBJECTS=keyring.obj gf.obj mem.obj sprng.obj ecc.obj base64.obj dh.obj rsa.obj \
 bits.obj yarrow.obj cfb.obj ofb.obj ecb.obj ctr.obj cbc.obj hash.obj tiger.obj sha1.obj \
-md5.obj md4.obj md2.obj sha256.obj sha512.obj xtea.obj aes.obj serpent.obj des.obj \
+md5.obj md4.obj md2.obj sha256.obj sha512.obj xtea.obj aes.obj des.obj \
 safer_tab.obj safer.obj safer+.obj rc4.obj rc2.obj rc6.obj rc5.obj cast5.obj noekeon.obj \
-blowfish.obj crypt.obj mpi.obj prime.obj twofish.obj packet.obj hmac.obj strings.obj
+blowfish.obj crypt.obj mpi.obj prime.obj twofish.obj packet.obj hmac.obj strings.obj 
 
 library: $(OBJECTS)
 	lib /out:tomcrypt.lib $(OBJECTS)
diff --git a/makefile.out b/makefile.out
index bb217ef..fd73e82 100644
--- a/makefile.out
+++ b/makefile.out
@@ -5,11 +5,11 @@
 CC = gcc 
 AR = ar 
 LD = ld 
-CFLAGS += -O3 -Wall -Wsign-compare -W -Wno-unused -Werror -I./  
+CFLAGS += -Os -Wall -Wsign-compare -W -Wno-unused -Werror -I./  
 
 default: library
 
-OBJECTS = keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o serpent.o des.o safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o ampi.o mpi.o prime.o twofish.o packet.o hmac.o strings.o
+OBJECTS = keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o des.o safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o mpi.o prime.o twofish.o packet.o hmac.o strings.o 
 
 rsa.o: rsa_sys.c
 dh.o: dh_sys.c
diff --git a/md4.c b/md4.c
index 6c6003f..d7b8d5e 100644
--- a/md4.c
+++ b/md4.c
@@ -29,7 +29,7 @@ const struct _hash_descriptor md4_desc =
 #define S34 15
 
 /* F, G and H are basic MD4 functions. */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define F(x, y, z) (z ^ (x & (y ^ z)))
 #define G(x, y, z) ((x & y) | (z & (x | y)))
 #define H(x, y, z) ((x) ^ (y) ^ (z))
 
diff --git a/md5.c b/md5.c
index ef999c2..9b78fa3 100644
--- a/md5.c
+++ b/md5.c
@@ -14,7 +14,7 @@ const struct _hash_descriptor md5_desc =
     &md5_test
 };
 
-#define F(x,y,z)  ((x&y)|((~x)&z))
+#define F(x,y,z)  (z ^ (x & (y ^ z)))
 #define G(x,y,z)  ((x&z)|(y&(~z)))
 #define H(x,y,z)  (x^y^z)
 #define I(x,y,z)  (y^(x|(~z)))
diff --git a/mpi.c b/mpi.c
index 383d198..4f97286 100644
--- a/mpi.c
+++ b/mpi.c
@@ -1,1610 +1,1610 @@
-/* Start: bn_fast_mp_invmod.c */
-#line 0 "bn_fast_mp_invmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
+/* Start: bn_fast_mp_invmod.c */
+#line 0 "bn_fast_mp_invmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
 #include "mycrypt.h"
-#include <tommath.h>
-
-/* computes the modular inverse via binary extended euclidean algorithm, 
- * that is c = 1/a mod b 
- *
- * Based on mp_invmod except this is optimized for the case where b is 
- * odd as per HAC Note 14.64 on pp. 610
- */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, B, D;
-  int     res, neg;
-
-  /* init all our temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
-     return res;
-  }
-
-  /* x == modulus, y == value to invert */
-  if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! 
-   * 
-   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
-   */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
-  }
-  mp_set (&D, 1);
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&B) == 0) {
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
-      }
-    }
-    /* B = B/2 */
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&D) == 0) {
-      /* D = (D-x)/2 */
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
-      }
-    }
-    /* D = D/2 */
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  } else {
-    /* v - v - u, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0) {
-    goto top;
-  }
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* b is now the inverse */
-  neg = a->sign;
-  while (D.sign == MP_NEG) {
-    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-  mp_exch (&D, c);
-  c->sign = neg;
-  res = MP_OKAY;
-
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
-  return res;
-}
-
-/* End: bn_fast_mp_invmod.c */
-
-/* Start: bn_fast_mp_montgomery_reduce.c */
-#line 0 "bn_fast_mp_montgomery_reduce.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes xR**-1 == x (mod N) via Montgomery Reduction 
- * 
- * This is an optimized implementation of mp_montgomery_reduce 
- * which uses the comba method to quickly calculate the columns of the
- * reduction.  
- *
- * Based on Algorithm 14.32 on pp.601 of HAC.
-*/
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-{
-  int     ix, res, olduse;
-  mp_word W[MP_WARRAY];
-
-  /* get old used count */
-  olduse = x->used;
-
-  /* grow a as required */
-  if (x->alloc < n->used + 1) {
-    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  {
-    register mp_word *_W;
-    register mp_digit *tmpx;
-
-    _W = W;
-    tmpx = x->dp;
-
-    /* copy the digits of a into W[0..a->used-1] */
-    for (ix = 0; ix < x->used; ix++) {
-      *_W++ = *tmpx++;
-    }
-
-    /* zero the high words of W[a->used..m->used*2] */
-    for (; ix < n->used * 2 + 1; ix++) {
-      *_W++ = 0;
-    }
-  }
-
-  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * m' mod b
-     *
-     * We avoid a double precision multiplication (which isn't required)
-     * by casting the value down to a mp_digit.  Note this requires 
-     * that W[ix-1] have  the carry cleared (see after the inner loop)
-     */
-    register mp_digit mu;
-    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
-
-    /* a = a + mu * m * b**i
-     *
-     * This is computed in place and on the fly.  The multiplication
-     * by b**i is handled by offseting which columns the results
-     * are added to.
-     *
-     * Note the comba method normally doesn't handle carries in the 
-     * inner loop In this case we fix the carry from the previous 
-     * column since the Montgomery reduction requires digits of the 
-     * result (so far) [see above] to work.  This is
-     * handled by fixing up one carry after the inner loop.  The 
-     * carry fixups are done in order so after these loops the 
-     * first m->used words of W[] have the carries fixed
-     */
-    {
-      register int iy;
-      register mp_digit *tmpn;
-      register mp_word *_W;
-
-      /* alias for the digits of the modulus */
-      tmpn = n->dp;
-
-      /* Alias for the columns set by an offset of ix */
-      _W = W + ix;
-
-      /* inner loop */
-      for (iy = 0; iy < n->used; iy++) {
-          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
-      }
-    }
-
-    /* now fix carry for next digit, W[ix+1] */
-    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-  }
-
-
-  {
-    register mp_digit *tmpx;
-    register mp_word *_W, *_W1;
-
-    /* nox fix rest of carries */
-    _W1 = W + ix;
-    _W = W + ++ix;
-
-    for (; ix <= n->used * 2 + 1; ix++) {
-      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-    }
-
-    /* copy out, A = A/b**n
-     *
-     * The result is A/b**n but instead of converting from an 
-     * array of mp_word to mp_digit than calling mp_rshd 
-     * we just copy them in the right order
-     */
-    tmpx = x->dp;
-    _W = W + n->used;
-
-    for (ix = 0; ix < n->used + 1; ix++) {
-      *tmpx++ = *_W++ & ((mp_word) MP_MASK);
-    }
-
-    /* zero oldused digits, if the input a was larger than
-     * m->used+1 we'll have to clear the digits */
-    for (; ix < olduse; ix++) {
-      *tmpx++ = 0;
-    }
-  }
-
-  /* set the max used and clamp */
-  x->used = n->used + 1;
-  mp_clamp (x);
-
-  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (x, n, x);
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_fast_mp_montgomery_reduce.c */
-
-/* Start: bn_fast_s_mp_mul_digs.c */
-#line 0 "bn_fast_s_mp_mul_digs.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Fast (comba) multiplier
- *
- * This is the fast column-array [comba] multiplier.  It is 
- * designed to compute the columns of the product first 
- * then handle the carries afterwards.  This has the effect 
- * of making the nested loops that compute the columns very
- * simple and schedulable on super-scalar processors.
- *
- * This has been modified to produce a variable number of 
- * digits of output so if say only a half-product is required 
- * you don't have to compute the upper half (a feature 
- * required for fast Barrett reduction).
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- *
- */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     olduse, res, pa, ix;
-  mp_word W[MP_WARRAY];
-
-  /* grow the destination as required */
-  if (c->alloc < digs) {
-    if ((res = mp_grow (c, digs)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* clear temp buf (the columns) */
-  memset (W, 0, sizeof (mp_word) * digs);
-
-  /* calculate the columns */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* this multiplier has been modified to allow you to 
-     * control how many digits of output are produced.  
-     * So at most we want to make upto "digs" digits of output.
-     *
-     * this adds products to distinct columns (at ix+iy) of W
-     * note that each step through the loop is not dependent on
-     * the previous which means the compiler can easily unroll
-     * the loop without scheduling problems
-     */
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy, pb;
-
-      /* alias for the the word on the left e.g. A[ix] * A[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for the right side */
-      tmpy = b->dp;
-
-      /* alias for the columns, each step through the loop adds a new
-         term to each column
-       */
-      _W = W + ix;
-
-      /* the number of digits is limited by their placement.  E.g.
-         we avoid multiplying digits that will end up above the # of
-         digits of precision requested
-       */
-      pb = MIN (b->used, digs - ix);
-
-      for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-
-  }
-
-  /* setup dest */
-  olduse = c->used;
-  c->used = digs;
-
-  {
-    register mp_digit *tmpc;
-
-    /* At this point W[] contains the sums of each column.  To get the
-     * correct result we must take the extra bits from each column and
-     * carry them down
-     *
-     * Note that while this adds extra code to the multiplier it 
-     * saves time since the carry propagation is removed from the 
-     * above nested loop.This has the effect of reducing the work 
-     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
-     * cost of the shifting.  On very small numbers this is slower 
-     * but on most cryptographic size numbers it is faster.
-     */
-    tmpc = c->dp;
-    for (ix = 1; ix < digs; ix++) {
-      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-
-    /* clear unused */
-    for (; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_mul_digs.c */
-
-/* Start: bn_fast_s_mp_mul_high_digs.c */
-#line 0 "bn_fast_s_mp_mul_high_digs.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* this is a modified version of fast_s_mp_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
- * to see how it works.
- *
- * This is used in the Barrett reduction since for one of the multiplications
- * only the higher digits were needed.  This essentially halves the work.
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     oldused, newused, res, pa, pb, ix;
-  mp_word W[MP_WARRAY];
-
-  /* calculate size of product and allocate more space if required */
-  newused = a->used + b->used + 1;
-  if (c->alloc < newused) {
-    if ((res = mp_grow (c, newused)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* like the other comba method we compute the columns first */
-  pa = a->used;
-  pb = b->used;
-  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
-  for (ix = 0; ix < pa; ix++) {
-    {
-      register mp_digit tmpx, *tmpy;
-      register int iy;
-      register mp_word *_W;
-
-      /* work todo, that is we only calculate digits that are at "digs" or above  */
-      iy = digs - ix;
-
-      /* copy of word on the left of A[ix] * B[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = b->dp + iy;
-     
-      /* alias for the columns of output.  Offset to be equal to or above the 
-       * smallest digit place requested 
-       */
-      _W = W + digs;     
-      
-      /* skip cases below zero where ix > digs */
-      if (iy < 0) {
-         iy    = abs(iy);
-         tmpy += iy;
-         _W   += iy;
-         iy    = 0;
-      }
-
-      /* compute column products for digits above the minimum */
-      for (; iy < pb; iy++) {
-    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-  }
-
-  /* setup dest */
-  oldused = c->used;
-  c->used = newused;
-
-  /* now convert the array W downto what we need */
-  for (ix = digs + 1; ix < newused; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
-
-  for (; ix < oldused; ix++) {
-    c->dp[ix] = 0;
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_mul_high_digs.c */
-
-/* Start: bn_fast_s_mp_sqr.c */
-#line 0 "bn_fast_s_mp_sqr.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* fast squaring
- *
- * This is the comba method where the columns of the product 
- * are computed first then the carries are computed.  This 
- * has the effect of making a very simple inner loop that 
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner 
- * products are of the form "A * B * 2".  The *2 part does 
- * not need to be computed until the end which is good 
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
-int
-fast_s_mp_sqr (mp_int * a, mp_int * b)
-{
-  int     olduse, newused, res, ix, pa;
-  mp_word W2[MP_WARRAY], W[MP_WARRAY];
-
-  /* calculate size of product and allocate as required */
-  pa = a->used;
-  newused = pa + pa + 1;
-  if (b->alloc < newused) {
-    if ((res = mp_grow (b, newused)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* zero temp buffer (columns)
-   * Note that there are two buffers.  Since squaring requires
-   * a outter and inner product and the inner product requires
-   * computing a product and doubling it (a relatively expensive
-   * op to perform n**2 times if you don't have to) the inner and
-   * outer products are computed in different buffers.  This way
-   * the inner product can be doubled using n doublings instead of
-   * n**2
-   */
-  memset (W, 0, newused * sizeof (mp_word));
-  memset (W2, 0, newused * sizeof (mp_word));
-
-  /* This computes the inner product.  To simplify the inner N**2 loop
-   * the multiplication by two is done afterwards in the N loop.
-   */
-  for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product
-     *
-     * Note that every outer product is computed
-     * for a particular column only once which means that
-     * there is no need todo a double precision addition
-     */
-    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy;
-
-      /* copy of left side */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = a->dp + (ix + 1);
-
-      /* the column to store the result in */
-      _W = W + (ix + ix + 1);
-
-      /* inner products */
-      for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-  }
-
-  /* setup dest */
-  olduse  = b->used;
-  b->used = newused;
-
-  /* now compute digits */
-  {
-    register mp_digit *tmpb;
-
-    /* double first value, since the inner products are 
-     * half of what they should be 
-     */
-    W[0] += W[0] + W2[0];
-
-    tmpb = b->dp;
-    for (ix = 1; ix < newused; ix++) {
-      /* double/add next digit */
-      W[ix] += W[ix] + W2[ix];
-
-      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    /* set the last value.  Note even if the carry is zero 
-     * this is required since the next step will not zero 
-     * it if b originally had a value at b->dp[2*a.used]
-     */
-    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-
-    /* clear high digits */
-    for (; ix < olduse; ix++) {
-      *tmpb++ = 0;
-    }
-  }
-
-  mp_clamp (b);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_sqr.c */
-
-/* Start: bn_mp_2expt.c */
-#line 0 "bn_mp_2expt.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes a = 2**b 
- *
- * Simple algorithm which zeroes the int, grows it then just sets one bit
- * as required.
- */
-int
-mp_2expt (mp_int * a, int b)
-{
-  int     res;
-
-  mp_zero (a);
-  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
-  }
-  a->used = b / DIGIT_BIT + 1;
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_2expt.c */
-
-/* Start: bn_mp_abs.c */
-#line 0 "bn_mp_abs.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = |a| 
- *
- * Simple function copies the input and fixes the sign to positive
- */
-int
-mp_abs (mp_int * a, mp_int * b)
-{
-  int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
-  b->sign = MP_ZPOS;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_abs.c */
-
-/* Start: bn_mp_add.c */
-#line 0 "bn_mp_add.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level addition (handles signs) */
-int
-mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-  /* get sign of both inputs */
-  sa = a->sign;
-  sb = b->sign;
-
-  /* handle two cases, not four */
-  if (sa == sb) {
-    /* both positive or both negative */
-    /* add their magnitudes, copy the sign */
-    c->sign = sa;
-    res = s_mp_add (a, b, c);
-  } else {
-    /* one positive, the other negative */
-    /* subtract the one with the greater magnitude from */
-    /* the one of the lesser magnitude.  The result gets */
-    /* the sign of the one with the greater magnitude. */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      c->sign = sb;
-      res = s_mp_sub (b, a, c);
-    } else {
-      c->sign = sa;
-      res = s_mp_sub (a, b, c);
-    }
-  }
-  return res;
-}
-
-
-/* End: bn_mp_add.c */
-
-/* Start: bn_mp_add_d.c */
-#line 0 "bn_mp_add_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit addition */
-int
-mp_add_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
-    return res;
-  }
-  mp_set (&t, b);
-  res = mp_add (a, &t, c);
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_add_d.c */
-
-/* Start: bn_mp_addmod.c */
-#line 0 "bn_mp_addmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a + b (mod c) */
-int
-mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_addmod.c */
-
-/* Start: bn_mp_and.c */
-#line 0 "bn_mp_and.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* AND two ints together */
-int
-mp_and (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] &= x->dp[ix];
-  }
-
-  /* zero digits above the last from the smallest mp_int */
-  for (; ix < t.used; ix++) {
-    t.dp[ix] = 0;
-  }
-
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_and.c */
-
-/* Start: bn_mp_clamp.c */
-#line 0 "bn_mp_clamp.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* trim unused digits 
- *
- * This is used to ensure that leading zero digits are
- * trimed and the leading "used" digit will be non-zero
- * Typically very fast.  Also fixes the sign if there
- * are no more leading digits
- */
-void
-mp_clamp (mp_int * a)
-{
-  while (a->used > 0 && a->dp[a->used - 1] == 0) {
-    --(a->used);
-  }
-  if (a->used == 0) {
-    a->sign = MP_ZPOS;
-  }
-}
-
-/* End: bn_mp_clamp.c */
-
-/* Start: bn_mp_clear.c */
-#line 0 "bn_mp_clear.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* clear one (frees)  */
-void
-mp_clear (mp_int * a)
-{
-  if (a->dp != NULL) {
-
-    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
-
-    /* free ram */
-    free (a->dp);
-
-    /* reset members to make debugging easier */
-    a->dp = NULL;
-    a->alloc = a->used = 0;
-  }
-}
-
-/* End: bn_mp_clear.c */
-
-/* Start: bn_mp_cmp.c */
-#line 0 "bn_mp_cmp.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare two ints (signed)*/
-int
-mp_cmp (mp_int * a, mp_int * b)
-{
-  /* compare based on sign */
-  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
-    return MP_LT;
-  } 
-  
-  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
-    return MP_GT;
-  }
-  
-  /* compare digits */
-  if (a->sign == MP_NEG) {
-     /* if negative compare opposite direction */
-     return mp_cmp_mag(b, a);
-  } else {
-     return mp_cmp_mag(a, b);
-  }
-}
-
-/* End: bn_mp_cmp.c */
-
-/* Start: bn_mp_cmp_d.c */
-#line 0 "bn_mp_cmp_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare a digit */
-int
-mp_cmp_d (mp_int * a, mp_digit b)
-{
-
-  if (a->sign == MP_NEG) {
-    return MP_LT;
-  }
-
-  if (a->used > 1) {
-    return MP_GT;
-  }
-
-  if (a->dp[0] > b) {
-    return MP_GT;
-  } else if (a->dp[0] < b) {
-    return MP_LT;
-  } else {
-    return MP_EQ;
-  }
-}
-
-/* End: bn_mp_cmp_d.c */
-
-/* Start: bn_mp_cmp_mag.c */
-#line 0 "bn_mp_cmp_mag.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare maginitude of two ints (unsigned) */
-int
-mp_cmp_mag (mp_int * a, mp_int * b)
-{
-  int     n;
-
-  /* compare based on # of non-zero digits */
-  if (a->used > b->used) {
-    return MP_GT;
-  } 
-  
-  if (a->used < b->used) {
-    return MP_LT;
-  }
-
-  /* compare based on digits  */
-  for (n = a->used - 1; n >= 0; n--) {
-    if (a->dp[n] > b->dp[n]) {
-      return MP_GT;
-    } 
-    
-    if (a->dp[n] < b->dp[n]) {
-      return MP_LT;
-    }
-  }
-  return MP_EQ;
-}
-
-/* End: bn_mp_cmp_mag.c */
-
-/* Start: bn_mp_copy.c */
-#line 0 "bn_mp_copy.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* copy, b = a */
-int
-mp_copy (mp_int * a, mp_int * b)
-{
-  int     res, n;
-
-  /* if dst == src do nothing */
-  if (a == b) {
-    return MP_OKAY;
-  }
-
-  /* grow dest */
-  if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-    return res;
-  }
-
-  /* zero b and copy the parameters over */
-  {
-    register mp_digit *tmpa, *tmpb;
-
-    /* pointer aliases */
-    tmpa = a->dp;
-    tmpb = b->dp;
-
-    /* copy all the digits */
-    for (n = 0; n < a->used; n++) {
-      *tmpb++ = *tmpa++;
-    }
-
-    /* clear high digits */
-    for (; n < b->used; n++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->used = a->used;
-  b->sign = a->sign;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_copy.c */
-
-/* Start: bn_mp_count_bits.c */
-#line 0 "bn_mp_count_bits.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* returns the number of bits in an int */
-int
-mp_count_bits (mp_int * a)
-{
-  int     r;
-  mp_digit q;
-
-  /* shortcut */
-  if (a->used == 0) {
-    return 0;
-  }
-
-  /* get number of digits and add that */
-  r = (a->used - 1) * DIGIT_BIT;
-  
-  /* take the last digit and count the bits in it */
-  q = a->dp[a->used - 1];
-  while (q > ((mp_digit) 0)) {
-    ++r;
-    q >>= ((mp_digit) 1);
-  }
-  return r;
-}
-
-/* End: bn_mp_count_bits.c */
-
-/* Start: bn_mp_div.c */
-#line 0 "bn_mp_div.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
- * HAC pp.598 Algorithm 14.20
- *
- * Note that the description in HAC is horribly incomplete.  For example,
- * it doesn't consider the case where digits are removed from 'x' in the inner
- * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
- *
- * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
-*/
-int
-mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  mp_int  q, x, y, t1, t2;
-  int     res, n, t, i, norm, neg;
-
-
-  /* is divisor zero ? */
-  if (mp_iszero (b) == 1) {
-    return MP_VAL;
-  }
-
-  /* if a < b then q=0, r = a */
-  if (mp_cmp_mag (a, b) == MP_LT) {
-    if (d != NULL) {
-      res = mp_copy (a, d);
-    } else {
-      res = MP_OKAY;
-    }
-    if (c != NULL) {
-      mp_zero (c);
-    }
-    return res;
-  }
-
-  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
-    return res;
-  }
-  q.used = a->used + 2;
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
-  }
-
-  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
-  }
-
-  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
-  }
-
-  /* fix the sign */
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  x.sign = y.sign = MP_ZPOS;
-
-  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
-  norm = mp_count_bits(&y) % DIGIT_BIT;
-  if (norm < (int)(DIGIT_BIT-1)) {
-     norm = (DIGIT_BIT-1) - norm;
-     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
-     }
-     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
-     }
-  } else {
-     norm = 0;
-  }
-
-  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-  n = x.used - 1;
-  t = y.used - 1;
-
-  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
-    goto __Y;
-  }
-
-  while (mp_cmp (&x, &y) != MP_LT) {
-    ++(q.dp[n - t]);
-    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-  }
-
-  /* reset y by shifting it back down */
-  mp_rshd (&y, n - t);
-
-  /* step 3. for i from n down to (t + 1) */
-  for (i = n; i >= (t + 1); i--) {
-    if (i > x.used)
-      continue;
-
-    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
-    if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
-    } else {
-      mp_word tmp;
-      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
-      tmp |= ((mp_word) x.dp[i - 1]);
-      tmp /= ((mp_word) y.dp[t]);
-      if (tmp > (mp_word) MP_MASK)
-        tmp = MP_MASK;
-      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
-    }
-
-    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
-    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
-    do {
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
-
-      /* find left hand */
-      mp_zero (&t1);
-      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
-      t1.dp[1] = y.dp[t];
-      t1.used = 2;
-      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
-      }
-
-      /* find right hand */
-      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
-      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
-      t2.dp[2] = x.dp[i];
-      t2.used = 3;
-    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
-
-    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
-    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
-    if (x.sign == MP_NEG) {
-      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
-      }
-      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
-      }
-      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
-      }
-
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
-    }
-  }
-
-  /* now q is the quotient and x is the remainder [which we have to normalize] */
-  /* get sign before writing to c */
-  x.sign = a->sign;
-
-  if (c != NULL) {
-    mp_clamp (&q);
-    mp_exch (&q, c);
-    c->sign = neg;
-  }
-
-  if (d != NULL) {
-    mp_div_2d (&x, norm, &x, NULL);
-    mp_exch (&x, d);
-  }
-
-  res = MP_OKAY;
-
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
-  return res;
-}
-
-/* End: bn_mp_div.c */
-
-/* Start: bn_mp_div_2.c */
-#line 0 "bn_mp_div_2.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = a/2 */
-int
-mp_div_2 (mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* copy */
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* source alias */
-    tmpa = a->dp + b->used - 1;
-
-    /* dest alias */
-    tmpb = b->dp + b->used - 1;
-
-    /* carry */
-    r = 0;
-    for (x = b->used - 1; x >= 0; x--) {
-      /* get the carry for the next iteration */
-      rr = *tmpa & 1;
-
-      /* shift the current digit, add in carry and store */
-      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-
-      /* forward carry to next iteration */
-      r = rr;
-    }
-
-    /* zero excess digits */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  mp_clamp (b);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_div_2.c */
-
-/* Start: bn_mp_div_2d.c */
-#line 0 "bn_mp_div_2d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
-int
-mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-{
-  mp_digit D, r, rr;
-  int     x, res;
-  mp_int  t;
-
-
-  /* if the shift count is <= 0 then we do no work */
-  if (b <= 0) {
-    res = mp_copy (a, c);
-    if (d != NULL) {
-      mp_zero (d);
-    }
-    return res;
-  }
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  /* get the remainder */
-  if (d != NULL) {
-    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= (int)DIGIT_BIT) {
-    mp_rshd (c, b / DIGIT_BIT);
-  }
-
-  /* shift any bit count < DIGIT_BIT */
-  D = (mp_digit) (b % DIGIT_BIT);
-  if (D != 0) {
-    register mp_digit *tmpc, mask;
-
-    /* mask */
-    mask = (((mp_digit)1) << D) - 1;
-
-    /* alias */
-    tmpc = c->dp + (c->used - 1);
-
-    /* carry */
-    r = 0;
-    for (x = c->used - 1; x >= 0; x--) {
-      /* get the lower  bits of this word in a temp */
-      rr = *tmpc & mask;
-
-      /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
-      --tmpc;
-
-      /* set the carry to the carry bits of the current word found above */
-      r = rr;
-    }
-  }
-  mp_clamp (c);
-  if (d != NULL) {
-    mp_exch (&t, d);
-  }
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_div_2d.c */
-
-/* Start: bn_mp_div_3.c */
-#line 0 "bn_mp_div_3.c"
+#include <tommath.h>
+
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on mp_invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
+int
+fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;
+
+  /* init all our temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x == modulus, y == value to invert */
+  if ((res = mp_copy (b, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* we need y = |a| */
+  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! 
+   * 
+   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
+   */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __ERR;
+  }
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&B) == 0) {
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+        goto __ERR;
+      }
+    }
+    /* B = B/2 */
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&D) == 0) {
+      /* D = (D-x)/2 */
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+        goto __ERR;
+      }
+    }
+    /* D = D/2 */
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  } else {
+    /* v - v - u, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0) {
+    goto top;
+  }
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* b is now the inverse */
+  neg = a->sign;
+  while (D.sign == MP_NEG) {
+    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+  mp_exch (&D, c);
+  c->sign = neg;
+  res = MP_OKAY;
+
+__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+  return res;
+}
+
+/* End: bn_fast_mp_invmod.c */
+
+/* Start: bn_fast_mp_montgomery_reduce.c */
+#line 0 "bn_fast_mp_montgomery_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction 
+ * 
+ * This is an optimized implementation of mp_montgomery_reduce 
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.  
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+int
+fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, olduse;
+  mp_word W[MP_WARRAY];
+
+  /* get old used count */
+  olduse = x->used;
+
+  /* grow a as required */
+  if (x->alloc < n->used + 1) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  {
+    register mp_word *_W;
+    register mp_digit *tmpx;
+
+    _W = W;
+    tmpx = x->dp;
+
+    /* copy the digits of a into W[0..a->used-1] */
+    for (ix = 0; ix < x->used; ix++) {
+      *_W++ = *tmpx++;
+    }
+
+    /* zero the high words of W[a->used..m->used*2] */
+    for (; ix < n->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
+  }
+
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires 
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
+     */
+    register mp_digit mu;
+    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
+
+    /* a = a + mu * m * b**i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b**i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the 
+     * inner loop In this case we fix the carry from the previous 
+     * column since the Montgomery reduction requires digits of the 
+     * result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The 
+     * carry fixups are done in order so after these loops the 
+     * first m->used words of W[] have the carries fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpn;
+      register mp_word *_W;
+
+      /* alias for the digits of the modulus */
+      tmpn = n->dp;
+
+      /* Alias for the columns set by an offset of ix */
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < n->used; iy++) {
+          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+
+  {
+    register mp_digit *tmpx;
+    register mp_word *_W, *_W1;
+
+    /* nox fix rest of carries */
+    _W1 = W + ix;
+    _W = W + ++ix;
+
+    for (; ix <= n->used * 2 + 1; ix++) {
+      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+    }
+
+    /* copy out, A = A/b**n
+     *
+     * The result is A/b**n but instead of converting from an 
+     * array of mp_word to mp_digit than calling mp_rshd 
+     * we just copy them in the right order
+     */
+    tmpx = x->dp;
+    _W = W + n->used;
+
+    for (ix = 0; ix < n->used + 1; ix++) {
+      *tmpx++ = *_W++ & ((mp_word) MP_MASK);
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits */
+    for (; ix < olduse; ix++) {
+      *tmpx++ = 0;
+    }
+  }
+
+  /* set the max used and clamp */
+  x->used = n->used + 1;
+  mp_clamp (x);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_fast_mp_montgomery_reduce.c */
+
+/* Start: bn_fast_s_mp_mul_digs.c */
+#line 0 "bn_fast_s_mp_mul_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Fast (comba) multiplier
+ *
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
+ * simple and schedulable on super-scalar processors.
+ *
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
+ */
+int
+fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix;
+  mp_word W[MP_WARRAY];
+
+  /* grow the destination as required */
+  if (c->alloc < digs) {
+    if ((res = mp_grow (c, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* clear temp buf (the columns) */
+  memset (W, 0, sizeof (mp_word) * digs);
+
+  /* calculate the columns */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* this multiplier has been modified to allow you to 
+     * control how many digits of output are produced.  
+     * So at most we want to make upto "digs" digits of output.
+     *
+     * this adds products to distinct columns (at ix+iy) of W
+     * note that each step through the loop is not dependent on
+     * the previous which means the compiler can easily unroll
+     * the loop without scheduling problems
+     */
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy, pb;
+
+      /* alias for the the word on the left e.g. A[ix] * A[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for the right side */
+      tmpy = b->dp;
+
+      /* alias for the columns, each step through the loop adds a new
+         term to each column
+       */
+      _W = W + ix;
+
+      /* the number of digits is limited by their placement.  E.g.
+         we avoid multiplying digits that will end up above the # of
+         digits of precision requested
+       */
+      pb = MIN (b->used, digs - ix);
+
+      for (iy = 0; iy < pb; iy++) {
+        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+
+  }
+
+  /* setup dest */
+  olduse = c->used;
+  c->used = digs;
+
+  {
+    register mp_digit *tmpc;
+
+    /* At this point W[] contains the sums of each column.  To get the
+     * correct result we must take the extra bits from each column and
+     * carry them down
+     *
+     * Note that while this adds extra code to the multiplier it 
+     * saves time since the carry propagation is removed from the 
+     * above nested loop.This has the effect of reducing the work 
+     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+     * cost of the shifting.  On very small numbers this is slower 
+     * but on most cryptographic size numbers it is faster.
+     */
+    tmpc = c->dp;
+    for (ix = 1; ix < digs; ix++) {
+      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
+
+    /* clear unused */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_mul_digs.c */
+
+/* Start: bn_fast_s_mp_mul_high_digs.c */
+#line 0 "bn_fast_s_mp_mul_high_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* this is a modified version of fast_s_mp_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
+ * to see how it works.
+ *
+ * This is used in the Barrett reduction since for one of the multiplications
+ * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ */
+int
+fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     oldused, newused, res, pa, pb, ix;
+  mp_word W[MP_WARRAY];
+
+  /* calculate size of product and allocate more space if required */
+  newused = a->used + b->used + 1;
+  if (c->alloc < newused) {
+    if ((res = mp_grow (c, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* like the other comba method we compute the columns first */
+  pa = a->used;
+  pb = b->used;
+  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+  for (ix = 0; ix < pa; ix++) {
+    {
+      register mp_digit tmpx, *tmpy;
+      register int iy;
+      register mp_word *_W;
+
+      /* work todo, that is we only calculate digits that are at "digs" or above  */
+      iy = digs - ix;
+
+      /* copy of word on the left of A[ix] * B[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = b->dp + iy;
+     
+      /* alias for the columns of output.  Offset to be equal to or above the 
+       * smallest digit place requested 
+       */
+      _W = W + digs;     
+      
+      /* skip cases below zero where ix > digs */
+      if (iy < 0) {
+         iy    = abs(iy);
+         tmpy += iy;
+         _W   += iy;
+         iy    = 0;
+      }
+
+      /* compute column products for digits above the minimum */
+      for (; iy < pb; iy++) {
+    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  oldused = c->used;
+  c->used = newused;
+
+  /* now convert the array W downto what we need */
+  for (ix = digs + 1; ix < newused; ix++) {
+    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+  }
+  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+
+  for (; ix < oldused; ix++) {
+    c->dp[ix] = 0;
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_mul_high_digs.c */
+
+/* Start: bn_fast_s_mp_sqr.c */
+#line 0 "bn_fast_s_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* fast squaring
+ *
+ * This is the comba method where the columns of the product 
+ * are computed first then the carries are computed.  This 
+ * has the effect of making a very simple inner loop that 
+ * is executed the most
+ *
+ * W2 represents the outer products and W the inner.
+ *
+ * A further optimizations is made because the inner 
+ * products are of the form "A * B * 2".  The *2 part does 
+ * not need to be computed until the end which is good 
+ * because 64-bit shifts are slow!
+ *
+ * Based on Algorithm 14.16 on pp.597 of HAC.
+ *
+ */
+int
+fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int     olduse, newused, res, ix, pa;
+  mp_word W2[MP_WARRAY], W[MP_WARRAY];
+
+  /* calculate size of product and allocate as required */
+  pa = a->used;
+  newused = pa + pa + 1;
+  if (b->alloc < newused) {
+    if ((res = mp_grow (b, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* zero temp buffer (columns)
+   * Note that there are two buffers.  Since squaring requires
+   * a outter and inner product and the inner product requires
+   * computing a product and doubling it (a relatively expensive
+   * op to perform n**2 times if you don't have to) the inner and
+   * outer products are computed in different buffers.  This way
+   * the inner product can be doubled using n doublings instead of
+   * n**2
+   */
+  memset (W, 0, newused * sizeof (mp_word));
+  memset (W2, 0, newused * sizeof (mp_word));
+
+  /* This computes the inner product.  To simplify the inner N**2 loop
+   * the multiplication by two is done afterwards in the N loop.
+   */
+  for (ix = 0; ix < pa; ix++) {
+    /* compute the outer product
+     *
+     * Note that every outer product is computed
+     * for a particular column only once which means that
+     * there is no need todo a double precision addition
+     */
+    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy;
+
+      /* copy of left side */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = a->dp + (ix + 1);
+
+      /* the column to store the result in */
+      _W = W + (ix + ix + 1);
+
+      /* inner products */
+      for (iy = ix + 1; iy < pa; iy++) {
+          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  olduse  = b->used;
+  b->used = newused;
+
+  /* now compute digits */
+  {
+    register mp_digit *tmpb;
+
+    /* double first value, since the inner products are 
+     * half of what they should be 
+     */
+    W[0] += W[0] + W2[0];
+
+    tmpb = b->dp;
+    for (ix = 1; ix < newused; ix++) {
+      /* double/add next digit */
+      W[ix] += W[ix] + W2[ix];
+
+      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    /* set the last value.  Note even if the carry is zero 
+     * this is required since the next step will not zero 
+     * it if b originally had a value at b->dp[2*a.used]
+     */
+    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+
+    /* clear high digits */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
+
+  mp_clamp (b);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_sqr.c */
+
+/* Start: bn_mp_2expt.c */
+#line 0 "bn_mp_2expt.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes a = 2**b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
+int
+mp_2expt (mp_int * a, int b)
+{
+  int     res;
+
+  mp_zero (a);
+  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+  a->used = b / DIGIT_BIT + 1;
+  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_2expt.c */
+
+/* Start: bn_mp_abs.c */
+#line 0 "bn_mp_abs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
+int
+mp_abs (mp_int * a, mp_int * b)
+{
+  int     res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = MP_ZPOS;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_abs.c */
+
+/* Start: bn_mp_add.c */
+#line 0 "bn_mp_add.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level addition (handles signs) */
+int
+mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  /* get sign of both inputs */
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
+  }
+  return res;
+}
+
+
+/* End: bn_mp_add.c */
+
+/* Start: bn_mp_add_d.c */
+#line 0 "bn_mp_add_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit addition */
+int
+mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_add (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_add_d.c */
+
+/* Start: bn_mp_addmod.c */
+#line 0 "bn_mp_addmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a + b (mod c) */
+int
+mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_addmod.c */
+
+/* Start: bn_mp_and.c */
+#line 0 "bn_mp_and.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* AND two ints together */
+int
+mp_and (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] &= x->dp[ix];
+  }
+
+  /* zero digits above the last from the smallest mp_int */
+  for (; ix < t.used; ix++) {
+    t.dp[ix] = 0;
+  }
+
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_and.c */
+
+/* Start: bn_mp_clamp.c */
+#line 0 "bn_mp_clamp.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
+void
+mp_clamp (mp_int * a)
+{
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
+    --(a->used);
+  }
+  if (a->used == 0) {
+    a->sign = MP_ZPOS;
+  }
+}
+
+/* End: bn_mp_clamp.c */
+
+/* Start: bn_mp_clear.c */
+#line 0 "bn_mp_clear.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* clear one (frees)  */
+void
+mp_clear (mp_int * a)
+{
+  if (a->dp != NULL) {
+
+    /* first zero the digits */
+    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+
+    /* free ram */
+    free (a->dp);
+
+    /* reset members to make debugging easier */
+    a->dp = NULL;
+    a->alloc = a->used = 0;
+  }
+}
+
+/* End: bn_mp_clear.c */
+
+/* Start: bn_mp_cmp.c */
+#line 0 "bn_mp_cmp.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare two ints (signed)*/
+int
+mp_cmp (mp_int * a, mp_int * b)
+{
+  /* compare based on sign */
+  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
+    return MP_LT;
+  } 
+  
+  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+    return MP_GT;
+  }
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
+}
+
+/* End: bn_mp_cmp.c */
+
+/* Start: bn_mp_cmp_d.c */
+#line 0 "bn_mp_cmp_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare a digit */
+int
+mp_cmp_d (mp_int * a, mp_digit b)
+{
+
+  if (a->sign == MP_NEG) {
+    return MP_LT;
+  }
+
+  if (a->used > 1) {
+    return MP_GT;
+  }
+
+  if (a->dp[0] > b) {
+    return MP_GT;
+  } else if (a->dp[0] < b) {
+    return MP_LT;
+  } else {
+    return MP_EQ;
+  }
+}
+
+/* End: bn_mp_cmp_d.c */
+
+/* Start: bn_mp_cmp_mag.c */
+#line 0 "bn_mp_cmp_mag.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare maginitude of two ints (unsigned) */
+int
+mp_cmp_mag (mp_int * a, mp_int * b)
+{
+  int     n;
+
+  /* compare based on # of non-zero digits */
+  if (a->used > b->used) {
+    return MP_GT;
+  } 
+  
+  if (a->used < b->used) {
+    return MP_LT;
+  }
+
+  /* compare based on digits  */
+  for (n = a->used - 1; n >= 0; n--) {
+    if (a->dp[n] > b->dp[n]) {
+      return MP_GT;
+    } 
+    
+    if (a->dp[n] < b->dp[n]) {
+      return MP_LT;
+    }
+  }
+  return MP_EQ;
+}
+
+/* End: bn_mp_cmp_mag.c */
+
+/* Start: bn_mp_copy.c */
+#line 0 "bn_mp_copy.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* copy, b = a */
+int
+mp_copy (mp_int * a, mp_int * b)
+{
+  int     res, n;
+
+  /* if dst == src do nothing */
+  if (a == b) {
+    return MP_OKAY;
+  }
+
+  /* grow dest */
+  if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero b and copy the parameters over */
+  {
+    register mp_digit *tmpa, *tmpb;
+
+    /* pointer aliases */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
+
+    /* clear high digits */
+    for (; n < b->used; n++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->used = a->used;
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_copy.c */
+
+/* Start: bn_mp_count_bits.c */
+#line 0 "bn_mp_count_bits.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* returns the number of bits in an int */
+int
+mp_count_bits (mp_int * a)
+{
+  int     r;
+  mp_digit q;
+
+  /* shortcut */
+  if (a->used == 0) {
+    return 0;
+  }
+
+  /* get number of digits and add that */
+  r = (a->used - 1) * DIGIT_BIT;
+  
+  /* take the last digit and count the bits in it */
+  q = a->dp[a->used - 1];
+  while (q > ((mp_digit) 0)) {
+    ++r;
+    q >>= ((mp_digit) 1);
+  }
+  return r;
+}
+
+/* End: bn_mp_count_bits.c */
+
+/* Start: bn_mp_div.c */
+#line 0 "bn_mp_div.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly incomplete.  For example,
+ * it doesn't consider the case where digits are removed from 'x' in the inner
+ * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
+*/
+int
+mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
+
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto __Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto __X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (int)(DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto __Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto __Y;
+     }
+  } else {
+     norm = 0;
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
+    goto __Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.used)
+      continue;
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+    } else {
+      mp_word tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+        tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+        goto __Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+        goto __Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+        goto __Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+        goto __Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder [which we have to normalize] */
+  /* get sign before writing to c */
+  x.sign = a->sign;
+
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+__Q:mp_clear (&q);
+  return res;
+}
+
+/* End: bn_mp_div.c */
+
+/* Start: bn_mp_div_2.c */
+#line 0 "bn_mp_div_2.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = a/2 */
+int
+mp_div_2 (mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* copy */
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* source alias */
+    tmpa = a->dp + b->used - 1;
+
+    /* dest alias */
+    tmpb = b->dp + b->used - 1;
+
+    /* carry */
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
+      rr = *tmpa & 1;
+
+      /* shift the current digit, add in carry and store */
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+      /* forward carry to next iteration */
+      r = rr;
+    }
+
+    /* zero excess digits */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  mp_clamp (b);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_div_2.c */
+
+/* Start: bn_mp_div_2d.c */
+#line 0 "bn_mp_div_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
+int
+mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+{
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;
+
+
+  /* if the shift count is <= 0 then we do no work */
+  if (b <= 0) {
+    res = mp_copy (a, c);
+    if (d != NULL) {
+      mp_zero (d);
+    }
+    return res;
+  }
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  /* get the remainder */
+  if (d != NULL) {
+    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    mp_rshd (c, b / DIGIT_BIT);
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  D = (mp_digit) (b % DIGIT_BIT);
+  if (D != 0) {
+    register mp_digit *tmpc, mask;
+
+    /* mask */
+    mask = (((mp_digit)1) << D) - 1;
+
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+
+    /* carry */
+    r = 0;
+    for (x = c->used - 1; x >= 0; x--) {
+      /* get the lower  bits of this word in a temp */
+      rr = *tmpc & mask;
+
+      /* shift the current word and mix in the carry bits from the previous word */
+      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      --tmpc;
+
+      /* set the carry to the carry bits of the current word found above */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  if (d != NULL) {
+    mp_exch (&t, d);
+  }
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_div_2d.c */
+
+/* Start: bn_mp_div_3.c */
+#line 0 "bn_mp_div_3.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -1669,80 +1669,80 @@ mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
   return res;
 }
 
-
-/* End: bn_mp_div_3.c */
-
-/* Start: bn_mp_div_d.c */
-#line 0 "bn_mp_div_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit division (based on routine from MPI) */
-int
-mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
-{
-  mp_int  q;
-  mp_word w, t;
-  int     res, ix;
-  
-  if (b == 0) {
-     return MP_VAL;
-  }
-  
-  if (b == 3) {
-     return mp_div_3(a, c, d);
-  }
-  
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-  
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-     
-     if (w >= b) {
-        t = w / b;
-        w = w % b;
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = t;
-  }
-  
-  if (d != NULL) {
-     *d = w;
-  }
-  
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-  
-  return res;
-}
-
-
-/* End: bn_mp_div_d.c */
-
-/* Start: bn_mp_dr_is_modulus.c */
-#line 0 "bn_mp_dr_is_modulus.c"
+
+/* End: bn_mp_div_3.c */
+
+/* Start: bn_mp_div_d.c */
+#line 0 "bn_mp_div_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit division (based on routine from MPI) */
+int
+mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+{
+  mp_int  q;
+  mp_word w, t;
+  int     res, ix;
+  
+  if (b == 0) {
+     return MP_VAL;
+  }
+  
+  if (b == 3) {
+     return mp_div_3(a, c, d);
+  }
+  
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= b) {
+        t = w / b;
+        w = w % b;
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = t;
+  }
+  
+  if (d != NULL) {
+     *d = w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
+
+/* End: bn_mp_div_d.c */
+
+/* Start: bn_mp_dr_is_modulus.c */
+#line 0 "bn_mp_dr_is_modulus.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -1777,102 +1777,102 @@ int mp_dr_is_modulus(mp_int *a)
    return 1;
 }
 
-
-/* End: bn_mp_dr_is_modulus.c */
-
-/* Start: bn_mp_dr_reduce.c */
-#line 0 "bn_mp_dr_reduce.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
- *
- * Based on algorithm from the paper
- *
- * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
- *          POSTECH Information Research Laboratories
- *
- * The modulus must be of a special format [see manual]
- *
- * Has been modified to use algorithm 7.10 from the LTM book instead
- */
-int
-mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
-{
-  int      err, i, m;
-  mp_word  r;
-  mp_digit mu, *tmpx1, *tmpx2;
-  
-  /* m = digits in modulus */
-  m = n->used;
-  
-  /* ensure that "x" has at least 2m digits */
-  if (x->alloc < m + m) {
-    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
-      return err;
-    }
-  }
-
-/* top of loop, this is where the code resumes if 
- * another reduction pass is required.
- */
-top:
-  /* aliases for digits */
-  /* alias for lower half of x */
-  tmpx1 = x->dp;
-  
-  /* alias for upper half of x, or x/B**m */
-  tmpx2 = x->dp + m;
-  
-  /* set carry to zero */
-  mu = 0;
-  
-  /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
-  for (i = 0; i < m; i++) {
-      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
-      *tmpx1++  = r & MP_MASK;
-      mu        = r >> ((mp_word)DIGIT_BIT);
-  }
-  
-  /* set final carry */
-  *tmpx1++ = mu;
-  
-  /* zero words above m */
-  for (i = m + 1; i < x->used; i++) {
-      *tmpx1++ = 0;
-  }
-
-  /* clamp, sub and return */
-  mp_clamp (x);
-
-  /* if x >= n then subtract and reduce again 
-   * Each successive "recursion" makes the input smaller and smaller.
-   */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    s_mp_sub(x, n, x);
-    goto top;
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_dr_reduce.c */
-
-/* Start: bn_mp_dr_setup.c */
-#line 0 "bn_mp_dr_setup.c"
+
+/* End: bn_mp_dr_is_modulus.c */
+
+/* Start: bn_mp_dr_reduce.c */
+#line 0 "bn_mp_dr_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+ *
+ * Based on algorithm from the paper
+ *
+ * "Generating Efficient Primes for Discrete Log Cryptosystems"
+ *                 Chae Hoon Lim, Pil Loong Lee,
+ *          POSTECH Information Research Laboratories
+ *
+ * The modulus must be of a special format [see manual]
+ *
+ * Has been modified to use algorithm 7.10 from the LTM book instead
+ */
+int
+mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
+{
+  int      err, i, m;
+  mp_word  r;
+  mp_digit mu, *tmpx1, *tmpx2;
+  
+  /* m = digits in modulus */
+  m = n->used;
+  
+  /* ensure that "x" has at least 2m digits */
+  if (x->alloc < m + m) {
+    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
+      return err;
+    }
+  }
+
+/* top of loop, this is where the code resumes if 
+ * another reduction pass is required.
+ */
+top:
+  /* aliases for digits */
+  /* alias for lower half of x */
+  tmpx1 = x->dp;
+  
+  /* alias for upper half of x, or x/B**m */
+  tmpx2 = x->dp + m;
+  
+  /* set carry to zero */
+  mu = 0;
+  
+  /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
+  for (i = 0; i < m; i++) {
+      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+      *tmpx1++  = r & MP_MASK;
+      mu        = r >> ((mp_word)DIGIT_BIT);
+  }
+  
+  /* set final carry */
+  *tmpx1++ = mu;
+  
+  /* zero words above m */
+  for (i = m + 1; i < x->used; i++) {
+      *tmpx1++ = 0;
+  }
+
+  /* clamp, sub and return */
+  mp_clamp (x);
+
+  /* if x >= n then subtract and reduce again 
+   * Each successive "recursion" makes the input smaller and smaller.
+   */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    s_mp_sub(x, n, x);
+    goto top;
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_dr_reduce.c */
+
+/* Start: bn_mp_dr_setup.c */
+#line 0 "bn_mp_dr_setup.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -1899,2940 +1899,2944 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
         ((mp_word)a->dp[0]));
 }
 
-
-/* End: bn_mp_dr_setup.c */
-
-/* Start: bn_mp_exch.c */
-#line 0 "bn_mp_exch.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* swap the elements of two integers, for cases where you can't simply swap the 
- * mp_int pointers around 
- */
-void
-mp_exch (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-
-  t = *a;
-  *a = *b;
-  *b = t;
-}
-
-/* End: bn_mp_exch.c */
-
-/* Start: bn_mp_expt_d.c */
-#line 0 "bn_mp_expt_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calculate c = a**b  using a square-multiply algorithm */
-int
-mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, x;
-  mp_int  g;
-
-  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* set initial result */
-  mp_set (c, 1);
-
-  for (x = 0; x < (int) DIGIT_BIT; x++) {
-    /* square */
-    if ((res = mp_sqr (c, c)) != MP_OKAY) {
-      mp_clear (&g);
-      return res;
-    }
-
-    /* if the bit is set multiply */
-    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
-      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
-         mp_clear (&g);
-         return res;
-      }
-    }
-
-    /* shift to next bit */
-    b <<= 1;
-  }
-
-  mp_clear (&g);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_expt_d.c */
-
-/* Start: bn_mp_exptmod.c */
-#line 0 "bn_mp_exptmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-
-/* this is a shell function that calls either the normal or Montgomery
- * exptmod functions.  Originally the call to the montgomery code was
- * embedded in the normal function but that wasted alot of stack space
- * for nothing (since 99% of the time the Montgomery code would be called)
- */
-int
-mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  int dr;
-
-  /* modulus P must be positive */
-  if (P->sign == MP_NEG) {
-     return MP_VAL;
-  }
-
-  /* if exponent X is negative we have to recurse */
-  if (X->sign == MP_NEG) {
-     mp_int tmpG, tmpX;
-     int err;
-
-     /* first compute 1/G mod P */
-     if ((err = mp_init(&tmpG)) != MP_OKAY) {
-        return err;
-     }
-     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
-        mp_clear(&tmpG);
-        return err;
-     }
-
-     /* now get |X| */
-     if ((err = mp_init(&tmpX)) != MP_OKAY) {
-        mp_clear(&tmpG);
-        return err;
-     }
-     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
-        mp_clear_multi(&tmpG, &tmpX, NULL);
-        return err;
-     }
-
-     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
-     err = mp_exptmod(&tmpG, &tmpX, P, Y);
-     mp_clear_multi(&tmpG, &tmpX, NULL);
-     return err;
-  }
-
-  dr = mp_dr_is_modulus(P);
-  if (dr == 0) {
-     dr = mp_reduce_is_2k(P) << 1;
-  }
-  
-  /* if the modulus is odd use the fast method */
-  if ((mp_isodd (P) == 1 || dr !=  0) && P->used > 4) {
-    return mp_exptmod_fast (G, X, P, Y, dr);
-  } else {
-    return s_mp_exptmod (G, X, P, Y);
-  }
-}
-
-
-/* End: bn_mp_exptmod.c */
-
-/* Start: bn_mp_exptmod_fast.c */
-#line 0 "bn_mp_exptmod_fast.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
- *
- * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
- * The value of k changes based on the size of the exponent.
- *
- * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
- */
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
-{
-  mp_int  M[256], res;
-  mp_digit buf, mp;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-  
-  /* use a pointer to the reduction algorithm.  This allows us to use
-   * one of many reduction algorithms without modding the guts of
-   * the code with if statements everywhere.  
-   */
-  int     (*redux)(mp_int*,mp_int*,mp_digit);
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-  if (winsize > 5) {
-     winsize = 5;
-  }
-#endif
-
-
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init (&M[x])) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* determine and setup reduction code */
-  if (redmode == 0) {
-     /* now setup montgomery  */
-     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
-     }
-     
-     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
-     if (((P->used * 2 + 1) < MP_WARRAY) &&
-          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-        redux = fast_mp_montgomery_reduce;
-     } else {
-        /* use slower baselien method */
-        redux = mp_montgomery_reduce;
-     }
-  } else if (redmode == 1) {
-     /* setup DR reduction */
-     mp_dr_setup(P, &mp);
-     redux = mp_dr_reduce;
-  } else {
-     /* setup 2k reduction */
-     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
-     }
-     redux = mp_reduce_2k;
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __RES;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-   *
-   * The first half of the table is not computed though accept for M[0] and M[1]
-   */
-
-  if (redmode == 0) {
-     /* now we need R mod m */
-     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
-     }
-
-     /* now set M[1] to G * R mod m */
-     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
-     }
-  } else {
-     mp_set(&res, 1);
-     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
-     }
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
-    }
-    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
-    }
-    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
-    }
-  }
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-        break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0) {
-      continue;
-    }
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-    }
-  }
-
-  if (redmode == 0) {
-     /* fixup result if Montgomery reduction is used */
-     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
-       goto __RES;
-     }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
-
-/* End: bn_mp_exptmod_fast.c */
-
-/* Start: bn_mp_gcd.c */
-#line 0 "bn_mp_gcd.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
- */
-int
-mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  u, v, t;
-  int     k, res, neg;
-
-  /* either zero than gcd is the largest */
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
-    return mp_copy (b, c);
-  }
-  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
-    return mp_copy (a, c);
-  }
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
-    mp_set (c, 1);
-    return MP_OKAY;
-  }
-
-  /* if both are negative they share (-1) as a common divisor */
-  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-
-  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
-  }
-
-  /* must be positive for the remainder of the algorithm */
-  u.sign = v.sign = MP_ZPOS;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    goto __V;
-  }
-
-  /* B1.  Find power of two */
-  k = 0;
-  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
-    ++k;
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __T;
-    }
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  /* B2.  Initialize */
-  if (mp_isodd(&u) == 1) {
-    /* t = -v */
-    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-    t.sign = MP_NEG;
-  } else {
-    /* t = u */
-    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  do {
-    /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && mp_iseven(&t) == 1) {
-      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-        goto __T;
-      }
-    }
-
-    /* B5.  if t>0 then u=t otherwise v=-t */
-    if (t.used != 0 && t.sign != MP_NEG) {
-      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-        goto __T;
-      }
-    } else {
-      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-        goto __T;
-      }
-      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-    }
-
-    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  } while (mp_iszero(&t) == 0);
-
-  /* multiply by 2^k which we divided out at the beginning */ 
-  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
-    goto __T;
-  }
-
-  mp_exch (&u, c);
-  c->sign = neg;
-  res = MP_OKAY;
-__T:mp_clear (&t);
-__V:mp_clear (&u);
-__U:mp_clear (&v);
-  return res;
-}
-
-/* End: bn_mp_gcd.c */
-
-/* Start: bn_mp_grow.c */
-#line 0 "bn_mp_grow.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* grow as required */
-int
-mp_grow (mp_int * a, int size)
-{
-  int     i;
-
-  /* if the alloc size is smaller alloc more ram */
-  if (a->alloc < size) {
-    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
-
-    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
-    if (a->dp == NULL) {
-      return MP_MEM;
-    }
-
-    /* zero excess digits */
-    i        = a->alloc;
-    a->alloc = size;
-    for (; i < a->alloc; i++) {
-      a->dp[i] = 0;
-    }
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_grow.c */
-
-/* Start: bn_mp_init.c */
-#line 0 "bn_mp_init.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* init a new bigint */
-int
-mp_init (mp_int * a)
-{
-  /* allocate ram required and clear it */
-  a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-
-  /* set the used to zero, allocated digits to the default precision
-   * and sign to positive */
-  a->used  = 0;
-  a->alloc = MP_PREC;
-  a->sign  = MP_ZPOS;
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_init.c */
-
-/* Start: bn_mp_init_copy.c */
-#line 0 "bn_mp_init_copy.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* creates "a" then copies b into it */
-int
-mp_init_copy (mp_int * a, mp_int * b)
-{
-  int     res;
-
-  if ((res = mp_init (a)) != MP_OKAY) {
-    return res;
-  }
-  return mp_copy (b, a);
-}
-
-/* End: bn_mp_init_copy.c */
-
-/* Start: bn_mp_init_size.c */
-#line 0 "bn_mp_init_size.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* init a mp_init and grow it to a given size */
-int
-mp_init_size (mp_int * a, int size)
-{
-
-  /* pad size so there are always extra digits */
-  size += (MP_PREC * 2) - (size & (MP_PREC - 1));   
-  
-  /* alloc mem */
-  a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-  a->used = 0;
-  a->alloc = size;
-  a->sign = MP_ZPOS;
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_init_size.c */
-
-/* Start: bn_mp_invmod.c */
-#line 0 "bn_mp_invmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, A, B, C, D;
-  int     res;
-
-  /* b cannot be negative */
-  if (b->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  /* if the modulus is odd we can use a faster routine instead */
-  if (mp_iseven (b) == 0) {
-    return fast_mp_invmod (a, b, c);
-  }
-  
-  /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
-     return res;
-  }
-
-  /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
-  }
-  mp_set (&A, 1);
-  mp_set (&D, 1);
-
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
-      /* A = (A+y)/2, B = (B-x)/2 */
-      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-    goto __ERR;
-      }
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-    goto __ERR;
-      }
-    }
-    /* A = A/2, B = B/2 */
-    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
-    }
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
-      /* C = (C+y)/2, D = (D-x)/2 */
-      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-    goto __ERR;
-      }
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-    goto __ERR;
-      }
-    }
-    /* C = C/2, D = D/2 */
-    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
-    }
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  } else {
-    /* v - v - u, C = C - A, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
-    goto top;
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* a is now the inverse */
-  mp_exch (&C, c);
-  res = MP_OKAY;
-
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
-  return res;
-}
-
-/* End: bn_mp_invmod.c */
-
-/* Start: bn_mp_jacobi.c */
-#line 0 "bn_mp_jacobi.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes the jacobi c = (a | n) (or Legendre if n is prime)
- * HAC pp. 73 Algorithm 2.149
- */
-int
-mp_jacobi (mp_int * a, mp_int * n, int *c)
-{
-  mp_int  a1, n1, e;
-  int     s, r, res;
-  mp_digit residue;
-
-  /* step 1.  if a == 0, return 0 */
-  if (mp_iszero (a) == 1) {
-    *c = 0;
-    return MP_OKAY;
-  }
-
-  /* step 2.  if a == 1, return 1 */
-  if (mp_cmp_d (a, 1) == MP_EQ) {
-    *c = 1;
-    return MP_OKAY;
-  }
-
-  /* default */
-  s = 0;
-
-  /* step 3.  write a = a1 * 2^e  */
-  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&n1)) != MP_OKAY) {
-    goto __A1;
-  }
-
-  if ((res = mp_init (&e)) != MP_OKAY) {
-    goto __N1;
-  }
-
-  while (mp_iseven (&a1) == 1) {
-    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
-      goto __E;
-    }
-
-    if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
-      goto __E;
-    }
-  }
-
-  /* step 4.  if e is even set s=1 */
-  if (mp_iseven (&e) == 1) {
-    s = 1;
-  } else {
-    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-
-    if (residue == 1 || residue == 7) {
-      s = 1;
-    } else if (residue == 3 || residue == 5) {
-      s = -1;
-    }
-  }
-
-  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
-    goto __E;
-  }
-  if (residue == 3) {
-    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-    if (residue == 3) {
-      s = -s;
-    }
-  }
-
-  /* if a1 == 1 we're done */
-  if (mp_cmp_d (&a1, 1) == MP_EQ) {
-    *c = s;
-  } else {
-    /* n1 = n mod a1 */
-    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
-      goto __E;
-    }
-    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
-      goto __E;
-    }
-    *c = s * r;
-  }
-
-  /* done */
-  res = MP_OKAY;
-__E:mp_clear (&e);
-__N1:mp_clear (&n1);
-__A1:mp_clear (&a1);
-  return res;
-}
-
-/* End: bn_mp_jacobi.c */
-
-/* Start: bn_mp_karatsuba_mul.c */
-#line 0 "bn_mp_karatsuba_mul.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = |a| * |b| using Karatsuba Multiplication using 
- * three half size multiplications
- *
- * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
- * let n represent half of the number of digits in 
- * the min(a,b)
- *
- * a = a1 * B**n + a0
- * b = b1 * B**n + b0
- *
- * Then, a * b => 
-   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
- *
- * Note that a1b1 and a0b0 are used twice and only need to be 
- * computed once.  So in total three half size (half # of 
- * digit) multiplications are performed, a0b0, a1b1 and 
- * (a1-b1)(a0-b0)
- *
- * Note that a multiplication of half the digits requires
- * 1/4th the number of single precision multiplications so in 
- * total after one call 25% of the single precision multiplications 
- * are saved.  Note also that the call to mp_mul can end up back 
- * in this function if the a0, a1, b0, or b1 are above the threshold.  
- * This is known as divide-and-conquer and leads to the famous 
- * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
- * the standard O(N**2) that the baseline/comba methods use.  
- * Generally though the overhead of this method doesn't pay off 
- * until a certain size (N ~ 80) is reached.
- */
-int
-mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-  int     B, err;
-
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = MIN (a->used, b->used);
-
-  /* now divide in two */
-  B = B / 2;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-  if (mp_init_size (&y0, B) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-    goto Y0;
-
-  /* init temps */
-  if (mp_init_size (&t1, B * 2) != MP_OKAY)
-    goto Y1;
-  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-    goto X0Y0;
-
-  /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
-  x0.used = y0.used = B;
-  x1.used = a->used - B;
-  y1.used = b->used - B;
-
-  {
-    register int x;
-    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-
-    /* we copy the digits directly instead of using higher level functions
-     * since we also need to shift the digits
-     */
-    tmpa = a->dp;
-    tmpb = b->dp;
-
-    tmpx = x0.dp;
-    tmpy = y0.dp;
-    for (x = 0; x < B; x++) {
-      *tmpx++ = *tmpa++;
-      *tmpy++ = *tmpb++;
-    }
-
-    tmpx = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *tmpx++ = *tmpa++;
-    }
-
-    tmpy = y1.dp;
-    for (x = B; x < b->used; x++) {
-      *tmpy++ = *tmpb++;
-    }
-  }
-
-  /* only need to clamp the lower words since by definition the 
-   * upper words x1/y1 must have a known number of digits
-   */
-  mp_clamp (&x0);
-  mp_clamp (&y0);
-
-  /* now calc the products x0y0 and x1y1 */
-  /* after this x0 is no longer required, free temp [x0==t2]! */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
-    goto X1Y1;          /* x0y0 = x0*y0 */
-  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-    goto X1Y1;          /* x1y1 = x1*y1 */
-
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
-    goto X1Y1;          /* t2 = y1 - y0 */
-  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
-
-  /* add x0y0 */
-  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
-    goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
-
-  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + t1 */
-  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
-
-  err = MP_OKAY;
-
-X1Y1:mp_clear (&x1y1);
-X0Y0:mp_clear (&x0y0);
-T1:mp_clear (&t1);
-Y1:mp_clear (&y1);
-Y0:mp_clear (&y0);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-
-/* End: bn_mp_karatsuba_mul.c */
-
-/* Start: bn_mp_karatsuba_sqr.c */
-#line 0 "bn_mp_karatsuba_sqr.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Karatsuba squaring, computes b = a*a using three 
- * half size squarings
- *
- * See comments of mp_karatsuba_mul for details.  It 
- * is essentially the same algorithm but merely 
- * tuned to perform recursive squarings.
- */
-int
-mp_karatsuba_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  x0, x1, t1, t2, x0x0, x1x1;
-  int     B, err;
-
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = a->used;
-
-  /* now divide in two */
-  B = B / 2;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-
-  /* init temps */
-  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-    goto T2;
-  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-    goto X0X0;
-
-  {
-    register int x;
-    register mp_digit *dst, *src;
-
-    src = a->dp;
-
-    /* now shift the digits */
-    dst = x0.dp;
-    for (x = 0; x < B; x++) {
-      *dst++ = *src++;
-    }
-
-    dst = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *dst++ = *src++;
-    }
-  }
-
-  x0.used = B;
-  x1.used = a->used - B;
-
-  mp_clamp (&x0);
-
-  /* now calc the products x0*x0 and x1*x1 */
-  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;           /* x0x0 = x0*x0 */
-  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;           /* x1x1 = x1*x1 */
-
-  /* now calc (x1-x0)**2 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x1 - x0 */
-  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
-
-  /* add x0y0 */
-  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;           /* x1x1 = x1x1 << 2*B */
-
-  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + t1 */
-  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
-
-  err = MP_OKAY;
-
-X1X1:mp_clear (&x1x1);
-X0X0:mp_clear (&x0x0);
-T2:mp_clear (&t2);
-T1:mp_clear (&t1);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-
-/* End: bn_mp_karatsuba_sqr.c */
-
-/* Start: bn_mp_lcm.c */
-#line 0 "bn_mp_lcm.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes least common multiple as a*b/(a, b) */
-int
-mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  res = mp_div (&t, c, c, NULL);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_lcm.c */
-
-/* Start: bn_mp_lshd.c */
-#line 0 "bn_mp_lshd.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift left a certain amount of digits */
-int
-mp_lshd (mp_int * a, int b)
-{
-  int     x, res;
-
-  /* if its less than zero return */
-  if (b <= 0) {
-    return MP_OKAY;
-  }
-
-  /* grow to fit the new digits */
-  if (a->alloc < a->used + b) {
-     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  {
-    register mp_digit *top, *bottom;
-
-    /* increment the used by the shift amount then copy upwards */
-    a->used += b;
-
-    /* top */
-    top = a->dp + a->used - 1;
-
-    /* base */
-    bottom = a->dp + a->used - 1 - b;
-
-    /* much like mp_rshd this is implemented using a sliding window
-     * except the window goes the otherway around.  Copying from
-     * the bottom to the top.  see bn_mp_rshd.c for more info.
-     */
-    for (x = a->used - 1; x >= b; x--) {
-      *top-- = *bottom--;
-    }
-
-    /* zero the lower digits */
-    top = a->dp;
-    for (x = 0; x < b; x++) {
-      *top++ = 0;
-    }
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_lshd.c */
-
-/* Start: bn_mp_mod.c */
-#line 0 "bn_mp_mod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = a mod b, 0 <= c < b */
-int
-mp_mod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    res = mp_add (b, &t, c);
-  } else {
-    res = MP_OKAY;
-    mp_exch (&t, c);
-  }
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_mod.c */
-
-/* Start: bn_mp_mod_2d.c */
-#line 0 "bn_mp_mod_2d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calc a value mod 2^b */
-int
-mp_mod_2d (mp_int * a, int b, mp_int * c)
-{
-  int     x, res;
-
-
-  /* if b is <= 0 then zero the int */
-  if (b <= 0) {
-    mp_zero (c);
-    return MP_OKAY;
-  }
-
-  /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
-    res = mp_copy (a, c);
-    return res;
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
-  }
-
-  /* zero digits above the last digit of the modulus */
-  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
-    c->dp[x] = 0;
-  }
-  /* clear the digit that is not completely outside/inside the modulus */
-  c->dp[b / DIGIT_BIT] &=
-    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mod_2d.c */
-
-/* Start: bn_mp_mod_d.c */
-#line 0 "bn_mp_mod_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
-{
-  return mp_div_d(a, b, NULL, c);
-}
-
-/* End: bn_mp_mod_d.c */
-
-/* Start: bn_mp_montgomery_calc_normalization.c */
-#line 0 "bn_mp_montgomery_calc_normalization.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].
- * B^n mod b is computed by first computing
- * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT
- * shifts with subtractions when the result is greater than b.
- *
- * The method is slightly modified to shift B unconditionally upto just under
- * the leading bit of b.  This saves alot of multiple precision shifting.
- */
-int
-mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
-{
-  int     x, bits, res;
-
-  /* how many bits of last digit does b use */
-  bits = mp_count_bits (b) % DIGIT_BIT;
-
-  /* compute A = B^(n-1) * 2^(bits-1) */
-  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* now compute C = A * B mod b */
-  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
-    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
-      return res;
-    }
-    if (mp_cmp_mag (a, b) != MP_LT) {
-      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
-        return res;
-      }
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_calc_normalization.c */
-
-/* Start: bn_mp_montgomery_reduce.c */
-#line 0 "bn_mp_montgomery_reduce.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes xR**-1 == x (mod N) via Montgomery Reduction */
-int
-mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-{
-  int     ix, res, digs;
-  mp_digit mu;
-
-  /* can the fast reduction [comba] method be used?
-   *
-   * Note that unlike in mp_mul you're safely allowed *less*
-   * than the available columns [255 per default] since carries
-   * are fixed up in the inner loop.
-   */
-  digs = n->used * 2 + 1;
-  if ((digs < MP_WARRAY) && 
-      n->used < 
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_mp_montgomery_reduce (x, n, rho);
-  }
-
-  /* grow the input as required */
-  if (x->alloc < digs) {
-    if ((res = mp_grow (x, digs)) != MP_OKAY) {
-      return res;
-    }
-  }
-  x->used = digs;
-
-  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * m' mod b */
-    mu = (x->dp[ix] * rho) & MP_MASK;
-
-    /* a = a + mu * m * b**i */
-    {
-      register int iy;
-      register mp_digit *tmpn, *tmpx, u;
-      register mp_word r;
-
-      /* aliases */
-      tmpn = n->dp;
-      tmpx = x->dp + ix;
-
-      /* set the carry to zero */
-      u = 0;
-      
-      /* Multiply and add in place */
-      for (iy = 0; iy < n->used; iy++) {
-        r = ((mp_word) mu) * ((mp_word) * tmpn++) + 
-            ((mp_word) u) + ((mp_word) * tmpx);
-        u = (r >> ((mp_word) DIGIT_BIT));
-        *tmpx++ = (r & ((mp_word) MP_MASK));
-      }
-      /* propagate carries */
-      while (u) {
-        *tmpx += u;
-        u = *tmpx >> DIGIT_BIT;
-        *tmpx++ &= MP_MASK;
-      }
-    }
-  }
-
-  /* x = x/b**n.used */
-  mp_rshd (x, n->used);
-
-  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (x, n, x);
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_reduce.c */
-
-/* Start: bn_mp_montgomery_setup.c */
-#line 0 "bn_mp_montgomery_setup.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* setups the montgomery reduction stuff */
-int
-mp_montgomery_setup (mp_int * n, mp_digit * rho)
-{
-  mp_digit x, b;
-
-/* fast inversion mod 2**k
- *
- * Based on the fact that
- *
- * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
- *                    =>  2*X*A - X*X*A*A = 1
- *                    =>  2*(1) - (1)     = 1
- */
-  b = n->dp[0];
-
-  if ((b & 1) == 0) {
-    return MP_VAL;
-  }
-
-  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
-#if !defined(MP_8BIT)
-  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
-#endif
-#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-#endif
-#ifdef MP_64BIT
-  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
-#endif
-
-  /* rho = -1/m mod b */
-  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_setup.c */
-
-/* Start: bn_mp_mul.c */
-#line 0 "bn_mp_mul.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level multiplication (handles sign) */
-int
-mp_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, neg;
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  
-  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
-    res = mp_toom_mul(a, b, c);
-  } else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
-    res = mp_karatsuba_mul (a, b, c);
-  } else {
-
-    /* can we use the fast multiplier?
-     *
-     * The fast multiplier can be used if the output will 
-     * have less than MP_WARRAY digits and the number of 
-     * digits won't affect carry propagation
-     */
-    int     digs = a->used + b->used + 1;
-
-    if ((digs < MP_WARRAY) &&
-        MIN(a->used, b->used) <= 
-        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-      res = fast_s_mp_mul_digs (a, b, c, digs);
-    } else {
-      res = s_mp_mul (a, b, c);
-    }
-
-  }
-  c->sign = neg;
-  return res;
-}
-
-/* End: bn_mp_mul.c */
-
-/* Start: bn_mp_mul_2.c */
-#line 0 "bn_mp_mul_2.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = a*2 */
-int
-mp_mul_2 (mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* grow to accomodate result */
-  if (b->alloc < a->used + 1) {
-    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* alias for source */
-    tmpa = a->dp;
-    
-    /* alias for dest */
-    tmpb = b->dp;
-
-    /* carry */
-    r = 0;
-    for (x = 0; x < a->used; x++) {
-    
-      /* get what will be the *next* carry bit from the 
-       * MSB of the current digit 
-       */
-      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
-      
-      /* now shift up this digit, add in the carry [from the previous] */
-      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
-      
-      /* copy the carry that would be from the source 
-       * digit into the next iteration 
-       */
-      r = rr;
-    }
-
-    /* new leading digit? */
-    if (r != 0) {
-      /* add a MSB which is always 1 at this point */
-      *tmpb = 1;
-      ++b->used;
-    }
-
-    /* now zero any excess digits on the destination 
-     * that we didn't write to 
-     */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_2.c */
-
-/* Start: bn_mp_mul_2d.c */
-#line 0 "bn_mp_mul_2d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
-   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
- 
-   To be fixed for LTM 0.18
- */
-
-/* shift left by a certain bit count */
-int
-mp_mul_2d (mp_int * a, int b, mp_int * c)
-{
-  mp_digit d;
-  int      res;
-
-  /* copy */
-  if (a != c) {
-     if ((res = mp_copy (a, c)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
-     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= (int)DIGIT_BIT) {
-    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
-      return res;
-    }
-  }
-  c->used = c->alloc;
-
-  /* shift any bit count < DIGIT_BIT */
-  d = (mp_digit) (b % DIGIT_BIT);
-  if (d != 0) {
-    register mp_digit *tmpc, mask, r, rr;
-    register int x;
-
-    /* bitmask for carries */
-    mask = (((mp_digit)1) << d) - 1;
-
-    /* alias */
-    tmpc = c->dp;
-
-    /* carry */
-    r    = 0;
-    for (x = 0; x < c->used; x++) {
-      /* get the higher bits of the current word */
-      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
-
-      /* shift the current word and OR in the carry */
-      *tmpc = ((*tmpc << d) | r) & MP_MASK;
-      ++tmpc;
-
-      /* set the carry to the carry bits of the current word */
-      r = rr;
-    }
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_2d.c */
-
-/* Start: bn_mp_mul_d.c */
-#line 0 "bn_mp_mul_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiply by a digit */
-int
-mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, pa, olduse;
-
-  /* make sure c is big enough to hold a*b */
-  pa = a->used;
-  if (c->alloc < pa + 1) {
-    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* get the original destinations used count */
-  olduse = c->used;
-
-  /* set the new temporary used count */
-  c->used = pa + 1;
-
-  {
-    register mp_digit u, *tmpa, *tmpc;
-    register mp_word r;
-    register int ix;
-
-    /* alias for a->dp [source] */
-    tmpa = a->dp;
-
-    /* alias for c->dp [dest] */
-    tmpc = c->dp;
-
-    /* zero carry */
-    u = 0;
-    for (ix = 0; ix < pa; ix++) {
-      /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
-
-      /* mask off higher bits to get a single digit */
-      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* send carry into next iteration */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* store final carry [if any] */
-    *tmpc++ = u;
-
-    /* now zero digits above the top */
-    for (; pa < olduse; pa++) {
-       *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_d.c */
-
-/* Start: bn_mp_mulmod.c */
-#line 0 "bn_mp_mulmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_mulmod.c */
-
-/* Start: bn_mp_multi.c */
-#line 0 "bn_mp_multi.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-#include <stdarg.h>
-
-int mp_init_multi(mp_int *mp, ...) 
-{
-    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
-    int n = 0;                 /* Number of ok inits */
-    mp_int* cur_arg = mp;
-    va_list args;
-
-    va_start(args, mp);        /* init args to next argument from caller */
-    while (cur_arg != NULL) {
-        if (mp_init(cur_arg) != MP_OKAY) {
-            /* Oops - error! Back-track and mp_clear what we already
-               succeeded in init-ing, then return error.
-            */
-            va_list clean_args;
-            
-            /* end the current list */
-            va_end(args);
-            
-            /* now start cleaning up */            
-            cur_arg = mp;
-            va_start(clean_args, mp);
-            while (n--) {
-                mp_clear(cur_arg);
-                cur_arg = va_arg(clean_args, mp_int*);
-            }
-            va_end(clean_args);
-            res = MP_MEM;
-            break;
-        }
-        n++;
-        cur_arg = va_arg(args, mp_int*);
-    }
-    va_end(args);
-    return res;                /* Assumed ok, if error flagged above. */
-}
-
-void mp_clear_multi(mp_int *mp, ...) 
-{
-    mp_int* next_mp = mp;
-    va_list args;
-    va_start(args, mp);
-    while (next_mp != NULL) {
-        mp_clear(next_mp);
-        next_mp = va_arg(args, mp_int*);
-    }
-    va_end(args);
-}
-
-/* End: bn_mp_multi.c */
-
-/* Start: bn_mp_n_root.c */
-#line 0 "bn_mp_n_root.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* find the n'th root of an integer 
- *
- * Result found such that (c)^b <= a and (c+1)^b > a 
- *
- * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
- * which will find the root in log(N) time where each step involves a fair bit.  This
- * is not meant to find huge roots [square and cube at most].
- */
-int
-mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t1, t2, t3;
-  int     res, neg;
-
-  /* input must be positive if b is even */
-  if ((b & 1) == 0 && a->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
-  }
-
-  if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto __T2;
-  }
-
-  /* if a is negative fudge the sign but keep track */
-  neg = a->sign;
-  a->sign = MP_ZPOS;
-
-  /* t2 = 2 */
-  mp_set (&t2, 2);
-
-  do {
-    /* t1 = t2 */
-    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto __T3;
-    }
-
-    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
-    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   /* t3 = t1^(b-1) */
-      goto __T3;
-    }
-
-    /* numerator */
-    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    /* t2 = t1^b */
-      goto __T3;
-    }
-
-    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  /* t2 = t1^b - a */
-      goto __T3;
-    }
-
-    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    /* t3 = t1^(b-1) * b  */
-      goto __T3;
-    }
-
-    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  /* t3 = (t1^b - a)/(b * t1^(b-1)) */
-      goto __T3;
-    }
-
-    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto __T3;
-    }
-  }
-  while (mp_cmp (&t1, &t2) != MP_EQ);
-
-  /* result can be off by a few so check */
-  for (;;) {
-    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
-    }
-
-    if (mp_cmp (&t2, a) == MP_GT) {
-      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-    goto __T3;
-      }
-    } else {
-      break;
-    }
-  }
-
-  /* reset the sign of a first */
-  a->sign = neg;
-
-  /* set the result */
-  mp_exch (&t1, c);
-
-  /* set the sign of the result */
-  c->sign = neg;
-
-  res = MP_OKAY;
-
-__T3:mp_clear (&t3);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-  return res;
-}
-
-/* End: bn_mp_n_root.c */
-
-/* Start: bn_mp_neg.c */
-#line 0 "bn_mp_neg.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = -a */
-int
-mp_neg (mp_int * a, mp_int * b)
-{
-  int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
-  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_neg.c */
-
-/* Start: bn_mp_or.c */
-#line 0 "bn_mp_or.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* OR two ints together */
-int
-mp_or (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] |= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_or.c */
-
-/* Start: bn_mp_prime_fermat.c */
-#line 0 "bn_mp_prime_fermat.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* performs one Fermat test.
- * 
- * If "a" were prime then b^a == b (mod a) since the order of
- * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
- *
- * Sets result to 1 if the congruence holds, or zero otherwise.
- */
-int
-mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  t;
-  int     err;
-
-  /* default to fail */
-  *result = 0;
-
-  /* init t */
-  if ((err = mp_init (&t)) != MP_OKAY) {
-    return err;
-  }
-
-  /* compute t = b^a mod a */
-  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto __T;
-  }
-
-  /* is it equal to b? */
-  if (mp_cmp (&t, b) == MP_EQ) {
-    *result = 1;
-  }
-
-  err = MP_OKAY;
-__T:mp_clear (&t);
-  return err;
-}
-
-/* End: bn_mp_prime_fermat.c */
-
-/* Start: bn_mp_prime_is_divisible.c */
-#line 0 "bn_mp_prime_is_divisible.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines if an integers is divisible by one of the first 256 primes or not
- *
- * sets result to 0 if not, 1 if yes
- */
-int
-mp_prime_is_divisible (mp_int * a, int *result)
-{
-  int     err, ix;
-  mp_digit res;
-
-  /* default to not */
-  *result = 0;
-
-  for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* is it equal to the prime? */
-    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
-      *result = 1;
-      return MP_OKAY;
-    }
-
-    /* what is a mod __prime_tab[ix] */
-    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
-      return err;
-    }
-
-    /* is the residue zero? */
-    if (res == 0) {
-      *result = 1;
-      return MP_OKAY;
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_prime_is_divisible.c */
-
-/* Start: bn_mp_prime_is_prime.c */
-#line 0 "bn_mp_prime_is_prime.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* performs a variable number of rounds of Miller-Rabin
- *
- * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= 256
- *
- * Sets result to 1 if probably prime, 0 otherwise
- */
-int
-mp_prime_is_prime (mp_int * a, int t, int *result)
-{
-  mp_int  b;
-  int     ix, err, res;
-
-  /* default to no */
-  *result = 0;
-
-  /* valid value of t? */
-  if (t < 1 || t > PRIME_SIZE) {
-    return MP_VAL;
-  }
-
-  /* is the input equal to one of the primes in the table? */
-  for (ix = 0; ix < PRIME_SIZE; ix++) {
-      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
-         *result = 1;
-         return MP_OKAY;
-      }
-  }
-
-  /* first perform trial division */
-  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
-    return err;
-  }
-  if (res == 1) {
-    return MP_OKAY;
-  }
-
-  /* now perform the miller-rabin rounds */
-  if ((err = mp_init (&b)) != MP_OKAY) {
-    return err;
-  }
-
-  for (ix = 0; ix < t; ix++) {
-    /* set the prime */
-    mp_set (&b, __prime_tab[ix]);
-
-    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto __B;
-    }
-
-    if (res == 0) {
-      goto __B;
-    }
-  }
-
-  /* passed the test */
-  *result = 1;
-__B:mp_clear (&b);
-  return err;
-}
-
-/* End: bn_mp_prime_is_prime.c */
-
-/* Start: bn_mp_prime_miller_rabin.c */
-#line 0 "bn_mp_prime_miller_rabin.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Miller-Rabin test of "a" to the base of "b" as described in 
- * HAC pp. 139 Algorithm 4.24
- *
- * Sets result to 0 if definitely composite or 1 if probably prime.
- * Randomly the chance of error is no more than 1/4 and often 
- * very much lower.
- */
-int
-mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  n1, y, r;
-  int     s, j, err;
-
-  /* default */
-  *result = 0;
-
-  /* get n1 = a - 1 */
-  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
-    return err;
-  }
-  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto __N1;
-  }
-
-  /* set 2^s * r = n1 */
-  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto __N1;
-  }
-  s = 0;
-  while (mp_iseven (&r) == 1) {
-    ++s;
-    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
-      goto __R;
-    }
-  }
-
-  /* compute y = b^r mod a */
-  if ((err = mp_init (&y)) != MP_OKAY) {
-    goto __R;
-  }
-  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  /* if y != 1 and y != n1 do */
-  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
-    j = 1;
-    /* while j <= s-1 and y != n1 */
-    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
-      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-    goto __Y;
-      }
-
-      /* if y == 1 then composite */
-      if (mp_cmp_d (&y, 1) == MP_EQ) {
-    goto __Y;
-      }
-
-      ++j;
-    }
-
-    /* if y != n1 then composite */
-    if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto __Y;
-    }
-  }
-
-  /* probably prime now */
-  *result = 1;
-__Y:mp_clear (&y);
-__R:mp_clear (&r);
-__N1:mp_clear (&n1);
-  return err;
-}
-
-/* End: bn_mp_prime_miller_rabin.c */
-
-/* Start: bn_mp_prime_next_prime.c */
-#line 0 "bn_mp_prime_next_prime.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* finds the next prime after the number "a" using "t" trials
- * of Miller-Rabin.
- */
-int mp_prime_next_prime(mp_int *a, int t)
-{
-   int err, res;
-
-   if (mp_iseven(a) == 1) {
-      /* force odd */
-      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
-         return err;
-      }
-   } else {
-      /* force to next odd number */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
-   }
-
-   for (;;) {
-      /* is this prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         return err;
-      }
-
-      if (res == 1) {
-         break;
-      }
-
-      /* add two, next candidate */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
-   }
-
-   return MP_OKAY;
-}
-
-
-/* End: bn_mp_prime_next_prime.c */
-
-/* Start: bn_mp_rand.c */
-#line 0 "bn_mp_rand.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* makes a pseudo-random int of a given size */
-int
-mp_rand (mp_int * a, int digits)
-{
-  int     res;
-  mp_digit d;
-
-  mp_zero (a);
-  if (digits <= 0) {
-    return MP_OKAY;
-  }
-
-  /* first place a random non-zero digit */
-  do {
-    d = ((mp_digit) abs (rand ()));
-  } while (d == 0);
-
-  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
-    return res;
-  }
-
-  while (digits-- > 0) {
-    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
-      return res;
-    }
-
-    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_rand.c */
-
-/* Start: bn_mp_read_signed_bin.c */
-#line 0 "bn_mp_read_signed_bin.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
-{
-  int     res;
-
-  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
-    return res;
-  }
-  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_read_signed_bin.c */
-
-/* Start: bn_mp_read_unsigned_bin.c */
-#line 0 "bn_mp_read_unsigned_bin.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
-{
-  int     res;
-  mp_zero (a);
-  while (c-- > 0) {
-    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
-      return res;
-    }
-
-    if (DIGIT_BIT != 7) {
-      a->dp[0] |= *b++;
-      a->used += 1;
-    } else {
-      a->dp[0] = (*b & MP_MASK);
-      a->dp[1] |= ((*b++ >> 7U) & 1);
-      a->used += 2;
-    }
-  }
-  mp_clamp (a);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_read_unsigned_bin.c */
-
-/* Start: bn_mp_reduce.c */
-#line 0 "bn_mp_reduce.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reduces x mod m, assumes 0 < x < m**2, mu is 
- * precomputed via mp_reduce_setup.
- * From HAC pp.604 Algorithm 14.42
- */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-{
-  mp_int  q;
-  int     res, um = m->used;
-
-  /* q = x */
-  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
-    return res;
-  }
-
-  /* q1 = x / b**(k-1)  */
-  mp_rshd (&q, um - 1);         
-
-  /* according to HAC this is optimization is ok */
-  if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
-    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  } else {
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  }
-
-  /* q3 = q2 / b**(k+1) */
-  mp_rshd (&q, um + 1);         
-
-  /* x = x mod b**(k+1), quick (no division) */
-  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* q = q * m mod b**(k+1), quick (no division) */
-  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* x = x - q */
-  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* If x < 0, add b**(k+1) to it */
-  if (mp_cmp_d (x, 0) == MP_LT) {
-    mp_set (&q, 1);
-    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-      goto CLEANUP;
-    if ((res = mp_add (x, &q, x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  /* Back off if it's too big */
-  while (mp_cmp (x, m) != MP_LT) {
-    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
-      break;
-    }
-  }
-  
-CLEANUP:
-  mp_clear (&q);
-
-  return res;
-}
-
-/* End: bn_mp_reduce.c */
-
-/* Start: bn_mp_reduce_2k.c */
-#line 0 "bn_mp_reduce_2k.c"
+
+/* End: bn_mp_dr_setup.c */
+
+/* Start: bn_mp_exch.c */
+#line 0 "bn_mp_exch.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around 
+ */
+void
+mp_exch (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+
+  t = *a;
+  *a = *b;
+  *b = t;
+}
+
+/* End: bn_mp_exch.c */
+
+/* Start: bn_mp_expt_d.c */
+#line 0 "bn_mp_expt_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calculate c = a**b  using a square-multiply algorithm */
+int
+mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, x;
+  mp_int  g;
+
+  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set initial result */
+  mp_set (c, 1);
+
+  for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
+    if ((res = mp_sqr (c, c)) != MP_OKAY) {
+      mp_clear (&g);
+      return res;
+    }
+
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
+      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
+         mp_clear (&g);
+         return res;
+      }
+    }
+
+    /* shift to next bit */
+    b <<= 1;
+  }
+
+  mp_clear (&g);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_expt_d.c */
+
+/* Start: bn_mp_exptmod.c */
+#line 0 "bn_mp_exptmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
+int
+mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  int dr;
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+  }
+
+  dr = mp_dr_is_modulus(P);
+  if (dr == 0) {
+     dr = mp_reduce_is_2k(P) << 1;
+  }
+    
+  /* if the modulus is odd or dr != 0 use the fast method */
+  if (mp_isodd (P) == 1 || dr !=  0) {
+    return mp_exptmod_fast (G, X, P, Y, dr);
+  } else {
+    return s_mp_exptmod (G, X, P, Y);
+  }
+}
+
+
+/* End: bn_mp_exptmod.c */
+
+/* Start: bn_mp_exptmod_fast.c */
+#line 0 "bn_mp_exptmod_fast.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+ */
+int
+mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[256], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  
+  /* use a pointer to the reduction algorithm.  This allows us to use
+   * one of many reduction algorithms without modding the guts of
+   * the code with if statements everywhere.  
+   */
+  int     (*redux)(mp_int*,mp_int*,mp_digit);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+
+  /* init G array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init (&M[x])) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  /* determine and setup reduction code */
+  if (redmode == 0) {
+     /* now setup montgomery  */
+     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+        goto __M;
+     }
+     
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+
+     } else {
+        /* use slower baselien method */
+        redux = mp_montgomery_reduce;
+     }
+  } else if (redmode == 1) {
+     /* setup DR reduction */
+     mp_dr_setup(P, &mp);
+     redux = mp_dr_reduce;
+  } else {
+     /* setup 2k reduction */
+     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
+        goto __M;
+     }
+     redux = mp_reduce_2k;
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+
+  if (redmode == 0) {
+     /* now we need R mod m */
+     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
+       goto __RES;
+     }
+
+     /* now set M[1] to G * R mod m */
+     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
+       goto __RES;
+     }
+  } else {
+     mp_set(&res, 1);
+     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+        goto __RES;
+     }
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  if (redmode == 0) {
+     /* fixup result if Montgomery reduction is used */
+     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+       goto __RES;
+     }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+
+/* End: bn_mp_exptmod_fast.c */
+
+/* Start: bn_mp_gcd.c */
+#line 0 "bn_mp_gcd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
+ */
+int
+mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  u, v, t;
+  int     k, res, neg;
+
+  /* either zero than gcd is the largest */
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+    return mp_copy (b, c);
+  }
+  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+    return mp_copy (a, c);
+  }
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
+    mp_set (c, 1);
+    return MP_OKAY;
+  }
+
+  /* if both are negative they share (-1) as a common divisor */
+  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
+
+  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
+    goto __U;
+  }
+
+  /* must be positive for the remainder of the algorithm */
+  u.sign = v.sign = MP_ZPOS;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    goto __V;
+  }
+
+  /* B1.  Find power of two */
+  k = 0;
+  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
+    ++k;
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __T;
+    }
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  /* B2.  Initialize */
+  if (mp_isodd(&u) == 1) {
+    /* t = -v */
+    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+    t.sign = MP_NEG;
+  } else {
+    /* t = u */
+    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  do {
+    /* B3 (and B4).  Halve t, if even */
+    while (t.used != 0 && mp_iseven(&t) == 1) {
+      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
+        goto __T;
+      }
+    }
+
+    /* B5.  if t>0 then u=t otherwise v=-t */
+    if (t.used != 0 && t.sign != MP_NEG) {
+      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
+        goto __T;
+      }
+    } else {
+      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
+        goto __T;
+      }
+      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+    }
+
+    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
+    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  } while (mp_iszero(&t) == 0);
+
+  /* multiply by 2^k which we divided out at the beginning */ 
+  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
+    goto __T;
+  }
+
+  mp_exch (&u, c);
+  c->sign = neg;
+  res = MP_OKAY;
+__T:mp_clear (&t);
+__V:mp_clear (&u);
+__U:mp_clear (&v);
+  return res;
+}
+
+/* End: bn_mp_gcd.c */
+
+/* Start: bn_mp_grow.c */
+#line 0 "bn_mp_grow.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* grow as required */
+int
+mp_grow (mp_int * a, int size)
+{
+  int     i;
+
+  /* if the alloc size is smaller alloc more ram */
+  if (a->alloc < size) {
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
+
+    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
+    if (a->dp == NULL) {
+      return MP_MEM;
+    }
+
+    /* zero excess digits */
+    i        = a->alloc;
+    a->alloc = size;
+    for (; i < a->alloc; i++) {
+      a->dp[i] = 0;
+    }
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_grow.c */
+
+/* Start: bn_mp_init.c */
+#line 0 "bn_mp_init.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* init a new bigint */
+int
+mp_init (mp_int * a)
+{
+  /* allocate ram required and clear it */
+  a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the used to zero, allocated digits to the default precision
+   * and sign to positive */
+  a->used  = 0;
+  a->alloc = MP_PREC;
+  a->sign  = MP_ZPOS;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_init.c */
+
+/* Start: bn_mp_init_copy.c */
+#line 0 "bn_mp_init_copy.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* creates "a" then copies b into it */
+int
+mp_init_copy (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  if ((res = mp_init (a)) != MP_OKAY) {
+    return res;
+  }
+  return mp_copy (b, a);
+}
+
+/* End: bn_mp_init_copy.c */
+
+/* Start: bn_mp_init_size.c */
+#line 0 "bn_mp_init_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* init a mp_init and grow it to a given size */
+int
+mp_init_size (mp_int * a, int size)
+{
+
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));   
+  
+  /* alloc mem */
+  a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+  a->used = 0;
+  a->alloc = size;
+  a->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_init_size.c */
+
+/* Start: bn_mp_invmod.c */
+#line 0 "bn_mp_invmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int
+mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_iseven (b) == 0) {
+    return fast_mp_invmod (a, b, c);
+  }
+  
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_copy (a, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __ERR;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+    goto __ERR;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+    goto __ERR;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+    goto __ERR;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+    goto __ERR;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* a is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+
+__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+  return res;
+}
+
+/* End: bn_mp_invmod.c */
+
+/* Start: bn_mp_jacobi.c */
+#line 0 "bn_mp_jacobi.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
+ * HAC pp. 73 Algorithm 2.149
+ */
+int
+mp_jacobi (mp_int * a, mp_int * n, int *c)
+{
+  mp_int  a1, n1, e;
+  int     s, r, res;
+  mp_digit residue;
+
+  /* step 1.  if a == 0, return 0 */
+  if (mp_iszero (a) == 1) {
+    *c = 0;
+    return MP_OKAY;
+  }
+
+  /* step 2.  if a == 1, return 1 */
+  if (mp_cmp_d (a, 1) == MP_EQ) {
+    *c = 1;
+    return MP_OKAY;
+  }
+
+  /* default */
+  s = 0;
+
+  /* step 3.  write a = a1 * 2^e  */
+  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&n1)) != MP_OKAY) {
+    goto __A1;
+  }
+
+  if ((res = mp_init (&e)) != MP_OKAY) {
+    goto __N1;
+  }
+
+  while (mp_iseven (&a1) == 1) {
+    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
+      goto __E;
+    }
+  }
+
+  /* step 4.  if e is even set s=1 */
+  if (mp_iseven (&e) == 1) {
+    s = 1;
+  } else {
+    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
+    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if (residue == 1 || residue == 7) {
+      s = 1;
+    } else if (residue == 3 || residue == 5) {
+      s = -1;
+    }
+  }
+
+  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
+    goto __E;
+  }
+  if (residue == 3) {
+    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+    if (residue == 3) {
+      s = -s;
+    }
+  }
+
+  /* if a1 == 1 we're done */
+  if (mp_cmp_d (&a1, 1) == MP_EQ) {
+    *c = s;
+  } else {
+    /* n1 = n mod a1 */
+    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
+      goto __E;
+    }
+    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
+      goto __E;
+    }
+    *c = s * r;
+  }
+
+  /* done */
+  res = MP_OKAY;
+__E:mp_clear (&e);
+__N1:mp_clear (&n1);
+__A1:mp_clear (&a1);
+  return res;
+}
+
+/* End: bn_mp_jacobi.c */
+
+/* Start: bn_mp_karatsuba_mul.c */
+#line 0 "bn_mp_karatsuba_mul.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = |a| * |b| using Karatsuba Multiplication using 
+ * three half size multiplications
+ *
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+ * let n represent half of the number of digits in 
+ * the min(a,b)
+ *
+ * a = a1 * B**n + a0
+ * b = b1 * B**n + b0
+ *
+ * Then, a * b => 
+   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+ *
+ * Note that a1b1 and a0b0 are used twice and only need to be 
+ * computed once.  So in total three half size (half # of 
+ * digit) multiplications are performed, a0b0, a1b1 and 
+ * (a1-b1)(a0-b0)
+ *
+ * Note that a multiplication of half the digits requires
+ * 1/4th the number of single precision multiplications so in 
+ * total after one call 25% of the single precision multiplications 
+ * are saved.  Note also that the call to mp_mul can end up back 
+ * in this function if the a0, a1, b0, or b1 are above the threshold.  
+ * This is known as divide-and-conquer and leads to the famous 
+ * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+ * the standard O(N**2) that the baseline/comba methods use.  
+ * Generally though the overhead of this method doesn't pay off 
+ * until a certain size (N ~ 80) is reached.
+ */
+int
+mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+  int     B, err;
+
+  /* default the return code to an error */
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = MIN (a->used, b->used);
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+  if (mp_init_size (&y0, B) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+    goto Y0;
+
+  /* init temps */
+  if (mp_init_size (&t1, B * 2) != MP_OKAY)
+    goto Y1;
+  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+    goto X0Y0;
+
+  /* now shift the digits */
+  x0.sign = x1.sign = a->sign;
+  y0.sign = y1.sign = b->sign;
+
+  x0.used = y0.used = B;
+  x1.used = a->used - B;
+  y1.used = b->used - B;
+
+  {
+    register int x;
+    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+
+    /* we copy the digits directly instead of using higher level functions
+     * since we also need to shift the digits
+     */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    tmpx = x0.dp;
+    tmpy = y0.dp;
+    for (x = 0; x < B; x++) {
+      *tmpx++ = *tmpa++;
+      *tmpy++ = *tmpb++;
+    }
+
+    tmpx = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *tmpx++ = *tmpa++;
+    }
+
+    tmpy = y1.dp;
+    for (x = B; x < b->used; x++) {
+      *tmpy++ = *tmpb++;
+    }
+  }
+
+  /* only need to clamp the lower words since by definition the 
+   * upper words x1/y1 must have a known number of digits
+   */
+  mp_clamp (&x0);
+  mp_clamp (&y0);
+
+  /* now calc the products x0y0 and x1y1 */
+  /* after this x0 is no longer required, free temp [x0==t2]! */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
+    goto X1Y1;          /* x0y0 = x0*y0 */
+  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1*y1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+
+  /* Algorithm succeeded set the return code to MP_OKAY */
+  err = MP_OKAY;
+
+X1Y1:mp_clear (&x1y1);
+X0Y0:mp_clear (&x0y0);
+T1:mp_clear (&t1);
+Y1:mp_clear (&y1);
+Y0:mp_clear (&y0);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+
+/* End: bn_mp_karatsuba_mul.c */
+
+/* Start: bn_mp_karatsuba_sqr.c */
+#line 0 "bn_mp_karatsuba_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Karatsuba squaring, computes b = a*a using three 
+ * half size squarings
+ *
+ * See comments of mp_karatsuba_mul for details.  It 
+ * is essentially the same algorithm but merely 
+ * tuned to perform recursive squarings.
+ */
+int
+mp_karatsuba_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err;
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = a->used;
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+
+  /* init temps */
+  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+    goto T2;
+  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+    goto X0X0;
+
+  {
+    register int x;
+    register mp_digit *dst, *src;
+
+    src = a->dp;
+
+    /* now shift the digits */
+    dst = x0.dp;
+    for (x = 0; x < B; x++) {
+      *dst++ = *src++;
+    }
+
+    dst = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *dst++ = *src++;
+    }
+  }
+
+  x0.used = B;
+  x1.used = a->used - B;
+
+  mp_clamp (&x0);
+
+  /* now calc the products x0*x0 and x1*x1 */
+  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+    goto X1X1;           /* x0x0 = x0*x0 */
+  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1*x1 */
+
+  /* now calc (x1-x0)**2 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x1 - x0 */
+  if (mp_sqr (&t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
+
+  /* add x0y0 */
+  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+    goto X1X1;           /* t2 = x0x0 + x1x1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
+  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1x1 << 2*B */
+
+  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 */
+  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
+
+  err = MP_OKAY;
+
+X1X1:mp_clear (&x1x1);
+X0X0:mp_clear (&x0x0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+
+/* End: bn_mp_karatsuba_sqr.c */
+
+/* Start: bn_mp_lcm.c */
+#line 0 "bn_mp_lcm.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes least common multiple as a*b/(a, b) */
+int
+mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  res = mp_div (&t, c, c, NULL);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_lcm.c */
+
+/* Start: bn_mp_lshd.c */
+#line 0 "bn_mp_lshd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift left a certain amount of digits */
+int
+mp_lshd (mp_int * a, int b)
+{
+  int     x, res;
+
+  /* if its less than zero return */
+  if (b <= 0) {
+    return MP_OKAY;
+  }
+
+  /* grow to fit the new digits */
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  {
+    register mp_digit *top, *bottom;
+
+    /* increment the used by the shift amount then copy upwards */
+    a->used += b;
+
+    /* top */
+    top = a->dp + a->used - 1;
+
+    /* base */
+    bottom = a->dp + a->used - 1 - b;
+
+    /* much like mp_rshd this is implemented using a sliding window
+     * except the window goes the otherway around.  Copying from
+     * the bottom to the top.  see bn_mp_rshd.c for more info.
+     */
+    for (x = a->used - 1; x >= b; x--) {
+      *top-- = *bottom--;
+    }
+
+    /* zero the lower digits */
+    top = a->dp;
+    for (x = 0; x < b; x++) {
+      *top++ = 0;
+    }
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_lshd.c */
+
+/* Start: bn_mp_mod.c */
+#line 0 "bn_mp_mod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = a mod b, 0 <= c < b */
+int
+mp_mod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    res = mp_add (b, &t, c);
+  } else {
+    res = MP_OKAY;
+    mp_exch (&t, c);
+  }
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_mod.c */
+
+/* Start: bn_mp_mod_2d.c */
+#line 0 "bn_mp_mod_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calc a value mod 2^b */
+int
+mp_mod_2d (mp_int * a, int b, mp_int * c)
+{
+  int     x, res;
+
+
+  /* if b is <= 0 then zero the int */
+  if (b <= 0) {
+    mp_zero (c);
+    return MP_OKAY;
+  }
+
+  /* if the modulus is larger than the value than return */
+  if (b > (int) (a->used * DIGIT_BIT)) {
+    res = mp_copy (a, c);
+    return res;
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero digits above the last digit of the modulus */
+  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+    c->dp[x] = 0;
+  }
+  /* clear the digit that is not completely outside/inside the modulus */
+  c->dp[b / DIGIT_BIT] &=
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mod_2d.c */
+
+/* Start: bn_mp_mod_d.c */
+#line 0 "bn_mp_mod_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int
+mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
+{
+  return mp_div_d(a, b, NULL, c);
+}
+
+/* End: bn_mp_mod_d.c */
+
+/* Start: bn_mp_montgomery_calc_normalization.c */
+#line 0 "bn_mp_montgomery_calc_normalization.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calculates a = B^n mod b for Montgomery reduction
+ * Where B is the base [e.g. 2^DIGIT_BIT].
+ * B^n mod b is computed by first computing
+ * A = B^(n-1) which doesn't require a reduction but a simple OR.
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int
+mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  /* compute A = B^(n-1) * 2^(bits-1) */
+  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+        return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_calc_normalization.c */
+
+/* Start: bn_mp_montgomery_reduce.c */
+#line 0 "bn_mp_montgomery_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction */
+int
+mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, digs;
+  mp_digit mu;
+
+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mp_mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
+  digs = n->used * 2 + 1;
+  if ((digs < MP_WARRAY) && 
+      n->used < 
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_mp_montgomery_reduce (x, n, rho);
+  }
+
+  /* grow the input as required */
+  if (x->alloc < digs) {
+    if ((res = mp_grow (x, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+  x->used = digs;
+
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b */
+    mu = (x->dp[ix] * rho) & MP_MASK;
+
+    /* a = a + mu * m * b**i */
+    {
+      register int iy;
+      register mp_digit *tmpn, *tmpx, u;
+      register mp_word r;
+
+      /* aliases */
+      tmpn = n->dp;
+      tmpx = x->dp + ix;
+
+      /* set the carry to zero */
+      u = 0;
+      
+      /* Multiply and add in place */
+      for (iy = 0; iy < n->used; iy++) {
+        r = ((mp_word) mu) * ((mp_word) * tmpn++) + 
+            ((mp_word) u) + ((mp_word) * tmpx);
+        u = (r >> ((mp_word) DIGIT_BIT));
+        *tmpx++ = (r & ((mp_word) MP_MASK));
+      }
+      /* propagate carries */
+      while (u) {
+        *tmpx   += u;
+        u        = *tmpx >> DIGIT_BIT;
+        *tmpx++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* x = x/b**n.used */
+  mp_clamp(x);
+  mp_rshd (x, n->used);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_reduce.c */
+
+/* Start: bn_mp_montgomery_setup.c */
+#line 0 "bn_mp_montgomery_setup.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* setups the montgomery reduction stuff */
+int
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
+{
+  mp_digit x, b;
+
+/* fast inversion mod 2**k
+ *
+ * Based on the fact that
+ *
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+ *                    =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*(1) - (1)     = 1
+ */
+  b = n->dp[0];
+
+  if ((b & 1) == 0) {
+    return MP_VAL;
+  }
+
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+
+  /* rho = -1/m mod b */
+  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_setup.c */
+
+/* Start: bn_mp_mul.c */
+#line 0 "bn_mp_mul.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level multiplication (handles sign) */
+int
+mp_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, neg;
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  
+  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
+    res = mp_toom_mul(a, b, c);
+  } else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
+    res = mp_karatsuba_mul (a, b, c);
+  } else {
+
+    /* can we use the fast multiplier?
+     *
+     * The fast multiplier can be used if the output will 
+     * have less than MP_WARRAY digits and the number of 
+     * digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+    if ((digs < MP_WARRAY) &&
+        MIN(a->used, b->used) <= 
+        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else {
+      res = s_mp_mul (a, b, c);
+    }
+
+  }
+  c->sign = neg;
+  return res;
+}
+
+/* End: bn_mp_mul.c */
+
+/* Start: bn_mp_mul_2.c */
+#line 0 "bn_mp_mul_2.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = a*2 */
+int
+mp_mul_2 (mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* alias for source */
+    tmpa = a->dp;
+    
+    /* alias for dest */
+    tmpb = b->dp;
+
+    /* carry */
+    r = 0;
+    for (x = 0; x < a->used; x++) {
+    
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+      
+      /* now shift up this digit, add in the carry [from the previous] */
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      /* add a MSB which is always 1 at this point */
+      *tmpb = 1;
+      ++b->used;
+    }
+
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_2.c */
+
+/* Start: bn_mp_mul_2d.c */
+#line 0 "bn_mp_mul_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
+   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
+ 
+   To be fixed for LTM 0.18
+ */
+
+/* shift left by a certain bit count */
+int
+mp_mul_2d (mp_int * a, int b, mp_int * c)
+{
+  mp_digit d;
+  int      res;
+
+  /* copy */
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
+      return res;
+    }
+  }
+  c->used = c->alloc;
+
+  /* shift any bit count < DIGIT_BIT */
+  d = (mp_digit) (b % DIGIT_BIT);
+  if (d != 0) {
+    register mp_digit *tmpc, mask, r, rr;
+    register int x;
+
+    /* bitmask for carries */
+    mask = (((mp_digit)1) << d) - 1;
+
+    /* alias */
+    tmpc = c->dp;
+
+    /* carry */
+    r    = 0;
+    for (x = 0; x < c->used; x++) {
+      /* get the higher bits of the current word */
+      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+
+      /* shift the current word and OR in the carry */
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
+
+      /* set the carry to the carry bits of the current word */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_2d.c */
+
+/* Start: bn_mp_mul_d.c */
+#line 0 "bn_mp_mul_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiply by a digit */
+int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, pa, olduse;
+
+  /* make sure c is big enough to hold a*b */
+  pa = a->used;
+  if (c->alloc < pa + 1) {
+    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get the original destinations used count */
+  olduse = c->used;
+
+  /* set the new temporary used count */
+  c->used = pa + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpc;
+    register mp_word r;
+    register int ix;
+
+    /* alias for a->dp [source] */
+    tmpa = a->dp;
+
+    /* alias for c->dp [dest] */
+    tmpc = c->dp;
+
+    /* zero carry */
+    u = 0;
+    for (ix = 0; ix < pa; ix++) {
+      /* compute product and carry sum for this term */
+      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+
+      /* mask off higher bits to get a single digit */
+      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* send carry into next iteration */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* store final carry [if any] */
+    *tmpc++ = u;
+
+    /* now zero digits above the top */
+    for (; pa < olduse; pa++) {
+       *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_d.c */
+
+/* Start: bn_mp_mulmod.c */
+#line 0 "bn_mp_mulmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a * b (mod c) */
+int
+mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_mulmod.c */
+
+/* Start: bn_mp_multi.c */
+#line 0 "bn_mp_multi.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
+
+/* End: bn_mp_multi.c */
+
+/* Start: bn_mp_n_root.c */
+#line 0 "bn_mp_n_root.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)^b <= a and (c+1)^b > a 
+ *
+ * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where each step involves a fair bit.  This
+ * is not meant to find huge roots [square and cube at most].
+ */
+int
+mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t1, t2, t3;
+  int     res, neg;
+
+  /* input must be positive if b is even */
+  if ((b & 1) == 0 && a->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   /* t3 = t1^(b-1) */
+      goto __T3;
+    }
+
+    /* numerator */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    /* t2 = t1^b */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  /* t2 = t1^b - a */
+      goto __T3;
+    }
+
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    /* t3 = t1^(b-1) * b  */
+      goto __T3;
+    }
+
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  /* t3 = (t1^b - a)/(b * t1^(b-1)) */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+  }
+  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+    goto __T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+__T3:mp_clear (&t3);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+  return res;
+}
+
+/* End: bn_mp_n_root.c */
+
+/* Start: bn_mp_neg.c */
+#line 0 "bn_mp_neg.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = -a */
+int
+mp_neg (mp_int * a, mp_int * b)
+{
+  int     res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_neg.c */
+
+/* Start: bn_mp_or.c */
+#line 0 "bn_mp_or.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* OR two ints together */
+int
+mp_or (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] |= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_or.c */
+
+/* Start: bn_mp_prime_fermat.c */
+#line 0 "bn_mp_prime_fermat.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* performs one Fermat test.
+ * 
+ * If "a" were prime then b^a == b (mod a) since the order of
+ * the multiplicative sub-group would be phi(a) = a-1.  That means
+ * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
+ *
+ * Sets result to 1 if the congruence holds, or zero otherwise.
+ */
+int
+mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  t;
+  int     err;
+
+  /* default to fail */
+  *result = 0;
+
+  /* init t */
+  if ((err = mp_init (&t)) != MP_OKAY) {
+    return err;
+  }
+
+  /* compute t = b^a mod a */
+  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
+    goto __T;
+  }
+
+  /* is it equal to b? */
+  if (mp_cmp (&t, b) == MP_EQ) {
+    *result = 1;
+  }
+
+  err = MP_OKAY;
+__T:mp_clear (&t);
+  return err;
+}
+
+/* End: bn_mp_prime_fermat.c */
+
+/* Start: bn_mp_prime_is_divisible.c */
+#line 0 "bn_mp_prime_is_divisible.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if an integers is divisible by one of the first 256 primes or not
+ *
+ * sets result to 0 if not, 1 if yes
+ */
+int
+mp_prime_is_divisible (mp_int * a, int *result)
+{
+  int     err, ix;
+  mp_digit res;
+
+  /* default to not */
+  *result = 0;
+
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+    /* is it equal to the prime? */
+    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
+      *result = 1;
+      return MP_OKAY;
+    }
+
+    /* what is a mod __prime_tab[ix] */
+    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
+      return err;
+    }
+
+    /* is the residue zero? */
+    if (res == 0) {
+      *result = 1;
+      return MP_OKAY;
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_prime_is_divisible.c */
+
+/* Start: bn_mp_prime_is_prime.c */
+#line 0 "bn_mp_prime_is_prime.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* performs a variable number of rounds of Miller-Rabin
+ *
+ * Probability of error after t rounds is no more than
+ * (1/4)^t when 1 <= t <= 256
+ *
+ * Sets result to 1 if probably prime, 0 otherwise
+ */
+int
+mp_prime_is_prime (mp_int * a, int t, int *result)
+{
+  mp_int  b;
+  int     ix, err, res;
+
+  /* default to no */
+  *result = 0;
+
+  /* valid value of t? */
+  if (t < 1 || t > PRIME_SIZE) {
+    return MP_VAL;
+  }
+
+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
+  /* first perform trial division */
+  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
+    return err;
+  }
+  if (res == 1) {
+    return MP_OKAY;
+  }
+
+  /* now perform the miller-rabin rounds */
+  if ((err = mp_init (&b)) != MP_OKAY) {
+    return err;
+  }
+
+  for (ix = 0; ix < t; ix++) {
+    /* set the prime */
+    mp_set (&b, __prime_tab[ix]);
+
+    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
+      goto __B;
+    }
+
+    if (res == 0) {
+      goto __B;
+    }
+  }
+
+  /* passed the test */
+  *result = 1;
+__B:mp_clear (&b);
+  return err;
+}
+
+/* End: bn_mp_prime_is_prime.c */
+
+/* Start: bn_mp_prime_miller_rabin.c */
+#line 0 "bn_mp_prime_miller_rabin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Miller-Rabin test of "a" to the base of "b" as described in 
+ * HAC pp. 139 Algorithm 4.24
+ *
+ * Sets result to 0 if definitely composite or 1 if probably prime.
+ * Randomly the chance of error is no more than 1/4 and often 
+ * very much lower.
+ */
+int
+mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  n1, y, r;
+  int     s, j, err;
+
+  /* default */
+  *result = 0;
+
+  /* get n1 = a - 1 */
+  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
+    return err;
+  }
+  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
+    goto __N1;
+  }
+
+  /* set 2^s * r = n1 */
+  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
+    goto __N1;
+  }
+  s = 0;
+  while (mp_iseven (&r) == 1) {
+    ++s;
+    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
+      goto __R;
+    }
+  }
+
+  /* compute y = b^r mod a */
+  if ((err = mp_init (&y)) != MP_OKAY) {
+    goto __R;
+  }
+  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
+    goto __Y;
+  }
+
+  /* if y != 1 and y != n1 do */
+  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
+    j = 1;
+    /* while j <= s-1 and y != n1 */
+    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
+      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
+    goto __Y;
+      }
+
+      /* if y == 1 then composite */
+      if (mp_cmp_d (&y, 1) == MP_EQ) {
+    goto __Y;
+      }
+
+      ++j;
+    }
+
+    /* if y != n1 then composite */
+    if (mp_cmp (&y, &n1) != MP_EQ) {
+      goto __Y;
+    }
+  }
+
+  /* probably prime now */
+  *result = 1;
+__Y:mp_clear (&y);
+__R:mp_clear (&r);
+__N1:mp_clear (&n1);
+  return err;
+}
+
+/* End: bn_mp_prime_miller_rabin.c */
+
+/* Start: bn_mp_prime_next_prime.c */
+#line 0 "bn_mp_prime_next_prime.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* finds the next prime after the number "a" using "t" trials
+ * of Miller-Rabin.
+ */
+int mp_prime_next_prime(mp_int *a, int t)
+{
+   int err, res;
+
+   if (mp_iseven(a) == 1) {
+      /* force odd */
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
+         return err;
+      }
+   } else {
+      /* force to next odd number */
+      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   for (;;) {
+      /* is this prime? */
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
+         return err;
+      }
+
+      if (res == 1) {
+         break;
+      }
+
+      /* add two, next candidate */
+      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_prime_next_prime.c */
+
+/* Start: bn_mp_rand.c */
+#line 0 "bn_mp_rand.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* makes a pseudo-random int of a given size */
+int
+mp_rand (mp_int * a, int digits)
+{
+  int     res;
+  mp_digit d;
+
+  mp_zero (a);
+  if (digits <= 0) {
+    return MP_OKAY;
+  }
+
+  /* first place a random non-zero digit */
+  do {
+    d = ((mp_digit) abs (rand ()));
+  } while (d == 0);
+
+  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
+    return res;
+  }
+
+  while (digits-- > 0) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+      return res;
+    }
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_rand.c */
+
+/* Start: bn_mp_read_signed_bin.c */
+#line 0 "bn_mp_read_signed_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int
+mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
+    return res;
+  }
+  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_signed_bin.c */
+
+/* Start: bn_mp_read_unsigned_bin.c */
+#line 0 "bn_mp_read_unsigned_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int
+mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+  mp_zero (a);
+  while (c-- > 0) {
+    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
+      return res;
+    }
+
+    if (DIGIT_BIT != 7) {
+      a->dp[0] |= *b++;
+      a->used += 1;
+    } else {
+      a->dp[0] = (*b & MP_MASK);
+      a->dp[1] |= ((*b++ >> 7U) & 1);
+      a->used += 2;
+    }
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_unsigned_bin.c */
+
+/* Start: bn_mp_reduce.c */
+#line 0 "bn_mp_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reduces x mod m, assumes 0 < x < m**2, mu is 
+ * precomputed via mp_reduce_setup.
+ * From HAC pp.604 Algorithm 14.42
+ */
+int
+mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+{
+  mp_int  q;
+  int     res, um = m->used;
+
+  /* q = x */
+  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
+    return res;
+  }
+
+  /* q1 = x / b**(k-1)  */
+  mp_rshd (&q, um - 1);         
+
+  /* according to HAC this is optimization is ok */
+  if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
+    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  } else {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  }
+
+  /* q3 = q2 / b**(k+1) */
+  mp_rshd (&q, um + 1);         
+
+  /* x = x mod b**(k+1), quick (no division) */
+  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* q = q * m mod b**(k+1), quick (no division) */
+  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* x = x - q */
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* If x < 0, add b**(k+1) to it */
+  if (mp_cmp_d (x, 0) == MP_LT) {
+    mp_set (&q, 1);
+    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if ((res = mp_add (x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while (mp_cmp (x, m) != MP_LT) {
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
+      break;
+    }
+  }
+  
+CLEANUP:
+  mp_clear (&q);
+
+  return res;
+}
+
+/* End: bn_mp_reduce.c */
+
+/* Start: bn_mp_reduce_2k.c */
+#line 0 "bn_mp_reduce_2k.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -4889,11 +4893,11 @@ ERR:
    return res;
 }
 
-
-/* End: bn_mp_reduce_2k.c */
-
-/* Start: bn_mp_reduce_2k_setup.c */
-#line 0 "bn_mp_reduce_2k_setup.c"
+
+/* End: bn_mp_reduce_2k.c */
+
+/* Start: bn_mp_reduce_2k_setup.c */
+#line 0 "bn_mp_reduce_2k_setup.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -4936,11 +4940,11 @@ mp_reduce_2k_setup(mp_int *a, mp_digit *d)
    mp_clear(&tmp);
    return MP_OKAY;
 }
-
-/* End: bn_mp_reduce_2k_setup.c */
-
-/* Start: bn_mp_reduce_is_2k.c */
-#line 0 "bn_mp_reduce_is_2k.c"
+
+/* End: bn_mp_reduce_2k_setup.c */
+
+/* Start: bn_mp_reduce_is_2k.c */
+#line 0 "bn_mp_reduce_is_2k.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -4970,7 +4974,8 @@ mp_reduce_is_2k(mp_int *a)
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       for (ix = DIGIT_BIT; ix < iy; ix++) {
-          if ((a->dp[ix/DIGIT_BIT] & ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
+          if ((a->dp[ix/DIGIT_BIT] & 
+              ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
              return 0;
           }
       }
@@ -4978,11 +4983,11 @@ mp_reduce_is_2k(mp_int *a)
    return 1;
 }
 
-
-/* End: bn_mp_reduce_is_2k.c */
-
-/* Start: bn_mp_reduce_setup.c */
-#line 0 "bn_mp_reduce_setup.c"
+
+/* End: bn_mp_reduce_is_2k.c */
+
+/* Start: bn_mp_reduce_setup.c */
+#line 0 "bn_mp_reduce_setup.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -5012,11 +5017,522 @@ mp_reduce_setup (mp_int * a, mp_int * b)
   }
   return mp_div (a, b, a, NULL);
 }
-
-/* End: bn_mp_reduce_setup.c */
-
-/* Start: bn_mp_rshd.c */
-#line 0 "bn_mp_rshd.c"
+
+/* End: bn_mp_reduce_setup.c */
+
+/* Start: bn_mp_rshd.c */
+#line 0 "bn_mp_rshd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift right a certain amount of digits */
+void
+mp_rshd (mp_int * a, int b)
+{
+  int     x;
+
+  /* if b <= 0 then ignore it */
+  if (b <= 0) {
+    return;
+  }
+
+  /* if b > used then simply zero it and return */
+  if (a->used <= b) {
+    mp_zero (a);
+    return;
+  }
+
+  {
+    register mp_digit *bottom, *top;
+
+    /* shift the digits down */
+
+    /* bottom */
+    bottom = a->dp;
+
+    /* top [offset into digits] */
+    top = a->dp + b;
+
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
+     *
+     * e.g.
+
+     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+                 /\                   |      ---->
+                  \-------------------/      ---->
+     */
+    for (x = 0; x < (a->used - b); x++) {
+      *bottom++ = *top++;
+    }
+
+    /* zero the top digits */
+    for (; x < a->used; x++) {
+      *bottom++ = 0;
+    }
+  }
+  
+  /* remove excess digits */
+  a->used -= b;
+}
+
+/* End: bn_mp_rshd.c */
+
+/* Start: bn_mp_set.c */
+#line 0 "bn_mp_set.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set to a digit */
+void
+mp_set (mp_int * a, mp_digit b)
+{
+  mp_zero (a);
+  a->dp[0] = b & MP_MASK;
+  a->used = (a->dp[0] != 0) ? 1 : 0;
+}
+
+/* End: bn_mp_set.c */
+
+/* Start: bn_mp_set_int.c */
+#line 0 "bn_mp_set_int.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set a 32-bit const */
+int
+mp_set_int (mp_int * a, unsigned int b)
+{
+  int     x, res;
+
+  mp_zero (a);
+  /* set four bits at a time */
+  for (x = 0; x < 8; x++) {
+    /* shift the number up four bits */
+    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
+      return res;
+    }
+
+    /* OR in the top four bits of the source */
+    a->dp[0] |= (b >> 28) & 15;
+
+    /* shift the source up to the next four bits */
+    b <<= 4;
+
+    /* ensure that digits are not clamped off */
+    a->used += 1;
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_set_int.c */
+
+/* Start: bn_mp_shrink.c */
+#line 0 "bn_mp_shrink.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shrink a bignum */
+int
+mp_shrink (mp_int * a)
+{
+  if (a->alloc != a->used) {
+    if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+      return MP_MEM;
+    }
+    a->alloc = a->used;
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_shrink.c */
+
+/* Start: bn_mp_signed_bin_size.c */
+#line 0 "bn_mp_signed_bin_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the size for an signed equivalent */
+int
+mp_signed_bin_size (mp_int * a)
+{
+  return 1 + mp_unsigned_bin_size (a);
+}
+
+/* End: bn_mp_signed_bin_size.c */
+
+/* Start: bn_mp_sqr.c */
+#line 0 "bn_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes b = a*a */
+int
+mp_sqr (mp_int * a, mp_int * b)
+{
+  int     res;
+  if (a->used >= TOOM_SQR_CUTOFF) {
+    res = mp_toom_sqr(a, b);
+  } else if (a->used >= KARATSUBA_SQR_CUTOFF) {
+    res = mp_karatsuba_sqr (a, b);
+  } else {
+
+    /* can we use the fast multiplier? */
+    if ((a->used * 2 + 1) < MP_WARRAY && 
+         a->used < 
+         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else {
+      res = s_mp_sqr (a, b);
+    }
+  }
+  b->sign = MP_ZPOS;
+  return res;
+}
+
+/* End: bn_mp_sqr.c */
+
+/* Start: bn_mp_sqrmod.c */
+#line 0 "bn_mp_sqrmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = a * a (mod b) */
+int
+mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, b, c);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_sqrmod.c */
+
+/* Start: bn_mp_sub.c */
+#line 0 "bn_mp_sub.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level subtraction (handles signs) */
+int
+mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  sa = a->sign;
+  sb = b->sign;
+
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
+      res = s_mp_sub (a, b, c);
+    } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
+      res = s_mp_sub (b, a, c);
+    }
+  }
+  return res;
+}
+
+
+/* End: bn_mp_sub.c */
+
+/* Start: bn_mp_sub_d.c */
+#line 0 "bn_mp_sub_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit subtraction */
+int
+mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_sub (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_sub_d.c */
+
+/* Start: bn_mp_submod.c */
+#line 0 "bn_mp_submod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a - b (mod c) */
+int
+mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_submod.c */
+
+/* Start: bn_mp_to_signed_bin.c */
+#line 0 "bn_mp_to_signed_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* store in signed [big endian] format */
+int
+mp_to_signed_bin (mp_int * a, unsigned char *b)
+{
+  int     res;
+
+  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
+    return res;
+  }
+  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_to_signed_bin.c */
+
+/* Start: bn_mp_to_unsigned_bin.c */
+#line 0 "bn_mp_to_unsigned_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* store in unsigned [big endian] format */
+int
+mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+{
+  int     x, res;
+  mp_int  t;
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  x = 0;
+  while (mp_iszero (&t) == 0) {
+    if (DIGIT_BIT != 7) {
+      b[x++] = (unsigned char) (t.dp[0] & 255);
+    } else {
+      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
+    }
+    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+  bn_reverse (b, x);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_to_unsigned_bin.c */
+
+/* Start: bn_mp_toom_mul.c */
+#line 0 "bn_mp_toom_mul.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -5033,518 +5549,7 @@ mp_reduce_setup (mp_int * a, mp_int * b)
  */
 #include <tommath.h>
 
-/* shift right a certain amount of digits */
-void
-mp_rshd (mp_int * a, int b)
-{
-  int     x;
-
-  /* if b <= 0 then ignore it */
-  if (b <= 0) {
-    return;
-  }
-
-  /* if b > used then simply zero it and return */
-  if (a->used <= b) {
-    mp_zero (a);
-    return;
-  }
-
-  {
-    register mp_digit *bottom, *top;
-
-    /* shift the digits down */
-
-    /* bottom */
-    bottom = a->dp;
-
-    /* top [offset into digits] */
-    top = a->dp + b;
-
-    /* this is implemented as a sliding window where 
-     * the window is b-digits long and digits from 
-     * the top of the window are copied to the bottom
-     *
-     * e.g.
-
-     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-                 /\                   |      ---->
-                  \-------------------/      ---->
-     */
-    for (x = 0; x < (a->used - b); x++) {
-      *bottom++ = *top++;
-    }
-
-    /* zero the top digits */
-    for (; x < a->used; x++) {
-      *bottom++ = 0;
-    }
-  }
-  
-  /* remove excess digits */
-  a->used -= b;
-}
-
-/* End: bn_mp_rshd.c */
-
-/* Start: bn_mp_set.c */
-#line 0 "bn_mp_set.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set to a digit */
-void
-mp_set (mp_int * a, mp_digit b)
-{
-  mp_zero (a);
-  a->dp[0] = b & MP_MASK;
-  a->used = (a->dp[0] != 0) ? 1 : 0;
-}
-
-/* End: bn_mp_set.c */
-
-/* Start: bn_mp_set_int.c */
-#line 0 "bn_mp_set_int.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set a 32-bit const */
-int
-mp_set_int (mp_int * a, unsigned int b)
-{
-  int     x, res;
-
-  mp_zero (a);
-  /* set four bits at a time */
-  for (x = 0; x < 8; x++) {
-    /* shift the number up four bits */
-    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
-      return res;
-    }
-
-    /* OR in the top four bits of the source */
-    a->dp[0] |= (b >> 28) & 15;
-
-    /* shift the source up to the next four bits */
-    b <<= 4;
-
-    /* ensure that digits are not clamped off */
-    a->used += 1;
-  }
-  mp_clamp (a);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_set_int.c */
-
-/* Start: bn_mp_shrink.c */
-#line 0 "bn_mp_shrink.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shrink a bignum */
-int
-mp_shrink (mp_int * a)
-{
-  if (a->alloc != a->used) {
-    if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
-      return MP_MEM;
-    }
-    a->alloc = a->used;
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_shrink.c */
-
-/* Start: bn_mp_signed_bin_size.c */
-#line 0 "bn_mp_signed_bin_size.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* get the size for an signed equivalent */
-int
-mp_signed_bin_size (mp_int * a)
-{
-  return 1 + mp_unsigned_bin_size (a);
-}
-
-/* End: bn_mp_signed_bin_size.c */
-
-/* Start: bn_mp_sqr.c */
-#line 0 "bn_mp_sqr.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes b = a*a */
-int
-mp_sqr (mp_int * a, mp_int * b)
-{
-  int     res;
-  if (a->used >= TOOM_SQR_CUTOFF) {
-    res = mp_toom_sqr(a, b);
-  } else if (a->used >= KARATSUBA_SQR_CUTOFF) {
-    res = mp_karatsuba_sqr (a, b);
-  } else {
-
-    /* can we use the fast multiplier? */
-    if ((a->used * 2 + 1) < MP_WARRAY && 
-         a->used < 
-         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
-      res = fast_s_mp_sqr (a, b);
-    } else {
-      res = s_mp_sqr (a, b);
-    }
-  }
-  b->sign = MP_ZPOS;
-  return res;
-}
-
-/* End: bn_mp_sqr.c */
-
-/* Start: bn_mp_sqrmod.c */
-#line 0 "bn_mp_sqrmod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = a * a (mod b) */
-int
-mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, b, c);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_sqrmod.c */
-
-/* Start: bn_mp_sub.c */
-#line 0 "bn_mp_sub.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level subtraction (handles signs) */
-int
-mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-  sa = a->sign;
-  sb = b->sign;
-
-  if (sa != sb) {
-    /* subtract a negative from a positive, OR */
-    /* subtract a positive from a negative. */
-    /* In either case, ADD their magnitudes, */
-    /* and use the sign of the first number. */
-    c->sign = sa;
-    res = s_mp_add (a, b, c);
-  } else {
-    /* subtract a positive from a positive, OR */
-    /* subtract a negative from a negative. */
-    /* First, take the difference between their */
-    /* magnitudes, then... */
-    if (mp_cmp_mag (a, b) != MP_LT) {
-      /* Copy the sign from the first */
-      c->sign = sa;
-      /* The first has a larger or equal magnitude */
-      res = s_mp_sub (a, b, c);
-    } else {
-      /* The result has the *opposite* sign from */
-      /* the first number. */
-      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-      /* The second has a larger magnitude */
-      res = s_mp_sub (b, a, c);
-    }
-  }
-  return res;
-}
-
-
-/* End: bn_mp_sub.c */
-
-/* Start: bn_mp_sub_d.c */
-#line 0 "bn_mp_sub_d.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit subtraction */
-int
-mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-  mp_set (&t, b);
-  res = mp_sub (a, &t, c);
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_sub_d.c */
-
-/* Start: bn_mp_submod.c */
-#line 0 "bn_mp_submod.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a - b (mod c) */
-int
-mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_submod.c */
-
-/* Start: bn_mp_to_signed_bin.c */
-#line 0 "bn_mp_to_signed_bin.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
-{
-  int     res;
-
-  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
-    return res;
-  }
-  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_to_signed_bin.c */
-
-/* Start: bn_mp_to_unsigned_bin.c */
-#line 0 "bn_mp_to_unsigned_bin.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
-{
-  int     x, res;
-  mp_int  t;
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  x = 0;
-  while (mp_iszero (&t) == 0) {
-    if (DIGIT_BIT != 7) {
-      b[x++] = (unsigned char) (t.dp[0] & 255);
-    } else {
-      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
-    }
-    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-  bn_reverse (b, x);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_to_unsigned_bin.c */
-
-/* Start: bn_mp_toom_mul.c */
-#line 0 "bn_mp_toom_mul.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplication using Toom-Cook 3-way algorithm */
+/* multiplication using the Toom-Cook 3-way algorithm */
 int 
 mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
@@ -5552,14 +5557,16 @@ mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
     int res, B;
         
     /* init temps */
-    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+                             &a0, &a1, &a2, &b0, &b1, 
+                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
        return res;
     }
     
     /* B */
     B = MIN(a->used, b->used) / 3;
     
-    /* a = a2 * B^2 + a1 * B + a0 */
+    /* a = a2 * B**2 + a1 * B + a0 */
     if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
        goto ERR;
     }
@@ -5575,7 +5582,7 @@ mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
     }
     mp_rshd(&a2, B*2);
     
-    /* b = b2 * B^2 + b1 * B + b0 */
+    /* b = b2 * B**2 + b1 * B + b0 */
     if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
        goto ERR;
     }
@@ -5689,7 +5696,8 @@ mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
        16 8  4  2  1
        1  0  0  0  0
        
-       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication 
+       using 12 subtractions, 4 shifts, 
+              2 small divisions and 1 small multiplication 
      */
      
      /* r1 - r4 */
@@ -5792,15 +5800,17 @@ mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
      }     
      
 ERR:
-     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL);
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+                    &a0, &a1, &a2, &b0, &b1, 
+                    &b2, &tmp1, &tmp2, NULL);
      return res;
 }     
      
-
-/* End: bn_mp_toom_mul.c */
-
-/* Start: bn_mp_toom_sqr.c */
-#line 0 "bn_mp_toom_sqr.c"
+
+/* End: bn_mp_toom_mul.c */
+
+/* Start: bn_mp_toom_sqr.c */
+#line 0 "bn_mp_toom_sqr.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -6021,543 +6031,551 @@ ERR:
      return res;
 }     
      
-
-/* End: bn_mp_toom_sqr.c */
-
-/* Start: bn_mp_unsigned_bin_size.c */
-#line 0 "bn_mp_unsigned_bin_size.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
-{
-  int     size = mp_count_bits (a);
-  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
-}
-
-/* End: bn_mp_unsigned_bin_size.c */
-
-/* Start: bn_mp_xor.c */
-#line 0 "bn_mp_xor.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* XOR two ints together */
-int
-mp_xor (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] ^= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_xor.c */
-
-/* Start: bn_mp_zero.c */
-#line 0 "bn_mp_zero.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set to zero */
-void
-mp_zero (mp_int * a)
-{
-  a->sign = MP_ZPOS;
-  a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-}
-
-/* End: bn_mp_zero.c */
-
-/* Start: bn_prime_tab.c */
-#line 0 "bn_prime_tab.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-const mp_digit __prime_tab[] = {
-  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
-#ifndef MP_8BIT
-  0x0083,
-  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-
-  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-
-  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-
-  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-#endif
-};
-
-/* End: bn_prime_tab.c */
-
-/* Start: bn_radix.c */
-#line 0 "bn_radix.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* chars used in radix conversions */
-static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-/* read a string [ASCII] in a given radix */
-int
-mp_read_radix (mp_int * a, char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  mp_zero (a);
-  while (*str) {
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == s_rmap[y]) {
-    break;
-      }
-    }
-
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-    return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-    return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-  a->sign = neg;
-  return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int
-mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = s_rmap[d];
-    ++digs;
-  }
-  bn_reverse ((unsigned char *)_s, digs);
-  *str++ = '\0';
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int
-mp_radix_size (mp_int * a, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  /* special case for binary */
-  if (radix == 2) {
-    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-  }
-
-  if (radix < 2 || radix > 64) {
-    return 0;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return 0;
-  }
-
-  digs = 0;
-  if (t.sign == MP_NEG) {
-    ++digs;
-    t.sign = MP_ZPOS;
-  }
-
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return 0;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-  return digs + 1;
-}
-
-/* read a bigint from a file stream in ASCII */
-int mp_fread(mp_int *a, int radix, FILE *stream)
-{
-   int err, ch, neg, y;
-   
-   /* clear a */
-   mp_zero(a);
-   
-   /* if first digit is - then set negative */
-   ch = fgetc(stream);
-   if (ch == '-') {
-      neg = MP_NEG;
-      ch = fgetc(stream);
-   } else {
-      neg = MP_ZPOS;
-   }
-   
-   for (;;) {
-      /* find y in the radix map */
-      for (y = 0; y < radix; y++) {
-          if (s_rmap[y] == ch) {
-             break;
-          }
-      }
-      if (y == radix) {
-         break;
-      }
-      
-      /* shift up and add */
-      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-      
-      ch = fgetc(stream);
-   }
-   if (mp_cmp_d(a, 0) != MP_EQ) {
-      a->sign = neg;
-   }
-   
-   return MP_OKAY;
-}
-
-int mp_fwrite(mp_int *a, int radix, FILE *stream)
-{
-   char *buf;
-   int err, len, x;
-   
-   len = mp_radix_size(a, radix);
-   if (len == 0) {
-      return MP_VAL;
-   }
-   
-   buf = malloc(len);
-   if (buf == NULL) {
-      return MP_MEM;
-   }
-   
-   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
-      free(buf);
-      return err;
-   }
-   
-   for (x = 0; x < len; x++) {
-       if (fputc(buf[x], stream) == EOF) {
-          free(buf);
-          return MP_VAL;
-       }
-   }
-   
-   free(buf);
-   return MP_OKAY;
-}
-
-
-/* End: bn_radix.c */
-
-/* Start: bn_reverse.c */
-#line 0 "bn_reverse.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reverse an array, used for radix code */
-void
-bn_reverse (unsigned char *s, int len)
-{
-  int     ix, iy;
-  unsigned char t;
-
-  ix = 0;
-  iy = len - 1;
-  while (ix < iy) {
-    t     = s[ix];
-    s[ix] = s[iy];
-    s[iy] = t;
-    ++ix;
-    --iy;
-  }
-}
-
-/* End: bn_reverse.c */
-
-/* Start: bn_s_mp_add.c */
-#line 0 "bn_s_mp_add.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level addition, based on HAC pp.594, Algorithm 14.7 */
-int
-s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int *x;
-  int     olduse, res, min, max;
-
-  /* find sizes, we let |a| <= |b| which means we have to sort
-   * them.  "x" will point to the input with the most digits
-   */
-  if (a->used > b->used) {
-    min = b->used;
-    max = a->used;
-    x = a;
-  } else {
-    min = a->used;
-    max = b->used;
-    x = b;
-  }
-
-  /* init result */
-  if (c->alloc < max + 1) {
-    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* get old used digit count and set new one */
-  olduse = c->used;
-  c->used = max + 1;
-
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-
-    /* first input */
-    tmpa = a->dp;
-
-    /* second input */
-    tmpb = b->dp;
-
-    /* destination */
-    tmpc = c->dp;
-
-    /* zero the carry */
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-      *tmpc = *tmpa++ + *tmpb++ + u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> ((mp_digit)DIGIT_BIT);
-
-      /* take away carry bit from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, that is in A+B 
-     * if A or B has more digits add those in 
-     */
-    if (min != max) {
-      for (; i < max; i++) {
-        /* T[i] = X[i] + U */
-        *tmpc = x->dp[i] + u;
-
-        /* U = carry bit of T[i] */
-        u = *tmpc >> ((mp_digit)DIGIT_BIT);
-
-        /* take away carry bit from T[i] */
-        *tmpc++ &= MP_MASK;
-      }
-    }
-
-    /* add carry */
-    *tmpc++ = u;
-
-    /* clear digits above oldused */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_add.c */
-
-/* Start: bn_s_mp_exptmod.c */
-#line 0 "bn_s_mp_exptmod.c"
+
+/* End: bn_mp_toom_sqr.c */
+
+/* Start: bn_mp_unsigned_bin_size.c */
+#line 0 "bn_mp_unsigned_bin_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the size for an unsigned equivalent */
+int
+mp_unsigned_bin_size (mp_int * a)
+{
+  int     size = mp_count_bits (a);
+  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+
+/* End: bn_mp_unsigned_bin_size.c */
+
+/* Start: bn_mp_xor.c */
+#line 0 "bn_mp_xor.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* XOR two ints together */
+int
+mp_xor (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] ^= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_xor.c */
+
+/* Start: bn_mp_zero.c */
+#line 0 "bn_mp_zero.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set to zero */
+void
+mp_zero (mp_int * a)
+{
+  a->sign = MP_ZPOS;
+  a->used = 0;
+  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+}
+
+/* End: bn_mp_zero.c */
+
+/* Start: bn_prime_tab.c */
+#line 0 "bn_prime_tab.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+const mp_digit __prime_tab[] = {
+  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
+  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
+};
+
+/* End: bn_prime_tab.c */
+
+/* Start: bn_radix.c */
+#line 0 "bn_radix.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  mp_zero (a);
+  while (*str) {
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == s_rmap[y]) {
+    break;
+      }
+    }
+
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+    return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+    return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  a->sign = neg;
+  return MP_OKAY;
+}
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+  
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+  
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = s_rmap[d];
+    ++digs;
+  }
+  bn_reverse ((unsigned char *)_s, digs);
+  *str++ = '\0';
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
+
+/* End: bn_radix.c */
+
+/* Start: bn_reverse.c */
+#line 0 "bn_reverse.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reverse an array, used for radix code */
+void
+bn_reverse (unsigned char *s, int len)
+{
+  int     ix, iy;
+  unsigned char t;
+
+  ix = 0;
+  iy = len - 1;
+  while (ix < iy) {
+    t     = s[ix];
+    s[ix] = s[iy];
+    s[iy] = t;
+    ++ix;
+    --iy;
+  }
+}
+
+/* End: bn_reverse.c */
+
+/* Start: bn_s_mp_add.c */
+#line 0 "bn_s_mp_add.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
+int
+s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int *x;
+  int     olduse, res, min, max;
+
+  /* find sizes, we let |a| <= |b| which means we have to sort
+   * them.  "x" will point to the input with the most digits
+   */
+  if (a->used > b->used) {
+    min = b->used;
+    max = a->used;
+    x = a;
+  } else {
+    min = a->used;
+    max = b->used;
+    x = b;
+  }
+
+  /* init result */
+  if (c->alloc < max + 1) {
+    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get old used digit count and set new one */
+  olduse = c->used;
+  c->used = max + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+
+    /* first input */
+    tmpa = a->dp;
+
+    /* second input */
+    tmpb = b->dp;
+
+    /* destination */
+    tmpc = c->dp;
+
+    /* zero the carry */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+      /* take away carry bit from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
+    if (min != max) {
+      for (; i < max; i++) {
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;
+
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above oldused */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_add.c */
+
+/* Start: bn_s_mp_exptmod.c */
+#line 0 "bn_s_mp_exptmod.c"
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is library that provides for multiple-precision
@@ -6625,21 +6643,26 @@ s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 
   /* create M table
    *
-   * The M table contains powers of the input base, e.g. M[x] = G**x mod P
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
    *
-   * The first half of the table is not computed though accept for M[0] and M[1]
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
    */
   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
     goto __MU;
   }
 
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
     goto __MU;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
       goto __MU;
     }
     if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
@@ -6769,381 +6792,381 @@ __M:
   }
   return err;
 }
-
-/* End: bn_s_mp_exptmod.c */
-
-/* Start: bn_s_mp_mul_digs.c */
-#line 0 "bn_s_mp_mul_digs.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplies |a| * |b| and only computes upto digs digits of result
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
- * many digits of output are created.
- */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-  /* can we use the fast multiplier? */
-  if (((digs) < MP_WARRAY) &&
-      MIN (a->used, b->used) < 
-          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_s_mp_mul_digs (a, b, c, digs);
-  }
-
-  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
-    return res;
-  }
-  t.used = digs;
-
-  /* compute the digits of the product directly */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* set the carry to zero */
-    u = 0;
-
-    /* limit ourselves to making digs digits of output */
-    pb = MIN (b->used, digs - ix);
-
-    /* setup some aliases */
-    /* copy of the digit from a used within the nested loop */
-    tmpx = a->dp[ix];
-    
-    /* an alias for the destination shifted ix places */
-    tmpt = t.dp + ix;
-    
-    /* an alias for the digits of b */
-    tmpy = b->dp;
-
-    /* compute the columns of the output and propagate the carry */
-    for (iy = 0; iy < pb; iy++) {
-      /* compute the column as a mp_word */
-      r = ((mp_word) *tmpt) + 
-          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
-          ((mp_word) u);
-
-      /* the new column is the lower part of the result */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get the carry word from the result */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* set carry if it is placed below digs */
-    if (ix + iy < digs) {
-      *tmpt = u;
-    }
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, c);
-
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_mul_digs.c */
-
-/* Start: bn_s_mp_mul_high_digs.c */
-#line 0 "bn_s_mp_mul_high_digs.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplies |a| * |b| and does not compute the lower digs digits
- * [meant to get the higher part of the product]
- */
-int
-s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-
-  /* can we use the fast multiplier? */
-  if (((a->used + b->used + 1) < MP_WARRAY)
-      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_s_mp_mul_high_digs (a, b, c, digs);
-  }
-
-  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
-    return res;
-  }
-  t.used = a->used + b->used + 1;
-
-  pa = a->used;
-  pb = b->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* clear the carry */
-    u = 0;
-
-    /* left hand side of A[ix] * B[iy] */
-    tmpx = a->dp[ix];
-
-    /* alias to the address of where the digits will be stored */
-    tmpt = &(t.dp[digs]);
-
-    /* alias for where to read the right hand side from */
-    tmpy = b->dp + (digs - ix);
-
-    for (iy = digs - ix; iy < pb; iy++) {
-      /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
-
-      /* get the lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* carry the carry */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    *tmpt = u;
-  }
-  mp_clamp (&t);
-  mp_exch (&t, c);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_mul_high_digs.c */
-
-/* Start: bn_s_mp_sqr.c */
-#line 0 "bn_s_mp_sqr.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-  int     res, ix, iy, pa;
-  mp_word r;
-  mp_digit u, tmpx, *tmpt;
-
-  pa = a->used;
-  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
-    return res;
-  }
-  t.used = pa + pa + 1;
-
-  for (ix = 0; ix < pa; ix++) {
-    /* first calculate the digit at 2*ix */
-    /* calculate double precision result */
-    r = ((mp_word) t.dp[ix + ix]) + 
-        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-
-    /* store lower part in result */
-    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-
-    /* get the carry */
-    u = (r >> ((mp_word) DIGIT_BIT));
-
-    /* left hand side of A[ix] * A[iy] */
-    tmpx = a->dp[ix];
-
-    /* alias for where to store the results */
-    tmpt = t.dp + (ix + ix + 1);
-    
-    for (iy = ix + 1; iy < pa; iy++) {
-      /* first calculate the product */
-      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
-
-      /* now calculate the double precision result, note we use
-       * addition instead of *2 since its easier to optimize
-       */
-      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
-
-      /* store lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get carry */
-      u = (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* propagate upwards */
-    while (u != ((mp_digit) 0)) {
-      r = ((mp_word) * tmpt) + ((mp_word) u);
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u = (r >> ((mp_word) DIGIT_BIT));
-    }
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, b);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_sqr.c */
-
-/* Start: bn_s_mp_sub.c */
-#line 0 "bn_s_mp_sub.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
-int
-s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     olduse, res, min, max;
-
-  /* find sizes */
-  min = b->used;
-  max = a->used;
-
-  /* init result */
-  if (c->alloc < max) {
-    if ((res = mp_grow (c, max)) != MP_OKAY) {
-      return res;
-    }
-  }
-  olduse = c->used;
-  c->used = max;
-
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-    tmpa = a->dp;
-    tmpb = b->dp;
-    tmpc = c->dp;
-
-    /* set carry to zero */
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* T[i] = A[i] - B[i] - U */
-      *tmpc = *tmpa++ - *tmpb++ - u;
-
-      /* U = carry bit of T[i]
-       * Note this saves performing an AND operation since
-       * if a carry does occur it will propagate all the way to the
-       * MSB.  As a result a single shift is enough to get the carry
-       */
-      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, e.g. if A has more digits than B  */
-    for (; i < max; i++) {
-      /* T[i] = A[i] - U */
-      *tmpc = *tmpa++ - u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* clear digits above used (since we may not have grown result above) */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-
-/* End: bn_s_mp_sub.c */
-
-/* Start: bncore.c */
-#line 0 "bncore.c"
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Known optimal configurations
-
- CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
--------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
- AMD Athlon XP          /GCC v3.2     /       109/       127
-
-*/
-
-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
-        
-        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
-        TOOM_SQR_CUTOFF      = 400; 
-
-/* End: bncore.c */
-
-
-/* EOF */
+
+/* End: bn_s_mp_exptmod.c */
+
+/* Start: bn_s_mp_mul_digs.c */
+#line 0 "bn_s_mp_mul_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and only computes upto digs digits of result
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
+ */
+int
+s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
+    return res;
+  }
+  t.used = digs;
+
+  /* compute the digits of the product directly */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* set the carry to zero */
+    u = 0;
+
+    /* limit ourselves to making digs digits of output */
+    pb = MIN (b->used, digs - ix);
+
+    /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
+    tmpx = a->dp[ix];
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
+    tmpy = b->dp;
+
+    /* compute the columns of the output and propagate the carry */
+    for (iy = 0; iy < pb; iy++) {
+      /* compute the column as a mp_word */
+      r = ((mp_word) *tmpt) + 
+          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word) u);
+
+      /* the new column is the lower part of the result */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get the carry word from the result */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
+      *tmpt = u;
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, c);
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_mul_digs.c */
+
+/* Start: bn_s_mp_mul_high_digs.c */
+#line 0 "bn_s_mp_mul_high_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and does not compute the lower digs digits
+ * [meant to get the higher part of the product]
+ */
+int
+s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+
+  /* can we use the fast multiplier? */
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = a->used + b->used + 1;
+
+  pa = a->used;
+  pb = b->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* clear the carry */
+    u = 0;
+
+    /* left hand side of A[ix] * B[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias to the address of where the digits will be stored */
+    tmpt = &(t.dp[digs]);
+
+    /* alias for where to read the right hand side from */
+    tmpy = b->dp + (digs - ix);
+
+    for (iy = digs - ix; iy < pb; iy++) {
+      /* calculate the double precision result */
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+
+      /* get the lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* carry the carry */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpt = u;
+  }
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_mul_high_digs.c */
+
+/* Start: bn_s_mp_sqr.c */
+#line 0 "bn_s_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+int
+s_mp_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r;
+  mp_digit u, tmpx, *tmpt;
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = 2*pa + 1;
+
+  for (ix = 0; ix < pa; ix++) {
+    /* first calculate the digit at 2*ix */
+    /* calculate double precision result */
+    r = ((mp_word) t.dp[2*ix]) + 
+        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    /* store lower part in result */
+    t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* get the carry */
+    u = (r >> ((mp_word) DIGIT_BIT));
+
+    /* left hand side of A[ix] * A[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias for where to store the results */
+    tmpt = t.dp + (2*ix + 1);
+    
+    for (iy = ix + 1; iy < pa; iy++) {
+      /* first calculate the product */
+      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+
+      /* now calculate the double precision result, note we use
+       * addition instead of *2 since it's easier to optimize
+       */
+      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
+
+      /* store lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get carry */
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* propagate upwards */
+    while (u != ((mp_digit) 0)) {
+      r = ((mp_word) * tmpt) + ((mp_word) u);
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, b);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_sqr.c */
+
+/* Start: bn_s_mp_sub.c */
+#line 0 "bn_s_mp_sub.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+int
+s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     olduse, res, min, max;
+
+  /* find sizes */
+  min = b->used;
+  max = a->used;
+
+  /* init result */
+  if (c->alloc < max) {
+    if ((res = mp_grow (c, max)) != MP_OKAY) {
+      return res;
+    }
+  }
+  olduse = c->used;
+  c->used = max;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
+
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;
+
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is enough to get the carry
+       */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, e.g. if A has more digits than B  */
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      *tmpc = *tmpa++ - u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+
+/* End: bn_s_mp_sub.c */
+
+/* Start: bncore.c */
+#line 0 "bncore.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Known optimal configurations
+
+ CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
+-------------------------------------------------------------
+ Intel P4               /GCC v3.2     /        70/       108
+ AMD Athlon XP          /GCC v3.2     /       109/       127
+
+*/
+
+/* configured for a AMD XP Thoroughbred core with etc/tune.c */
+int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+        
+        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
+        TOOM_SQR_CUTOFF      = 400; 
+
+/* End: bncore.c */
+
+
+/* EOF */
diff --git a/mycrypt.h b/mycrypt.h
index 3a09a73..68f7144 100644
--- a/mycrypt.h
+++ b/mycrypt.h
@@ -16,8 +16,8 @@ extern "C" {
 #endif
 
 /* version */
-#define CRYPT   0x0084
-#define SCRYPT  "0.84"
+#define CRYPT   0x0085
+#define SCRYPT  "0.85"
 
 /* max size of either a cipher/hash block or symmetric key [largest of the two] */
 #define MAXBLOCKSIZE           128
diff --git a/mycrypt_cipher.h b/mycrypt_cipher.h
index 800b702..f7a3419 100644
--- a/mycrypt_cipher.h
+++ b/mycrypt_cipher.h
@@ -30,15 +30,10 @@ struct saferp_key {
 };
 #endif
 
-#ifdef SERPENT
-struct serpent_key {
-   unsigned long K[132];
-};
-#endif
-
 #ifdef RIJNDAEL
 struct rijndael_key {
-   unsigned long eK[64], dK[64], k_len;
+   unsigned long eK[64], dK[64];
+   int Nr;
 };
 #endif
 
@@ -96,7 +91,7 @@ struct cast5_key {
 
 #ifdef NOEKEON
 struct noekeon_key {
-	unsigned long K[4], dK[4];
+    unsigned long K[4], dK[4];
 };
 #endif
 
@@ -126,9 +121,6 @@ typedef union Symmetric_key {
 #ifdef SAFERP
    struct saferp_key   saferp;
 #endif
-#ifdef SERPENT
-   struct serpent_key  serpent;
-#endif
 #ifdef RIJNDAEL
    struct rijndael_key rijndael;
 #endif
@@ -251,23 +243,14 @@ extern int safer_128_keysize(int *desired_keysize);
 extern const struct _cipher_descriptor safer_k64_desc, safer_k128_desc, safer_sk64_desc, safer_sk128_desc;
 #endif
 
-#ifdef SERPENT
-extern int serpent_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey);
-extern void serpent_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key);
-extern void serpent_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key);
-extern int serpent_test(void);
-extern int serpent_keysize(int *desired_keysize);
-extern const struct _cipher_descriptor serpent_desc;
-#endif
-
 #ifdef RIJNDAEL
 
 /* make aes an alias */
-#define aes_setup			rijndael_setup
-#define aes_ecb_encrypt		rijndael_ecb_encrypt
-#define aes_ecb_decrypt		rijndael_ecb_decrypt
-#define aes_test			rijndael_test
-#define aes_keysize			rijndael_keysize
+#define aes_setup           rijndael_setup
+#define aes_ecb_encrypt     rijndael_ecb_encrypt
+#define aes_ecb_decrypt     rijndael_ecb_decrypt
+#define aes_test            rijndael_test
+#define aes_keysize         rijndael_keysize
 
 extern int rijndael_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey);
 extern void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key);
@@ -363,7 +346,7 @@ extern int ctr_start(int cipher, const unsigned char *IV, const unsigned char *k
 extern int ctr_encrypt(const unsigned char *pt, unsigned char *ct, unsigned long len, symmetric_CTR *ctr);
 extern int ctr_decrypt(const unsigned char *ct, unsigned char *pt, unsigned long len, symmetric_CTR *ctr);
 #endif
-	
+    
 extern int find_cipher(const char *name);
 extern int find_cipher_any(const char *name, int blocklen, int keylen);
 extern int find_cipher_id(unsigned char ID);
diff --git a/mycrypt_custom.h b/mycrypt_custom.h
index ad6081f..6e59e92 100644
--- a/mycrypt_custom.h
+++ b/mycrypt_custom.h
@@ -1,16 +1,14 @@
 /* This header is meant to be included before mycrypt.h in projects where
- * you don't want to throw all the defines in a makefile.
+ * you don't want to throw all the defines in a makefile. 
  */
 
 #ifndef MYCRYPT_CUSTOM_H_
 #define MYCRYPT_CUSTOM_H_
 
 #ifdef CRYPT
-        #error mycrypt_custom.h should be included before mycrypt.h
+	#error mycrypt_custom.h should be included before mycrypt.h
 #endif
 
-#define LTC_TEST
-
 #define XMALLOC malloc
 #define XREALLOC realloc
 #define XCALLOC calloc
@@ -18,11 +16,11 @@
 #define XCLOCK clock
 #define XCLOCKS_PER_SEC CLOCKS_PER_SEC
 #define SMALL_CODE
+#define LTC_TEST
 #define BLOWFISH
 #define RC2
 #define RC5
 #define RC6
-#define SERPENT
 #define SAFERP
 #define SAFER
 #define RIJNDAEL
diff --git a/mycrypt_hash.h b/mycrypt_hash.h
index 2b277a7..854b7e9 100644
--- a/mycrypt_hash.h
+++ b/mycrypt_hash.h
@@ -183,4 +183,3 @@ extern int hmac_file(int hash, const char *fname, const unsigned char *key,
                      unsigned long keylen, 
                      unsigned char *dst, unsigned long *dstlen);
 #endif
-
diff --git a/prime.c b/prime.c
index b50baf2..20571d6 100644
--- a/prime.c
+++ b/prime.c
@@ -45,7 +45,7 @@ loop:
            dist += step; x = -1;
         }
     }
-
+    
     /* recalc the total distance from where we started */
     total_dist += dist;
     
diff --git a/serpent.c b/serpent.c
deleted file mode 100644
index 8ea4a9c..0000000
--- a/serpent.c
+++ /dev/null
@@ -1,702 +0,0 @@
-#include "mycrypt.h"
-
-#ifdef SERPENT
-
-const struct _cipher_descriptor serpent_desc =
-{
-    "serpent",
-    5,
-    16, 32, 16, 32,
-    &serpent_setup,
-    &serpent_ecb_encrypt,
-    &serpent_ecb_decrypt,
-    &serpent_test,
-    &serpent_keysize
-};
-
-/* These defines are derived from Brian Gladman's work.  Contact him at gladman@seven77.demon.co.uk 
- *
- * Available on the web at http://fp.gladman.plus.com/cryptography_technology/aes/index.htm
- */
-#define sb0(a,b,c,d,e,f,g,h)    \
-    t1 = a ^ d;     \
-    t2 = a & d;     \
-    t3 = c ^ t1;    \
-    t6 = b & t1;    \
-    t4 = b ^ t3;    \
-    t10 = ~t3;      \
-    h = t2 ^ t4;    \
-    t7 = a ^ t6;    \
-    t14 = ~t7;      \
-    t8 = c | t7;    \
-    t11 = t3 ^ t7;  \
-    g = t4 ^ t8;    \
-    t12 = h & t11;  \
-    f = t10 ^ t12;  \
-    e = t12 ^ t14
-
-/* 15 terms */
-
-#define ib0(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = a ^ b;     \
-    t3 = t1 | t2;   \
-    t4 = d ^ t3;    \
-    t7 = d & t2;    \
-    t5 = c ^ t4;    \
-    t8 = t1 ^ t7;   \
-    g = t2 ^ t5;    \
-    t11 = a & t4;   \
-    t9 = g & t8;    \
-    t14 = t5 ^ t8;  \
-    f = t4 ^ t9;    \
-    t12 = t5 | f;   \
-    h = t11 ^ t12;  \
-    e = h ^ t14
-
-/* 14 terms!  */
-
-#define sb1(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = b ^ t1;    \
-    t3 = a | t2;    \
-    t4 = d | t2;    \
-    t5 = c ^ t3;    \
-    g = d ^ t5;     \
-    t7 = b ^ t4;    \
-    t8 = t2 ^ g;    \
-    t9 = t5 & t7;   \
-    h = t8 ^ t9;    \
-    t11 = t5 ^ t7;  \
-    f = h ^ t11;    \
-    t13 = t8 & t11; \
-    e = t5 ^ t13
-
-/* 17 terms */
-
-#define ib1(a,b,c,d,e,f,g,h)    \
-    t1 = a ^ d;     \
-    t2 = a & b;     \
-    t3 = b ^ c;     \
-    t4 = a ^ t3;    \
-    t5 = b | d;     \
-    t7 = c | t1;    \
-    h = t4 ^ t5;    \
-    t8 = b ^ t7;    \
-    t11 = ~t2;      \
-    t9 = t4 & t8;   \
-    f = t1 ^ t9;    \
-    t13 = t9 ^ t11; \
-    t12 = h & f;    \
-    g = t12 ^ t13;  \
-    t15 = a & d;    \
-    t16 = c ^ t13;  \
-    e = t15 ^ t16
-
-/* 16 terms */
-
-#define sb2(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = b ^ d;     \
-    t3 = c & t1;    \
-    t13 = d | t1;   \
-    e = t2 ^ t3;    \
-    t5 = c ^ t1;    \
-    t6 = c ^ e;     \
-    t7 = b & t6;    \
-    t10 = e | t5;   \
-    h = t5 ^ t7;    \
-    t9 = d | t7;    \
-    t11 = t9 & t10; \
-    t14 = t2 ^ h;   \
-    g = a ^ t11;    \
-    t15 = g ^ t13;  \
-    f = t14 ^ t15
-
-/* 16 terms */
-
-#define ib2(a,b,c,d,e,f,g,h)    \
-    t1 = b ^ d;     \
-    t2 = ~t1;       \
-    t3 = a ^ c;     \
-    t4 = c ^ t1;    \
-    t7 = a | t2;    \
-    t5 = b & t4;    \
-    t8 = d ^ t7;    \
-    t11 = ~t4;      \
-    e = t3 ^ t5;    \
-    t9 = t3 | t8;   \
-    t14 = d & t11;  \
-    h = t1 ^ t9;    \
-    t12 = e | h;    \
-    f = t11 ^ t12;  \
-    t15 = t3 ^ t12; \
-    g = t14 ^ t15
-
-/* 17 terms */
-
-#define sb3(a,b,c,d,e,f,g,h)    \
-    t1 = a ^ c;     \
-    t2 = d ^ t1;    \
-    t3 = a & t2;    \
-    t4 = d ^ t3;    \
-    t5 = b & t4;    \
-    g = t2 ^ t5;    \
-    t7 = a | g;     \
-    t8 = b | d;     \
-    t11 = a | d;    \
-    t9 = t4 & t7;   \
-    f = t8 ^ t9;    \
-    t12 = b ^ t11;  \
-    t13 = g ^ t9;   \
-    t15 = t3 ^ t8;  \
-    h = t12 ^ t13;  \
-    t16 = c & t15;  \
-    e = t12 ^ t16
-
-/* 16 term solution that performs less well than 17 term one
-   in my environment (PPro/PII)                                  
-
-#define sb3(a,b,c,d,e,f,g,h)    \
-    t1 = a ^ b;     \
-    t2 = a & c;     \
-    t3 = a | d;     \
-    t4 = c ^ d;     \
-    t5 = t1 & t3;   \
-    t6 = t2 | t5;   \
-    g = t4 ^ t6;    \
-    t8 = b ^ t3;    \
-    t9 = t6 ^ t8;   \
-    t10 = t4 & t9;  \
-    e = t1 ^ t10;   \
-    t12 = g & e;    \
-    f = t9 ^ t12;   \
-    t14 = b | d;    \
-    t15 = t4 ^ t12; \
-    h = t14 ^ t15
-*/
-
-/* 17 terms */
-
-#define ib3(a,b,c,d,e,f,g,h)    \
-    t1 = b ^ c;     \
-    t2 = b | c;     \
-    t3 = a ^ c;     \
-    t7 = a ^ d;     \
-    t4 = t2 ^ t3;   \
-    t5 = d | t4;    \
-    t9 = t2 ^ t7;   \
-    e = t1 ^ t5;    \
-    t8 = t1 | t5;   \
-    t11 = a & t4;   \
-    g = t8 ^ t9;    \
-    t12 = e | t9;   \
-    f = t11 ^ t12;  \
-    t14 = a & g;    \
-    t15 = t2 ^ t14; \
-    t16 = e & t15;  \
-    h = t4 ^ t16
-
-/* 15 terms */
-
-#define sb4(a,b,c,d,e,f,g,h)    \
-    t1 = a ^ d;     \
-    t2 = d & t1;    \
-    t3 = c ^ t2;    \
-    t4 = b | t3;    \
-    h = t1 ^ t4;    \
-    t6 = ~b;        \
-    t7 = t1 | t6;   \
-    e = t3 ^ t7;    \
-    t9 = a & e;     \
-    t10 = t1 ^ t6;  \
-    t11 = t4 & t10; \
-    g = t9 ^ t11;   \
-    t13 = a ^ t3;   \
-    t14 = t10 & g;  \
-    f = t13 ^ t14
-
-/* 17 terms */
-
-#define ib4(a,b,c,d,e,f,g,h)    \
-    t1 = c ^ d;     \
-    t2 = c | d;     \
-    t3 = b ^ t2;    \
-    t4 = a & t3;    \
-    f = t1 ^ t4;    \
-    t6 = a ^ d;     \
-    t7 = b | d;     \
-    t8 = t6 & t7;   \
-    h = t3 ^ t8;    \
-    t10 = ~a;       \
-    t11 = c ^ h;    \
-    t12 = t10 | t11;\
-    e = t3 ^ t12;   \
-    t14 = c | t4;   \
-    t15 = t7 ^ t14; \
-    t16 = h | t10;  \
-    g = t15 ^ t16
-
-/* 16 terms */
-
-#define sb5(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = a ^ b;     \
-    t3 = a ^ d;     \
-    t4 = c ^ t1;    \
-    t5 = t2 | t3;   \
-    e = t4 ^ t5;    \
-    t7 = d & e;     \
-    t8 = t2 ^ e;    \
-    t10 = t1 | e;   \
-    f = t7 ^ t8;    \
-    t11 = t2 | t7;  \
-    t12 = t3 ^ t10; \
-    t14 = b ^ t7;   \
-    g = t11 ^ t12;  \
-    t15 = f & t12;  \
-    h = t14 ^ t15
-
-/* 16 terms */
-
-#define ib5(a,b,c,d,e,f,g,h)    \
-    t1 = ~c;        \
-    t2 = b & t1;    \
-    t3 = d ^ t2;    \
-    t4 = a & t3;    \
-    t5 = b ^ t1;    \
-    h = t4 ^ t5;    \
-    t7 = b | h;     \
-    t8 = a & t7;    \
-    f = t3 ^ t8;    \
-    t10 = a | d;    \
-    t11 = t1 ^ t7;  \
-    e = t10 ^ t11;  \
-    t13 = a ^ c;    \
-    t14 = b & t10;  \
-    t15 = t4 | t13; \
-    g = t14 ^ t15
-
-/* 15 terms */
-
-#define sb6(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = a ^ d;     \
-    t3 = b ^ t2;    \
-    t4 = t1 | t2;   \
-    t5 = c ^ t4;    \
-    f = b ^ t5;     \
-    t13 = ~t5;      \
-    t7 = t2 | f;    \
-    t8 = d ^ t7;    \
-    t9 = t5 & t8;   \
-    g = t3 ^ t9;    \
-    t11 = t5 ^ t8;  \
-    e = g ^ t11;    \
-    t14 = t3 & t11; \
-    h = t13 ^ t14
-
-/* 15 terms */
-
-#define ib6(a,b,c,d,e,f,g,h)    \
-    t1 = ~a;        \
-    t2 = a ^ b;     \
-    t3 = c ^ t2;    \
-    t4 = c | t1;    \
-    t5 = d ^ t4;    \
-    t13 = d & t1;   \
-    f = t3 ^ t5;    \
-    t7 = t3 & t5;   \
-    t8 = t2 ^ t7;   \
-    t9 = b | t8;    \
-    h = t5 ^ t9;    \
-    t11 = b | h;    \
-    e = t8 ^ t11;   \
-    t14 = t3 ^ t11; \
-    g = t13 ^ t14
-
-/* 17 terms */
-
-#define sb7(a,b,c,d,e,f,g,h)    \
-    t1 = ~c;        \
-    t2 = b ^ c;     \
-    t3 = b | t1;    \
-    t4 = d ^ t3;    \
-    t5 = a & t4;    \
-    t7 = a ^ d;     \
-    h = t2 ^ t5;    \
-    t8 = b ^ t5;    \
-    t9 = t2 | t8;   \
-    t11 = d & t3;   \
-    f = t7 ^ t9;    \
-    t12 = t5 ^ f;   \
-    t15 = t1 | t4;  \
-    t13 = h & t12;  \
-    g = t11 ^ t13;  \
-    t16 = t12 ^ g;  \
-    e = t15 ^ t16
-
-/* 17 terms */
-
-#define ib7(a,b,c,d,e,f,g,h)    \
-    t1 = a & b;     \
-    t2 = a | b;     \
-    t3 = c | t1;    \
-    t4 = d & t2;    \
-    h = t3 ^ t4;    \
-    t6 = ~d;        \
-    t7 = b ^ t4;    \
-    t8 = h ^ t6;    \
-    t11 = c ^ t7;   \
-    t9 = t7 | t8;   \
-    f = a ^ t9;     \
-    t12 = d | f;    \
-    e = t11 ^ t12;  \
-    t14 = a & h;    \
-    t15 = t3 ^ f;   \
-    t16 = e ^ t14;  \
-    g = t15 ^ t16
-
-#define k_xor(r,a,b,c,d)             \
-    a ^= skey->serpent.K[4 * (r) + 0]; \
-    b ^= skey->serpent.K[4 * (r) + 1]; \
-    c ^= skey->serpent.K[4 * (r) + 2]; \
-    d ^= skey->serpent.K[4 * (r) + 3]
-
-#define k_set(r,a,b,c,d)   \
-    a = lkey[4 * (r) +  8];  \
-    b = lkey[4 * (r) +  9];  \
-    c = lkey[4 * (r) + 10];  \
-    d = lkey[4 * (r) + 11]
-
-#define k_get(r,a,b,c,d)            \
-    skey->serpent.K[4 * (r) + 0] = a; \
-    skey->serpent.K[4 * (r) + 1] = b; \
-    skey->serpent.K[4 * (r) + 2] = c; \
-    skey->serpent.K[4 * (r) + 3] = d
-
-/* the linear transformation and its inverse    */
-
-#define rot(a,b,c,d)    \
-    a = ROL(a, 13);    \
-    c = ROL(c, 3);     \
-    d ^= c ^ (a << 3);  \
-    b ^= a ^ c;         \
-    d = ROL(d, 7);     \
-    b = ROL(b, 1);     \
-    a ^= b ^ d;         \
-    c ^= d ^ (b << 7);  \
-    a = ROL(a, 5);     \
-    c = ROL(c, 22)
-
-#define irot(a,b,c,d)   \
-    c = ROR(c, 22);    \
-    a = ROR(a, 5);     \
-    c ^= d ^ (b << 7);  \
-    a ^= b ^ d;         \
-    d = ROR(d, 7);     \
-    b = ROR(b, 1);     \
-    d ^= c ^ (a << 3);  \
-    b ^= a ^ c;         \
-    c = ROR(c, 3);     \
-    a = ROR(a, 13)
-    
-#ifdef CLEAN_STACK
-static int _serpent_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
-#else
-int serpent_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
-#endif
-{
-    unsigned long lkey[140], t, a, b, c, d, e, f, g, h, x;
-    unsigned long t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
-    unsigned char buf[32];
-
-    _ARGCHK(key != NULL);
-    _ARGCHK(skey != NULL);
-
-    /* check rounds */
-    if (num_rounds != 0 && num_rounds != 32) {
-       return CRYPT_INVALID_ROUNDS;
-    }
-
-    /* check keylen */
-    if (keylen < 16 || keylen > 32) {
-       return CRYPT_INVALID_KEYSIZE;
-    }
-
-    /* copy key and expand to 32bytes as required */
-    for (x = 0; x < (unsigned long)keylen; x++) {
-        buf[x] = key[x];
-    }
-
-    if (x < 32) {
-       buf[x++] = (unsigned char)0x01;
-       while (x < 32) {
-           buf[x++] = (unsigned char)0;
-       }
-    }
-
-    /* copy key into 32-bit words */
-    for (x = 0; x < 8; x++) {
-        LOAD32L(lkey[x], &buf[x*4]);
-    }
-
-    /* expand using the LFSR to 140 words */
-    for (x = 0; x < 132; x++) {
-        t = lkey[x] ^ lkey[x+3] ^ lkey[x+5] ^ lkey[x+7] ^ x ^ 0x9E3779B9UL;
-        lkey[x + 8] = ROL(t, 11);
-    }
-
-    /* perform the substituions */
-    for (x = 0; x < 32; ) {
-       k_set( x,a,b,c,d);sb3(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb2(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb1(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb0(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb7(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb6(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb5(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-       k_set( x,a,b,c,d);sb4(a,b,c,d,e,f,g,h);k_get( x,e,f,g,h); ++x;
-    }
-    k_set(32,a,b,c,d);sb3(a,b,c,d,e,f,g,h);k_get(32,e,f,g,h);
-    return CRYPT_OK;
-}
-
-#ifdef CLEAN_STACK
-int serpent_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
-{
-   int x;
-   x = _serpent_setup(key, keylen, num_rounds, skey);
-   burn_stack(sizeof(unsigned long)*166 + sizeof(unsigned char)*32);
-   return x;
-}
-#endif
-
-#ifdef CLEAN_STACK
-static void _serpent_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey)
-#else
-void serpent_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey)
-#endif
-{
-    unsigned long a,b,c,d,e,f,g,h;
-    unsigned long t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
-
-    _ARGCHK(pt != NULL);
-    _ARGCHK(ct != NULL);
-    _ARGCHK(skey != NULL);
-
-    LOAD32L(a, &pt[0]);LOAD32L(b, &pt[4]);LOAD32L(c, &pt[8]);LOAD32L(d, &pt[12]);
-    k_xor( 0,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor( 1,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor( 2,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor( 3,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor( 4,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor( 5,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor( 6,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor( 7,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor( 8,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor( 9,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d); 
-    k_xor(10,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(11,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(12,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(13,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(14,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(15,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(16,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(17,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(18,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(19,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(20,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(21,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(22,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(23,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(24,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(25,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(26,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(27,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(28,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(29,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
-    k_xor(30,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
-    k_xor(31,e,f,g,h); sb7(e,f,g,h,a,b,c,d); k_xor(32,a,b,c,d);
-    STORE32L(a, &ct[0]);STORE32L(b, &ct[4]);STORE32L(c, &ct[8]);STORE32L(d, &ct[12]);
-}
-
-#ifdef CLEAN_STACK
-void serpent_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey)
-{
-   _serpent_ecb_encrypt(pt, ct, skey);
-   burn_stack(sizeof(unsigned long)*24);
-}
-#endif
-
-#ifdef CLEAN_STACK
-static void _serpent_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey)
-#else
-void serpent_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey)
-#endif
-{
-    unsigned long a,b,c,d,e,f,g,h;
-    unsigned long t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
-
-    _ARGCHK(pt != NULL);
-    _ARGCHK(ct != NULL);
-    _ARGCHK(skey != NULL);
-
-    LOAD32L(a, &ct[0]);LOAD32L(b, &ct[4]);LOAD32L(c, &ct[8]);LOAD32L(d, &ct[12]);
-    k_xor(32,a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor(31,e,f,g,h);
-    irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(30,a,b,c,d);
-    irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(29,e,f,g,h);
-    irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(28,a,b,c,d);
-    irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(27,e,f,g,h);
-    irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(26,a,b,c,d);
-    irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor(25,e,f,g,h);
-    irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor(24,a,b,c,d);
-    irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor(23,e,f,g,h);
-    irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(22,a,b,c,d);
-    irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(21,e,f,g,h);
-    irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(20,a,b,c,d);
-    irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(19,e,f,g,h);
-    irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(18,a,b,c,d);
-    irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor(17,e,f,g,h);
-    irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor(16,a,b,c,d);
-    irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor(15,e,f,g,h);
-    irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(14,a,b,c,d);
-    irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(13,e,f,g,h);
-    irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(12,a,b,c,d);
-    irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(11,e,f,g,h);
-    irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(10,a,b,c,d);
-    irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor( 9,e,f,g,h);
-    irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor( 8,a,b,c,d);
-    irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor( 7,e,f,g,h);
-    irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor( 6,a,b,c,d);
-    irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor( 5,e,f,g,h);
-    irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor( 4,a,b,c,d);
-    irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor( 3,e,f,g,h);
-    irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor( 2,a,b,c,d);
-    irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor( 1,e,f,g,h);
-    irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor( 0,a,b,c,d);
-    STORE32L(a, &pt[0]);STORE32L(b, &pt[4]);STORE32L(c, &pt[8]);STORE32L(d, &pt[12]);
-}
-
-#ifdef CLEAN_STACK
-void serpent_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey)
-{
-   _serpent_ecb_decrypt(ct, pt, skey);
-   burn_stack(sizeof(unsigned long)*24);
-}
-#endif
-
-int serpent_test(void)
-{
- #ifndef LTC_TEST
-    return CRYPT_NOP;
- #else    
-   static const struct {
-       int keylen;
-       unsigned char key[32], pt[16], ct[16];
-   } tests[] = {
-   {
-      16,
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0xdd, 0xd2, 0x6b, 0x98, 0xa5, 0xff, 0xd8, 0x2c,
-        0x05, 0x34, 0x5a, 0x9d, 0xad, 0xbf, 0xaf, 0x49 }
-   },
-   {
-      16,
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
-      { 0x4a, 0xe9, 0xa2, 0x0b, 0x2b, 0x14, 0xa1, 0x02,
-        0x90, 0xcb, 0xb8, 0x20, 0xb7, 0xff, 0xb5, 0x10 }
-   },
-   {
-      24,
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08 },
-      { 0xe1, 0x1b, 0x01, 0x52, 0x4e, 0xa1, 0xf4, 0x65, 
-        0xa2, 0xa2, 0x00, 0x43, 0xeb, 0x9f, 0x7e, 0x8a }
-   },
-   {
-      32,
-      { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0xe0, 0x88, 0x5d, 0x44, 0x60, 0x37, 0x34, 0x69,
-        0xd1, 0xfa, 0x6c, 0x36, 0xa6, 0xe1, 0xc5, 0x2f }
-   },
-   {
-      32,
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x17, 0xc6, 0x25, 0x8e, 0x60, 0x09, 0xe2, 0x82,
-        0x66, 0x18, 0x69, 0xd5, 0x25, 0xf7, 0xd2, 0x04 }
-   },
-   {
-      32,
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x9f, 0xe1, 0x43, 0x25, 0x0d, 0x00, 0xe2, 0x56, 
-        0x96, 0xb0, 0x1e, 0x0a, 0x2e, 0xd0, 0x5d, 0xb3 }
-   }
-   };
-
-   unsigned char buf[2][16];
-   int x, err;
-   symmetric_key key;
-
-   for (x = 0; x < (int)(sizeof(tests) / sizeof(tests[0])); x++) {
-      /* setup key */
-      if ((err = serpent_setup(tests[x].key, tests[x].keylen, 0, &key))!= CRYPT_OK) {
-         return err;
-      }
-
-      /* encrypt and decrypt */
-      serpent_ecb_encrypt(tests[x].pt, buf[0], &key);
-      serpent_ecb_decrypt(buf[0], buf[1], &key);
-
-      /* compare */
-      if (memcmp(buf[0], tests[x].ct, 16) != 0 || memcmp(buf[1], tests[x].pt, 16) != 0) {
-         return CRYPT_FAIL_TESTVECTOR;
-      }
-   }
-   return CRYPT_OK;
-  #endif
-}
-
-int serpent_keysize(int *desired_keysize)
-{
-   _ARGCHK(desired_keysize != NULL);
-
-   if (*desired_keysize < 16)
-      return CRYPT_INVALID_KEYSIZE;
-   if (*desired_keysize > 32)
-      *desired_keysize = 32;
-   return CRYPT_OK;
-}
-
-#endif
-
-
diff --git a/sha1.c b/sha1.c
index b273165..f0e5818 100644
--- a/sha1.c
+++ b/sha1.c
@@ -14,7 +14,7 @@ const struct _hash_descriptor sha1_desc =
     &sha1_test
 };
 
-#define F0(x,y,z)  ( (x&y) | ((~x)&z) )
+#define F0(x,y,z)  (z ^ (x & (y ^ z)))
 #define F1(x,y,z)  (x ^ y ^ z)
 #define F2(x,y,z)  ((x & y) | (z & (x | y)))
 #define F3(x,y,z)  (x ^ y ^ z)
diff --git a/sha256.c b/sha256.c
index 33fd96f..f14e72f 100644
--- a/sha256.c
+++ b/sha256.c
@@ -32,7 +32,7 @@ static const unsigned long K[64] = {
 };
 
 /* Various logical functions */
-#define Ch(x,y,z)       ((x & y) | (~x & z))
+#define Ch(x,y,z)       (z ^ (x & (y ^ z)))
 #define Maj(x,y,z)      (((x | y) & z) | (x & y)) 
 #define S(x, n)         ROR((x),(n))
 #define R(x, n)         (((x)&0xFFFFFFFFUL)>>(n))
diff --git a/sha512.c b/sha512.c
index 88317e4..53b33ac 100644
--- a/sha512.c
+++ b/sha512.c
@@ -59,7 +59,7 @@ CONST64(0x5fcb6fab3ad6faec), CONST64(0x6c44198c4a475817)
 };
 
 /* Various logical functions */
-#define Ch(x,y,z)       ((x & y) | (~x & z))
+#define Ch(x,y,z)       (z ^ (x & (y ^ z)))
 #define Maj(x,y,z)      (((x | y) & z) | (x & y)) 
 #define S(x, n)         ROR64((x),(n))
 #define R(x, n)         (((x)&CONST64(0xFFFFFFFFFFFFFFFF))>>((ulong64)n))
diff --git a/tdcal.pdf b/tdcal.pdf
new file mode 100644
index 0000000..e4f4b35
Binary files /dev/null and b/tdcal.pdf differ
diff --git a/twofish.c b/twofish.c
index 2d09589..bfc77fc 100644
--- a/twofish.c
+++ b/twofish.c
@@ -397,9 +397,9 @@ static unsigned long g_func(unsigned long x, symmetric_key *key)
     burn_stack(sizeof(unsigned char) * 4 + sizeof(unsigned long));
     return y;
 }
-#endif
+#endif /* CLEAN_STACK */
 
-#endif
+#endif /* TWOFISH_SMALL */
 
 #ifdef CLEAN_STACK
 static int _twofish_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
diff --git a/yarrow.c b/yarrow.c
index 6261edc..ee53a5c 100644
--- a/yarrow.c
+++ b/yarrow.c
@@ -28,8 +28,6 @@ int yarrow_start(prng_state *prng)
    prng->yarrow.cipher = register_cipher(&twofish_desc);
 #elif defined(CAST5)
    prng->yarrow.cipher = register_cipher(&cast5_desc);
-#elif defined(SERPENT)
-   prng->yarrow.cipher = register_cipher(&serpent_desc);
 #elif defined(SAFER)
    prng->yarrow.cipher = register_cipher(&saferp_desc);
 #elif defined(RC5)