Main Page | Namespace List | Class Hierarchy | Alphabetical List | Compound List | File List | Namespace Members | Compound Members | File Members

integer.cpp

00001 // integer.cpp - written and placed in the public domain by Wei Dai
00002 // contains public domain code contributed by Alister Lee and Leonard Janke
00003 
00004 #include "pch.h"
00005 
00006 #ifndef CRYPTOPP_IMPORTS
00007 
00008 #include "integer.h"
00009 #include "modarith.h"
00010 #include "nbtheory.h"
00011 #include "asn.h"
00012 #include "oids.h"
00013 #include "words.h"
00014 #include "algparam.h"
00015 #include "pubkey.h"             // for P1363_KDF2
00016 #include "sha.h"
00017 
00018 #include <iostream>
00019 
00020 #ifdef SSE2_INTRINSICS_AVAILABLE
00021 #include <emmintrin.h>
00022 #endif
00023 
00024 NAMESPACE_BEGIN(CryptoPP)
00025 
00026 #ifdef SSE2_INTRINSICS_AVAILABLE
00027 template <class T>
00028 CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *)
00029 {
00030         if (n < 4)
00031                 return new T[n];
00032         else
00033                 return (T *)_mm_malloc(sizeof(T)*n, 16);
00034 
00035 }
00036 
00037 template <class T>
00038 void AlignedAllocator<T>::deallocate(void *p, size_type n)
00039 {
00040         memset(p, 0, n*sizeof(T));
00041         if (n < 4)
00042                 delete [] p;
00043         else
00044                 _mm_free(p);
00045 }
00046 #endif
00047 
00048 #define MAKE_DWORD(lowWord, highWord) ((dword(highWord)<<WORD_BITS) | (lowWord))
00049 
00050 static int Compare(const word *A, const word *B, unsigned int N)
00051 {
00052         while (N--)
00053                 if (A[N] > B[N])
00054                         return 1;
00055                 else if (A[N] < B[N])
00056                         return -1;
00057 
00058         return 0;
00059 }
00060 
00061 static word Increment(word *A, unsigned int N, word B=1)
00062 {
00063         assert(N);
00064         word t = A[0];
00065         A[0] = t+B;
00066         if (A[0] >= t)
00067                 return 0;
00068         for (unsigned i=1; i<N; i++)
00069                 if (++A[i])
00070                         return 0;
00071         return 1;
00072 }
00073 
00074 static word Decrement(word *A, unsigned int N, word B=1)
00075 {
00076         assert(N);
00077         word t = A[0];
00078         A[0] = t-B;
00079         if (A[0] <= t)
00080                 return 0;
00081         for (unsigned i=1; i<N; i++)
00082                 if (A[i]--)
00083                         return 0;
00084         return 1;
00085 }
00086 
00087 static void TwosComplement(word *A, unsigned int N)
00088 {
00089         Decrement(A, N);
00090         for (unsigned i=0; i<N; i++)
00091                 A[i] = ~A[i];
00092 }
00093 
00094 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
00095 {
00096         word carry=0;
00097         for(unsigned i=0; i<N; i++)
00098         {
00099                 dword p = (dword)A[i] * B + carry;
00100                 C[i] = LOW_WORD(p);
00101                 carry = HIGH_WORD(p);
00102         }
00103         return carry;
00104 }
00105 
00106 static void AtomicInverseModPower2(word *C, word A0, word A1)
00107 {
00108         assert(A0%2==1);
00109 
00110         dword A=MAKE_DWORD(A0, A1), R=A0%8;
00111 
00112         for (unsigned i=3; i<2*WORD_BITS; i*=2)
00113                 R = R*(2-R*A);
00114 
00115         assert(R*A==1);
00116 
00117         C[0] = LOW_WORD(R);
00118         C[1] = HIGH_WORD(R);
00119 }
00120 
00121 // ********************************************************
00122 
00123 class Portable
00124 {
00125 public:
00126         static word Add(word *C, const word *A, const word *B, unsigned int N);
00127         static word Subtract(word *C, const word *A, const word *B, unsigned int N);
00128 
00129         static inline void Multiply2(word *C, const word *A, const word *B);
00130         static inline word Multiply2Add(word *C, const word *A, const word *B);
00131         static void Multiply4(word *C, const word *A, const word *B);
00132         static void Multiply8(word *C, const word *A, const word *B);
00133         static inline unsigned int MultiplyRecursionLimit() {return 8;}
00134 
00135         static inline void Multiply2Bottom(word *C, const word *A, const word *B);
00136         static void Multiply4Bottom(word *C, const word *A, const word *B);
00137         static void Multiply8Bottom(word *C, const word *A, const word *B);
00138         static inline unsigned int MultiplyBottomRecursionLimit() {return 8;}
00139 
00140         static void Square2(word *R, const word *A);
00141         static void Square4(word *R, const word *A);
00142         static void Square8(word *R, const word *A) {assert(false);}
00143         static inline unsigned int SquareRecursionLimit() {return 4;}
00144 };
00145 
00146 word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
00147 {
00148         assert (N%2 == 0);
00149 
00150 #ifdef IS_LITTLE_ENDIAN
00151         if (sizeof(dword) == sizeof(size_t))    // dword is only register size
00152         {
00153                 dword carry = 0;
00154                 N >>= 1;
00155                 for (unsigned int i = 0; i < N; i++)
00156                 {
00157                         dword a = ((const dword *)A)[i] + carry;
00158                         dword c = a + ((const dword *)B)[i];
00159                         ((dword *)C)[i] = c;
00160                         carry = (a < carry) | (c < a);
00161                 }
00162                 return (word)carry;
00163         }
00164         else
00165 #endif
00166         {
00167                 word carry = 0;
00168                 for (unsigned int i = 0; i < N; i+=2)
00169                 {
00170                         dword u = (dword) carry + A[i] + B[i];
00171                         C[i] = LOW_WORD(u);
00172                         u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
00173                         C[i+1] = LOW_WORD(u);
00174                         carry = HIGH_WORD(u);
00175                 }
00176                 return carry;
00177         }
00178 }
00179 
00180 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
00181 {
00182         assert (N%2 == 0);
00183 
00184 #ifdef IS_LITTLE_ENDIAN
00185         if (sizeof(dword) == sizeof(size_t))    // dword is only register size
00186         {
00187                 dword borrow = 0;
00188                 N >>= 1;
00189                 for (unsigned int i = 0; i < N; i++)
00190                 {
00191                         dword a = ((const dword *)A)[i];
00192                         dword b = a - borrow;
00193                         dword c = b - ((const dword *)B)[i];
00194                         ((dword *)C)[i] = c;
00195                         borrow = (b > a) | (c > b);
00196                 }
00197                 return (word)borrow;
00198         }
00199         else
00200 #endif
00201         {
00202                 word borrow=0;
00203                 for (unsigned i = 0; i < N; i+=2)
00204                 {
00205                         dword u = (dword) A[i] - B[i] - borrow;
00206                         C[i] = LOW_WORD(u);
00207                         u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
00208                         C[i+1] = LOW_WORD(u);
00209                         borrow = 0-HIGH_WORD(u);
00210                 }
00211                 return borrow;
00212         }
00213 }
00214 
00215 void Portable::Multiply2(word *C, const word *A, const word *B)
00216 {
00217 /*
00218         word s;
00219         dword d;
00220 
00221         if (A1 >= A0)
00222                 if (B0 >= B1)
00223                 {
00224                         s = 0;
00225                         d = (dword)(A1-A0)*(B0-B1);
00226                 }
00227                 else
00228                 {
00229                         s = (A1-A0);
00230                         d = (dword)s*(word)(B0-B1);
00231                 }
00232         else
00233                 if (B0 > B1)
00234                 {
00235                         s = (B0-B1);
00236                         d = (word)(A1-A0)*(dword)s;
00237                 }
00238                 else
00239                 {
00240                         s = 0;
00241                         d = (dword)(A0-A1)*(B1-B0);
00242                 }
00243 */
00244         // this segment is the branchless equivalent of above
00245         word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
00246         unsigned int ai = A[1] < A[0];
00247         unsigned int bi = B[0] < B[1];
00248         unsigned int di = ai & bi;
00249         dword d = (dword)D[di]*D[di+2];
00250         D[1] = D[3] = 0;
00251         unsigned int si = ai + !bi;
00252         word s = D[si];
00253 
00254         dword A0B0 = (dword)A[0]*B[0];
00255         C[0] = LOW_WORD(A0B0);
00256 
00257         dword A1B1 = (dword)A[1]*B[1];
00258         dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
00259         C[1] = LOW_WORD(t);
00260 
00261         t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
00262         C[2] = LOW_WORD(t);
00263         C[3] = HIGH_WORD(t);
00264 }
00265 
00266 inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
00267 {
00268 #ifdef IS_LITTLE_ENDIAN
00269         if (sizeof(dword) == sizeof(size_t))
00270         {
00271                 dword a = *(const dword *)A, b = *(const dword *)B;
00272                 ((dword *)C)[0] = a*b;
00273         }
00274         else
00275 #endif
00276         {
00277                 dword t = (dword)A[0]*B[0];
00278                 C[0] = LOW_WORD(t);
00279                 C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
00280         }
00281 }
00282 
00283 word Portable::Multiply2Add(word *C, const word *A, const word *B)
00284 {
00285         word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
00286         unsigned int ai = A[1] < A[0];
00287         unsigned int bi = B[0] < B[1];
00288         unsigned int di = ai & bi;
00289         dword d = (dword)D[di]*D[di+2];
00290         D[1] = D[3] = 0;
00291         unsigned int si = ai + !bi;
00292         word s = D[si];
00293 
00294         dword A0B0 = (dword)A[0]*B[0];
00295         dword t = A0B0 + C[0];
00296         C[0] = LOW_WORD(t);
00297 
00298         dword A1B1 = (dword)A[1]*B[1];
00299         t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
00300         C[1] = LOW_WORD(t);
00301 
00302         t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
00303         C[2] = LOW_WORD(t);
00304 
00305         t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
00306         C[3] = LOW_WORD(t);
00307         return HIGH_WORD(t);
00308 }
00309 
00310 #define MulAcc(x, y)                                                            \
00311         p = (dword)A[x] * B[y] + c;                                     \
00312         c = LOW_WORD(p);                                                                \
00313         p = (dword)d + HIGH_WORD(p);                                    \
00314         d = LOW_WORD(p);                                                                \
00315         e += HIGH_WORD(p);
00316 
00317 #define SaveMulAcc(s, x, y)                                             \
00318         R[s] = c;                                                                               \
00319         p = (dword)A[x] * B[y] + d;                                     \
00320         c = LOW_WORD(p);                                                                \
00321         p = (dword)e + HIGH_WORD(p);                                    \
00322         d = LOW_WORD(p);                                                                \
00323         e = HIGH_WORD(p);
00324 
00325 #define SquAcc(x, y)                                                            \
00326         q = (dword)A[x] * A[y]; \
00327         p = q + c;                                      \
00328         c = LOW_WORD(p);                                                                \
00329         p = (dword)d + HIGH_WORD(p);                                    \
00330         d = LOW_WORD(p);                                                                \
00331         e += HIGH_WORD(p);                      \
00332         p = q + c;                                      \
00333         c = LOW_WORD(p);                                                                \
00334         p = (dword)d + HIGH_WORD(p);                                    \
00335         d = LOW_WORD(p);                                                                \
00336         e += HIGH_WORD(p);
00337 
00338 #define SaveSquAcc(s, x, y)                                             \
00339         R[s] = c;                                                                               \
00340         q = (dword)A[x] * A[y]; \
00341         p = q + d;                                      \
00342         c = LOW_WORD(p);                                                                \
00343         p = (dword)e + HIGH_WORD(p);                                    \
00344         d = LOW_WORD(p);                                                                \
00345         e = HIGH_WORD(p);                       \
00346         p = q + c;                                      \
00347         c = LOW_WORD(p);                                                                \
00348         p = (dword)d + HIGH_WORD(p);                                    \
00349         d = LOW_WORD(p);                                                                \
00350         e += HIGH_WORD(p);
00351 
00352 void Portable::Multiply4(word *R, const word *A, const word *B)
00353 {
00354         dword p;
00355         word c, d, e;
00356 
00357         p = (dword)A[0] * B[0];
00358         R[0] = LOW_WORD(p);
00359         c = HIGH_WORD(p);
00360         d = e = 0;
00361 
00362         MulAcc(0, 1);
00363         MulAcc(1, 0);
00364 
00365         SaveMulAcc(1, 2, 0);
00366         MulAcc(1, 1);
00367         MulAcc(0, 2);
00368 
00369         SaveMulAcc(2, 0, 3);
00370         MulAcc(1, 2);
00371         MulAcc(2, 1);
00372         MulAcc(3, 0);
00373 
00374         SaveMulAcc(3, 3, 1);
00375         MulAcc(2, 2);
00376         MulAcc(1, 3);
00377 
00378         SaveMulAcc(4, 2, 3);
00379         MulAcc(3, 2);
00380 
00381         R[5] = c;
00382         p = (dword)A[3] * B[3] + d;
00383         R[6] = LOW_WORD(p);
00384         R[7] = e + HIGH_WORD(p);
00385 }
00386 
00387 void Portable::Square2(word *R, const word *A)
00388 {
00389         dword p, q;
00390         word c, d, e;
00391 
00392         p = (dword)A[0] * A[0];
00393         R[0] = LOW_WORD(p);
00394         c = HIGH_WORD(p);
00395         d = e = 0;
00396 
00397         SquAcc(0, 1);
00398 
00399         R[1] = c;
00400         p = (dword)A[1] * A[1] + d;
00401         R[2] = LOW_WORD(p);
00402         R[3] = e + HIGH_WORD(p);
00403 }
00404 
00405 void Portable::Square4(word *R, const word *A)
00406 {
00407         const word *B = A;
00408         dword p, q;
00409         word c, d, e;
00410 
00411         p = (dword)A[0] * A[0];
00412         R[0] = LOW_WORD(p);
00413         c = HIGH_WORD(p);
00414         d = e = 0;
00415 
00416         SquAcc(0, 1);
00417 
00418         SaveSquAcc(1, 2, 0);
00419         MulAcc(1, 1);
00420 
00421         SaveSquAcc(2, 0, 3);
00422         SquAcc(1, 2);
00423 
00424         SaveSquAcc(3, 3, 1);
00425         MulAcc(2, 2);
00426 
00427         SaveSquAcc(4, 2, 3);
00428 
00429         R[5] = c;
00430         p = (dword)A[3] * A[3] + d;
00431         R[6] = LOW_WORD(p);
00432         R[7] = e + HIGH_WORD(p);
00433 }
00434 
00435 void Portable::Multiply8(word *R, const word *A, const word *B)
00436 {
00437         dword p;
00438         word c, d, e;
00439 
00440         p = (dword)A[0] * B[0];
00441         R[0] = LOW_WORD(p);
00442         c = HIGH_WORD(p);
00443         d = e = 0;
00444 
00445         MulAcc(0, 1);
00446         MulAcc(1, 0);
00447 
00448         SaveMulAcc(1, 2, 0);
00449         MulAcc(1, 1);
00450         MulAcc(0, 2);
00451 
00452         SaveMulAcc(2, 0, 3);
00453         MulAcc(1, 2);
00454         MulAcc(2, 1);
00455         MulAcc(3, 0);
00456 
00457         SaveMulAcc(3, 0, 4);
00458         MulAcc(1, 3);
00459         MulAcc(2, 2);
00460         MulAcc(3, 1);
00461         MulAcc(4, 0);
00462 
00463         SaveMulAcc(4, 0, 5);
00464         MulAcc(1, 4);
00465         MulAcc(2, 3);
00466         MulAcc(3, 2);
00467         MulAcc(4, 1);
00468         MulAcc(5, 0);
00469 
00470         SaveMulAcc(5, 0, 6);
00471         MulAcc(1, 5);
00472         MulAcc(2, 4);
00473         MulAcc(3, 3);
00474         MulAcc(4, 2);
00475         MulAcc(5, 1);
00476         MulAcc(6, 0);
00477 
00478         SaveMulAcc(6, 0, 7);
00479         MulAcc(1, 6);
00480         MulAcc(2, 5);
00481         MulAcc(3, 4);
00482         MulAcc(4, 3);
00483         MulAcc(5, 2);
00484         MulAcc(6, 1);
00485         MulAcc(7, 0);
00486 
00487         SaveMulAcc(7, 1, 7);
00488         MulAcc(2, 6);
00489         MulAcc(3, 5);
00490         MulAcc(4, 4);
00491         MulAcc(5, 3);
00492         MulAcc(6, 2);
00493         MulAcc(7, 1);
00494 
00495         SaveMulAcc(8, 2, 7);
00496         MulAcc(3, 6);
00497         MulAcc(4, 5);
00498         MulAcc(5, 4);
00499         MulAcc(6, 3);
00500         MulAcc(7, 2);
00501 
00502         SaveMulAcc(9, 3, 7);
00503         MulAcc(4, 6);
00504         MulAcc(5, 5);
00505         MulAcc(6, 4);
00506         MulAcc(7, 3);
00507 
00508         SaveMulAcc(10, 4, 7);
00509         MulAcc(5, 6);
00510         MulAcc(6, 5);
00511         MulAcc(7, 4);
00512 
00513         SaveMulAcc(11, 5, 7);
00514         MulAcc(6, 6);
00515         MulAcc(7, 5);
00516 
00517         SaveMulAcc(12, 6, 7);
00518         MulAcc(7, 6);
00519 
00520         R[13] = c;
00521         p = (dword)A[7] * B[7] + d;
00522         R[14] = LOW_WORD(p);
00523         R[15] = e + HIGH_WORD(p);
00524 }
00525 
00526 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
00527 {
00528         dword p;
00529         word c, d, e;
00530 
00531         p = (dword)A[0] * B[0];
00532         R[0] = LOW_WORD(p);
00533         c = HIGH_WORD(p);
00534         d = e = 0;
00535 
00536         MulAcc(0, 1);
00537         MulAcc(1, 0);
00538 
00539         SaveMulAcc(1, 2, 0);
00540         MulAcc(1, 1);
00541         MulAcc(0, 2);
00542 
00543         R[2] = c;
00544         R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
00545 }
00546 
00547 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
00548 {
00549         dword p;
00550         word c, d, e;
00551 
00552         p = (dword)A[0] * B[0];
00553         R[0] = LOW_WORD(p);
00554         c = HIGH_WORD(p);
00555         d = e = 0;
00556 
00557         MulAcc(0, 1);
00558         MulAcc(1, 0);
00559 
00560         SaveMulAcc(1, 2, 0);
00561         MulAcc(1, 1);
00562         MulAcc(0, 2);
00563 
00564         SaveMulAcc(2, 0, 3);
00565         MulAcc(1, 2);
00566         MulAcc(2, 1);
00567         MulAcc(3, 0);
00568 
00569         SaveMulAcc(3, 0, 4);
00570         MulAcc(1, 3);
00571         MulAcc(2, 2);
00572         MulAcc(3, 1);
00573         MulAcc(4, 0);
00574 
00575         SaveMulAcc(4, 0, 5);
00576         MulAcc(1, 4);
00577         MulAcc(2, 3);
00578         MulAcc(3, 2);
00579         MulAcc(4, 1);
00580         MulAcc(5, 0);
00581 
00582         SaveMulAcc(5, 0, 6);
00583         MulAcc(1, 5);
00584         MulAcc(2, 4);
00585         MulAcc(3, 3);
00586         MulAcc(4, 2);
00587         MulAcc(5, 1);
00588         MulAcc(6, 0);
00589 
00590         R[6] = c;
00591         R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
00592                                 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
00593 }
00594 
00595 #undef MulAcc
00596 #undef SaveMulAcc
00597 #undef SquAcc
00598 #undef SaveSquAcc
00599 
00600 // CodeWarrior defines _MSC_VER
00601 #if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
00602 
00603 class PentiumOptimized : public Portable
00604 {
00605 public:
00606         static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
00607         static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
00608         static inline void Square4(word *R, const word *A)
00609         {
00610                 // VC60 workaround: MSVC 6.0 has an optimization bug that makes
00611                 // (dword)A*B where either A or B has been cast to a dword before
00612                 // very expensive. Revisit this function when this
00613                 // bug is fixed.
00614                 Multiply4(R, A, A);
00615         }
00616 };
00617 
00618 typedef PentiumOptimized LowLevel;
00619 
00620 __declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
00621 {
00622         __asm
00623         {
00624                 push ebp
00625                 push ebx
00626                 push esi
00627                 push edi
00628 
00629                 mov esi, [esp+24]       ; N
00630                 mov ebx, [esp+20]       ; B
00631 
00632                 // now: ebx = B, ecx = C, edx = A, esi = N
00633 
00634                 sub ecx, edx    // hold the distance between C & A so we can add this to A to get C
00635                 xor eax, eax    // clear eax
00636 
00637                 sub eax, esi    // eax is a negative index from end of B
00638                 lea ebx, [ebx+4*esi]    // ebx is end of B
00639 
00640                 sar eax, 1              // unit of eax is now dwords; this also clears the carry flag
00641                 jz      loopend         // if no dwords then nothing to do
00642 
00643 loopstart:
00644                 mov    esi,[edx]                        // load lower word of A
00645                 mov    ebp,[edx+4]                      // load higher word of A
00646 
00647                 mov    edi,[ebx+8*eax]          // load lower word of B
00648                 lea    edx,[edx+8]                      // advance A and C
00649 
00650                 adc    esi,edi                          // add lower words
00651                 mov    edi,[ebx+8*eax+4]        // load higher word of B
00652 
00653                 adc    ebp,edi                          // add higher words
00654                 inc    eax                                      // advance B
00655 
00656                 mov    [edx+ecx-8],esi          // store lower word result
00657                 mov    [edx+ecx-4],ebp          // store higher word result
00658 
00659                 jnz    loopstart                        // loop until eax overflows and becomes zero
00660 
00661 loopend:
00662                 adc eax, 0              // store carry into eax (return result register)
00663                 pop edi
00664                 pop esi
00665                 pop ebx
00666                 pop ebp
00667                 ret 8
00668         }
00669 }
00670 
00671 __declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
00672 {
00673         __asm
00674         {
00675                 push ebp
00676                 push ebx
00677                 push esi
00678                 push edi
00679 
00680                 mov esi, [esp+24]       ; N
00681                 mov ebx, [esp+20]       ; B
00682 
00683                 sub ecx, edx
00684                 xor eax, eax
00685 
00686                 sub eax, esi
00687                 lea ebx, [ebx+4*esi]
00688 
00689                 sar eax, 1
00690                 jz      loopend
00691 
00692 loopstart:
00693                 mov    esi,[edx]
00694                 mov    ebp,[edx+4]
00695 
00696                 mov    edi,[ebx+8*eax]
00697                 lea    edx,[edx+8]
00698 
00699                 sbb    esi,edi
00700                 mov    edi,[ebx+8*eax+4]
00701 
00702                 sbb    ebp,edi
00703                 inc    eax
00704 
00705                 mov    [edx+ecx-8],esi
00706                 mov    [edx+ecx-4],ebp
00707 
00708                 jnz    loopstart
00709 
00710 loopend:
00711                 adc eax, 0
00712                 pop edi
00713                 pop esi
00714                 pop ebx
00715                 pop ebp
00716                 ret 8
00717         }
00718 }
00719 
00720 #ifdef SSE2_INTRINSICS_AVAILABLE
00721 
00722 static bool GetSSE2Capability()
00723 {
00724         word32 b;
00725 
00726         __asm
00727         {
00728                 mov             eax, 1
00729                 cpuid
00730                 mov             b, edx
00731         }
00732 
00733         return (b & (1 << 26)) != 0;
00734 }
00735 
00736 bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
00737 
00738 void DisableSSE2()
00739 {
00740         g_sse2Enabled = false;
00741 }
00742 
00743 static inline bool HasSSE2()
00744 {
00745         if (g_sse2Enabled && !g_sse2DetectionDone)
00746         {
00747                 g_sse2Detected = GetSSE2Capability();
00748                 g_sse2DetectionDone = true;
00749         }
00750         return g_sse2Enabled && g_sse2Detected;
00751 }
00752 
00753 class P4Optimized : public PentiumOptimized
00754 {
00755 public:
00756         static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
00757         static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
00758         static void Multiply4(word *C, const word *A, const word *B);
00759         static void Multiply8(word *C, const word *A, const word *B);
00760         static inline void Square4(word *R, const word *A)
00761         {
00762                 Multiply4(R, A, A);
00763         }
00764         static void Multiply8Bottom(word *C, const word *A, const word *B);
00765 };
00766 
00767 static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
00768 {
00769         __m128i a3210 = _mm_load_si128(A);
00770         __m128i b3210 = _mm_load_si128(B);
00771 
00772         __m128i sum;
00773 
00774         __m128i z = _mm_setzero_si128();
00775         __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
00776         C[0] = a2b2_a0b0;
00777 
00778         __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
00779         __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
00780         __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
00781         __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
00782         __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
00783         C[1] = _mm_add_epi64(a1b0, a0b1);
00784 
00785         __m128i a31 = _mm_srli_epi64(a3210, 32);
00786         __m128i b31 = _mm_srli_epi64(b3210, 32);
00787         __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
00788         C[6] = a3b3_a1b1;
00789 
00790         __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
00791         __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
00792         __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
00793         __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
00794         __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
00795         sum = _mm_add_epi64(a1b1, a0b2);
00796         C[2] = _mm_add_epi64(sum, a2b0);
00797 
00798         __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
00799         __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
00800         __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
00801         __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
00802         __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
00803         __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
00804         __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
00805         __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
00806         __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
00807         sum = _mm_add_epi64(a2b1, a0b3);
00808         C[3] = _mm_add_epi64(sum, sum1);
00809 
00810         __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
00811         __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
00812         __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
00813         __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
00814         sum = _mm_add_epi64(a2b2, a3b1);
00815         C[4] = _mm_add_epi64(sum, a1b3);
00816 
00817         __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
00818         __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
00819         __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
00820         __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
00821         __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
00822         C[5] = _mm_add_epi64(a3b2, a2b3);
00823 }
00824 
00825 void P4Optimized::Multiply4(word *C, const word *A, const word *B)
00826 {
00827         __m128i temp[7];
00828         const word *w = (word *)temp;
00829         const __m64 *mw = (__m64 *)w;
00830 
00831         P4_Mul(temp, (__m128i *)A, (__m128i *)B);
00832 
00833         C[0] = w[0];
00834 
00835         __m64 s1, s2;
00836 
00837         __m64 w1 = _m_from_int(w[1]);
00838         __m64 w4 = mw[2];
00839         __m64 w6 = mw[3];
00840         __m64 w8 = mw[4];
00841         __m64 w10 = mw[5];
00842         __m64 w12 = mw[6];
00843         __m64 w14 = mw[7];
00844         __m64 w16 = mw[8];
00845         __m64 w18 = mw[9];
00846         __m64 w20 = mw[10];
00847         __m64 w22 = mw[11];
00848         __m64 w26 = _m_from_int(w[26]);
00849 
00850         s1 = _mm_add_si64(w1, w4);
00851         C[1] = _m_to_int(s1);
00852         s1 = _m_psrlqi(s1, 32);
00853 
00854         s2 = _mm_add_si64(w6, w8);
00855         s1 = _mm_add_si64(s1, s2);
00856         C[2] = _m_to_int(s1);
00857         s1 = _m_psrlqi(s1, 32);
00858 
00859         s2 = _mm_add_si64(w10, w12);
00860         s1 = _mm_add_si64(s1, s2);
00861         C[3] = _m_to_int(s1);
00862         s1 = _m_psrlqi(s1, 32);
00863 
00864         s2 = _mm_add_si64(w14, w16);
00865         s1 = _mm_add_si64(s1, s2);
00866         C[4] = _m_to_int(s1);
00867         s1 = _m_psrlqi(s1, 32);
00868 
00869         s2 = _mm_add_si64(w18, w20);
00870         s1 = _mm_add_si64(s1, s2);
00871         C[5] = _m_to_int(s1);
00872         s1 = _m_psrlqi(s1, 32);
00873 
00874         s2 = _mm_add_si64(w22, w26);
00875         s1 = _mm_add_si64(s1, s2);
00876         C[6] = _m_to_int(s1);
00877         s1 = _m_psrlqi(s1, 32);
00878 
00879         C[7] = _m_to_int(s1) + w[27];
00880         _mm_empty();
00881 }
00882 
00883 void P4Optimized::Multiply8(word *C, const word *A, const word *B)
00884 {
00885         __m128i temp[28];
00886         const word *w = (word *)temp;
00887         const __m64 *mw = (__m64 *)w;
00888         const word *x = (word *)temp+7*4;
00889         const __m64 *mx = (__m64 *)x;
00890         const word *y = (word *)temp+7*4*2;
00891         const __m64 *my = (__m64 *)y;
00892         const word *z = (word *)temp+7*4*3;
00893         const __m64 *mz = (__m64 *)z;
00894 
00895         P4_Mul(temp, (__m128i *)A, (__m128i *)B);
00896 
00897         P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
00898 
00899         P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
00900 
00901         P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
00902 
00903         C[0] = w[0];
00904 
00905         __m64 s1, s2, s3, s4;
00906 
00907         __m64 w1 = _m_from_int(w[1]);
00908         __m64 w4 = mw[2];
00909         __m64 w6 = mw[3];
00910         __m64 w8 = mw[4];
00911         __m64 w10 = mw[5];
00912         __m64 w12 = mw[6];
00913         __m64 w14 = mw[7];
00914         __m64 w16 = mw[8];
00915         __m64 w18 = mw[9];
00916         __m64 w20 = mw[10];
00917         __m64 w22 = mw[11];
00918         __m64 w26 = _m_from_int(w[26]);
00919         __m64 w27 = _m_from_int(w[27]);
00920 
00921         __m64 x0 = _m_from_int(x[0]);
00922         __m64 x1 = _m_from_int(x[1]);
00923         __m64 x4 = mx[2];
00924         __m64 x6 = mx[3];
00925         __m64 x8 = mx[4];
00926         __m64 x10 = mx[5];
00927         __m64 x12 = mx[6];
00928         __m64 x14 = mx[7];
00929         __m64 x16 = mx[8];
00930         __m64 x18 = mx[9];
00931         __m64 x20 = mx[10];
00932         __m64 x22 = mx[11];
00933         __m64 x26 = _m_from_int(x[26]);
00934         __m64 x27 = _m_from_int(x[27]);
00935 
00936         __m64 y0 = _m_from_int(y[0]);
00937         __m64 y1 = _m_from_int(y[1]);
00938         __m64 y4 = my[2];
00939         __m64 y6 = my[3];
00940         __m64 y8 = my[4];
00941         __m64 y10 = my[5];
00942         __m64 y12 = my[6];
00943         __m64 y14 = my[7];
00944         __m64 y16 = my[8];
00945         __m64 y18 = my[9];
00946         __m64 y20 = my[10];
00947         __m64 y22 = my[11];
00948         __m64 y26 = _m_from_int(y[26]);
00949         __m64 y27 = _m_from_int(y[27]);
00950 
00951         __m64 z0 = _m_from_int(z[0]);
00952         __m64 z1 = _m_from_int(z[1]);
00953         __m64 z4 = mz[2];
00954         __m64 z6 = mz[3];
00955         __m64 z8 = mz[4];
00956         __m64 z10 = mz[5];
00957         __m64 z12 = mz[6];
00958         __m64 z14 = mz[7];
00959         __m64 z16 = mz[8];
00960         __m64 z18 = mz[9];
00961         __m64 z20 = mz[10];
00962         __m64 z22 = mz[11];
00963         __m64 z26 = _m_from_int(z[26]);
00964 
00965         s1 = _mm_add_si64(w1, w4);
00966         C[1] = _m_to_int(s1);
00967         s1 = _m_psrlqi(s1, 32);
00968 
00969         s2 = _mm_add_si64(w6, w8);
00970         s1 = _mm_add_si64(s1, s2);
00971         C[2] = _m_to_int(s1);
00972         s1 = _m_psrlqi(s1, 32);
00973 
00974         s2 = _mm_add_si64(w10, w12);
00975         s1 = _mm_add_si64(s1, s2);
00976         C[3] = _m_to_int(s1);
00977         s1 = _m_psrlqi(s1, 32);
00978 
00979         s3 = _mm_add_si64(x0, y0);
00980         s2 = _mm_add_si64(w14, w16);
00981         s1 = _mm_add_si64(s1, s3);
00982         s1 = _mm_add_si64(s1, s2);
00983         C[4] = _m_to_int(s1);
00984         s1 = _m_psrlqi(s1, 32);
00985 
00986         s3 = _mm_add_si64(x1, y1);
00987         s4 = _mm_add_si64(x4, y4);
00988         s1 = _mm_add_si64(s1, w18);
00989         s3 = _mm_add_si64(s3, s4);
00990         s1 = _mm_add_si64(s1, w20);
00991         s1 = _mm_add_si64(s1, s3);
00992         C[5] = _m_to_int(s1);
00993         s1 = _m_psrlqi(s1, 32);
00994 
00995         s3 = _mm_add_si64(x6, y6);
00996         s4 = _mm_add_si64(x8, y8);
00997         s1 = _mm_add_si64(s1, w22);
00998         s3 = _mm_add_si64(s3, s4);
00999         s1 = _mm_add_si64(s1, w26);
01000         s1 = _mm_add_si64(s1, s3);
01001         C[6] = _m_to_int(s1);
01002         s1 = _m_psrlqi(s1, 32);
01003 
01004         s3 = _mm_add_si64(x10, y10);
01005         s4 = _mm_add_si64(x12, y12);
01006         s1 = _mm_add_si64(s1, w27);
01007         s3 = _mm_add_si64(s3, s4);
01008         s1 = _mm_add_si64(s1, s3);
01009         C[7] = _m_to_int(s1);
01010         s1 = _m_psrlqi(s1, 32);
01011 
01012         s3 = _mm_add_si64(x14, y14);
01013         s4 = _mm_add_si64(x16, y16);
01014         s1 = _mm_add_si64(s1, z0);
01015         s3 = _mm_add_si64(s3, s4);
01016         s1 = _mm_add_si64(s1, s3);
01017         C[8] = _m_to_int(s1);
01018         s1 = _m_psrlqi(s1, 32);
01019 
01020         s3 = _mm_add_si64(x18, y18);
01021         s4 = _mm_add_si64(x20, y20);
01022         s1 = _mm_add_si64(s1, z1);
01023         s3 = _mm_add_si64(s3, s4);
01024         s1 = _mm_add_si64(s1, z4);
01025         s1 = _mm_add_si64(s1, s3);
01026         C[9] = _m_to_int(s1);
01027         s1 = _m_psrlqi(s1, 32);
01028 
01029         s3 = _mm_add_si64(x22, y22);
01030         s4 = _mm_add_si64(x26, y26);
01031         s1 = _mm_add_si64(s1, z6);
01032         s3 = _mm_add_si64(s3, s4);
01033         s1 = _mm_add_si64(s1, z8);
01034         s1 = _mm_add_si64(s1, s3);
01035         C[10] = _m_to_int(s1);
01036         s1 = _m_psrlqi(s1, 32);
01037 
01038         s3 = _mm_add_si64(x27, y27);
01039         s1 = _mm_add_si64(s1, z10);
01040         s1 = _mm_add_si64(s1, z12);
01041         s1 = _mm_add_si64(s1, s3);
01042         C[11] = _m_to_int(s1);
01043         s1 = _m_psrlqi(s1, 32);
01044 
01045         s3 = _mm_add_si64(z14, z16);
01046         s1 = _mm_add_si64(s1, s3);
01047         C[12] = _m_to_int(s1);
01048         s1 = _m_psrlqi(s1, 32);
01049 
01050         s3 = _mm_add_si64(z18, z20);
01051         s1 = _mm_add_si64(s1, s3);
01052         C[13] = _m_to_int(s1);
01053         s1 = _m_psrlqi(s1, 32);
01054 
01055         s3 = _mm_add_si64(z22, z26);
01056         s1 = _mm_add_si64(s1, s3);
01057         C[14] = _m_to_int(s1);
01058         s1 = _m_psrlqi(s1, 32);
01059 
01060         C[15] = z[27] + _m_to_int(s1);
01061         _mm_empty();
01062 }
01063 
01064 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
01065 {
01066         __m128i temp[21];
01067         const word *w = (word *)temp;
01068         const __m64 *mw = (__m64 *)w;
01069         const word *x = (word *)temp+7*4;
01070         const __m64 *mx = (__m64 *)x;
01071         const word *y = (word *)temp+7*4*2;
01072         const __m64 *my = (__m64 *)y;
01073 
01074         P4_Mul(temp, (__m128i *)A, (__m128i *)B);
01075 
01076         P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
01077 
01078         P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
01079 
01080         C[0] = w[0];
01081 
01082         __m64 s1, s2, s3, s4;
01083 
01084         __m64 w1 = _m_from_int(w[1]);
01085         __m64 w4 = mw[2];
01086         __m64 w6 = mw[3];
01087         __m64 w8 = mw[4];
01088         __m64 w10 = mw[5];
01089         __m64 w12 = mw[6];
01090         __m64 w14 = mw[7];
01091         __m64 w16 = mw[8];
01092         __m64 w18 = mw[9];
01093         __m64 w20 = mw[10];
01094         __m64 w22 = mw[11];
01095         __m64 w26 = _m_from_int(w[26]);
01096 
01097         __m64 x0 = _m_from_int(x[0]);
01098         __m64 x1 = _m_from_int(x[1]);
01099         __m64 x4 = mx[2];
01100         __m64 x6 = mx[3];
01101         __m64 x8 = mx[4];
01102 
01103         __m64 y0 = _m_from_int(y[0]);
01104         __m64 y1 = _m_from_int(y[1]);
01105         __m64 y4 = my[2];
01106         __m64 y6 = my[3];
01107         __m64 y8 = my[4];
01108 
01109         s1 = _mm_add_si64(w1, w4);
01110         C[1] = _m_to_int(s1);
01111         s1 = _m_psrlqi(s1, 32);
01112 
01113         s2 = _mm_add_si64(w6, w8);
01114         s1 = _mm_add_si64(s1, s2);
01115         C[2] = _m_to_int(s1);
01116         s1 = _m_psrlqi(s1, 32);
01117 
01118         s2 = _mm_add_si64(w10, w12);
01119         s1 = _mm_add_si64(s1, s2);
01120         C[3] = _m_to_int(s1);
01121         s1 = _m_psrlqi(s1, 32);
01122 
01123         s3 = _mm_add_si64(x0, y0);
01124         s2 = _mm_add_si64(w14, w16);
01125         s1 = _mm_add_si64(s1, s3);
01126         s1 = _mm_add_si64(s1, s2);
01127         C[4] = _m_to_int(s1);
01128         s1 = _m_psrlqi(s1, 32);
01129 
01130         s3 = _mm_add_si64(x1, y1);
01131         s4 = _mm_add_si64(x4, y4);
01132         s1 = _mm_add_si64(s1, w18);
01133         s3 = _mm_add_si64(s3, s4);
01134         s1 = _mm_add_si64(s1, w20);
01135         s1 = _mm_add_si64(s1, s3);
01136         C[5] = _m_to_int(s1);
01137         s1 = _m_psrlqi(s1, 32);
01138 
01139         s3 = _mm_add_si64(x6, y6);
01140         s4 = _mm_add_si64(x8, y8);
01141         s1 = _mm_add_si64(s1, w22);
01142         s3 = _mm_add_si64(s3, s4);
01143         s1 = _mm_add_si64(s1, w26);
01144         s1 = _mm_add_si64(s1, s3);
01145         C[6] = _m_to_int(s1);
01146         s1 = _m_psrlqi(s1, 32);
01147 
01148         C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
01149         _mm_empty();
01150 }
01151 
01152 __declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
01153 {
01154         __asm
01155         {
01156                 sub             esp, 16
01157                 xor             eax, eax
01158                 mov             [esp], edi
01159                 mov             [esp+4], esi
01160                 mov             [esp+8], ebx
01161                 mov             [esp+12], ebp
01162 
01163                 mov             ebx, [esp+20]   // B
01164                 mov             esi, [esp+24]   // N
01165 
01166                 // now: ebx = B, ecx = C, edx = A, esi = N
01167 
01168                 neg             esi
01169                 jz              loopend         // if no dwords then nothing to do
01170 
01171                 mov             edi, [edx]
01172                 mov             ebp, [ebx]
01173 
01174 loopstart:
01175                 add             edi, eax
01176                 jc              carry1
01177 
01178                 xor             eax, eax
01179 
01180 carry1continue:
01181                 add             edi, ebp
01182                 mov             ebp, 1
01183                 mov             [ecx], edi
01184                 mov             edi, [edx+4]
01185                 cmovc   eax, ebp
01186                 mov             ebp, [ebx+4]
01187                 lea             ebx, [ebx+8]
01188                 add             edi, eax
01189                 jc              carry2
01190 
01191                 xor             eax, eax
01192 
01193 carry2continue:
01194                 add             edi, ebp
01195                 mov             ebp, 1
01196                 cmovc   eax, ebp
01197                 mov             [ecx+4], edi
01198                 add             ecx, 8
01199                 mov             edi, [edx+8]
01200                 add             edx, 8
01201                 add             esi, 2
01202                 mov             ebp, [ebx]
01203                 jnz             loopstart
01204 
01205 loopend:
01206                 mov             edi, [esp]
01207                 mov             esi, [esp+4]
01208                 mov             ebx, [esp+8]
01209                 mov             ebp, [esp+12]
01210                 add             esp, 16
01211                 ret             8
01212 
01213 carry1:
01214                 mov             eax, 1
01215                 jmp             carry1continue
01216 
01217 carry2:
01218                 mov             eax, 1
01219                 jmp             carry2continue
01220         }
01221 }
01222 
01223 __declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
01224 {
01225         __asm
01226         {
01227                 sub             esp, 16
01228                 xor             eax, eax
01229                 mov             [esp], edi
01230                 mov             [esp+4], esi
01231                 mov             [esp+8], ebx
01232                 mov             [esp+12], ebp
01233 
01234                 mov             ebx, [esp+20]   // B
01235                 mov             esi, [esp+24]   // N
01236 
01237                 // now: ebx = B, ecx = C, edx = A, esi = N
01238 
01239                 neg             esi
01240                 jz              loopend         // if no dwords then nothing to do
01241 
01242                 mov             edi, [edx]
01243                 mov             ebp, [ebx]
01244 
01245 loopstart:
01246                 sub             edi, eax
01247                 jc              carry1
01248 
01249                 xor             eax, eax
01250 
01251 carry1continue:
01252                 sub             edi, ebp
01253                 mov             ebp, 1
01254                 mov             [ecx], edi
01255                 mov             edi, [edx+4]
01256                 cmovc   eax, ebp
01257                 mov             ebp, [ebx+4]
01258                 lea             ebx, [ebx+8]
01259                 sub             edi, eax
01260                 jc              carry2
01261 
01262                 xor             eax, eax
01263 
01264 carry2continue:
01265                 sub             edi, ebp
01266                 mov             ebp, 1
01267                 cmovc   eax, ebp
01268                 mov             [ecx+4], edi
01269                 add             ecx, 8
01270                 mov             edi, [edx+8]
01271                 add             edx, 8
01272                 add             esi, 2
01273                 mov             ebp, [ebx]
01274                 jnz             loopstart
01275 
01276 loopend:
01277                 mov             edi, [esp]
01278                 mov             esi, [esp+4]
01279                 mov             ebx, [esp+8]
01280                 mov             ebp, [esp+12]
01281                 add             esp, 16
01282                 ret             8
01283 
01284 carry1:
01285                 mov             eax, 1
01286                 jmp             carry1continue
01287 
01288 carry2:
01289                 mov             eax, 1
01290                 jmp             carry2continue
01291         }
01292 }
01293 
01294 #endif  // #ifdef SSE2_INTRINSICS_AVAILABLE
01295 
01296 #elif defined(__GNUC__) && defined(__i386__)
01297 
01298 class PentiumOptimized : public Portable
01299 {
01300 public:
01301         static word Add(word *C, const word *A, const word *B, unsigned int N);
01302         static word Subtract(word *C, const word *A, const word *B, unsigned int N);
01303         static void Square4(word *R, const word *A);
01304         static void Multiply4(word *C, const word *A, const word *B);
01305         static void Multiply8(word *C, const word *A, const word *B);
01306 };
01307 
01308 typedef PentiumOptimized LowLevel;
01309 
01310 // Add and Subtract assembly code originally contributed by Alister Lee
01311 
01312 __attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
01313 {
01314         assert (N%2 == 0);
01315 
01316         register word carry, temp;
01317 
01318         __asm__ __volatile__(
01319                         "push %%ebp;"
01320                         "sub %3, %2;"
01321                         "xor %0, %0;"
01322                         "sub %4, %0;"
01323                         "lea (%1,%4,4), %1;"
01324                         "sar $1, %0;"
01325                         "jz 1f;"
01326 
01327                 "0:;"
01328                         "mov 0(%3), %4;"
01329                         "mov 4(%3), %%ebp;"
01330                         "mov (%1,%0,8), %5;"
01331                         "lea 8(%3), %3;"
01332                         "adc %5, %4;"
01333                         "mov 4(%1,%0,8), %5;"
01334                         "adc %5, %%ebp;"
01335                         "inc %0;"
01336                         "mov %4, -8(%3, %2);"
01337                         "mov %%ebp, -4(%3, %2);"
01338                         "jnz 0b;"
01339 
01340                 "1:;"
01341                         "adc $0, %0;"
01342                         "pop %%ebp;"
01343 
01344                 : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
01345                 : : "cc", "memory");
01346 
01347         return carry;
01348 }
01349 
01350 __attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
01351 {
01352         assert (N%2 == 0);
01353 
01354         register word carry, temp;
01355 
01356         __asm__ __volatile__(
01357                         "push %%ebp;"
01358                         "sub %3, %2;"
01359                         "xor %0, %0;"
01360                         "sub %4, %0;"
01361                         "lea (%1,%4,4), %1;"
01362                         "sar $1, %0;"
01363                         "jz 1f;"
01364 
01365                 "0:;"
01366                         "mov 0(%3), %4;"
01367                         "mov 4(%3), %%ebp;"
01368                         "mov (%1,%0,8), %5;"
01369                         "lea 8(%3), %3;"
01370                         "sbb %5, %4;"
01371                         "mov 4(%1,%0,8), %5;"
01372                         "sbb %5, %%ebp;"
01373                         "inc %0;"
01374                         "mov %4, -8(%3, %2);"
01375                         "mov %%ebp, -4(%3, %2);"
01376                         "jnz 0b;"
01377 
01378                 "1:;"
01379                         "adc $0, %0;"
01380                         "pop %%ebp;"
01381 
01382                 : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
01383                 : : "cc", "memory");
01384 
01385         return carry;
01386 }
01387 
01388 // Comba square and multiply assembly code originally contributed by Leonard Janke
01389 
01390 #define SqrStartup \
01391   "push %%ebp\n\t" \
01392   "push %%esi\n\t" \
01393   "push %%ebx\n\t" \
01394   "xor %%ebp, %%ebp\n\t" \
01395   "xor %%ebx, %%ebx\n\t" \
01396   "xor %%ecx, %%ecx\n\t" 
01397 
01398 #define SqrShiftCarry \
01399   "mov %%ebx, %%ebp\n\t" \
01400   "mov %%ecx, %%ebx\n\t" \
01401   "xor %%ecx, %%ecx\n\t"
01402 
01403 #define SqrAccumulate(i,j) \
01404   "mov 4*"#j"(%%esi), %%eax\n\t" \
01405   "mull 4*"#i"(%%esi)\n\t" \
01406   "add %%eax, %%ebp\n\t" \
01407   "adc %%edx, %%ebx\n\t" \
01408   "adc %%ch, %%cl\n\t" \
01409   "add %%eax, %%ebp\n\t" \
01410   "adc %%edx, %%ebx\n\t" \
01411   "adc %%ch, %%cl\n\t"
01412 
01413 #define SqrAccumulateCentre(i) \
01414   "mov 4*"#i"(%%esi), %%eax\n\t" \
01415   "mull 4*"#i"(%%esi)\n\t" \
01416   "add %%eax, %%ebp\n\t" \
01417   "adc %%edx, %%ebx\n\t" \
01418   "adc %%ch, %%cl\n\t" 
01419 
01420 #define SqrStoreDigit(X)  \
01421   "mov %%ebp, 4*"#X"(%%edi)\n\t" \
01422 
01423 #define SqrLastDiagonal(digits) \
01424   "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
01425   "mull 4*("#digits"-1)(%%esi)\n\t" \
01426   "add %%eax, %%ebp\n\t" \
01427   "adc %%edx, %%ebx\n\t" \
01428   "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
01429   "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" 
01430 
01431 #define SqrCleanup \
01432   "pop %%ebx\n\t" \
01433   "pop %%esi\n\t" \
01434   "pop %%ebp\n\t" 
01435 
01436 void PentiumOptimized::Square4(word* Y, const word* X)
01437 {
01438         __asm__ __volatile__(
01439                 SqrStartup
01440 
01441                 SqrAccumulateCentre(0)
01442                 SqrStoreDigit(0)
01443                 SqrShiftCarry
01444 
01445                 SqrAccumulate(1,0)
01446                 SqrStoreDigit(1)
01447                 SqrShiftCarry
01448 
01449                 SqrAccumulate(2,0)
01450                 SqrAccumulateCentre(1)
01451                 SqrStoreDigit(2)
01452                 SqrShiftCarry
01453 
01454                 SqrAccumulate(3,0)
01455                 SqrAccumulate(2,1)
01456                 SqrStoreDigit(3)
01457                 SqrShiftCarry
01458 
01459                 SqrAccumulate(3,1)
01460                 SqrAccumulateCentre(2)
01461                 SqrStoreDigit(4)
01462                 SqrShiftCarry
01463 
01464                 SqrAccumulate(3,2)
01465                 SqrStoreDigit(5)
01466                 SqrShiftCarry
01467 
01468                 SqrLastDiagonal(4)
01469 
01470                 SqrCleanup
01471 
01472                 :
01473                 : "D" (Y), "S" (X)
01474                 : "eax",  "ecx", "edx", "ebp",   "memory"
01475         );
01476 }
01477 
01478 #define MulStartup \
01479   "push %%ebp\n\t" \
01480   "push %%esi\n\t" \
01481   "push %%ebx\n\t" \
01482   "push %%edi\n\t" \
01483   "mov %%eax, %%ebx \n\t" \
01484   "xor %%ebp, %%ebp\n\t" \
01485   "xor %%edi, %%edi\n\t" \
01486   "xor %%ecx, %%ecx\n\t" 
01487 
01488 #define MulShiftCarry \
01489   "mov %%edx, %%ebp\n\t" \
01490   "mov %%ecx, %%edi\n\t" \
01491   "xor %%ecx, %%ecx\n\t"
01492 
01493 #define MulAccumulate(i,j) \
01494   "mov 4*"#j"(%%ebx), %%eax\n\t" \
01495   "mull 4*"#i"(%%esi)\n\t" \
01496   "add %%eax, %%ebp\n\t" \
01497   "adc %%edx, %%edi\n\t" \
01498   "adc %%ch, %%cl\n\t"
01499 
01500 #define MulStoreDigit(X)  \
01501   "mov %%edi, %%edx \n\t" \
01502   "mov (%%esp), %%edi \n\t" \
01503   "mov %%ebp, 4*"#X"(%%edi)\n\t" \
01504   "mov %%edi, (%%esp)\n\t" 
01505 
01506 #define MulLastDiagonal(digits) \
01507   "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
01508   "mull 4*("#digits"-1)(%%esi)\n\t" \
01509   "add %%eax, %%ebp\n\t" \
01510   "adc %%edi, %%edx\n\t" \
01511   "mov (%%esp), %%edi\n\t" \
01512   "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
01513   "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" 
01514 
01515 #define MulCleanup \
01516   "pop %%edi\n\t" \
01517   "pop %%ebx\n\t" \
01518   "pop %%esi\n\t" \
01519   "pop %%ebp\n\t" 
01520 
01521 void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
01522 {
01523         __asm__ __volatile__(
01524                 MulStartup
01525                 MulAccumulate(0,0)
01526                 MulStoreDigit(0)
01527                 MulShiftCarry
01528 
01529                 MulAccumulate(1,0)
01530                 MulAccumulate(0,1)
01531                 MulStoreDigit(1)
01532                 MulShiftCarry
01533 
01534                 MulAccumulate(2,0)
01535                 MulAccumulate(1,1)
01536                 MulAccumulate(0,2)
01537                 MulStoreDigit(2)
01538                 MulShiftCarry
01539 
01540                 MulAccumulate(3,0)
01541                 MulAccumulate(2,1)
01542                 MulAccumulate(1,2)
01543                 MulAccumulate(0,3)
01544                 MulStoreDigit(3)
01545                 MulShiftCarry
01546 
01547                 MulAccumulate(3,1)
01548                 MulAccumulate(2,2)
01549                 MulAccumulate(1,3)
01550                 MulStoreDigit(4)
01551                 MulShiftCarry
01552 
01553                 MulAccumulate(3,2)
01554                 MulAccumulate(2,3)
01555                 MulStoreDigit(5)
01556                 MulShiftCarry
01557 
01558                 MulLastDiagonal(4)
01559 
01560                 MulCleanup
01561 
01562                 : 
01563                 : "D" (Z), "S" (X), "a" (Y)
01564                 : "%ecx", "%edx",  "memory"
01565         );
01566 }
01567 
01568 void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
01569 {
01570         __asm__ __volatile__(
01571                 MulStartup
01572                 MulAccumulate(0,0)
01573                 MulStoreDigit(0)
01574                 MulShiftCarry
01575 
01576                 MulAccumulate(1,0)
01577                 MulAccumulate(0,1)
01578                 MulStoreDigit(1)
01579                 MulShiftCarry
01580 
01581                 MulAccumulate(2,0)
01582                 MulAccumulate(1,1)
01583                 MulAccumulate(0,2)
01584                 MulStoreDigit(2)
01585                 MulShiftCarry
01586 
01587                 MulAccumulate(3,0)
01588                 MulAccumulate(2,1)
01589                 MulAccumulate(1,2)
01590                 MulAccumulate(0,3)
01591                 MulStoreDigit(3)
01592                 MulShiftCarry
01593 
01594                 MulAccumulate(4,0)
01595                 MulAccumulate(3,1)
01596                 MulAccumulate(2,2)
01597                 MulAccumulate(1,3)
01598                 MulAccumulate(0,4)
01599                 MulStoreDigit(4)
01600                 MulShiftCarry
01601 
01602                 MulAccumulate(5,0)
01603                 MulAccumulate(4,1)
01604                 MulAccumulate(3,2)
01605                 MulAccumulate(2,3)
01606                 MulAccumulate(1,4)
01607                 MulAccumulate(0,5)
01608                 MulStoreDigit(5)
01609                 MulShiftCarry
01610 
01611                 MulAccumulate(6,0)
01612                 MulAccumulate(5,1)
01613                 MulAccumulate(4,2)
01614                 MulAccumulate(3,3)
01615                 MulAccumulate(2,4)
01616                 MulAccumulate(1,5)
01617                 MulAccumulate(0,6)
01618                 MulStoreDigit(6)
01619                 MulShiftCarry
01620 
01621                 MulAccumulate(7,0)
01622                 MulAccumulate(6,1)
01623                 MulAccumulate(5,2)
01624                 MulAccumulate(4,3)
01625                 MulAccumulate(3,4)
01626                 MulAccumulate(2,5)
01627                 MulAccumulate(1,6)
01628                 MulAccumulate(0,7)
01629                 MulStoreDigit(7)
01630                 MulShiftCarry
01631 
01632                 MulAccumulate(7,1)
01633                 MulAccumulate(6,2)
01634                 MulAccumulate(5,3)
01635                 MulAccumulate(4,4)
01636                 MulAccumulate(3,5)
01637                 MulAccumulate(2,6)
01638                 MulAccumulate(1,7)
01639                 MulStoreDigit(8)
01640                 MulShiftCarry
01641 
01642                 MulAccumulate(7,2)
01643                 MulAccumulate(6,3)
01644                 MulAccumulate(5,4)
01645                 MulAccumulate(4,5)
01646                 MulAccumulate(3,6)
01647                 MulAccumulate(2,7)
01648                 MulStoreDigit(9)
01649                 MulShiftCarry
01650 
01651                 MulAccumulate(7,3)
01652                 MulAccumulate(6,4)
01653                 MulAccumulate(5,5)
01654                 MulAccumulate(4,6)
01655                 MulAccumulate(3,7)
01656                 MulStoreDigit(10)
01657                 MulShiftCarry
01658 
01659                 MulAccumulate(7,4)
01660                 MulAccumulate(6,5)
01661                 MulAccumulate(5,6)
01662                 MulAccumulate(4,7)
01663                 MulStoreDigit(11)
01664                 MulShiftCarry
01665 
01666                 MulAccumulate(7,5)
01667                 MulAccumulate(6,6)
01668                 MulAccumulate(5,7)
01669                 MulStoreDigit(12)
01670                 MulShiftCarry
01671 
01672                 MulAccumulate(7,6)
01673                 MulAccumulate(6,7)
01674                 MulStoreDigit(13)
01675                 MulShiftCarry
01676 
01677                 MulLastDiagonal(8)
01678 
01679                 MulCleanup
01680 
01681                 : 
01682                 : "D" (Z), "S" (X), "a" (Y)
01683                 : "%ecx", "%edx",  "memory"
01684         );
01685 }
01686 
01687 #elif defined(__GNUC__) && defined(__alpha__)
01688 
01689 class AlphaOptimized : public Portable
01690 {
01691 public:
01692         static inline void Multiply2(word *C, const word *A, const word *B);
01693         static inline word Multiply2Add(word *C, const word *A, const word *B);
01694         static inline void Multiply4(word *C, const word *A, const word *B);
01695         static inline unsigned int MultiplyRecursionLimit() {return 4;}
01696 
01697         static inline void Multiply4Bottom(word *C, const word *A, const word *B);
01698         static inline unsigned int MultiplyBottomRecursionLimit() {return 4;}
01699 
01700         static inline void Square4(word *R, const word *A)
01701         {
01702                 Multiply4(R, A, A);
01703         }
01704 };
01705 
01706 typedef AlphaOptimized LowLevel;
01707 
01708 inline void AlphaOptimized::Multiply2(word *C, const word *A, const word *B)
01709 {
01710         register dword c, a = *(const dword *)A, b = *(const dword *)B;
01711         ((dword *)C)[0] = a*b;
01712         __asm__("umulh %1,%2,%0" : "=r" (c) : "r" (a), "r" (b));
01713         ((dword *)C)[1] = c;
01714 }
01715 
01716 inline word AlphaOptimized::Multiply2Add(word *C, const word *A, const word *B)
01717 {
01718         register dword c, d, e, a = *(const dword *)A, b = *(const dword *)B;
01719         c = ((dword *)C)[0];
01720         d = a*b + c;
01721         __asm__("umulh %1,%2,%0" : "=r" (e) : "r" (a), "r" (b));
01722         ((dword *)C)[0] = d;
01723         d = (d < c);
01724         c = ((dword *)C)[1] + d;
01725         d = (c < d);
01726         c += e;
01727         ((dword *)C)[1] = c;
01728         d |= (c < e);
01729         return d;
01730 }
01731 
01732 inline void AlphaOptimized::Multiply4(word *R, const word *A, const word *B)
01733 {
01734         Multiply2(R, A, B);
01735         Multiply2(R+4, A+2, B+2);
01736         word carry = Multiply2Add(R+2, A+0, B+2);
01737         carry += Multiply2Add(R+2, A+2, B+0);
01738         Increment(R+6, 2, carry);
01739 }
01740 
01741 static inline void Multiply2BottomAdd(word *C, const word *A, const word *B)
01742 {
01743         register dword a = *(const dword *)A, b = *(const dword *)B;
01744         ((dword *)C)[0] = a*b + ((dword *)C)[0];
01745 }
01746 
01747 inline void AlphaOptimized::Multiply4Bottom(word *R, const word *A, const word *B)
01748 {
01749         Multiply2(R, A, B);
01750         Multiply2BottomAdd(R+2, A+0, B+2);
01751         Multiply2BottomAdd(R+2, A+2, B+0);
01752 }
01753 
01754 #else   // no processor specific code available
01755 
01756 typedef Portable LowLevel;
01757 
01758 #endif
01759 
01760 // ********************************************************
01761 
01762 #define A0              A
01763 #define A1              (A+N2)
01764 #define B0              B
01765 #define B1              (B+N2)
01766 
01767 #define T0              T
01768 #define T1              (T+N2)
01769 #define T2              (T+N)
01770 #define T3              (T+N+N2)
01771 
01772 #define R0              R
01773 #define R1              (R+N2)
01774 #define R2              (R+N)
01775 #define R3              (R+N+N2)
01776 
01777 //VC60 workaround: compiler bug triggered without the extra dummy parameters
01778 
01779 // R[2*N] - result = A*B
01780 // T[2*N] - temporary work space
01781 // A[N] --- multiplier
01782 // B[N] --- multiplicant
01783 
01784 template <class P>
01785 void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
01786 
01787 template <class P>
01788 inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
01789 {
01790         assert(N>=2 && N%2==0);
01791 
01792         if (P::MultiplyRecursionLimit() >= 8 && N==8)
01793                 P::Multiply8(R, A, B);
01794         else if (P::MultiplyRecursionLimit() >= 4 && N==4)
01795                 P::Multiply4(R, A, B);
01796         else if (N==2)
01797                 P::Multiply2(R, A, B);
01798         else
01799                 DoRecursiveMultiply<P>(R, T, A, B, N, NULL);    // VC60 workaround: needs this NULL
01800 }
01801 
01802 template <class P>
01803 void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
01804 {
01805         const unsigned int N2 = N/2;
01806         int carry;
01807 
01808         int aComp = Compare(A0, A1, N2);
01809         int bComp = Compare(B0, B1, N2);
01810 
01811         switch (2*aComp + aComp + bComp)
01812         {
01813         case -4:
01814                 P::Subtract(R0, A1, A0, N2);
01815                 P::Subtract(R1, B0, B1, N2);
01816                 RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01817                 P::Subtract(T1, T1, R0, N2);
01818                 carry = -1;
01819                 break;
01820         case -2:
01821                 P::Subtract(R0, A1, A0, N2);
01822                 P::Subtract(R1, B0, B1, N2);
01823                 RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01824                 carry = 0;
01825                 break;
01826         case 2:
01827                 P::Subtract(R0, A0, A1, N2);
01828                 P::Subtract(R1, B1, B0, N2);
01829                 RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01830                 carry = 0;
01831                 break;
01832         case 4:
01833                 P::Subtract(R0, A1, A0, N2);
01834                 P::Subtract(R1, B0, B1, N2);
01835                 RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01836                 P::Subtract(T1, T1, R1, N2);
01837                 carry = -1;
01838                 break;
01839         default:
01840                 SetWords(T0, 0, N);
01841                 carry = 0;
01842         }
01843 
01844         RecursiveMultiply<P>(R0, T2, A0, B0, N2);
01845         RecursiveMultiply<P>(R2, T2, A1, B1, N2);
01846 
01847         // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
01848 
01849         carry += P::Add(T0, T0, R0, N);
01850         carry += P::Add(T0, T0, R2, N);
01851         carry += P::Add(R1, R1, T0, N);
01852 
01853         assert (carry >= 0 && carry <= 2);
01854         Increment(R3, N2, carry);
01855 }
01856 
01857 // R[2*N] - result = A*A
01858 // T[2*N] - temporary work space
01859 // A[N] --- number to be squared
01860 
01861 template <class P>
01862 void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
01863 
01864 template <class P>
01865 inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
01866 {
01867         assert(N && N%2==0);
01868         if (P::SquareRecursionLimit() >= 8 && N==8)
01869                 P::Square8(R, A);
01870         if (P::SquareRecursionLimit() >= 4 && N==4)
01871                 P::Square4(R, A);
01872         else if (N==2)
01873                 P::Square2(R, A);
01874         else
01875                 DoRecursiveSquare<P>(R, T, A, N, NULL); // VC60 workaround: needs this NULL
01876 }
01877 
01878 template <class P>
01879 void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
01880 {
01881         const unsigned int N2 = N/2;
01882 
01883         RecursiveSquare<P>(R0, T2, A0, N2);
01884         RecursiveSquare<P>(R2, T2, A1, N2);
01885         RecursiveMultiply<P>(T0, T2, A0, A1, N2);
01886 
01887         word carry = P::Add(R1, R1, T0, N);
01888         carry += P::Add(R1, R1, T0, N);
01889         Increment(R3, N2, carry);
01890 }
01891 
01892 // R[N] - bottom half of A*B
01893 // T[N] - temporary work space
01894 // A[N] - multiplier
01895 // B[N] - multiplicant
01896 
01897 template <class P>
01898 void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
01899 
01900 template <class P>
01901 inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
01902 {
01903         assert(N>=2 && N%2==0);
01904         if (P::MultiplyBottomRecursionLimit() >= 8 && N==8)
01905                 P::Multiply8Bottom(R, A, B);
01906         else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4)
01907                 P::Multiply4Bottom(R, A, B);
01908         else if (N==2)
01909                 P::Multiply2Bottom(R, A, B);
01910         else
01911                 DoRecursiveMultiplyBottom<P>(R, T, A, B, N, NULL);
01912 }
01913 
01914 template <class P>
01915 void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
01916 {
01917         const unsigned int N2 = N/2;
01918 
01919         RecursiveMultiply<P>(R, T, A0, B0, N2);
01920         RecursiveMultiplyBottom<P>(T0, T1, A1, B0, N2);
01921         P::Add(R1, R1, T0, N2);
01922         RecursiveMultiplyBottom<P>(T0, T1, A0, B1, N2);
01923         P::Add(R1, R1, T0, N2);
01924 }
01925 
01926 // R[N] --- upper half of A*B
01927 // T[2*N] - temporary work space
01928 // L[N] --- lower half of A*B
01929 // A[N] --- multiplier
01930 // B[N] --- multiplicant
01931 
01932 template <class P>
01933 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
01934 {
01935         assert(N>=2 && N%2==0);
01936 
01937         if (N==4)
01938         {
01939                 P::Multiply4(T, A, B);
01940                 ((dword *)R)[0] = ((dword *)T)[2];
01941                 ((dword *)R)[1] = ((dword *)T)[3];
01942         }
01943         else if (N==2)
01944         {
01945                 P::Multiply2(T, A, B);
01946                 ((dword *)R)[0] = ((dword *)T)[1];
01947         }
01948         else
01949         {
01950                 const unsigned int N2 = N/2;
01951                 int carry;
01952 
01953                 int aComp = Compare(A0, A1, N2);
01954                 int bComp = Compare(B0, B1, N2);
01955 
01956                 switch (2*aComp + aComp + bComp)
01957                 {
01958                 case -4:
01959                         P::Subtract(R0, A1, A0, N2);
01960                         P::Subtract(R1, B0, B1, N2);
01961                         RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01962                         P::Subtract(T1, T1, R0, N2);
01963                         carry = -1;
01964                         break;
01965                 case -2:
01966                         P::Subtract(R0, A1, A0, N2);
01967                         P::Subtract(R1, B0, B1, N2);
01968                         RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01969                         carry = 0;
01970                         break;
01971                 case 2:
01972                         P::Subtract(R0, A0, A1, N2);
01973                         P::Subtract(R1, B1, B0, N2);
01974                         RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01975                         carry = 0;
01976                         break;
01977                 case 4:
01978                         P::Subtract(R0, A1, A0, N2);
01979                         P::Subtract(R1, B0, B1, N2);
01980                         RecursiveMultiply<P>(T0, T2, R0, R1, N2);
01981                         P::Subtract(T1, T1, R1, N2);
01982                         carry = -1;
01983                         break;
01984                 default:
01985                         SetWords(T0, 0, N);
01986                         carry = 0;
01987                 }
01988 
01989                 RecursiveMultiply<P>(T2, R0, A1, B1, N2);
01990 
01991                 // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
01992 
01993                 word c2 = P::Subtract(R0, L+N2, L, N2);
01994                 c2 += P::Subtract(R0, R0, T0, N2);
01995                 word t = (Compare(R0, T2, N2) == -1);
01996 
01997                 carry += t;
01998                 carry += Increment(R0, N2, c2+t);
01999                 carry += P::Add(R0, R0, T1, N2);
02000                 carry += P::Add(R0, R0, T3, N2);
02001                 assert (carry >= 0 && carry <= 2);
02002 
02003                 CopyWords(R1, T3, N2);
02004                 Increment(R1, N2, carry);
02005         }
02006 }
02007 
02008 inline word Add(word *C, const word *A, const word *B, unsigned int N)
02009 {
02010         return LowLevel::Add(C, A, B, N);
02011 }
02012 
02013 inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
02014 {
02015         return LowLevel::Subtract(C, A, B, N);
02016 }
02017 
02018 inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
02019 {
02020 #ifdef SSE2_INTRINSICS_AVAILABLE
02021         if (HasSSE2())
02022                 RecursiveMultiply<P4Optimized>(R, T, A, B, N);
02023         else
02024 #endif
02025                 RecursiveMultiply<LowLevel>(R, T, A, B, N);
02026 }
02027 
02028 inline void Square(word *R, word *T, const word *A, unsigned int N)
02029 {
02030 #ifdef SSE2_INTRINSICS_AVAILABLE
02031         if (HasSSE2())
02032                 RecursiveSquare<P4Optimized>(R, T, A, N);
02033         else
02034 #endif
02035                 RecursiveSquare<LowLevel>(R, T, A, N);
02036 }
02037 
02038 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
02039 {
02040 #ifdef SSE2_INTRINSICS_AVAILABLE
02041         if (HasSSE2())
02042                 RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
02043         else
02044 #endif
02045                 RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
02046 }
02047 
02048 inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
02049 {
02050 #ifdef SSE2_INTRINSICS_AVAILABLE
02051         if (HasSSE2())
02052                 RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
02053         else
02054 #endif
02055                 RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
02056 }
02057 
02058 // R[NA+NB] - result = A*B
02059 // T[NA+NB] - temporary work space
02060 // A[NA] ---- multiplier
02061 // B[NB] ---- multiplicant
02062 
02063 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
02064 {
02065         if (NA == NB)
02066         {
02067                 if (A == B)
02068                         Square(R, T, A, NA);
02069                 else
02070                         Multiply(R, T, A, B, NA);
02071 
02072                 return;
02073         }
02074 
02075         if (NA > NB)
02076         {
02077                 std::swap(A, B);
02078                 std::swap(NA, NB);
02079         }
02080 
02081         assert(NB % NA == 0);
02082         assert((NB/NA)%2 == 0);         // NB is an even multiple of NA
02083 
02084         if (NA==2 && !A[1])
02085         {
02086                 switch (A[0])
02087                 {
02088                 case 0:
02089                         SetWords(R, 0, NB+2);
02090                         return;
02091                 case 1:
02092                         CopyWords(R, B, NB);
02093                         R[NB] = R[NB+1] = 0;
02094                         return;
02095                 default:
02096                         R[NB] = LinearMultiply(R, B, A[0], NB);
02097                         R[NB+1] = 0;
02098                         return;
02099                 }
02100         }
02101 
02102         Multiply(R, T, A, B, NA);
02103         CopyWords(T+2*NA, R+NA, NA);
02104 
02105         unsigned i;
02106 
02107         for (i=2*NA; i<NB; i+=2*NA)
02108                 Multiply(T+NA+i, T, A, B+i, NA);
02109         for (i=NA; i<NB; i+=2*NA)
02110                 Multiply(R+i, T, A, B+i, NA);
02111 
02112         if (Add(R+NA, R+NA, T+2*NA, NB-NA))
02113                 Increment(R+NB, NA);
02114 }
02115 
02116 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
02117 // T[3*N/2] - temporary work space
02118 // A[N] ----- an odd number as input
02119 
02120 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
02121 {
02122         if (N==2)
02123                 AtomicInverseModPower2(R, A[0], A[1]);
02124         else
02125         {
02126                 const unsigned int N2 = N/2;
02127                 RecursiveInverseModPower2(R0, T0, A0, N2);
02128                 T0[0] = 1;
02129                 SetWords(T0+1, 0, N2-1);
02130                 MultiplyTop(R1, T1, T0, R0, A0, N2);
02131                 MultiplyBottom(T0, T1, R0, A1, N2);
02132                 Add(T0, R1, T0, N2);
02133                 TwosComplement(T0, N2);
02134                 MultiplyBottom(R1, T1, R0, T0, N2);
02135         }
02136 }
02137 
02138 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
02139 // T[3*N] - temporary work space
02140 // X[2*N] - number to be reduced
02141 // M[N] --- modulus
02142 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
02143 
02144 void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
02145 {
02146         MultiplyBottom(R, T, X, U, N);
02147         MultiplyTop(T, T+N, X, R, M, N);
02148         if (Subtract(R, X+N, T, N))
02149         {
02150                 word carry = Add(R, R, M, N);
02151                 assert(carry);
02152         }
02153 }
02154 
02155 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
02156 // T[2*N] - temporary work space
02157 // X[2*N] - number to be reduced
02158 // M[N] --- modulus
02159 // U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
02160 // V[N] --- 2**(WORD_BITS*3*N/2) mod M
02161 
02162 void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, unsigned int N)
02163 {
02164         assert(N%2==0 && N>=4);
02165 
02166 #define M0              M
02167 #define M1              (M+N2)
02168 #define V0              V
02169 #define V1              (V+N2)
02170 
02171 #define X0              X
02172 #define X1              (X+N2)
02173 #define X2              (X+N)
02174 #define X3              (X+N+N2)
02175 
02176         const unsigned int N2 = N/2;
02177         Multiply(T0, T2, V0, X3, N2);
02178         int c2 = Add(T0, T0, X0, N);
02179         MultiplyBottom(T3, T2, T0, U, N2);
02180         MultiplyTop(T2, R, T0, T3, M0, N2);
02181         c2 -= Subtract(T2, T1, T2, N2);
02182         Multiply(T0, R, T3, M1, N2);
02183         c2 -= Subtract(T0, T2, T0, N2);
02184         int c3 = -(int)Subtract(T1, X2, T1, N2);
02185         Multiply(R0, T2, V1, X3, N2);
02186         c3 += Add(R, R, T, N);
02187 
02188         if (c2>0)
02189                 c3 += Increment(R1, N2);
02190         else if (c2<0)
02191                 c3 -= Decrement(R1, N2, -c2);
02192 
02193         assert(c3>=-1 && c3<=1);
02194         if (c3>0)
02195                 Subtract(R, R, M, N);
02196         else if (c3<0)
02197                 Add(R, R, M, N);
02198 
02199 #undef M0
02200 #undef M1
02201 #undef V0
02202 #undef V1
02203 
02204 #undef X0
02205 #undef X1
02206 #undef X2
02207 #undef X3
02208 }
02209 
02210 #undef A0
02211 #undef A1
02212 #undef B0
02213 #undef B1
02214 
02215 #undef T0
02216 #undef T1
02217 #undef T2
02218 #undef T3
02219 
02220 #undef R0
02221 #undef R1
02222 #undef R2
02223 #undef R3
02224 
02225 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
02226 static word SubatomicDivide(word *A, word B0, word B1)
02227 {
02228         // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
02229         assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
02230 
02231         dword p, u;
02232         word Q;
02233 
02234         // estimate the quotient: do a 2 word by 1 word divide
02235         if (B1+1 == 0)
02236                 Q = A[2];
02237         else
02238                 Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
02239 
02240         // now subtract Q*B from A
02241         p = (dword) B0*Q;
02242         u = (dword) A[0] - LOW_WORD(p);
02243         A[0] = LOW_WORD(u);
02244         u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
02245         A[1] = LOW_WORD(u);
02246         A[2] += HIGH_WORD(u);
02247 
02248         // Q <= actual quotient, so fix it
02249         while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
02250         {
02251                 u = (dword) A[0] - B0;
02252                 A[0] = LOW_WORD(u);
02253                 u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
02254                 A[1] = LOW_WORD(u);
02255                 A[2] += HIGH_WORD(u);
02256                 Q++;
02257                 assert(Q);      // shouldn't overflow
02258         }
02259 
02260         return Q;
02261 }
02262 
02263 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
02264 static inline void AtomicDivide(word *Q, const word *A, const word *B)
02265 {
02266         if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
02267         {
02268                 Q[0] = A[2];
02269                 Q[1] = A[3];
02270         }
02271         else
02272         {
02273                 word T[4];
02274                 T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
02275                 Q[1] = SubatomicDivide(T+1, B[0], B[1]);
02276                 Q[0] = SubatomicDivide(T, B[0], B[1]);
02277 
02278 #ifndef NDEBUG
02279                 // multiply quotient and divisor and add remainder, make sure it equals dividend
02280                 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
02281                 word P[4];
02282                 LowLevel::Multiply2(P, Q, B);
02283                 Add(P, P, T, 4);
02284                 assert(memcmp(P, A, 4*WORD_SIZE)==0);
02285 #endif
02286         }
02287 }
02288 
02289 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
02290 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
02291 {
02292         assert(N && N%2==0);
02293 
02294         if (Q[1])
02295         {
02296                 T[N] = T[N+1] = 0;
02297                 unsigned i;
02298                 for (i=0; i<N; i+=4)
02299                         LowLevel::Multiply2(T+i, Q, B+i);
02300                 for (i=2; i<N; i+=4)
02301                         if (LowLevel::Multiply2Add(T+i, Q, B+i))
02302                                 T[i+5] += (++T[i+4]==0);
02303         }
02304         else
02305         {
02306                 T[N] = LinearMultiply(T, B, Q[0], N);
02307                 T[N+1] = 0;
02308         }
02309 
02310         word borrow = Subtract(R, R, T, N+2);
02311         assert(!borrow && !R[N+1]);
02312 
02313         while (R[N] || Compare(R, B, N) >= 0)
02314         {
02315                 R[N] -= Subtract(R, R, B, N);
02316                 Q[1] += (++Q[0]==0);
02317                 assert(Q[0] || Q[1]); // no overflow
02318         }
02319 }
02320 
02321 // R[NB] -------- remainder = A%B
02322 // Q[NA-NB+2] --- quotient      = A/B
02323 // T[NA+2*NB+4] - temp work space
02324 // A[NA] -------- dividend
02325 // B[NB] -------- divisor
02326 
02327 void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
02328 {
02329         assert(NA && NB && NA%2==0 && NB%2==0);
02330         assert(B[NB-1] || B[NB-2]);
02331         assert(NB <= NA);
02332 
02333         // set up temporary work space
02334         word *const TA=T;
02335         word *const TB=T+NA+2;
02336         word *const TP=T+NA+2+NB;
02337 
02338         // copy B into TB and normalize it so that TB has highest bit set to 1
02339         unsigned shiftWords = (B[NB-1]==0);
02340         TB[0] = TB[NB-1] = 0;
02341         CopyWords(TB+shiftWords, B, NB-shiftWords);
02342         unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
02343         assert(shiftBits < WORD_BITS);
02344         ShiftWordsLeftByBits(TB, NB, shiftBits);
02345 
02346         // copy A into TA and normalize it
02347         TA[0] = TA[NA] = TA[NA+1] = 0;
02348         CopyWords(TA+shiftWords, A, NA);
02349         ShiftWordsLeftByBits(TA, NA+2, shiftBits);
02350 
02351         if (TA[NA+1]==0 && TA[NA] <= 1)
02352         {
02353                 Q[NA-NB+1] = Q[NA-NB] = 0;
02354                 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
02355                 {
02356                         TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
02357                         ++Q[NA-NB];
02358                 }
02359         }
02360         else
02361         {
02362                 NA+=2;
02363                 assert(Compare(TA+NA-NB, TB, NB) < 0);
02364         }
02365 
02366         word BT[2];
02367         BT[0] = TB[NB-2] + 1;
02368         BT[1] = TB[NB-1] + (BT[0]==0);
02369 
02370         // start reducing TA mod TB, 2 words at a time
02371         for (unsigned i=NA-2; i>=NB; i-=2)
02372         {
02373                 AtomicDivide(Q+i-NB, TA+i-2, BT);
02374                 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
02375         }
02376 
02377         // copy TA into R, and denormalize it
02378         CopyWords(R, TA+shiftWords, NB);
02379         ShiftWordsRightByBits(R, NB, shiftBits);
02380 }
02381 
02382 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
02383 {
02384         while (N && X[N-2]==0 && X[N-1]==0)
02385                 N-=2;
02386         return N;
02387 }
02388 
02389 // return k
02390 // R[N] --- result = A^(-1) * 2^k mod M
02391 // T[4*N] - temporary work space
02392 // A[NA] -- number to take inverse of
02393 // M[N] --- modulus
02394 
02395 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
02396 {
02397         assert(NA<=N && N && N%2==0);
02398 
02399         word *b = T;
02400         word *c = T+N;
02401         word *f = T+2*N;
02402         word *g = T+3*N;
02403         unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
02404         unsigned int k=0, s=0;
02405 
02406         SetWords(T, 0, 3*N);
02407         b[0]=1;
02408         CopyWords(f, A, NA);
02409         CopyWords(g, M, N);
02410 
02411         while (1)
02412         {
02413                 word t=f[0];
02414                 while (!t)
02415                 {
02416                         if (EvenWordCount(f, fgLen)==0)
02417                         {
02418                                 SetWords(R, 0, N);
02419                                 return 0;
02420                         }
02421 
02422                         ShiftWordsRightByWords(f, fgLen, 1);
02423                         if (c[bcLen-1]) bcLen+=2;
02424                         assert(bcLen <= N);
02425                         ShiftWordsLeftByWords(c, bcLen, 1);
02426                         k+=WORD_BITS;
02427                         t=f[0];
02428                 }
02429 
02430                 unsigned int i=0;
02431                 while (t%2 == 0)
02432                 {
02433                         t>>=1;
02434                         i++;
02435                 }
02436                 k+=i;
02437 
02438                 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
02439                 {
02440                         if (s%2==0)
02441                                 CopyWords(R, b, N);
02442                         else
02443                                 Subtract(R, M, b, N);
02444                         return k;
02445                 }
02446 
02447                 ShiftWordsRightByBits(f, fgLen, i);
02448                 t=ShiftWordsLeftByBits(c, bcLen, i);
02449                 if (t)
02450                 {
02451                         c[bcLen] = t;
02452                         bcLen+=2;
02453                         assert(bcLen <= N);
02454                 }
02455 
02456                 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
02457                         fgLen-=2;
02458 
02459                 if (Compare(f, g, fgLen)==-1)
02460                 {
02461                         std::swap(f, g);
02462                         std::swap(b, c);
02463                         s++;
02464                 }
02465 
02466                 Subtract(f, f, g, fgLen);
02467 
02468                 if (Add(b, b, c, bcLen))
02469                 {
02470                         b[bcLen] = 1;
02471                         bcLen+=2;
02472                         assert(bcLen <= N);
02473                 }
02474         }
02475 }
02476 
02477 // R[N] - result = A/(2^k) mod M
02478 // A[N] - input
02479 // M[N] - modulus
02480 
02481 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
02482 {
02483         CopyWords(R, A, N);
02484 
02485         while (k--)
02486         {
02487                 if (R[0]%2==0)
02488                         ShiftWordsRightByBits(R, N, 1);
02489                 else
02490                 {
02491                         word carry = Add(R, R, M, N);
02492                         ShiftWordsRightByBits(R, N, 1);
02493                         R[N-1] += carry<<(WORD_BITS-1);
02494                 }
02495         }
02496 }
02497 
02498 // R[N] - result = A*(2^k) mod M
02499 // A[N] - input
02500 // M[N] - modulus
02501 
02502 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
02503 {
02504         CopyWords(R, A, N);
02505 
02506         while (k--)
02507                 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
02508                         Subtract(R, R, M, N);
02509 }
02510 
02511 // ******************************************************************
02512 
02513 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
02514 
02515 static inline unsigned int RoundupSize(unsigned int n)
02516 {
02517         if (n<=8)
02518                 return RoundupSizeTable[n];
02519         else if (n<=16)
02520                 return 16;
02521         else if (n<=32)
02522                 return 32;
02523         else if (n<=64)
02524                 return 64;
02525         else return 1U << BitPrecision(n-1);
02526 }
02527 
02528 Integer::Integer()
02529         : reg(2), sign(POSITIVE)
02530 {
02531         reg[0] = reg[1] = 0;
02532 }
02533 
02534 Integer::Integer(const Integer& t)
02535         : reg(RoundupSize(t.WordCount())), sign(t.sign)
02536 {
02537         CopyWords(reg, t.reg, reg.size());
02538 }
02539 
02540 Integer::Integer(signed long value)
02541         : reg(2)
02542 {
02543         if (value >= 0)
02544                 sign = POSITIVE;
02545         else
02546         {
02547                 sign = NEGATIVE;
02548                 value = -value;
02549         }
02550         reg[0] = word(value);
02551         reg[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
02552 }
02553 
02554 bool Integer::IsConvertableToLong() const
02555 {
02556         if (ByteCount() > sizeof(long))
02557                 return false;
02558 
02559         unsigned long value = reg[0];
02560         value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);
02561 
02562         if (sign==POSITIVE)
02563                 return (signed long)value >= 0;
02564         else
02565                 return -(signed long)value < 0;
02566 }
02567 
02568 signed long Integer::ConvertToLong() const
02569 {
02570         assert(IsConvertableToLong());
02571 
02572         unsigned long value = reg[0];
02573         value += SafeLeftShift<WORD_BITS, unsigned long>(reg[1]);
02574         return sign==POSITIVE ? value : -(signed long)value;
02575 }
02576 
02577 Integer::Integer(BufferedTransformation &encodedInteger, unsigned int byteCount, Signedness s)
02578 {
02579         Decode(encodedInteger, byteCount, s);
02580 }
02581 
02582 Integer::Integer(const byte *encodedInteger, unsigned int byteCount, Signedness s)
02583 {
02584         Decode(encodedInteger, byteCount, s);
02585 }
02586 
02587 Integer::Integer(BufferedTransformation &bt)
02588 {
02589         BERDecode(bt);
02590 }
02591 
02592 Integer::Integer(RandomNumberGenerator &rng, unsigned int bitcount)
02593 {
02594         Randomize(rng, bitcount);
02595 }
02596 
02597 Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
02598 {
02599         if (!Randomize(rng, min, max, rnType, equiv, mod))
02600                 throw Integer::RandomNumberNotFound();
02601 }
02602 
02603 Integer Integer::Power2(unsigned int e)
02604 {
02605         Integer r((word)0, BitsToWords(e+1));
02606         r.SetBit(e);
02607         return r;
02608 }
02609 
02610 const Integer &Integer::Zero()
02611 {
02612         static const Integer zero;
02613         return zero;
02614 }
02615 
02616 const Integer &Integer::One()
02617 {
02618         static const Integer one(1,2);
02619         return one;
02620 }
02621 
02622 const Integer &Integer::Two()
02623 {
02624         static const Integer two(2,2);
02625         return two;
02626 }
02627 
02628 bool Integer::operator!() const
02629 {
02630         return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
02631 }
02632 
02633 Integer& Integer::operator=(const Integer& t)
02634 {
02635         if (this != &t)
02636         {
02637                 reg.New(RoundupSize(t.WordCount()));
02638                 CopyWords(reg, t.reg, reg.size());
02639                 sign = t.sign;
02640         }
02641         return *this;
02642 }
02643 
02644 bool Integer::GetBit(unsigned int n) const
02645 {
02646         if (n/WORD_BITS >= reg.size())
02647                 return 0;
02648         else
02649                 return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
02650 }
02651 
02652 void Integer::SetBit(unsigned int n, bool value)
02653 {
02654         if (value)
02655         {
02656                 reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
02657                 reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
02658         }
02659         else
02660         {
02661                 if (n/WORD_BITS < reg.size())
02662                         reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
02663         }
02664 }
02665 
02666 byte Integer::GetByte(unsigned int n) const
02667 {
02668         if (n/WORD_SIZE >= reg.size())
02669                 return 0;
02670         else
02671                 return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
02672 }
02673 
02674 void Integer::SetByte(unsigned int n, byte value)
02675 {
02676         reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
02677         reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
02678         reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
02679 }
02680 
02681 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
02682 {
02683         assert(n <= sizeof(unsigned long)*8);
02684         unsigned long v = 0;
02685         for (unsigned int j=0; j<n; j++)
02686                 v |= GetBit(i+j) << j;
02687         return v;
02688 }
02689 
02690 Integer Integer::operator-() const
02691 {
02692         Integer result(*this);
02693         result.Negate();
02694         return result;
02695 }
02696 
02697 Integer Integer::AbsoluteValue() const
02698 {
02699         Integer result(*this);
02700         result.sign = POSITIVE;
02701         return result;
02702 }
02703 
02704 void Integer::swap(Integer &a)
02705 {
02706         reg.swap(a.reg);
02707         std::swap(sign, a.sign);
02708 }
02709 
02710 Integer::Integer(word value, unsigned int length)
02711         : reg(RoundupSize(length)), sign(POSITIVE)
02712 {
02713         reg[0] = value;
02714         SetWords(reg+1, 0, reg.size()-1);
02715 }
02716 
02717 template <class T>
02718 static Integer StringToInteger(const T *str)
02719 {
02720         word radix;
02721 #if (defined(__GNUC__) && __GNUC__ <= 3)                // GCC workaround
02722         // std::char_traits doesn't exist in GCC 2.x
02723         // std::char_traits<wchar_t>::length() not defined in GCC 3.2
02724         unsigned int length;
02725         for (length = 0; str[length] != 0; length++) {}
02726 #else
02727         unsigned int length = std::char_traits<T>::length(str);
02728 #endif
02729 
02730         Integer v;
02731 
02732         if (length == 0)
02733                 return v;
02734 
02735         switch (str[length-1])
02736         {
02737         case 'h':
02738         case 'H':
02739                 radix=16;
02740                 break;
02741         case 'o':
02742         case 'O':
02743                 radix=8;
02744                 break;
02745         case 'b':
02746         case 'B':
02747                 radix=2;
02748                 break;
02749         default:
02750                 radix=10;
02751         }
02752 
02753         if (length > 2 && str[0] == '0' && str[1] == 'x')
02754                 radix = 16;
02755 
02756         for (unsigned i=0; i<length; i++)
02757         {
02758                 word digit;
02759 
02760                 if (str[i] >= '0' && str[i] <= '9')
02761                         digit = str[i] - '0';
02762                 else if (str[i] >= 'A' && str[i] <= 'F')
02763                         digit = str[i] - 'A' + 10;
02764                 else if (str[i] >= 'a' && str[i] <= 'f')
02765                         digit = str[i] - 'a' + 10;
02766                 else
02767                         digit = radix;
02768 
02769                 if (digit < radix)
02770                 {
02771                         v *= radix;
02772                         v += digit;
02773                 }
02774         }
02775 
02776         if (str[0] == '-')
02777                 v.Negate();
02778 
02779         return v;
02780 }
02781 
02782 Integer::Integer(const char *str)
02783         : reg(2), sign(POSITIVE)
02784 {
02785         *this = StringToInteger(str);
02786 }
02787 
02788 Integer::Integer(const wchar_t *str)
02789         : reg(2), sign(POSITIVE)
02790 {
02791         *this = StringToInteger(str);
02792 }
02793 
02794 unsigned int Integer::WordCount() const
02795 {
02796         return CountWords(reg, reg.size());
02797 }
02798 
02799 unsigned int Integer::ByteCount() const
02800 {
02801         unsigned wordCount = WordCount();
02802         if (wordCount)
02803                 return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
02804         else
02805                 return 0;
02806 }
02807 
02808 unsigned int Integer::BitCount() const
02809 {
02810         unsigned wordCount = WordCount();
02811         if (wordCount)
02812                 return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
02813         else
02814                 return 0;
02815 }
02816 
02817 void Integer::Decode(const byte *input, unsigned int inputLen, Signedness s)
02818 {
02819         StringStore store(input, inputLen);
02820         Decode(store, inputLen, s);
02821 }
02822 
02823 void Integer::Decode(BufferedTransformation &bt, unsigned int inputLen, Signedness s)
02824 {
02825         assert(bt.MaxRetrievable() >= inputLen);
02826 
02827         byte b;
02828         bt.Peek(b);
02829         sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
02830 
02831         while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
02832         {
02833                 bt.Skip(1);
02834                 inputLen--;
02835                 bt.Peek(b);
02836         }
02837 
02838         reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
02839 
02840         for (unsigned int i=inputLen; i > 0; i--)
02841         {
02842                 bt.Get(b);
02843                 reg[(i-1)/WORD_SIZE] |= b << ((i-1)%WORD_SIZE)*8;
02844         }
02845 
02846         if (sign == NEGATIVE)
02847         {
02848                 for (unsigned i=inputLen; i<reg.size()*WORD_SIZE; i++)
02849                         reg[i/WORD_SIZE] |= 0xff << (i%WORD_SIZE)*8;
02850                 TwosComplement(reg, reg.size());
02851         }
02852 }
02853 
02854 unsigned int Integer::MinEncodedSize(Signedness signedness) const
02855 {
02856         unsigned int outputLen = STDMAX(1U, ByteCount());
02857         if (signedness == UNSIGNED)
02858                 return outputLen;
02859         if (NotNegative() && (GetByte(outputLen-1) & 0x80))
02860                 outputLen++;
02861         if (IsNegative() && *this < -Power2(outputLen*8-1))
02862                 outputLen++;
02863         return outputLen;
02864 }
02865 
02866 unsigned int Integer::Encode(byte *output, unsigned int outputLen, Signedness signedness) const
02867 {
02868         ArraySink sink(output, outputLen);
02869         return Encode(sink, outputLen, signedness);
02870 }
02871 
02872 unsigned int Integer::Encode(BufferedTransformation &bt, unsigned int outputLen, Signedness signedness) const
02873 {
02874         if (signedness == UNSIGNED || NotNegative())
02875         {
02876                 for (unsigned int i=outputLen; i > 0; i--)
02877                         bt.Put(GetByte(i-1));
02878         }
02879         else
02880         {
02881                 // take two's complement of *this
02882                 Integer temp = Integer::Power2(8*STDMAX(ByteCount(), outputLen)) + *this;
02883                 for (unsigned i=0; i<outputLen; i++)
02884                         bt.Put(temp.GetByte(outputLen-i-1));
02885         }
02886         return outputLen;
02887 }
02888 
02889 void Integer::DEREncode(BufferedTransformation &bt) const
02890 {
02891         DERGeneralEncoder enc(bt, INTEGER);
02892         Encode(enc, MinEncodedSize(SIGNED), SIGNED);
02893         enc.MessageEnd();
02894 }
02895 
02896 void Integer::BERDecode(const byte *input, unsigned int len)
02897 {
02898         StringStore store(input, len);
02899         BERDecode(store);
02900 }
02901 
02902 void Integer::BERDecode(BufferedTransformation &bt)
02903 {
02904         BERGeneralDecoder dec(bt, INTEGER);
02905         if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
02906                 BERDecodeError();
02907         Decode(dec, dec.RemainingLength(), SIGNED);
02908         dec.MessageEnd();
02909 }
02910 
02911 void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, unsigned int length) const
02912 {
02913         DERGeneralEncoder enc(bt, OCTET_STRING);
02914         Encode(enc, length);
02915         enc.MessageEnd();
02916 }
02917 
02918 void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, unsigned int length)
02919 {
02920         BERGeneralDecoder dec(bt, OCTET_STRING);
02921         if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
02922                 BERDecodeError();
02923         Decode(dec, length);
02924         dec.MessageEnd();
02925 }
02926 
02927 unsigned int Integer::OpenPGPEncode(byte *output, unsigned int len) const
02928 {
02929         ArraySink sink(output, len);
02930         return OpenPGPEncode(sink);
02931 }
02932 
02933 unsigned int Integer::OpenPGPEncode(BufferedTransformation &bt) const
02934 {
02935         word16 bitCount = BitCount();
02936         bt.PutWord16(bitCount);
02937         return 2 + Encode(bt, BitsToBytes(bitCount));
02938 }
02939 
02940 void Integer::OpenPGPDecode(const byte *input, unsigned int len)
02941 {
02942         StringStore store(input, len);
02943         OpenPGPDecode(store);
02944 }
02945 
02946 void Integer::OpenPGPDecode(BufferedTransformation &bt)
02947 {
02948         word16 bitCount;
02949         if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
02950                 throw OpenPGPDecodeErr();
02951         Decode(bt, BitsToBytes(bitCount));
02952 }
02953 
02954 void Integer::Randomize(RandomNumberGenerator &rng, unsigned int nbits)
02955 {
02956         const unsigned int nbytes = nbits/8 + 1;
02957         SecByteBlock buf(nbytes);
02958         rng.GenerateBlock(buf, nbytes);
02959         if (nbytes)
02960                 buf[0] = (byte)Crop(buf[0], nbits % 8);
02961         Decode(buf, nbytes, UNSIGNED);
02962 }
02963 
02964 void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
02965 {
02966         if (min > max)
02967                 throw InvalidArgument("Integer: Min must be no greater than Max");
02968 
02969         Integer range = max - min;
02970         const unsigned int nbits = range.BitCount();
02971 
02972         do
02973         {
02974                 Randomize(rng, nbits);
02975         }
02976         while (*this > range);
02977 
02978         *this += min;
02979 }
02980 
02981 bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
02982 {
02983         return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
02984 }
02985 
02986 class KDF2_RNG : public RandomNumberGenerator
02987 {
02988 public:
02989         KDF2_RNG(const byte *seed, unsigned int seedSize)
02990                 : m_counter(0), m_counterAndSeed(seedSize + 4)
02991         {
02992                 memcpy(m_counterAndSeed + 4, seed, seedSize);
02993         }
02994 
02995         byte GenerateByte()
02996         {
02997                 byte b;
02998                 GenerateBlock(&b, 1);
02999                 return b;
03000         }
03001 
03002         void GenerateBlock(byte *output, unsigned int size)
03003         {
03004                 UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
03005                 ++m_counter;
03006                 P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size());
03007         }
03008 
03009 private:
03010         word32 m_counter;
03011         SecByteBlock m_counterAndSeed;
03012 };
03013 
03014 bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
03015 {
03016         Integer min = params.GetValueWithDefault("Min", Integer::Zero());
03017         Integer max;
03018         if (!params.GetValue("Max", max))
03019         {
03020                 int bitLength;
03021                 if (params.GetIntValue("BitLength", bitLength))
03022                         max = Integer::Power2(bitLength);
03023                 else
03024                         throw InvalidArgument("Integer: missing Max argument");
03025         }
03026         if (min > max)
03027                 throw InvalidArgument("Integer: Min must be no greater than Max");
03028 
03029         Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
03030         Integer mod = params.GetValueWithDefault("Mod", Integer::One());
03031 
03032         if (equiv.IsNegative() || equiv >= mod)
03033                 throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
03034 
03035         Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
03036 
03037         member_ptr<KDF2_RNG> kdf2Rng;
03038         ConstByteArrayParameter seed;
03039         if (params.GetValue("Seed", seed))
03040         {
03041                 ByteQueue bq;
03042                 DERSequenceEncoder seq(bq);
03043                 min.DEREncode(seq);
03044                 max.DEREncode(seq);
03045                 equiv.DEREncode(seq);
03046                 mod.DEREncode(seq);
03047                 DEREncodeUnsigned(seq, rnType);
03048                 DEREncodeOctetString(seq, seed.begin(), seed.size());
03049                 seq.MessageEnd();
03050 
03051                 SecByteBlock finalSeed(bq.MaxRetrievable());
03052                 bq.Get(finalSeed, finalSeed.size());
03053                 kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
03054         }
03055         RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
03056 
03057         switch (rnType)
03058         {
03059                 case ANY:
03060                         if (mod == One())
03061                                 Randomize(rng, min, max);
03062                         else
03063                         {
03064                                 Integer min1 = min + (equiv-min)%mod;
03065                                 if (max < min1)
03066                                         return false;
03067                                 Randomize(rng, Zero(), (max - min1) / mod);
03068                                 *this *= mod;
03069                                 *this += min1;
03070                         }
03071                         return true;
03072 
03073                 case PRIME:
03074                 {
03075                         const PrimeSelector *pSelector = params.GetValueWithDefault("PointerToPrimeSelector", (const PrimeSelector *)NULL);
03076 
03077                         int i;
03078                         i = 0;
03079                         while (1)
03080                         {
03081                                 if (++i==16)
03082                                 {
03083                                         // check if there are any suitable primes in [min, max]
03084                                         Integer first = min;
03085                                         if (FirstPrime(first, max, equiv, mod, pSelector))
03086                                         {
03087                                                 // if there is only one suitable prime, we're done
03088                                                 *this = first;
03089                                                 if (!FirstPrime(first, max, equiv, mod, pSelector))
03090                                                         return true;
03091                                         }
03092                                         else
03093                                                 return false;
03094                                 }
03095 
03096                                 Randomize(rng, min, max);
03097                                 if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
03098                                         return true;
03099                         }
03100                 }
03101 
03102                 default:
03103                         throw InvalidArgument("Integer: invalid RandomNumberType argument");
03104         }
03105 }
03106 
03107 std::istream& operator>>(std::istream& in, Integer &a)
03108 {
03109         char c;
03110         unsigned int length = 0;
03111         SecBlock<char> str(length + 16);
03112 
03113         std::ws(in);
03114 
03115         do
03116         {
03117                 in.read(&c, 1);
03118                 str[length++] = c;
03119                 if (length >= str.size())
03120                         str.Grow(length + 16);
03121         }
03122         while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
03123 
03124         if (in.gcount())
03125                 in.putback(c);
03126         str[length-1] = '\0';
03127         a = Integer(str);
03128 
03129         return in;
03130 }
03131 
03132 std::ostream& operator<<(std::ostream& out, const Integer &a)
03133 {
03134         // Get relevant conversion specifications from ostream.
03135         long f = out.flags() & std::ios::basefield; // Get base digits.
03136         int base, block;
03137         char suffix;
03138         switch(f)
03139         {
03140         case std::ios::oct :
03141                 base = 8;
03142                 block = 8;
03143                 suffix = 'o';
03144                 break;
03145         case std::ios::hex :
03146                 base = 16;
03147                 block = 4;
03148                 suffix = 'h';
03149                 break;
03150         default :
03151                 base = 10;
03152                 block = 3;
03153                 suffix = '.';
03154         }
03155 
03156         SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
03157         Integer temp1=a, temp2;
03158         unsigned i=0;
03159         const char vec[]="0123456789ABCDEF";
03160 
03161         if (a.IsNegative())
03162         {
03163                 out << '-';
03164                 temp1.Negate();
03165         }
03166 
03167         if (!a)
03168                 out << '0';
03169 
03170         while (!!temp1)
03171         {
03172                 word digit;
03173                 Integer::Divide(digit, temp2, temp1, base);
03174                 s[i++]=vec[digit];
03175                 temp1=temp2;
03176         }
03177 
03178         while (i--)
03179         {
03180                 out << s[i];
03181 //              if (i && !(i%block))
03182 //                      out << ",";
03183         }
03184         return out << suffix;
03185 }
03186 
03187 Integer& Integer::operator++()
03188 {
03189         if (NotNegative())
03190         {
03191                 if (Increment(reg, reg.size()))
03192                 {
03193                         reg.CleanGrow(2*reg.size());
03194                         reg[reg.size()/2]=1;
03195                 }
03196         }
03197         else
03198         {
03199                 word borrow = Decrement(reg, reg.size());
03200                 assert(!borrow);
03201                 if (WordCount()==0)
03202                         *this = Zero();
03203         }
03204         return *this;
03205 }
03206 
03207 Integer& Integer::operator--()
03208 {
03209         if (IsNegative())
03210         {
03211                 if (Increment(reg, reg.size()))
03212                 {
03213                         reg.CleanGrow(2*reg.size());
03214                         reg[reg.size()/2]=1;
03215                 }
03216         }
03217         else
03218         {
03219                 if (Decrement(reg, reg.size()))
03220                         *this = -One();
03221         }
03222         return *this;
03223 }
03224 
03225 void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
03226 {
03227         word carry;
03228         if (a.reg.size() == b.reg.size())
03229                 carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
03230         else if (a.reg.size() > b.reg.size())
03231         {
03232                 carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
03233                 CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
03234                 carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
03235         }
03236         else
03237         {
03238                 carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
03239                 CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
03240                 carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
03241         }
03242 
03243         if (carry)
03244         {
03245                 sum.reg.CleanGrow(2*sum.reg.size());
03246                 sum.reg[sum.reg.size()/2] = 1;
03247         }
03248         sum.sign = Integer::POSITIVE;
03249 }
03250 
03251 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
03252 {
03253         unsigned aSize = a.WordCount();
03254         aSize += aSize%2;
03255         unsigned bSize = b.WordCount();
03256         bSize += bSize%2;
03257 
03258         if (aSize == bSize)
03259         {
03260                 if (Compare(a.reg, b.reg, aSize) >= 0)
03261                 {
03262                         Subtract(diff.reg, a.reg, b.reg, aSize);
03263                         diff.sign = Integer::POSITIVE;
03264                 }
03265                 else
03266                 {
03267                         Subtract(diff.reg, b.reg, a.reg, aSize);
03268                         diff.sign = Integer::NEGATIVE;
03269                 }
03270         }
03271         else if (aSize > bSize)
03272         {
03273                 word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
03274                 CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
03275                 borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
03276                 assert(!borrow);
03277                 diff.sign = Integer::POSITIVE;
03278         }
03279         else
03280         {
03281                 word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
03282                 CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
03283                 borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
03284                 assert(!borrow);
03285                 diff.sign = Integer::NEGATIVE;
03286         }
03287 }
03288 
03289 Integer Integer::Plus(const Integer& b) const
03290 {
03291         Integer sum((word)0, STDMAX(reg.size(), b.reg.size()));
03292         if (NotNegative())
03293         {
03294                 if (b.NotNegative())
03295                         PositiveAdd(sum, *this, b);
03296                 else
03297                         PositiveSubtract(sum, *this, b);
03298         }
03299         else
03300         {
03301                 if (b.NotNegative())
03302                         PositiveSubtract(sum, b, *this);
03303                 else
03304                 {
03305                         PositiveAdd(sum, *this, b);
03306                         sum.sign = Integer::NEGATIVE;
03307                 }
03308         }
03309         return sum;
03310 }
03311 
03312 Integer& Integer::operator+=(const Integer& t)
03313 {
03314         reg.CleanGrow(t.reg.size());
03315         if (NotNegative())
03316         {
03317                 if (t.NotNegative())
03318                         PositiveAdd(*this, *this, t);
03319                 else
03320                         PositiveSubtract(*this, *this, t);
03321         }
03322         else
03323         {
03324                 if (t.NotNegative())
03325                         PositiveSubtract(*this, t, *this);
03326                 else
03327                 {
03328                         PositiveAdd(*this, *this, t);
03329                         sign = Integer::NEGATIVE;
03330                 }
03331         }
03332         return *this;
03333 }
03334 
03335 Integer Integer::Minus(const Integer& b) const
03336 {
03337         Integer diff((word)0, STDMAX(reg.size(), b.reg.size()));
03338         if (NotNegative())
03339         {
03340                 if (b.NotNegative())
03341                         PositiveSubtract(diff, *this, b);
03342                 else
03343                         PositiveAdd(diff, *this, b);
03344         }
03345         else
03346         {
03347                 if (b.NotNegative())
03348                 {
03349                         PositiveAdd(diff, *this, b);
03350                         diff.sign = Integer::NEGATIVE;
03351                 }
03352                 else
03353                         PositiveSubtract(diff, b, *this);
03354         }
03355         return diff;
03356 }
03357 
03358 Integer& Integer::operator-=(const Integer& t)
03359 {
03360         reg.CleanGrow(t.reg.size());
03361         if (NotNegative())
03362         {
03363                 if (t.NotNegative())
03364                         PositiveSubtract(*this, *this, t);
03365                 else
03366                         PositiveAdd(*this, *this, t);
03367         }
03368         else
03369         {
03370                 if (t.NotNegative())
03371                 {
03372                         PositiveAdd(*this, *this, t);
03373                         sign = Integer::NEGATIVE;
03374                 }
03375                 else
03376                         PositiveSubtract(*this, t, *this);
03377         }
03378         return *this;
03379 }
03380 
03381 Integer& Integer::operator<<=(unsigned int n)
03382 {
03383         const unsigned int wordCount = WordCount();
03384         const unsigned int shiftWords = n / WORD_BITS;
03385         const unsigned int shiftBits = n % WORD_BITS;
03386 
03387         reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
03388         ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
03389         ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
03390         return *this;
03391 }
03392 
03393 Integer& Integer::operator>>=(unsigned int n)
03394 {
03395         const unsigned int wordCount = WordCount();
03396         const unsigned int shiftWords = n / WORD_BITS;
03397         const unsigned int shiftBits = n % WORD_BITS;
03398 
03399         ShiftWordsRightByWords(reg, wordCount, shiftWords);
03400         if (wordCount > shiftWords)
03401                 ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
03402         if (IsNegative() && WordCount()==0)   // avoid -0
03403                 *this = Zero();
03404         return *this;
03405 }
03406 
03407 void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
03408 {
03409         unsigned aSize = RoundupSize(a.WordCount());
03410         unsigned bSize = RoundupSize(b.WordCount());
03411 
03412         product.reg.CleanNew(RoundupSize(aSize+bSize));
03413         product.sign = Integer::POSITIVE;
03414 
03415         SecAlignedWordBlock workspace(aSize + bSize);
03416         AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
03417 }
03418 
03419 void Multiply(Integer &product, const Integer &a, const Integer &b)
03420 {
03421         PositiveMultiply(product, a, b);
03422 
03423         if (a.NotNegative() != b.NotNegative())
03424                 product.Negate();
03425 }
03426 
03427 Integer Integer::Times(const Integer &b) const
03428 {
03429         Integer product;
03430         Multiply(product, *this, b);
03431         return product;
03432 }
03433 
03434 /*
03435 void PositiveDivide(Integer &remainder, Integer &quotient,
03436                                    const Integer &dividend, const Integer &divisor)
03437 {
03438         remainder.reg.CleanNew(divisor.reg.size());
03439         remainder.sign = Integer::POSITIVE;
03440         quotient.reg.New(0);
03441         quotient.sign = Integer::POSITIVE;
03442         unsigned i=dividend.BitCount();
03443         while (i--)
03444         {
03445                 word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
03446                 remainder.reg[0] |= dividend[i];
03447                 if (overflow || remainder >= divisor)
03448                 {
03449                         Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
03450                         quotient.SetBit(i);
03451                 }
03452         }
03453 }
03454 */
03455 
03456 void PositiveDivide(Integer &remainder, Integer &quotient,
03457                                    const Integer &a, const Integer &b)
03458 {
03459         unsigned aSize = a.WordCount();
03460         unsigned bSize = b.WordCount();
03461 
03462         if (!bSize)
03463                 throw Integer::DivideByZero();
03464 
03465         if (a.PositiveCompare(b) == -1)
03466         {
03467                 remainder = a;
03468                 remainder.sign = Integer::POSITIVE;
03469                 quotient = Integer::Zero();
03470                 return;
03471         }
03472 
03473         aSize += aSize%2;       // round up to next even number
03474         bSize += bSize%2;
03475 
03476         remainder.reg.CleanNew(RoundupSize(bSize));
03477         remainder.sign = Integer::POSITIVE;
03478         quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
03479         quotient.sign = Integer::POSITIVE;
03480 
03481         SecAlignedWordBlock T(aSize+2*bSize+4);
03482         Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
03483 }
03484 
03485 void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
03486 {
03487         PositiveDivide(remainder, quotient, dividend, divisor);
03488 
03489         if (dividend.IsNegative())
03490         {
03491                 quotient.Negate();
03492                 if (remainder.NotZero())
03493                 {
03494                         --quotient;
03495                         remainder = divisor.AbsoluteValue() - remainder;
03496                 }
03497         }
03498 
03499         if (divisor.IsNegative())
03500                 quotient.Negate();
03501 }
03502 
03503 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
03504 {
03505         q = a;
03506         q >>= n;
03507 
03508         const unsigned int wordCount = BitsToWords(n);
03509         if (wordCount <= a.WordCount())
03510         {
03511                 r.reg.resize(RoundupSize(wordCount));
03512                 CopyWords(r.reg, a.reg, wordCount);
03513                 SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
03514                 if (n % WORD_BITS != 0)
03515                         r.reg[wordCount-1] %= (1 << (n % WORD_BITS));
03516         }
03517         else
03518         {
03519                 r.reg.resize(RoundupSize(a.WordCount()));
03520                 CopyWords(r.reg, a.reg, r.reg.size());
03521         }
03522         r.sign = POSITIVE;
03523 
03524         if (a.IsNegative() && r.NotZero())
03525         {
03526                 --q;
03527                 r = Power2(n) - r;
03528         }
03529 }
03530 
03531 Integer Integer::DividedBy(const Integer &b) const
03532 {
03533         Integer remainder, quotient;
03534         Integer::Divide(remainder, quotient, *this, b);
03535         return quotient;
03536 }
03537 
03538 Integer Integer::Modulo(const Integer &b) const
03539 {
03540         Integer remainder, quotient;
03541         Integer::Divide(remainder, quotient, *this, b);
03542         return remainder;
03543 }
03544 
03545 void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
03546 {
03547         if (!divisor)
03548                 throw Integer::DivideByZero();
03549 
03550         assert(divisor);
03551 
03552         if ((divisor & (divisor-1)) == 0)       // divisor is a power of 2
03553         {
03554                 quotient = dividend >> (BitPrecision(divisor)-1);
03555                 remainder = dividend.reg[0] & (divisor-1);
03556                 return;
03557         }
03558 
03559         unsigned int i = dividend.WordCount();
03560         quotient.reg.CleanNew(RoundupSize(i));
03561         remainder = 0;
03562         while (i--)
03563         {
03564                 quotient.reg[i] = word(MAKE_DWORD(dividend.reg[i], remainder) / divisor);
03565                 remainder = word(MAKE_DWORD(dividend.reg[i], remainder) % divisor);
03566         }
03567 
03568         if (dividend.NotNegative())
03569                 quotient.sign = POSITIVE;
03570         else
03571         {
03572                 quotient.sign = NEGATIVE;
03573                 if (remainder)
03574                 {
03575                         --quotient;
03576                         remainder = divisor - remainder;
03577                 }
03578         }
03579 }
03580 
03581 Integer Integer::DividedBy(word b) const
03582 {
03583         word remainder;
03584         Integer quotient;
03585         Integer::Divide(remainder, quotient, *this, b);
03586         return quotient;
03587 }
03588 
03589 word Integer::Modulo(word divisor) const
03590 {
03591         if (!divisor)
03592                 throw Integer::DivideByZero();
03593 
03594         assert(divisor);
03595 
03596         word remainder;
03597 
03598         if ((divisor & (divisor-1)) == 0)       // divisor is a power of 2
03599                 remainder = reg[0] & (divisor-1);
03600         else
03601         {
03602                 unsigned int i = WordCount();
03603 
03604                 if (divisor <= 5)
03605                 {
03606                         dword sum=0;
03607                         while (i--)
03608                                 sum += reg[i];
03609                         remainder = word(sum%divisor);
03610                 }
03611                 else
03612                 {
03613                         remainder = 0;
03614                         while (i--)
03615                                 remainder = word(MAKE_DWORD(reg[i], remainder) % divisor);
03616                 }
03617         }
03618 
03619         if (IsNegative() && remainder)
03620                 remainder = divisor - remainder;
03621 
03622         return remainder;
03623 }
03624 
03625 void Integer::Negate()
03626 {
03627         if (!!(*this))  // don't flip sign if *this==0
03628                 sign = Sign(1-sign);
03629 }
03630 
03631 int Integer::PositiveCompare(const Integer& t) const
03632 {
03633         unsigned size = WordCount(), tSize = t.WordCount();
03634 
03635         if (size == tSize)
03636                 return CryptoPP::Compare(reg, t.reg, size);
03637         else
03638                 return size > tSize ? 1 : -1;
03639 }
03640 
03641 int Integer::Compare(const Integer& t) const
03642 {
03643         if (NotNegative())
03644         {
03645                 if (t.NotNegative())
03646                         return PositiveCompare(t);
03647                 else
03648                         return 1;
03649         }
03650         else
03651         {
03652                 if (t.NotNegative())
03653                         return -1;
03654                 else
03655                         return -PositiveCompare(t);
03656         }
03657 }
03658 
03659 Integer Integer::SquareRoot() const
03660 {
03661         if (!IsPositive())
03662                 return Zero();
03663 
03664         // overestimate square root
03665         Integer x, y = Power2((BitCount()+1)/2);
03666         assert(y*y >= *this);
03667 
03668         do
03669         {
03670                 x = y;
03671                 y = (x + *this/x) >> 1;
03672         } while (y<x);
03673 
03674         return x;
03675 }
03676 
03677 bool Integer::IsSquare() const
03678 {
03679         Integer r = SquareRoot();
03680         return *this == r.Squared();
03681 }
03682 
03683 bool Integer::IsUnit() const
03684 {
03685         return (WordCount() == 1) && (reg[0] == 1);
03686 }
03687 
03688 Integer Integer::MultiplicativeInverse() const
03689 {
03690         return IsUnit() ? *this : Zero();
03691 }
03692 
03693 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
03694 {
03695         return x*y%m;
03696 }
03697 
03698 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
03699 {
03700         ModularArithmetic mr(m);
03701         return mr.Exponentiate(x, e);
03702 }
03703 
03704 Integer Integer::Gcd(const Integer &a, const Integer &b)
03705 {
03706         return EuclideanDomainOf<Integer>().Gcd(a, b);
03707 }
03708 
03709 Integer Integer::InverseMod(const Integer &m) const
03710 {
03711         assert(m.NotNegative());
03712 
03713         if (IsNegative() || *this>=m)
03714                 return (*this%m).InverseMod(m);
03715 
03716         if (m.IsEven())
03717         {
03718                 if (!m || IsEven())
03719                         return Zero();  // no inverse
03720                 if (*this == One())
03721                         return One();
03722 
03723                 Integer u = m.InverseMod(*this);
03724                 return !u ? Zero() : (m*(*this-u)+1)/(*this);
03725         }
03726 
03727         SecBlock<word> T(m.reg.size() * 4);
03728         Integer r((word)0, m.reg.size());
03729         unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
03730         DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
03731         return r;
03732 }
03733 
03734 word Integer::InverseMod(const word mod) const
03735 {
03736         word g0 = mod, g1 = *this % mod;
03737         word v0 = 0, v1 = 1;
03738         word y;
03739 
03740         while (g1)
03741         {
03742                 if (g1 == 1)
03743                         return v1;
03744                 y = g0 / g1;
03745                 g0 = g0 % g1;
03746                 v0 += y * v1;
03747 
03748                 if (!g0)
03749                         break;
03750                 if (g0 == 1)
03751                         return mod-v0;
03752                 y = g1 / g0;
03753                 g1 = g1 % g0;
03754                 v1 += y * v0;
03755         }
03756         return 0;
03757 }
03758 
03759 // ********************************************************
03760 
03761 ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
03762 {
03763         BERSequenceDecoder seq(bt);
03764         OID oid(seq);
03765         if (oid != ASN1::prime_field())
03766                 BERDecodeError();
03767         modulus.BERDecode(seq);
03768         seq.MessageEnd();
03769         result.reg.resize(modulus.reg.size());
03770 }
03771 
03772 void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
03773 {
03774         DERSequenceEncoder seq(bt);
03775         ASN1::prime_field().DEREncode(seq);
03776         modulus.DEREncode(seq);
03777         seq.MessageEnd();
03778 }
03779 
03780 void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
03781 {
03782         a.DEREncodeAsOctetString(out, MaxElementByteLength());
03783 }
03784 
03785 void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
03786 {
03787         a.BERDecodeAsOctetString(in, MaxElementByteLength());
03788 }
03789 
03790 const Integer& ModularArithmetic::Half(const Integer &a) const
03791 {
03792         if (a.reg.size()==modulus.reg.size())
03793         {
03794                 CryptoPP::DivideByPower2Mod(result.reg.begin(), a.reg, 1, modulus.reg, a.reg.size());
03795                 return result;
03796         }
03797         else
03798                 return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
03799 }
03800 
03801 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
03802 {
03803         if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
03804         {
03805                 if (CryptoPP::Add(result.reg.begin(), a.reg, b.reg, a.reg.size())
03806                         || Compare(result.reg, modulus.reg, a.reg.size()) >= 0)
03807                 {
03808                         CryptoPP::Subtract(result.reg.begin(), result.reg, modulus.reg, a.reg.size());
03809                 }
03810                 return result;
03811         }
03812         else
03813         {
03814                 result1 = a+b;
03815                 if (result1 >= modulus)
03816                         result1 -= modulus;
03817                 return result1;
03818         }
03819 }
03820 
03821 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
03822 {
03823         if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
03824         {
03825                 if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
03826                         || Compare(a.reg, modulus.reg, a.reg.size()) >= 0)
03827                 {
03828                         CryptoPP::Subtract(a.reg, a.reg, modulus.reg, a.reg.size());
03829                 }
03830         }
03831         else
03832         {
03833                 a+=b;
03834                 if (a>=modulus)
03835                         a-=modulus;
03836         }
03837 
03838         return a;
03839 }
03840 
03841 const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
03842 {
03843         if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
03844         {
03845                 if (CryptoPP::Subtract(result.reg.begin(), a.reg, b.reg, a.reg.size()))
03846                         CryptoPP::Add(result.reg.begin(), result.reg, modulus.reg, a.reg.size());
03847                 return result;
03848         }
03849         else
03850         {
03851                 result1 = a-b;
03852                 if (result1.IsNegative())
03853                         result1 += modulus;
03854                 return result1;
03855         }
03856 }
03857 
03858 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
03859 {
03860         if (a.reg.size()==modulus.reg.size() && b.reg.size()==modulus.reg.size())
03861         {
03862                 if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
03863                         CryptoPP::Add(a.reg, a.reg, modulus.reg, a.reg.size());
03864         }
03865         else
03866         {
03867                 a-=b;
03868                 if (a.IsNegative())
03869                         a+=modulus;
03870         }
03871 
03872         return a;
03873 }
03874 
03875 const Integer& ModularArithmetic::Inverse(const Integer &a) const
03876 {
03877         if (!a)
03878                 return a;
03879 
03880         CopyWords(result.reg.begin(), modulus.reg, modulus.reg.size());
03881         if (CryptoPP::Subtract(result.reg.begin(), result.reg, a.reg, a.reg.size()))
03882                 Decrement(result.reg.begin()+a.reg.size(), 1, modulus.reg.size()-a.reg.size());
03883 
03884         return result;
03885 }
03886 
03887 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
03888 {
03889         if (modulus.IsOdd())
03890         {
03891                 MontgomeryRepresentation dr(modulus);
03892                 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
03893         }
03894         else
03895                 return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
03896 }
03897 
03898 void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
03899 {
03900         if (modulus.IsOdd())
03901         {
03902                 MontgomeryRepresentation dr(modulus);
03903                 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
03904                 for (unsigned int i=0; i<exponentsCount; i++)
03905                         results[i] = dr.ConvertOut(results[i]);
03906         }
03907         else
03908                 AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
03909 }
03910 
03911 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)    // modulus must be odd
03912         : ModularArithmetic(m),
03913           u((word)0, modulus.reg.size()),
03914           workspace(5*modulus.reg.size())
03915 {
03916         if (!modulus.IsOdd())
03917                 throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
03918 
03919         RecursiveInverseModPower2(u.reg, workspace, modulus.reg, modulus.reg.size());
03920 }
03921 
03922 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
03923 {
03924         word *const T = workspace.begin();
03925         word *const R = result.reg.begin();
03926         const unsigned int N = modulus.reg.size();
03927         assert(a.reg.size()<=N && b.reg.size()<=N);
03928 
03929         AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
03930         SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
03931         MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
03932         return result;
03933 }
03934 
03935 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
03936 {
03937         word *const T = workspace.begin();
03938         word *const R = result.reg.begin();
03939         const unsigned int N = modulus.reg.size();
03940         assert(a.reg.size()<=N);
03941 
03942         CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
03943         SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
03944         MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
03945         return result;
03946 }
03947 
03948 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
03949 {
03950         word *const T = workspace.begin();
03951         word *const R = result.reg.begin();
03952         const unsigned int N = modulus.reg.size();
03953         assert(a.reg.size()<=N);
03954 
03955         CopyWords(T, a.reg, a.reg.size());
03956         SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
03957         MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
03958         return result;
03959 }
03960 
03961 const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
03962 {
03963 //        return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
03964         word *const T = workspace.begin();
03965         word *const R = result.reg.begin();
03966         const unsigned int N = modulus.reg.size();
03967         assert(a.reg.size()<=N);
03968 
03969         CopyWords(T, a.reg, a.reg.size());
03970         SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
03971         MontgomeryReduce(R, T+2*N, T, modulus.reg, u.reg, N);
03972         unsigned k = AlmostInverse(R, T, R, N, modulus.reg, N);
03973 
03974 //      cout << "k=" << k << " N*32=" << 32*N << endl;
03975 
03976         if (k>N*WORD_BITS)
03977                 DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg, N);
03978         else
03979                 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg, N);
03980 
03981         return result;
03982 }
03983 
03984 NAMESPACE_END
03985 
03986 #endif

Generated on Tue Jul 8 23:34:18 2003 for Crypto++ by doxygen 1.3.2