/* MODIFIED 8/9/98 by Steven G. Johnson (stevenj@alum.mit.edu) to
   extract only the FFT code for use in benchFFT (see also
   http://theory.lcs.mit.edu/~benchfft).

   Also changed to use the benchmark floating-point & integer types and 
   to allow externally-specified storage for trig tables.

   Original code was downloaded from: http://fractal.mta.ca/prime/freeware/
   New URL is: http://www2.netdoor.com/~acurry/mersenne/freeware.html
   Or really: http://users.delta.com/mersenne/lucas.c
*/

#include <fftw.h>

/*  lucas.c - Discrete Weighted Transform, irrational base method for
              Lucas-Lehmer Mersenne test.

    References:

    Crandall R E and Fagin B 1994; "Discrete Weighted Transforms
            and Large-Integer Arithmetic," Math. Comp. 62, 205, 305-324
    Crandall R E 1995; "Topics in Advanced Scientific Computation,"
            TELOS/Springer-Verlag

    Usage:

    % lucas q N [n] [err]

    where q = Mersenne exponent, N = fft run-length, n = Number of
    Lucas iterations (or 0 for full test), err = 1 to report maximum
    convolution errors.

    N can be any power-of-two for which two conditions are met:
    q/N < 32, and the maximum convolution error is < 0.5 for every
    Lucas-Lehmer iteration.  An example test is:

    % lucas 521 32 0

    which will eventually output a line:
    521 0
    proving that 2^521-1 is prime.  One should use the minimum N
    allowed under the constraints, to achieve maximum performance.

    c. 1995 R. E. Crandall, All Rights Reserved

 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define TWOPI (FFTW_REAL)(2*3.1415926535897932384626433)
#define SQRTHALF (FFTW_REAL)(0.707106781186547524400844362104)
#define SQRT2 (FFTW_REAL)(1.414213562373095048801688724209)

static FFTW_REAL *cn, *sn, *scrambled;
static int b, c, *permute;

void  crandall_init_scramble_real(n)
        int n;
{
        register int i,j,k,halfn = n>>1;
        int tmp;
        
        for(i=0; i<n; ++i) permute[i] = i;
        for(i=0,j=0;i<n-1;i++) {
                if(i<j) {
                  tmp = permute[i];
                  permute[i] = permute[j];
                  permute[j] = tmp;
                }
                k = halfn;
                while(k<=j) {
                        j -= k;
                        k>>=1;
                }
                j += k;
        }
}

/* SGJ: storage[] should have 3*n elements, and istorage[] should have n. */
void crandall_init_fft(n, storage, istorage)
int n;
FFTW_REAL *storage;
int *istorage;
{

        int j;
        FFTW_REAL e = TWOPI/n;
        
/*      cn = (FFTW_REAL *)malloc(sizeof(FFTW_REAL)*n);
        sn = (FFTW_REAL *)malloc(sizeof(FFTW_REAL)*n); */
	cn = storage;
	sn = storage + n;
        for(j=0;j<n;j++) {
                cn[j] = cos(e*j);
                sn[j] = sin(e*j);
        }
        
/*      permute = (int *)malloc(n*sizeof(int));
        scrambled = (FFTW_REAL *)malloc(n*sizeof(FFTW_REAL)); */
	permute = istorage;
	scrambled = storage + 2*n;
        crandall_init_scramble_real(n);        
}

void crandall_fft_real_to_hermitian(z, n)
        FFTW_REAL *z;
        int n;
/* Output is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]).
   This is a decimation-in-time, split-radix algorithm.
 */
{       register int n4;
        register FFTW_REAL *x;
        register FFTW_REAL cc1, ss1, cc3, ss3;
        register int i1, i2, i3, i4, i5, i6, i7, i8,
                     a, a3, dil;
        register FFTW_REAL t1, t2, t3, t4, t5, t6;
        FFTW_REAL e;
        int nn = n>>1, nminus = n-1, is, id;
        register int n2, n8, i, j;
        
        x = z-1;  /* FORTRAN compatibility. */
        is = 1;
        id = 4;
        do{
           for(i2=is;i2<=n;i2+=id) {
                i1 = i2+1;
                e = x[i2];
                x[i2] = e + x[i1];
                x[i1] = e - x[i1];
           }
           is = (id<<1)-1;
           id <<= 2;
        } while(is<n);
        n2 = 2;
        while(nn>>=1) {
                n2 <<= 1;
                n4 = n2>>2;
                n8 = n2>>3;
                is = 0;
                id = n2<<1;
                do {
                        for(i=is;i<n;i+=id) {
                                i1 = i+1;
                                i2 = i1 + n4;
                                i3 = i2 + n4;
                                i4 = i3 + n4;
                                t1 = x[i4]+x[i3];
                                x[i4] -= x[i3];
                                x[i3] = x[i1] - t1;
                                x[i1] += t1;
                                if(n4==1) continue;
                                i1 += n8;
                                i2 += n8;
                                i3 += n8;
                                i4 += n8;
                                t1 = (x[i3]+x[i4])*SQRTHALF;
                                t2 = (x[i3]-x[i4])*SQRTHALF;
                                x[i4] = x[i2] - t1;
                                x[i3] = -x[i2] - t1;
                                x[i2] = x[i1] - t2;
                                x[i1] += t2;
                        }
                        is = (id<<1) - n2;
                        id <<= 2;
                } while(is<n);
                dil = n/n2;
                a = dil;
                for(j=2;j<=n8;j++) {
                        a3 = (a+(a<<1))&(nminus);
                        cc1 = cn[a];
                        ss1 = sn[a];
                        cc3 = cn[a3];
                        ss3 = sn[a3];
                        a = (a+dil)&(nminus);
                        is = 0;
                        id = n2<<1;
                        do {
                                for(i=is;i<n;i+=id) {
                                        i1 = i+j;
                                        i2 = i1 + n4;
                                        i3 = i2 + n4;
                                        i4 = i3 + n4;
                                        i5 = i + n4 - j + 2;
                                        i6 = i5 + n4;
                                        i7 = i6 + n4;
                                        i8 = i7 + n4;
                                        t1 = x[i3]*cc1 + x[i7]*ss1;
                                        t2 = x[i7]*cc1 - x[i3]*ss1;
                                        t3 = x[i4]*cc3 + x[i8]*ss3;
                                        t4 = x[i8]*cc3 - x[i4]*ss3;
                                        t5 = t1 + t3;
                                        t6 = t2 + t4;
                                        t3 = t1 - t3;
                                        t4 = t2 - t4;
                                        t2 = x[i6] + t6;
                                        x[i3] = t6 - x[i6];
                                        x[i8] = t2;
                                        t2 = x[i2] - t3;
                                        x[i7] = -x[i2] - t3;
                                        x[i4] = t2;
                                        t1 = x[i1] + t5;
                                        x[i6] = x[i1] - t5;
                                        x[i1] = t1;
                                        t1 = x[i5] + t4;
                                        x[i5] -= t4;
                                        x[i2] = t1;
                                }
                                is = (id<<1) - n2;
                                id <<= 2;
                        } while(is<n);
                }
        }
}

void crandall_fftinv_hermitian_to_real(z, n)
        FFTW_REAL *z; int n;
/* Input is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]).
   This is a decimation-in-frequency, split-radix algorithm.
 */
{
        register int n4;
        register FFTW_REAL cc1, ss1, cc3, ss3;
        register FFTW_REAL t1, t2, t3, t4, t5;
        register FFTW_REAL *x;
        register int n8, i1, i2, i3, i4, i5, i6, i7, i8,
                 a, a3, dil;
        FFTW_REAL e;
        int nn = n>>1, nminus = n-1, is, id;
        int n2, i, j;

        x = z-1;
        n2 = n<<1;
        while(nn >>= 1) {
                is = 0;
                id = n2;
                n2 >>= 1;
                n4 = n2>>2;
                n8 = n4>>1;
                do {
                        for(i=is;i<n;i+=id) {
                                i1 = i+1;
                                i2 = i1 + n4;
                                i3 = i2 + n4;
                                i4 = i3 + n4;
                                t1 = x[i1] - x[i3];
                                x[i1] += x[i3];
                                x[i2] += x[i2];
                                x[i3] = t1 - x[i4] - x[i4];
                                x[i4] = t1 + x[i4] + x[i4];
                                if(n4==1) continue;
                                i1 += n8;
                                i2 += n8;
                                i3 += n8;
                                i4 += n8;
                                t1 = x[i2]-x[i1];
                                t2 = x[i4]+x[i3];
                                x[i1] += x[i2];
                                x[i2] = x[i4]-x[i3];
                                x[i3] = -SQRT2*(t2+t1);
                                x[i4] = SQRT2*(t1-t2);
                        }
                        is = (id<<1) - n2;
                        id <<= 2;
                } while(is<nminus);
                dil = n/n2;
                a = dil;
                for(j=2;j<=n8;j++) {
                        a3 = (a+(a<<1))&(nminus);
                        cc1 = cn[a];
                        ss1 = sn[a];
                        cc3 = cn[a3];
                        ss3 = sn[a3];
                        a = (a+dil)&(nminus);
                        is = 0;
                        id = n2<<1;
                        do {
                           for(i=is;i<n;i+=id) {
                                i1 = i+j;
                                i2 = i1+n4;
                                i3 = i2+n4;
                                i4 = i3+n4;
                                i5 = i+n4-j+2;
                                i6 = i5+n4;
                                i7 = i6+n4;
                                i8 = i7+n4;
                                t1 = x[i1] - x[i6];
                                x[i1] += x[i6];
                                t2 = x[i5] - x[i2];
                                x[i5] += x[i2];
                                t3 = x[i8] + x[i3];
                                x[i6] = x[i8] - x[i3];
                                t4 = x[i4] + x[i7];
                                x[i2] = x[i4] - x[i7];
                                t5 = t1 - t4;
                                t1 += t4;
                                t4 = t2 - t3;
                                t2 += t3;
                                x[i3] = t5*cc1 + t4*ss1;
                                x[i7] = -t4*cc1 + t5*ss1;
                                x[i4] = t1*cc3 - t2*ss3;
                                x[i8] = t2*cc3 + t1*ss3;
                           }
                           is = (id<<1) - n2;
                           id <<= 2;
                        } while(is<nminus);
                }
        }
        is = 1;
        id = 4;
        do {
          for(i2=is;i2<=n;i2+=id){
                i1 = i2+1;
                e = x[i2];
                x[i2] = e + x[i1];
                x[i1] = e - x[i1];
          }
          is = (id<<1) - 1;
          id <<= 2;
        } while(is<n);
        e = 1/(FFTW_REAL)n;
        for(i=0;i<n;i++) z[i] *= e;
}

