I wanted to use NTT for fast squaring (see Fast bignum square computation), but the result is slow even for really big numbers .. more than 12000 bits.
So my question is:
- Is there a way to optimize my NTT transform? I did not mean to speed it by parallelism (threads); this is low-level layer only.
- Is there a way to speed up my modular arithmetics?
This is my (already optimized) source code in C++ for NTT (it's complete and 100% working in C++ whitout any need for third-party libs and should also be thread-safe. Beware the source array is used as a temporary!!!, Also it cannot transform the array to itself).
//---------------------------------------------------------------------------
class fourier_NTT // Number theoretic transform
{
public:
DWORD r,L,p,N;
DWORD W,iW,rN;
fourier_NTT(){ r=0; L=0; p=0; W=0; iW=0; rN=0; }
// main interface
void NTT(DWORD *dst,DWORD *src,DWORD n=0); // DWORD dst[n] = fast NTT(DWORD src[n])
void INTT(DWORD *dst,DWORD *src,DWORD n=0); // DWORD dst[n] = fast INTT(DWORD src[n])
// Helper functions
bool init(DWORD n); // init r,L,p,W,iW,rN
void NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = fast NTT(DWORD src[n])
// Only for testing
void NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = slow NTT(DWORD src[n])
void INTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = slow INTT(DWORD src[n])
// DWORD arithmetics
DWORD shl(DWORD a);
DWORD shr(DWORD a);
// Modular arithmetics
DWORD mod(DWORD a);
DWORD modadd(DWORD a,DWORD b);
DWORD modsub(DWORD a,DWORD b);
DWORD modmul(DWORD a,DWORD b);
DWORD modpow(DWORD a,DWORD b);
};
//---------------------------------------------------------------------------
void fourier_NTT:: NTT(DWORD *dst,DWORD *src,DWORD n)
{
if (n>0) init(n);
NTT_fast(dst,src,N,W);
// NTT_slow(dst,src,N,W);
}
//---------------------------------------------------------------------------
void fourier_NTT::INTT(DWORD *dst,DWORD *src,DWORD n)
{
if (n>0) init(n);
NTT_fast(dst,src,N,iW);
for (DWORD i=0;i<N;i++) dst[i]=modmul(dst[i],rN);
// INTT_slow(dst,src,N,W);
}
//---------------------------------------------------------------------------
bool fourier_NTT::init(DWORD n)
{
// (max(src[])^2)*n < p else NTT overflow can ocur !!!
r=2; p=0xC0000001; if ((n<2)||(n>0x10000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x30000000/n; // 32:30 bit best for unsigned 32 bit
// r=2; p=0x78000001; if ((n<2)||(n>0x04000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x3c000000/n; // 31:27 bit best for signed 32 bit
// r=2; p=0x00010001; if ((n<2)||(n>0x00000020)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x00000020/n; // 17:16 bit best for 16 bit
// r=2; p=0x0a000001; if ((n<2)||(n>0x01000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x01000000/n; // 28:25 bit
N=n; // size of vectors [DWORDs]
W=modpow(r, L); // Wn for NTT
iW=modpow(r,p-1-L); // Wn for INTT
rN=modpow(n,p-2 ); // scale for INTT
return true;
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
if (n<=1) { if (n==1) dst[0]=src[0]; return; }
DWORD i,j,a0,a1,n2=n>>1,w2=modmul(w,w);
// reorder even,odd
for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
for ( j=1;i<n ;i++,j+=2) dst[i]=src[j];
// recursion
NTT_fast(src ,dst ,n2,w2); // even
NTT_fast(src+n2,dst+n2,n2,w2); // odd
// restore results
for (w2=1,i=0,j=n2;i<n2;i++,j++,w2=modmul(w2,w))
{
a0=src[i];
a1=modmul(src[j],w2);
dst[i]=modadd(a0,a1);
dst[j]=modsub(a0,a1);
}
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
DWORD i,j,wj,wi,a,n2=n>>1;
for (wj=1,j=0;j<n;j++)
{
a=0;
for (wi=1,i=0;i<n;i++)
{
a=modadd(a,modmul(wi,src[i]));
wi=modmul(wi,wj);
}
dst[j]=a;
wj=modmul(wj,w);
}
}
//---------------------------------------------------------------------------
void fourier_NTT::INTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
DWORD i,j,wi=1,wj=1,a,n2=n>>1;
for (wj=1,j=0;j<n;j++)
{
a=0;
for (wi=1,i=0;i<n;i++)
{
a=modadd(a,modmul(wi,src[i]));
wi=modmul(wi,wj);
}
dst[j]=modmul(a,rN);
wj=modmul(wj,iW);
}
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::shl(DWORD a) { return (a<<1)&0xFFFFFFFE; }
DWORD fourier_NTT::shr(DWORD a) { return (a>>1)&0x7FFFFFFF; }
//---------------------------------------------------------------------------
DWORD fourier_NTT::mod(DWORD a)
{
DWORD bb;
for (bb=p;(DWORD(a)>DWORD(bb))&&(!DWORD(bb&0x80000000));bb=shl(bb));
for (;;)
{
if (DWORD(a)>=DWORD(bb)) a-=bb;
if (bb==p) break;
bb =shr(bb);
}
return a;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modadd(DWORD a,DWORD b)
{
DWORD d,cy;
a=mod(a);
b=mod(b);
d=a+b;
cy=(shr(a)+shr(b)+shr((a&1)+(b&1)))&0x80000000;
if (cy) d-=p;
if (DWORD(d)>=DWORD(p)) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modsub(DWORD a,DWORD b)
{
DWORD d;
a=mod(a);
b=mod(b);
d=a-b; if (DWORD(a)<DWORD(b)) d+=p;
if (DWORD(d)>=DWORD(p)) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modmul(DWORD a,DWORD b)
{ // b bez orezania !
int i;
DWORD d;
a=mod(a);
for (d=0,i=0;i<32;i++)
{
if (DWORD(a&1)) d=modadd(d,b);
a=shr(a);
b=modadd(b,b);
}
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modpow(DWORD a,DWORD b)
{ // a,b bez orezania !
int i;
DWORD d=1;
for (i=0;i<32;i++)
{
d=modmul(d,d);
if (DWORD(b&0x80000000)) d=modmul(d,a);
b=shl(b);
}
return d;
}
//---------------------------------------------------------------------------
Example of usage of my NTT class:
fourier_NTT ntt;
const DWORD n=32
DWORD x[N]={0,1,2,3,....31},y[N]={32,33,34,35,...63},z[N];
ntt.NTT(z,x,N); // z[N]=NTT(x[N]), also init constants for N
ntt.NTT(x,y); // x[N]=NTT(y[N]), no recompute of constants, use last N
// modular convolution y[]=z[].x[]
for (i=0;i<n;i++) y[i]=ntt.modmul(z[i],x[i]);
ntt.INTT(x,y); // x[N]=INTT(y[N]), no recompute of constants, use last N
// x[]=convolution of original x[].y[]
Some measurements before optimizations (non Class NTT):
a = 0.98765588997654321000 | 389*32 bits
looped 1x times
sqr1[ 3.177 ms ] fast sqr
sqr2[ 720.419 ms ] NTT sqr
mul1[ 5.588 ms ] simpe mul
mul2[ 3.172 ms ] karatsuba mul
mul3[ 1053.382 ms ] NTT mul
Some measurements after my optimizations (current code, lower recursion parameter size/count, and better modular arithmetics):
a = 0.98765588997654321000 | 389*32 bits
looped 1x times
sqr1[ 3.214 ms ] fast sqr
sqr2[ 208.298 ms ] NTT sqr
mul1[ 5.564 ms ] simpe mul
mul2[ 3.113 ms ] karatsuba mul
mul3[ 302.740 ms ] NTT mul
Check the NTT mul and NTT sqr times (my optimizations speed it up little over 3x times). It's only 1x times loop so it's not very precise (error ~ 10%), but the speedup is noticeable even now (normally I loop it 1000x and more, but my NTT is too slow for that).
You can use my code freely... Just keep my nick and/or link to this page somewhere (rem in code, readme.txt, about or whatever). I hope it helps... (I did not see C++ source for fast NTTs anywhere so I had to write it by myself). Roots of unity were tested for all accepted N, see the fourier_NTT::init(DWORD n)
function.
P.S.: For more information about NTT, see Translation from Complex-FFT to Finite-Field-FFT. This code is based on my posts inside that link.
[edit1:] Further changes in the code
I managed to further optimize my modular arithmetics, by exploiting that modulo prime is allways 0xC0000001 and eliminating unnecessary calls. The resulting speedup is stunning (more than 40x times) now and NTT multiplication is faster than karatsuba after about the 1500 * 32 bits threshold. BTW, the speed of my NTT is now the same as my optimized DFFT on 64-bit doubles.
Some measurements:
a = 0.98765588997654321000 | 1553*32bits
looped 10x times
mul2[ 28.585 ms ] karatsuba mul
mul3[ 26.311 ms ] NTT mul
New source code for modular arithmetics:
//---------------------------------------------------------------------------
DWORD fourier_NTT::mod(DWORD a)
{
if (a>p) a-=p;
return a;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modadd(DWORD a,DWORD b)
{
DWORD d,cy;
if (a>p) a-=p;
if (b>p) b-=p;
d=a+b;
cy=((a>>1)+(b>>1)+(((a&1)+(b&1))>>1))&0x80000000;
if (cy ) d-=p;
if (d>p) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modsub(DWORD a,DWORD b)
{
DWORD d;
if (a>p) a-=p;
if (b>p) b-=p;
d=a-b;
if (a<b) d+=p;
if (d>p) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modmul(DWORD a,DWORD b)
{
DWORD _a,_b,_p;
_a=a;
_b=b;
_p=p;
asm {
mov eax,_a
mov ebx,_b
mul ebx // H(edx),L(eax) = eax * ebx
mov ebx,_p
div ebx // eax = H(edx),L(eax) / ebx
mov _a,edx // edx = H(edx),L(eax) % ebx
}
return _a;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modpow(DWORD a,DWORD b)
{ // b bez orezania!
int i;
DWORD d=1;
if (a>p) a-=p;
for (i=0;i<32;i++)
{
d=modmul(d,d);
if (DWORD(b&0x80000000)) d=modmul(d,a);
b<<=1;
}
return d;
}
//---------------------------------------------------------------------------
As you can see, functions shl
and shr
are no more used. I think that modpow can be further optimized, but it's not a critical function because it is called only very few times. The most critical function is modmul, and that seems to be in the best shape possible.
Further questions:
- Is there any other option to speedup NTT?
- Are my optimizations of modular arithmetics safe? (Results seem to be the same, but I could miss something.)
[edit2] New optimizations
a = 0.99991970486 | 2000*32 bits
looped 10x
sqr1[ 13.908 ms ] fast sqr
sqr2[ 13.649 ms ] NTT sqr
mul1[ 19.726 ms ] simpe mul
mul2[ 31.808 ms ] karatsuba mul
mul3[ 19.373 ms ] NTT mul
I implemented all the usable stuff from all of your comments (thanks for the insight).
Speedups:
- +2.5% by removing unnecessary safety mods (Mandalf The Beige)
- +34.9% by use of precomputed W,iW powers (Mysticial)
- +35% total
Actual full source code:
//---------------------------------------------------------------------------
//--- Number theoretic transforms: 2.03 -------------------------------------
//---------------------------------------------------------------------------
#ifndef _fourier_NTT_h
#define _fourier_NTT_h
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
class fourier_NTT // Number theoretic transform
{
public:
DWORD r,L,p,N;
DWORD W,iW,rN; // W=(r^L) mod p, iW=inverse W, rN = inverse N
DWORD *WW,*iWW,NN; // Precomputed (W,iW)^(0,..,NN-1) powers
// Internals
fourier_NTT(){ r=0; L=0; p=0; W=0; iW=0; rN=0; WW=NULL; iWW=NULL; NN=0; }
~fourier_NTT(){ _free(); }
void _free(); // Free precomputed W,iW powers tables
void _alloc(DWORD n); // Allocate and precompute W,iW powers tables
// Main interface
void NTT(DWORD *dst,DWORD *src,DWORD n=0); // DWORD dst[n] = fast NTT(DWORD src[n])
void iNTT(DWORD *dst,DWORD *src,DWORD n=0); // DWORD dst[n] = fast INTT(DWORD src[n])
// Helper functions
bool init(DWORD n); // init r,L,p,W,iW,rN
void NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = fast NTT(DWORD src[n])
void NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD *w2,DWORD i2);
// Only for testing
void NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = slow NTT(DWORD src[n])
void iNTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w); // DWORD dst[n] = slow INTT(DWORD src[n])
// Modular arithmetics (optimized, but it works only for p >= 0x80000000!!!)
DWORD mod(DWORD a);
DWORD modadd(DWORD a,DWORD b);
DWORD modsub(DWORD a,DWORD b);
DWORD modmul(DWORD a,DWORD b);
DWORD modpow(DWORD a,DWORD b);
};
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
void fourier_NTT::_free()
{
NN=0;
if ( WW) delete[] WW; WW=NULL;
if (iWW) delete[] iWW; iWW=NULL;
}
//---------------------------------------------------------------------------
void fourier_NTT::_alloc(DWORD n)
{
if (n<=NN) return;
DWORD *tmp,i,w;
tmp=new DWORD[n]; if ((NN)&&( WW)) for (i=0;i<NN;i++) tmp[i]= WW[i]; if ( WW) delete[] WW; WW=tmp; WW[0]=1; for (i=NN?NN:1,w= WW[i-1];i<n;i++){ w=modmul(w, W); WW[i]=w; }
tmp=new DWORD[n]; if ((NN)&&(iWW)) for (i=0;i<NN;i++) tmp[i]=iWW[i]; if (iWW) delete[] iWW; iWW=tmp; iWW[0]=1; for (i=NN?NN:1,w=iWW[i-1];i<n;i++){ w=modmul(w,iW); iWW[i]=w; }
NN=n;
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT(DWORD *dst,DWORD *src,DWORD n)
{
if (n>0) init(n);
NTT_fast(dst,src,N,WW,1);
// NTT_fast(dst,src,N,W);
// NTT_slow(dst,src,N,W);
}
//---------------------------------------------------------------------------
void fourier_NTT::iNTT(DWORD *dst,DWORD *src,DWORD n)
{
if (n>0) init(n);
NTT_fast(dst,src,N,iWW,1);
// NTT_fast(dst,src,N,iW);
for (DWORD i=0;i<N;i++) dst[i]=modmul(dst[i],rN);
// iNTT_slow(dst,src,N,W);
}
//---------------------------------------------------------------------------
bool fourier_NTT::init(DWORD n)
{
// (max(src[])^2)*n < p else NTT overflow can ocur!!!
r=2; p=0xC0000001; if ((n<2)||(n>0x10000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x30000000/n; // 32:30 bit best for unsigned 32 bit
// r=2; p=0x78000001; if ((n<2)||(n>0x04000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x3c000000/n; // 31:27 bit best for signed 32 bit
// r=2; p=0x00010001; if ((n<2)||(n>0x00000020)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x00000020/n; // 17:16 bit best for 16 bit
// r=2; p=0x0a000001; if ((n<2)||(n>0x01000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x01000000/n; // 28:25 bit
N=n; // Size of vectors [DWORDs]
W=modpow(r, L); // Wn for NTT
iW=modpow(r,p-1-L); // Wn for INTT
rN=modpow(n,p-2 ); // Scale for INTT
_alloc(n>>1); // Precompute W,iW powers
return true;
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
if (n<=1) { if (n==1) dst[0]=src[0]; return; }
DWORD i,j,a0,a1,n2=n>>1,w2=modmul(w,w);
// Reorder even,odd
for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
for ( j=1;i<n ;i++,j+=2) dst[i]=src[j];
// Recursion
NTT_fast(src ,dst ,n2,w2); // Even
NTT_fast(src+n2,dst+n2,n2,w2); // Odd
// Restore results
for (w2=1,i=0,j=n2;i<n2;i++,j++,w2=modmul(w2,w))
{
a0=src[i];
a1=modmul(src[j],w2);
dst[i]=modadd(a0,a1);
dst[j]=modsub(a0,a1);
}
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD *w2,DWORD i2)
{
if (n<=1) { if (n==1) dst[0]=src[0]; return; }
DWORD i,j,a0,a1,n2=n>>1;
// Reorder even,odd
for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
for ( j=1;i<n ;i++,j+=2) dst[i]=src[j];
// Recursion
i=i2<<1;
NTT_fast(src ,dst ,n2,w2,i); // Even
NTT_fast(src+n2,dst+n2,n2,w2,i); // Odd
// Restore results
for (i=0,j=n2;i<n2;i++,j++,w2+=i2)
{
a0=src[i];
a1=modmul(src[j],*w2);
dst[i]=modadd(a0,a1);
dst[j]=modsub(a0,a1);
}
}
//---------------------------------------------------------------------------
void fourier_NTT:: NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
DWORD i,j,wj,wi,a;
for (wj=1,j=0;j<n;j++)
{
a=0;
for (wi=1,i=0;i<n;i++)
{
a=modadd(a,modmul(wi,src[i]));
wi=modmul(wi,wj);
}
dst[j]=a;
wj=modmul(wj,w);
}
}
//---------------------------------------------------------------------------
void fourier_NTT::iNTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
{
DWORD i,j,wi=1,wj=1,a;
for (wj=1,j=0;j<n;j++)
{
a=0;
for (wi=1,i=0;i<n;i++)
{
a=modadd(a,modmul(wi,src[i]));
wi=modmul(wi,wj);
}
dst[j]=modmul(a,rN);
wj=modmul(wj,iW);
}
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::mod(DWORD a)
{
if (a>p) a-=p;
return a;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modadd(DWORD a,DWORD b)
{
DWORD d,cy;
//if (a>p) a-=p;
//if (b>p) b-=p;
d=a+b;
cy=((a>>1)+(b>>1)+(((a&1)+(b&1))>>1))&0x80000000;
if (cy ) d-=p;
if (d>p) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modsub(DWORD a,DWORD b)
{
DWORD d;
//if (a>p) a-=p;
//if (b>p) b-=p;
d=a-b;
if (a<b) d+=p;
if (d>p) d-=p;
return d;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modmul(DWORD a,DWORD b)
{
DWORD _a,_b,_p;
_a=a;
_b=b;
_p=p;
asm {
mov eax,_a
mov ebx,_b
mul ebx // H(edx),L(eax) = eax * ebx
mov ebx,_p
div ebx // eax = H(edx),L(eax) / ebx
mov _a,edx // edx = H(edx),L(eax) % ebx
}
return _a;
}
//---------------------------------------------------------------------------
DWORD fourier_NTT::modpow(DWORD a,DWORD b)
{ // b is not mod(p)!
int i;
DWORD d=1;
//if (a>p) a-=p;
for (i=0;i<32;i++)
{
d=modmul(d,d);
if (DWORD(b&0x80000000)) d=modmul(d,a);
b<<=1;
}
return d;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
There is still the possibility to use less heap trashing by separating NTT_fast
to two functions. One with WW[]
and the other with iWW[]
which leads to one parameter less in recursion calls. But I do not expect much from it (32-bit pointer only) and rather have one function for better code management in the future. Many functions are dormant now (for testing) Like slow variants, mod
and the older fast function (with w
parameter instead of *w2,i2
).
To avoid overflows for big datasets, limit input numbers to p/4
bits. Where p
is number of bits per NTT element so for this 32 bit version use max (32 bit/4 -> 8 bit)
input values.
[edit3] Simple string bigint
multiplication for testing
//---------------------------------------------------------------------------
char* mul_NTT(const char *sx,const char *sy)
{
char *s;
int i,j,k,n;
// n = min power of 2 <= 2 max length(x,y)
for (i=0;sx[i];i++); for (n=1;n<i;n<<=1); i--;
for (j=0;sx[j];j++); for (n=1;n<j;n<<=1); n<<=1; j--;
DWORD *x,*y,*xx,*yy,a;
x=new DWORD[n]; xx=new DWORD[n];
y=new DWORD[n]; yy=new DWORD[n];
// Zero padding
for (k=0;i>=0;i--,k++) x[k]=sx[i]-'0'; for (;k<n;k++) x[k]=0;
for (k=0;j>=0;j--,k++) y[k]=sy[j]-'0'; for (;k<n;k++) y[k]=0;
//NTT
fourier_NTT ntt;
ntt.NTT(xx,x,n);
ntt.NTT(yy,y);
// Convolution
for (i=0;i<n;i++) xx[i]=ntt.modmul(xx[i],yy[i]);
//INTT
ntt.iNTT(yy,xx);
//suma
a=0; s=new char[n+1]; for (i=0;i<n;i++) { a+=yy[i]; s[n-i-1]=(a%10)+'0'; a/=10; } s[n]=0;
delete[] x; delete[] xx;
delete[] y; delete[] yy;
return s;
}
//---------------------------------------------------------------------------
I use AnsiString
's, so I port it to char*
hopefully, I did not do some mistake. It looks like it works properly (in comparison to the AnsiString
version).
sx,sy
are decadic integer numbers- Returns allocated string
(char*)=sx*sy
This is only ~4 bit per 32 bit data word so there is no risk of overflow, but it is slower of course. In my bignum
lib I use a binary representation and use 8 bit
chunks per 32-bit WORD for NTT. More than that is risky if N
is big ...
Have fun with this
First off, thank you very much for posting and making it free to use. I really appreciate that.
I was able to use some bit tricks to eliminate some branching, rearranged the main loop, and modified the assembly, and was able to get a 1.35x speedup.
Also, I added a preprocessor condition for 64 bit, seeing as Visual Studio doesn't allow inline assembly in 64 bit mode (thank you Microsoft; feel free to go screw yourself).
Something strange happened when I was optimizing the modsub() function. I rewrote it using bit hacks like I did modadd (which was faster). But for some reason, the bit wise version of modsub was slower. Not sure why. Might just be my computer.
New modmul
[EDIT] Spektre, based on your feedback I changed the modadd & modsub back to their original. I also realized I made some changes to the recursive NTT function I shouldn't have.
[EDIT2] Removed unneeded if statements and bitwise functions.
[EDIT3] Added new modmul inline assembly.