faster code for exact Euler numbers

This commit is contained in:
fredrik 2021-11-21 20:51:27 +01:00
parent 74fe9b32be
commit 7db71cec11
5 changed files with 777 additions and 17 deletions

2
arb.h
View file

@ -607,6 +607,8 @@ void arb_bell_ui(arb_t res, ulong n, slong prec);
void arb_euler_number_fmpz(arb_t res, const fmpz_t n, slong prec);
void arb_euler_number_ui(arb_t res, ulong n, slong prec);
void arb_fmpz_euler_number_ui_multi_mod(fmpz_t res, ulong n, double alpha);
void arb_fmpz_euler_number_ui(fmpz_t res, ulong n);
void arb_partitions_fmpz(arb_t res, const fmpz_t n, slong prec);
void arb_partitions_ui(arb_t res, ulong n, slong prec);

View file

@ -1,5 +1,5 @@
/*
Copyright (C) 2016 Fredrik Johansson
Copyright (C) 2016, 2021 Fredrik Johansson
This file is part of Arb.
@ -37,7 +37,7 @@ arb_euler_number_mag(double n)
return x;
}
static void
void
arb_euler_number_ui_beta(arb_t res, ulong n, slong prec)
{
slong pi_prec;
@ -65,12 +65,621 @@ arb_euler_number_ui_beta(arb_t res, ulong n, slong prec)
arb_clear(t);
}
#define LOW_MASK ((UWORD(1) << (FLINT_BITS / 2)) - 1)
typedef struct
{
ulong n;
ulong ninv;
ulong F;
}
nmod_redc_t;
static void
nmod_redc_init(nmod_redc_t * rmod, nmod_t mod)
{
ulong n, ninv2;
int bits;
n = mod.n;
rmod->n = n;
rmod->F = (UWORD(1) << (FLINT_BITS / 2));
NMOD_RED(rmod->F, rmod->F, mod);
/* Newton's method for 2-adic inversion */
ninv2 = -n; /* already correct mod 8 */
for (bits = 3; bits < FLINT_BITS / 2; bits *= 2)
ninv2 = 2*ninv2 + n * ninv2 * ninv2;
rmod->ninv = ninv2 & LOW_MASK;
}
static __inline__ ulong
nmod_redc_fast(ulong x, ulong n, ulong ninv2)
{
ulong y = (x * ninv2) & LOW_MASK;
ulong z = x + (n * y);
return z >> (FLINT_BITS / 2);
}
static __inline__ ulong
nmod_redc(ulong x, ulong n, ulong ninv2)
{
ulong y = nmod_redc_fast(x, n, ninv2);
if (y >= n)
y -= n;
return y;
}
static ulong
nmod_redc_mul_fast(ulong a, ulong b, nmod_redc_t rmod)
{
return nmod_redc_fast(a * b, rmod.n, rmod.ninv);
}
static ulong
nmod_redc_mul(ulong a, ulong b, nmod_redc_t rmod)
{
return nmod_redc(a * b, rmod.n, rmod.ninv);
}
static ulong
nmod_to_redc(ulong x, nmod_t mod, nmod_redc_t rmod)
{
return nmod_mul(x, rmod.F, mod);
}
static ulong
nmod_from_redc(ulong x, nmod_redc_t rmod)
{
return nmod_redc(x, rmod.n, rmod.ninv);
}
ulong
nmod_redc_pow_ui(ulong a, ulong exp, nmod_redc_t rmod)
{
ulong x;
while ((exp & 1) == 0)
{
a = nmod_redc_mul(a, a, rmod);
exp >>= 1;
}
x = a;
while (exp >>= 1)
{
a = nmod_redc_mul(a, a, rmod);
if (exp & 1)
x = nmod_redc_mul(x, a, rmod);
}
return x;
}
ulong
euler_mod_p_powsum_1(ulong n, ulong p)
{
slong j;
ulong s, t;
nmod_t mod;
if (n % 2 == 1)
return 0;
n = n % (p - 1);
if (n == 0)
return UWORD_MAX;
nmod_init(&mod, p);
s = 1;
for (j = 3; j <= p - 2; j += 2)
{
t = nmod_pow_ui(j, n, mod);
s = nmod_sub(t, s, mod);
}
if (p % 4 == 1)
s = nmod_neg(s, mod);
s = nmod_add(s, s, mod);
return s;
}
ulong
euler_mod_p_powsum_noredc(ulong n, ulong p, const unsigned int * divtab)
{
unsigned int * pows;
slong i, N, horner_point;
ulong s, t, z;
ulong v2n, power_of_two;
nmod_t mod;
TMP_INIT;
if (n % 2 == 1)
return 0;
n = n % (p - 1);
if (n == 0)
return UWORD_MAX;
N = p / 4;
nmod_init(&mod, p);
TMP_START;
pows = TMP_ALLOC(sizeof(unsigned int) * (N / 3 + 1));
s = z = 0;
/* Evaluate as a polynomial in 2^n */
power_of_two = 1;
while (power_of_two * 2 <= N)
power_of_two *= 2;
horner_point = 1;
v2n = nmod_pow_ui(2, n, mod);
for (i = 1; i <= N / 3; i += 2)
{
if (divtab[i] == 1)
t = nmod_pow_ui(i, n, mod);
else
t = nmod_mul(pows[divtab[i]], pows[divtab[i + 1]], mod);
pows[i] = t;
s = nmod_add(s, t, mod);
if (i == horner_point)
{
while (i == horner_point && power_of_two != 1)
{
z = nmod_add(s, nmod_mul(v2n, z, mod), mod);
power_of_two /= 2;
horner_point = N / power_of_two;
if (horner_point % 2 == 0)
horner_point--;
}
}
}
/* Same as above, but here we don't need to write the powers. */
for ( ; i <= N; i += 2)
{
if (divtab[i] == 1)
t = nmod_pow_ui(i, n, mod);
else
t = nmod_mul(pows[divtab[i]], pows[divtab[i + 1]], mod);
s = nmod_add(s, t, mod);
if (i == horner_point)
{
while (i == horner_point && power_of_two != 1)
{
z = nmod_add(s, nmod_mul(v2n, z, mod), mod);
power_of_two /= 2;
horner_point = N / power_of_two;
if (horner_point % 2 == 0)
horner_point--;
}
}
}
s = nmod_add(s, nmod_mul(v2n, z, mod), mod);
if (p % 4 == 3)
s = nmod_neg(s, mod);
t = nmod_inv(nmod_pow_ui(4, p - n - 2, mod), mod);
s = nmod_mul(s, t, mod);
TMP_END;
return s;
}
ulong
euler_mod_p_powsum_redc(ulong n, ulong p, const unsigned int * divtab)
{
unsigned int * pows;
slong i, N, horner_point;
ulong s, t, z;
ulong v2n, power_of_two;
nmod_t mod;
nmod_redc_t rmod;
TMP_INIT;
if (n % 2 == 1)
return 0;
n = n % (p - 1);
if (n == 0)
return UWORD_MAX;
N = p / 4;
nmod_init(&mod, p);
nmod_redc_init(&rmod, mod);
TMP_START;
pows = TMP_ALLOC(sizeof(unsigned int) * (N / 3 + 1));
s = z = 0;
/* Evaluate as a polynomial in 2^n */
power_of_two = 1;
while (power_of_two * 2 <= N)
power_of_two *= 2;
horner_point = 1;
v2n = nmod_redc_pow_ui(nmod_to_redc(2, mod, rmod), n, rmod);
for (i = 1; i <= N / 3; i += 2)
{
if (divtab[i] == 1)
t = nmod_redc_pow_ui(nmod_to_redc(i, mod, rmod), n, rmod);
else
t = nmod_redc_mul(pows[divtab[i]], pows[divtab[i + 1]], rmod);
pows[i] = t;
s += t;
if (i == horner_point)
{
while (i == horner_point && power_of_two != 1)
{
NMOD_RED(s, s, mod);
z = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod);
power_of_two /= 2;
horner_point = N / power_of_two;
if (horner_point % 2 == 0)
horner_point--;
}
}
}
/* Same as above, but here we don't need to write the powers. */
for ( ; i <= N; i += 2)
{
if (divtab[i] == 1)
t = nmod_redc_pow_ui(nmod_to_redc(i, mod, rmod), n, rmod);
else
t = nmod_redc_mul_fast(pows[divtab[i]], pows[divtab[i + 1]], rmod);
s += t;
if (i == horner_point)
{
while (i == horner_point && power_of_two != 1)
{
NMOD_RED(s, s, mod);
z = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod);
power_of_two /= 2;
horner_point = N / power_of_two;
if (horner_point % 2 == 0)
horner_point--;
}
}
}
NMOD_RED(s, s, mod);
s = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod);
s = nmod_from_redc(s, rmod);
if (p % 4 == 3)
s = nmod_neg(s, mod);
t = nmod_inv(nmod_pow_ui(4, p - n - 2, mod), mod);
s = nmod_mul(s, t, mod);
TMP_END;
return s;
}
ulong
euler_mod_p_powsum(ulong n, ulong p, const unsigned int * divtab)
{
if (p < (UWORD(1) << (FLINT_BITS / 2 - 1)))
return euler_mod_p_powsum_redc(n, p, divtab);
else
return euler_mod_p_powsum_noredc(n, p, divtab);
}
static void
divisor_table_odd(unsigned int * tab, slong len)
{
slong i, j;
tab[0] = 0;
for (i = 1; i < len; i += 2)
{
tab[i] = 1;
tab[i + 1] = i;
}
for (i = 3; i < len; i += 2)
{
for (j = 3; j <= i && i * j < len; j += 2)
{
tab[i * j] = j;
tab[i * j + 1] = i;
}
}
}
#define TIMING 0
#define DEBUG 0
static void
_fmpz_crt_combine(fmpz_t r1r2, fmpz_t m1m2, const fmpz_t r1, const fmpz_t m1, const fmpz_t r2, const fmpz_t m2)
{
fmpz_invmod(m1m2, m1, m2);
fmpz_mul(m1m2, m1m2, m1);
fmpz_sub(r1r2, r2, r1);
fmpz_mul(r1r2, r1r2, m1m2);
fmpz_add(r1r2, r1r2, r1);
fmpz_mul(m1m2, m1, m2);
fmpz_mod(r1r2, r1r2, m1m2);
}
static void
tree_crt(fmpz_t r, fmpz_t m, mp_srcptr residues, mp_srcptr primes, slong len)
{
if (len == 0)
{
fmpz_zero(r);
fmpz_one(m);
}
else if (len == 1)
{
fmpz_set_ui(r, residues[0]);
fmpz_set_ui(m, primes[0]);
}
else
{
fmpz_t r1, m1, r2, m2;
fmpz_init(r1);
fmpz_init(m1);
fmpz_init(r2);
fmpz_init(m2);
tree_crt(r1, m1, residues, primes, len / 2);
tree_crt(r2, m2, residues + len / 2, primes + len / 2, len - len / 2);
_fmpz_crt_combine(r, m, r1, m1, r2, m2);
fmpz_clear(r1);
fmpz_clear(m1);
fmpz_clear(r2);
fmpz_clear(m2);
}
}
void
arb_fmpz_euler_number_ui_multi_mod(fmpz_t num, ulong n, double alpha)
{
slong i, bits, mod_bits, zeta_bits, num_primes;
ulong p;
mp_ptr primes, residues;
mag_t primes_product;
unsigned int * divtab_odd;
fmpz_t M;
#if TIMING
double t1, t2;
#endif
if (n % 2 != 0)
{
fmpz_zero(num);
return;
}
if (n < ARB_EULER_NUMBER_TAB_SIZE)
{
if (n % 4 == 0)
fmpz_set_ui(num, arb_euler_number_tab[n / 2]);
else
fmpz_neg_ui(num, arb_euler_number_tab[n / 2]);
return;
}
if (alpha < 0)
{
if (n < 2000)
alpha = 0.0;
else if (n < 6000)
alpha = 0.002 + 1.0e-5 * (n - 2000);
else if (n < 90000)
alpha = -0.132 + 0.02 * log(n);
else
alpha = FLINT_MIN(0.0085 * log(n), 0.11);
}
#if TIMING
t1 = clock();
#endif
bits = arb_euler_number_mag(n) + 2;
mod_bits = bits * alpha;
zeta_bits = bits - mod_bits;
num_primes = 0;
mag_init(primes_product);
mag_one(primes_product);
for (p = 5; mag_cmp_2exp_si(primes_product, mod_bits) < 0; p = n_nextprime(p, 1))
{
if (n % (p - 1) != 0)
{
mag_mul_ui_lower(primes_product, primes_product, p);
num_primes++;
}
}
#if DEBUG
printf("\nn = %lu, bits = %lu, num_primes = %ld\n", n, bits, num_primes);
#endif
primes = flint_malloc(sizeof(mp_limb_t) * num_primes);
residues = flint_malloc(sizeof(mp_limb_t) * num_primes);
for (p = 5, i = 0; i < num_primes; p = n_nextprime(p, 1))
{
if (n % (p - 1) != 0)
{
primes[i] = p;
i++;
}
}
if (num_primes == 0)
{
divtab_odd = NULL;
}
else
{
p = primes[num_primes - 1];
divtab_odd = flint_malloc(sizeof(unsigned int) * (p / 4 + 2));
divisor_table_odd(divtab_odd, p / 4 + 1);
}
#if TIMING
t2 = clock();
printf("init time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC);
printf("num_primes = %ld\n", num_primes);
#endif
for (i = 0; i < num_primes; i++)
{
#if TIMING
if (i % 10000 == 0)
printf("%ld / %ld\n", i, num_primes);
#endif
residues[i] = euler_mod_p_powsum(n, primes[i], divtab_odd);
}
#if TIMING
t2 = clock();
printf("mod time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC);
printf("start CRT\n");
t1 = clock();
#endif
fmpz_init(M);
tree_crt(num, M, residues, primes, num_primes);
fmpz_mod(num, num, M);
if (n % 4 == 2)
{
fmpz_sub(num, M, num);
fmpz_neg(num, num);
}
#if TIMING
printf("end CRT\n");
t2 = clock();
printf("CRT time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC);
t1 = clock();
#endif
if (zeta_bits > 0)
{
slong prec;
arb_t b;
fmpz_t t;
arb_init(b);
fmpz_init(t);
for (prec = zeta_bits + 10; ; prec += 32)
{
arb_euler_number_ui_beta(b, n, prec);
arb_sub_fmpz(b, b, num, prec);
arb_div_fmpz(b, b, M, prec);
if (arb_get_unique_fmpz(t, b))
{
fmpz_addmul(num, t, M);
break;
}
flint_printf("euler: n = %wu, bits = %wd, mod = %wd, zeta = %wd: get_unique_fmpz failed!\n", n, bits, mod_bits, zeta_bits);
}
arb_clear(b);
fmpz_clear(t);
}
#if TIMING
printf("end zeta\n");
t2 = clock();
printf("zeta time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC);
#endif
flint_free(primes);
flint_free(residues);
flint_free(divtab_odd);
fmpz_clear(M);
mag_clear(primes_product);
}
void
arb_fmpz_euler_number_ui(fmpz_t res, ulong n)
{
arb_t x;
double mag;
mag = arb_euler_number_mag(n);
if (n % 2 != 0)
{
fmpz_zero(res);
return;
}
if (n < ARB_EULER_NUMBER_TAB_SIZE)
{
if (n % 4 == 0)
fmpz_set_ui(res, arb_euler_number_tab[n / 2]);
else
fmpz_neg_ui(res, arb_euler_number_tab[n / 2]);
return;
}
if (n < 2000)
{
arb_init(x);
arb_euler_number_ui_beta(x, n, mag + 5);
if (!arb_get_unique_fmpz(res, x))
{
flint_printf("arb_fmpz_euler_number_ui: unexpected inaccuracy\n");
flint_abort();
}
arb_clear(x);
}
else
{
arb_fmpz_euler_number_ui_multi_mod(res, n, -1.0);
}
}
void
arb_euler_number_ui(arb_t res, ulong n, slong prec)
{
double mag;
if (n % 2)
if (n % 2 != 0)
{
arb_zero(res);
return;
@ -91,9 +700,8 @@ arb_euler_number_ui(arb_t res, ulong n, slong prec)
{
fmpz_t t;
fmpz_init(t);
arb_euler_number_ui_beta(res, n, mag + 5);
if (arb_get_unique_fmpz(t, res))
arb_set_round_fmpz(res, t, prec);
arb_fmpz_euler_number_ui(t, n);
arb_set_round_fmpz(res, t, prec);
fmpz_clear(t);
}
else
@ -102,4 +710,3 @@ arb_euler_number_ui(arb_t res, ulong n, slong prec)
arb_set_round(res, res, prec);
}
}

View file

@ -70,7 +70,7 @@ int main()
arb_euler_number_fmpz(b2, n, prec2);
}
if (fmpz_cmp_ui(n, 100) < 0)
if (fmpz_cmp_ui(n, 300) < 0)
{
arith_euler_number(bv, fmpz_get_ui(n));
}
@ -110,4 +110,3 @@ int main()
flint_printf("PASS\n");
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,148 @@
/*
Copyright (C) 2021 Fredrik Johansson
This file is part of Arb.
Arb is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version. See <http://www.gnu.org/licenses/>.
*/
#include "flint/arith.h"
#include "arb.h"
ulong euler_mod_p_powsum_noredc(ulong n, ulong p, const unsigned int * divtab);
ulong euler_mod_p_powsum(ulong n, ulong p, const unsigned int * divtab);
ulong euler_mod_p_powsum_1(ulong n, ulong p);
static void
divisor_table_odd(unsigned int * tab, slong len)
{
slong i, j;
tab[0] = 0;
for (i = 1; i < len; i += 2)
{
tab[i] = 1;
tab[i + 1] = i;
}
for (i = 3; i < len; i += 2)
{
for (j = 3; j <= i && i * j < len; j += 2)
{
tab[i * j] = j;
tab[i * j + 1] = i;
}
}
}
int main()
{
slong iter;
flint_rand_t state;
flint_printf("euler_number_ui....");
fflush(stdout);
flint_randinit(state);
{
slong nmax;
unsigned int * divtab_odd;
fmpz * En;
nmax = 1000 * FLINT_MIN(1.0, arb_test_multiplier());
En = _fmpz_vec_init(nmax);
arith_euler_number_vec(En, nmax);
{
fmpz_t E;
slong n;
double alpha;
fmpz_init(E);
for (n = 0; n < nmax; n++)
{
if (n_randint(state, 2))
alpha = -1.0;
else
alpha = n_randint(state, 11) / (double) 10;
arb_fmpz_euler_number_ui_multi_mod(E, n, alpha);
if (!fmpz_equal(En + n, E))
{
flint_printf("FAIL: n = %wd\n", n);
flint_printf("vec: "); fmpz_print(En + n); flint_printf("\n");
flint_printf("single: "); fmpz_print(E); flint_printf("\n");
flint_abort();
}
arb_fmpz_euler_number_ui(E, n);
if (!fmpz_equal(En + n, E))
{
flint_printf("FAIL (2): n = %wd\n", n);
flint_printf("vec: "); fmpz_print(En + n); flint_printf("\n");
flint_printf("single: "); fmpz_print(E); flint_printf("\n");
flint_abort();
}
}
fmpz_clear(E);
}
/* test the mod p code */
for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
{
ulong p, n, m, m1, m5;
n = n_randint(state, nmax);
p = 4 + n_randint(state, 10000);
p = n_nextprime(p, 1);
divtab_odd = flint_malloc(sizeof(unsigned int) * (p / 4 + 2));
divisor_table_odd(divtab_odd, p / 4 + 1);
m = fmpz_fdiv_ui(En + n, p);
if (n_randint(state, 2))
m5 = euler_mod_p_powsum(n, p, divtab_odd);
else
m5 = euler_mod_p_powsum_noredc(n, p, divtab_odd);
if (n_randint(state, 30) == 0)
{
m1 = euler_mod_p_powsum_1(n, p);
if (m1 != UWORD_MAX && m != m1)
{
flint_printf("FAIL\n\n");
flint_printf("n = %wu, p = %wu, m = %wu, m1 = %wu\n\n", n, p, m, m1);
flint_abort();
}
}
if (m5 != UWORD_MAX && m != m5)
{
flint_printf("FAIL\n\n");
flint_printf("n = %wu, p = %wu, m = %wu, m5 = %wu\n\n", n, p, m, m5);
flint_abort();
}
flint_free(divtab_odd);
}
_fmpz_vec_clear(En, nmax);
}
flint_randclear(state);
flint_cleanup();
flint_printf("PASS\n");
return EXIT_SUCCESS;
}

View file

@ -1580,17 +1580,21 @@ Other special functions
is described in detail in [Joh2015]_.
.. function:: void arb_euler_number_fmpz(arb_t res, const fmpz_t n, slong prec)
void arb_euler_number_ui(arb_t res, ulong n, slong prec)
.. function:: void arb_euler_number_ui(arb_t res, ulong n, slong prec)
Sets *res* to the Euler number `E_n`, which is defined by having
Sets *res* to the Euler number `E_n`, which is defined by
the exponential generating function `1 / \cosh(x)`.
The result will be exact if `E_n` is exactly representable
at the requested precision.
The Euler product for the Dirichlet beta function
(:func:`_acb_dirichlet_euler_product_real_ui`) is used to compute
a numerical approximation. If *prec* is more than enough to represent
the result exactly, the exact value is automatically determined
from a lower-precision approximation.
.. function:: void arb_fmpz_euler_number_ui_multi_mod(fmpz_t res, ulong n, double alpha)
void arb_fmpz_euler_number_ui(fmpz_t res, ulong n)
Computes the Euler number `E_n` as an exact integer. The default
algorithm uses a table lookup, the Dirichlet beta function or a
hybrid modular algorithm depending on the size of *n*.
The *multi_mod* algorithm accepts a tuning parameter *alpha* which
can be set to a negative value to use defaults.
.. function:: void arb_partitions_fmpz(arb_t res, const fmpz_t n, slong prec)