diff --git a/arb.h b/arb.h index 210e39f0..ed8a3031 100644 --- a/arb.h +++ b/arb.h @@ -607,6 +607,8 @@ void arb_bell_ui(arb_t res, ulong n, slong prec); void arb_euler_number_fmpz(arb_t res, const fmpz_t n, slong prec); void arb_euler_number_ui(arb_t res, ulong n, slong prec); +void arb_fmpz_euler_number_ui_multi_mod(fmpz_t res, ulong n, double alpha); +void arb_fmpz_euler_number_ui(fmpz_t res, ulong n); void arb_partitions_fmpz(arb_t res, const fmpz_t n, slong prec); void arb_partitions_ui(arb_t res, ulong n, slong prec); diff --git a/arb/euler_number_ui.c b/arb/euler_number_ui.c index caad2551..ec08dc9f 100644 --- a/arb/euler_number_ui.c +++ b/arb/euler_number_ui.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Fredrik Johansson + Copyright (C) 2016, 2021 Fredrik Johansson This file is part of Arb. @@ -37,7 +37,7 @@ arb_euler_number_mag(double n) return x; } -static void +void arb_euler_number_ui_beta(arb_t res, ulong n, slong prec) { slong pi_prec; @@ -65,12 +65,621 @@ arb_euler_number_ui_beta(arb_t res, ulong n, slong prec) arb_clear(t); } +#define LOW_MASK ((UWORD(1) << (FLINT_BITS / 2)) - 1) + +typedef struct +{ + ulong n; + ulong ninv; + ulong F; +} +nmod_redc_t; + +static void +nmod_redc_init(nmod_redc_t * rmod, nmod_t mod) +{ + ulong n, ninv2; + int bits; + + n = mod.n; + rmod->n = n; + rmod->F = (UWORD(1) << (FLINT_BITS / 2)); + NMOD_RED(rmod->F, rmod->F, mod); + + /* Newton's method for 2-adic inversion */ + ninv2 = -n; /* already correct mod 8 */ + for (bits = 3; bits < FLINT_BITS / 2; bits *= 2) + ninv2 = 2*ninv2 + n * ninv2 * ninv2; + + rmod->ninv = ninv2 & LOW_MASK; +} + +static __inline__ ulong +nmod_redc_fast(ulong x, ulong n, ulong ninv2) +{ + ulong y = (x * ninv2) & LOW_MASK; + ulong z = x + (n * y); + return z >> (FLINT_BITS / 2); +} + +static __inline__ ulong +nmod_redc(ulong x, ulong n, ulong ninv2) +{ + ulong y = nmod_redc_fast(x, n, ninv2); + if (y >= n) + y -= n; + return y; +} + +static ulong +nmod_redc_mul_fast(ulong a, ulong b, nmod_redc_t rmod) +{ + return nmod_redc_fast(a * b, rmod.n, rmod.ninv); +} + +static ulong +nmod_redc_mul(ulong a, ulong b, nmod_redc_t rmod) +{ + return nmod_redc(a * b, rmod.n, rmod.ninv); +} + + +static ulong +nmod_to_redc(ulong x, nmod_t mod, nmod_redc_t rmod) +{ + return nmod_mul(x, rmod.F, mod); +} + +static ulong +nmod_from_redc(ulong x, nmod_redc_t rmod) +{ + return nmod_redc(x, rmod.n, rmod.ninv); +} + +ulong +nmod_redc_pow_ui(ulong a, ulong exp, nmod_redc_t rmod) +{ + ulong x; + + while ((exp & 1) == 0) + { + a = nmod_redc_mul(a, a, rmod); + exp >>= 1; + } + + x = a; + + while (exp >>= 1) + { + a = nmod_redc_mul(a, a, rmod); + + if (exp & 1) + x = nmod_redc_mul(x, a, rmod); + } + + return x; +} + +ulong +euler_mod_p_powsum_1(ulong n, ulong p) +{ + slong j; + ulong s, t; + nmod_t mod; + + if (n % 2 == 1) + return 0; + + n = n % (p - 1); + + if (n == 0) + return UWORD_MAX; + + nmod_init(&mod, p); + s = 1; + + for (j = 3; j <= p - 2; j += 2) + { + t = nmod_pow_ui(j, n, mod); + s = nmod_sub(t, s, mod); + } + + if (p % 4 == 1) + s = nmod_neg(s, mod); + + s = nmod_add(s, s, mod); + return s; +} + +ulong +euler_mod_p_powsum_noredc(ulong n, ulong p, const unsigned int * divtab) +{ + unsigned int * pows; + slong i, N, horner_point; + ulong s, t, z; + ulong v2n, power_of_two; + nmod_t mod; + TMP_INIT; + + if (n % 2 == 1) + return 0; + + n = n % (p - 1); + + if (n == 0) + return UWORD_MAX; + + N = p / 4; + + nmod_init(&mod, p); + + TMP_START; + pows = TMP_ALLOC(sizeof(unsigned int) * (N / 3 + 1)); + + s = z = 0; + + /* Evaluate as a polynomial in 2^n */ + power_of_two = 1; + while (power_of_two * 2 <= N) + power_of_two *= 2; + + horner_point = 1; + v2n = nmod_pow_ui(2, n, mod); + + for (i = 1; i <= N / 3; i += 2) + { + if (divtab[i] == 1) + t = nmod_pow_ui(i, n, mod); + else + t = nmod_mul(pows[divtab[i]], pows[divtab[i + 1]], mod); + + pows[i] = t; + s = nmod_add(s, t, mod); + + if (i == horner_point) + { + while (i == horner_point && power_of_two != 1) + { + z = nmod_add(s, nmod_mul(v2n, z, mod), mod); + power_of_two /= 2; + horner_point = N / power_of_two; + if (horner_point % 2 == 0) + horner_point--; + } + } + } + + /* Same as above, but here we don't need to write the powers. */ + for ( ; i <= N; i += 2) + { + if (divtab[i] == 1) + t = nmod_pow_ui(i, n, mod); + else + t = nmod_mul(pows[divtab[i]], pows[divtab[i + 1]], mod); + + s = nmod_add(s, t, mod); + + if (i == horner_point) + { + while (i == horner_point && power_of_two != 1) + { + z = nmod_add(s, nmod_mul(v2n, z, mod), mod); + power_of_two /= 2; + horner_point = N / power_of_two; + if (horner_point % 2 == 0) + horner_point--; + } + } + } + + s = nmod_add(s, nmod_mul(v2n, z, mod), mod); + + if (p % 4 == 3) + s = nmod_neg(s, mod); + + t = nmod_inv(nmod_pow_ui(4, p - n - 2, mod), mod); + s = nmod_mul(s, t, mod); + + TMP_END; + + return s; +} + +ulong +euler_mod_p_powsum_redc(ulong n, ulong p, const unsigned int * divtab) +{ + unsigned int * pows; + slong i, N, horner_point; + ulong s, t, z; + ulong v2n, power_of_two; + nmod_t mod; + nmod_redc_t rmod; + TMP_INIT; + + if (n % 2 == 1) + return 0; + + n = n % (p - 1); + + if (n == 0) + return UWORD_MAX; + + N = p / 4; + + nmod_init(&mod, p); + nmod_redc_init(&rmod, mod); + + TMP_START; + pows = TMP_ALLOC(sizeof(unsigned int) * (N / 3 + 1)); + + s = z = 0; + + /* Evaluate as a polynomial in 2^n */ + power_of_two = 1; + while (power_of_two * 2 <= N) + power_of_two *= 2; + + horner_point = 1; + v2n = nmod_redc_pow_ui(nmod_to_redc(2, mod, rmod), n, rmod); + + for (i = 1; i <= N / 3; i += 2) + { + if (divtab[i] == 1) + t = nmod_redc_pow_ui(nmod_to_redc(i, mod, rmod), n, rmod); + else + t = nmod_redc_mul(pows[divtab[i]], pows[divtab[i + 1]], rmod); + + pows[i] = t; + s += t; + + if (i == horner_point) + { + while (i == horner_point && power_of_two != 1) + { + NMOD_RED(s, s, mod); + z = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod); + power_of_two /= 2; + horner_point = N / power_of_two; + if (horner_point % 2 == 0) + horner_point--; + } + } + } + + /* Same as above, but here we don't need to write the powers. */ + for ( ; i <= N; i += 2) + { + if (divtab[i] == 1) + t = nmod_redc_pow_ui(nmod_to_redc(i, mod, rmod), n, rmod); + else + t = nmod_redc_mul_fast(pows[divtab[i]], pows[divtab[i + 1]], rmod); + + s += t; + + if (i == horner_point) + { + while (i == horner_point && power_of_two != 1) + { + NMOD_RED(s, s, mod); + z = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod); + power_of_two /= 2; + horner_point = N / power_of_two; + if (horner_point % 2 == 0) + horner_point--; + } + } + } + + NMOD_RED(s, s, mod); + s = nmod_add(s, nmod_redc_mul(v2n, z, rmod), mod); + s = nmod_from_redc(s, rmod); + + if (p % 4 == 3) + s = nmod_neg(s, mod); + + t = nmod_inv(nmod_pow_ui(4, p - n - 2, mod), mod); + s = nmod_mul(s, t, mod); + + TMP_END; + + return s; +} + +ulong +euler_mod_p_powsum(ulong n, ulong p, const unsigned int * divtab) +{ + if (p < (UWORD(1) << (FLINT_BITS / 2 - 1))) + return euler_mod_p_powsum_redc(n, p, divtab); + else + return euler_mod_p_powsum_noredc(n, p, divtab); +} + +static void +divisor_table_odd(unsigned int * tab, slong len) +{ + slong i, j; + + tab[0] = 0; + + for (i = 1; i < len; i += 2) + { + tab[i] = 1; + tab[i + 1] = i; + } + + for (i = 3; i < len; i += 2) + { + for (j = 3; j <= i && i * j < len; j += 2) + { + tab[i * j] = j; + tab[i * j + 1] = i; + } + } +} + +#define TIMING 0 +#define DEBUG 0 + +static void +_fmpz_crt_combine(fmpz_t r1r2, fmpz_t m1m2, const fmpz_t r1, const fmpz_t m1, const fmpz_t r2, const fmpz_t m2) +{ + fmpz_invmod(m1m2, m1, m2); + fmpz_mul(m1m2, m1m2, m1); + fmpz_sub(r1r2, r2, r1); + fmpz_mul(r1r2, r1r2, m1m2); + fmpz_add(r1r2, r1r2, r1); + fmpz_mul(m1m2, m1, m2); + fmpz_mod(r1r2, r1r2, m1m2); +} + +static void +tree_crt(fmpz_t r, fmpz_t m, mp_srcptr residues, mp_srcptr primes, slong len) +{ + if (len == 0) + { + fmpz_zero(r); + fmpz_one(m); + } + else if (len == 1) + { + fmpz_set_ui(r, residues[0]); + fmpz_set_ui(m, primes[0]); + } + else + { + fmpz_t r1, m1, r2, m2; + + fmpz_init(r1); + fmpz_init(m1); + fmpz_init(r2); + fmpz_init(m2); + + tree_crt(r1, m1, residues, primes, len / 2); + tree_crt(r2, m2, residues + len / 2, primes + len / 2, len - len / 2); + _fmpz_crt_combine(r, m, r1, m1, r2, m2); + + fmpz_clear(r1); + fmpz_clear(m1); + fmpz_clear(r2); + fmpz_clear(m2); + } +} + +void +arb_fmpz_euler_number_ui_multi_mod(fmpz_t num, ulong n, double alpha) +{ + slong i, bits, mod_bits, zeta_bits, num_primes; + ulong p; + mp_ptr primes, residues; + mag_t primes_product; + unsigned int * divtab_odd; + fmpz_t M; +#if TIMING + double t1, t2; +#endif + + if (n % 2 != 0) + { + fmpz_zero(num); + return; + } + + if (n < ARB_EULER_NUMBER_TAB_SIZE) + { + if (n % 4 == 0) + fmpz_set_ui(num, arb_euler_number_tab[n / 2]); + else + fmpz_neg_ui(num, arb_euler_number_tab[n / 2]); + return; + } + + if (alpha < 0) + { + if (n < 2000) + alpha = 0.0; + else if (n < 6000) + alpha = 0.002 + 1.0e-5 * (n - 2000); + else if (n < 90000) + alpha = -0.132 + 0.02 * log(n); + else + alpha = FLINT_MIN(0.0085 * log(n), 0.11); + } + +#if TIMING + t1 = clock(); +#endif + + bits = arb_euler_number_mag(n) + 2; + mod_bits = bits * alpha; + zeta_bits = bits - mod_bits; + + num_primes = 0; + mag_init(primes_product); + mag_one(primes_product); + + for (p = 5; mag_cmp_2exp_si(primes_product, mod_bits) < 0; p = n_nextprime(p, 1)) + { + if (n % (p - 1) != 0) + { + mag_mul_ui_lower(primes_product, primes_product, p); + num_primes++; + } + } + +#if DEBUG + printf("\nn = %lu, bits = %lu, num_primes = %ld\n", n, bits, num_primes); +#endif + + primes = flint_malloc(sizeof(mp_limb_t) * num_primes); + residues = flint_malloc(sizeof(mp_limb_t) * num_primes); + + for (p = 5, i = 0; i < num_primes; p = n_nextprime(p, 1)) + { + if (n % (p - 1) != 0) + { + primes[i] = p; + i++; + } + } + + if (num_primes == 0) + { + divtab_odd = NULL; + } + else + { + p = primes[num_primes - 1]; + divtab_odd = flint_malloc(sizeof(unsigned int) * (p / 4 + 2)); + divisor_table_odd(divtab_odd, p / 4 + 1); + } + +#if TIMING + t2 = clock(); + printf("init time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC); + printf("num_primes = %ld\n", num_primes); +#endif + + for (i = 0; i < num_primes; i++) + { +#if TIMING + if (i % 10000 == 0) + printf("%ld / %ld\n", i, num_primes); +#endif + + residues[i] = euler_mod_p_powsum(n, primes[i], divtab_odd); + } + +#if TIMING + t2 = clock(); + printf("mod time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC); + printf("start CRT\n"); + t1 = clock(); +#endif + + fmpz_init(M); + tree_crt(num, M, residues, primes, num_primes); + fmpz_mod(num, num, M); + + if (n % 4 == 2) + { + fmpz_sub(num, M, num); + fmpz_neg(num, num); + } + +#if TIMING + printf("end CRT\n"); + t2 = clock(); + printf("CRT time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC); + t1 = clock(); +#endif + + if (zeta_bits > 0) + { + slong prec; + arb_t b; + fmpz_t t; + + arb_init(b); + fmpz_init(t); + + for (prec = zeta_bits + 10; ; prec += 32) + { + arb_euler_number_ui_beta(b, n, prec); + arb_sub_fmpz(b, b, num, prec); + arb_div_fmpz(b, b, M, prec); + + if (arb_get_unique_fmpz(t, b)) + { + fmpz_addmul(num, t, M); + break; + } + + flint_printf("euler: n = %wu, bits = %wd, mod = %wd, zeta = %wd: get_unique_fmpz failed!\n", n, bits, mod_bits, zeta_bits); + } + + arb_clear(b); + fmpz_clear(t); + } + +#if TIMING + printf("end zeta\n"); + t2 = clock(); + printf("zeta time = %f\n", (t2 - t1) / (double) CLOCKS_PER_SEC); +#endif + + flint_free(primes); + flint_free(residues); + flint_free(divtab_odd); + fmpz_clear(M); + mag_clear(primes_product); +} + +void +arb_fmpz_euler_number_ui(fmpz_t res, ulong n) +{ + arb_t x; + double mag; + + mag = arb_euler_number_mag(n); + + if (n % 2 != 0) + { + fmpz_zero(res); + return; + } + + if (n < ARB_EULER_NUMBER_TAB_SIZE) + { + if (n % 4 == 0) + fmpz_set_ui(res, arb_euler_number_tab[n / 2]); + else + fmpz_neg_ui(res, arb_euler_number_tab[n / 2]); + return; + } + + if (n < 2000) + { + arb_init(x); + arb_euler_number_ui_beta(x, n, mag + 5); + if (!arb_get_unique_fmpz(res, x)) + { + flint_printf("arb_fmpz_euler_number_ui: unexpected inaccuracy\n"); + flint_abort(); + } + arb_clear(x); + } + else + { + arb_fmpz_euler_number_ui_multi_mod(res, n, -1.0); + } +} + void arb_euler_number_ui(arb_t res, ulong n, slong prec) { double mag; - if (n % 2) + if (n % 2 != 0) { arb_zero(res); return; @@ -91,9 +700,8 @@ arb_euler_number_ui(arb_t res, ulong n, slong prec) { fmpz_t t; fmpz_init(t); - arb_euler_number_ui_beta(res, n, mag + 5); - if (arb_get_unique_fmpz(t, res)) - arb_set_round_fmpz(res, t, prec); + arb_fmpz_euler_number_ui(t, n); + arb_set_round_fmpz(res, t, prec); fmpz_clear(t); } else @@ -102,4 +710,3 @@ arb_euler_number_ui(arb_t res, ulong n, slong prec) arb_set_round(res, res, prec); } } - diff --git a/arb/test/t-euler_number_fmpz.c b/arb/test/t-euler_number_fmpz.c index 0e8b6ee7..1e8abfe6 100644 --- a/arb/test/t-euler_number_fmpz.c +++ b/arb/test/t-euler_number_fmpz.c @@ -70,7 +70,7 @@ int main() arb_euler_number_fmpz(b2, n, prec2); } - if (fmpz_cmp_ui(n, 100) < 0) + if (fmpz_cmp_ui(n, 300) < 0) { arith_euler_number(bv, fmpz_get_ui(n)); } @@ -110,4 +110,3 @@ int main() flint_printf("PASS\n"); return EXIT_SUCCESS; } - diff --git a/arb/test/t-euler_number_ui.c b/arb/test/t-euler_number_ui.c new file mode 100644 index 00000000..d8cd4460 --- /dev/null +++ b/arb/test/t-euler_number_ui.c @@ -0,0 +1,148 @@ +/* + Copyright (C) 2021 Fredrik Johansson + + This file is part of Arb. + + Arb is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "flint/arith.h" +#include "arb.h" + +ulong euler_mod_p_powsum_noredc(ulong n, ulong p, const unsigned int * divtab); +ulong euler_mod_p_powsum(ulong n, ulong p, const unsigned int * divtab); +ulong euler_mod_p_powsum_1(ulong n, ulong p); + +static void +divisor_table_odd(unsigned int * tab, slong len) +{ + slong i, j; + + tab[0] = 0; + + for (i = 1; i < len; i += 2) + { + tab[i] = 1; + tab[i + 1] = i; + } + + for (i = 3; i < len; i += 2) + { + for (j = 3; j <= i && i * j < len; j += 2) + { + tab[i * j] = j; + tab[i * j + 1] = i; + } + } +} + +int main() +{ + slong iter; + flint_rand_t state; + + flint_printf("euler_number_ui...."); + fflush(stdout); + + flint_randinit(state); + + { + slong nmax; + unsigned int * divtab_odd; + fmpz * En; + + nmax = 1000 * FLINT_MIN(1.0, arb_test_multiplier()); + + En = _fmpz_vec_init(nmax); + arith_euler_number_vec(En, nmax); + + { + fmpz_t E; + slong n; + double alpha; + + fmpz_init(E); + + for (n = 0; n < nmax; n++) + { + if (n_randint(state, 2)) + alpha = -1.0; + else + alpha = n_randint(state, 11) / (double) 10; + + arb_fmpz_euler_number_ui_multi_mod(E, n, alpha); + + if (!fmpz_equal(En + n, E)) + { + flint_printf("FAIL: n = %wd\n", n); + flint_printf("vec: "); fmpz_print(En + n); flint_printf("\n"); + flint_printf("single: "); fmpz_print(E); flint_printf("\n"); + flint_abort(); + } + + arb_fmpz_euler_number_ui(E, n); + + if (!fmpz_equal(En + n, E)) + { + flint_printf("FAIL (2): n = %wd\n", n); + flint_printf("vec: "); fmpz_print(En + n); flint_printf("\n"); + flint_printf("single: "); fmpz_print(E); flint_printf("\n"); + flint_abort(); + } + } + + fmpz_clear(E); + } + + /* test the mod p code */ + for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++) + { + ulong p, n, m, m1, m5; + + n = n_randint(state, nmax); + p = 4 + n_randint(state, 10000); + p = n_nextprime(p, 1); + + divtab_odd = flint_malloc(sizeof(unsigned int) * (p / 4 + 2)); + divisor_table_odd(divtab_odd, p / 4 + 1); + + m = fmpz_fdiv_ui(En + n, p); + + if (n_randint(state, 2)) + m5 = euler_mod_p_powsum(n, p, divtab_odd); + else + m5 = euler_mod_p_powsum_noredc(n, p, divtab_odd); + + if (n_randint(state, 30) == 0) + { + m1 = euler_mod_p_powsum_1(n, p); + + if (m1 != UWORD_MAX && m != m1) + { + flint_printf("FAIL\n\n"); + flint_printf("n = %wu, p = %wu, m = %wu, m1 = %wu\n\n", n, p, m, m1); + flint_abort(); + } + } + + if (m5 != UWORD_MAX && m != m5) + { + flint_printf("FAIL\n\n"); + flint_printf("n = %wu, p = %wu, m = %wu, m5 = %wu\n\n", n, p, m, m5); + flint_abort(); + } + + flint_free(divtab_odd); + } + + _fmpz_vec_clear(En, nmax); + } + + flint_randclear(state); + flint_cleanup(); + flint_printf("PASS\n"); + return EXIT_SUCCESS; +} diff --git a/doc/source/arb.rst b/doc/source/arb.rst index 9094ef83..1e930f20 100644 --- a/doc/source/arb.rst +++ b/doc/source/arb.rst @@ -1580,17 +1580,21 @@ Other special functions is described in detail in [Joh2015]_. .. function:: void arb_euler_number_fmpz(arb_t res, const fmpz_t n, slong prec) + void arb_euler_number_ui(arb_t res, ulong n, slong prec) -.. function:: void arb_euler_number_ui(arb_t res, ulong n, slong prec) - - Sets *res* to the Euler number `E_n`, which is defined by having + Sets *res* to the Euler number `E_n`, which is defined by the exponential generating function `1 / \cosh(x)`. + The result will be exact if `E_n` is exactly representable + at the requested precision. - The Euler product for the Dirichlet beta function - (:func:`_acb_dirichlet_euler_product_real_ui`) is used to compute - a numerical approximation. If *prec* is more than enough to represent - the result exactly, the exact value is automatically determined - from a lower-precision approximation. +.. function:: void arb_fmpz_euler_number_ui_multi_mod(fmpz_t res, ulong n, double alpha) + void arb_fmpz_euler_number_ui(fmpz_t res, ulong n) + + Computes the Euler number `E_n` as an exact integer. The default + algorithm uses a table lookup, the Dirichlet beta function or a + hybrid modular algorithm depending on the size of *n*. + The *multi_mod* algorithm accepts a tuning parameter *alpha* which + can be set to a negative value to use defaults. .. function:: void arb_partitions_fmpz(arb_t res, const fmpz_t n, slong prec)