From 2fb86a0383be765e44e3c14d0f71a5ddf5da2fa8 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Thu, 17 Aug 2017 23:51:03 +0200 Subject: [PATCH] speed up acb_elliptic_rf for high precision by using a variable length Taylor expansion --- acb_elliptic/rf.c | 208 +++++++++++++++++++++++++++--------- doc/source/acb_elliptic.rst | 3 +- 2 files changed, 158 insertions(+), 53 deletions(-) diff --git a/acb_elliptic/rf.c b/acb_elliptic/rf.c index c3058e11..fab79ea9 100644 --- a/acb_elliptic/rf.c +++ b/acb_elliptic/rf.c @@ -11,8 +11,111 @@ #include "acb_elliptic.h" -/* todo: fast handling of y == z (RC function) and other special cases */ -/* todo: allow computing cauchy PV */ +void +acb_elliptic_rf_taylor_sum(acb_t res, const acb_t E2, const acb_t E3, slong nterms, slong prec) +{ + fmpz_t den, c, d, e; + acb_ptr E2pow; + arb_ptr E2powr; + acb_t s; + slong x, y, XMAX, YMAX, NMAX, N; + int real; + + NMAX = nterms - 1; + YMAX = NMAX / 3; + XMAX = NMAX / 2; + real = acb_is_real(E2) && acb_is_real(E3); + + fmpz_init(den); + fmpz_init(c); + fmpz_init(d); + fmpz_init(e); + acb_init(s); + if (real) + { + E2powr = _arb_vec_init(XMAX + 1); + E2pow = NULL; + _arb_vec_set_powers(E2powr, acb_realref(E2), XMAX + 1, prec); + } + else + { + E2pow = _acb_vec_init(XMAX + 1); + E2powr = NULL; + _acb_vec_set_powers(E2pow, E2, XMAX + 1, prec); + } + + /* Compute universal denominator (could tighten this?). */ + fmpz_one(den); + for (x = 3; x <= 2 * NMAX + 1; x += 2) + fmpz_mul_ui(den, den, x); + fmpz_mul_2exp(den, den, NMAX); + + /* Compute initial coefficient rf(1/2,y) / y! */ + fmpz_set(c, den); + for (y = 0; y < YMAX; y++) + { + fmpz_mul_ui(c, c, 2 * y + 1); + fmpz_divexact_ui(c, c, 2 * y + 2); + } + + acb_zero(res); + + for (y = YMAX; y >= 0; y--) + { + acb_zero(s); + + if (y != YMAX) + { + fmpz_mul_ui(c, c, 2 * y + 2); + fmpz_divexact_ui(c, c, 2 * y + 1); + } + + fmpz_set(d, c); + + /* Use powers with respect to E2 */ + for (x = 0; x <= XMAX; x++) + { + N = 2 * x + 3 * y; + + if (N <= NMAX) + { + fmpz_divexact_ui(e, d, 2 * N + 1); + + if (x % 2 == 1) + fmpz_neg(e, e); + + if (x != 0 || y != 0) + { + if (real) + arb_addmul_fmpz(acb_realref(s), E2powr + x, e, prec); + else + acb_addmul_fmpz(s, E2pow + x, e, prec); + } + + fmpz_mul_ui(d, d, 2 * x + 2 * y + 1); + fmpz_divexact_ui(d, d, 2 * x + 2); + } + } + + /* Horner with respect to E3 */ + acb_mul(res, res, E3, prec); + acb_add(res, res, s, prec); + } + + acb_div_fmpz(res, res, den, prec); + acb_add_ui(res, res, 1, prec); + + fmpz_clear(den); + fmpz_clear(c); + fmpz_clear(d); + fmpz_clear(e); + acb_clear(s); + if (real) + _arb_vec_clear(E2powr, XMAX + 1); + else + _acb_vec_clear(E2pow, XMAX + 1); +} + void acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, int flags, slong prec) @@ -20,7 +123,7 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, acb_t xx, yy, zz, sx, sy, sz, t; acb_t X, Y, Z, E2, E3; mag_t err, err2, prev_err; - slong k, wp, accx, accy, accz; + slong k, wp, accx, accy, accz, order; if (!acb_is_finite(x) || !acb_is_finite(y) || !acb_is_finite(z)) { @@ -42,25 +145,26 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, mag_init(err2); mag_init(prev_err); + order = 5; /* will be set later */ + acb_set(xx, x); acb_set(yy, y); acb_set(zz, z); - wp = prec + 20; + /* First guess precision based on the inputs. */ + /* This does not account for mixing. */ + accx = acb_rel_accuracy_bits(xx); + accy = acb_rel_accuracy_bits(yy); + accz = acb_rel_accuracy_bits(zz); + accx = FLINT_MAX(accx, accy); + accx = FLINT_MAX(accx, accz); + if (accx < prec - 20) + prec = FLINT_MAX(2, accx + 20); + wp = prec + 10 + FLINT_BIT_COUNT(prec); - /* must do at least one iteration */ + /* Must do at least one iteration. */ for (k = 0; k < prec; k++) { - accx = acb_rel_accuracy_bits(xx); - accy = acb_rel_accuracy_bits(yy); - accz = acb_rel_accuracy_bits(zz); - - wp = FLINT_MAX(accx, accy); - wp = FLINT_MAX(wp, accz); - wp = FLINT_MAX(wp, 0); - wp = FLINT_MIN(wp, prec); - wp += 20; - acb_sqrt(sx, xx, wp); acb_sqrt(sy, yy, wp); acb_sqrt(sz, zz, wp); @@ -77,9 +181,29 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, acb_mul_2exp_si(yy, yy, -2); acb_mul_2exp_si(zz, zz, -2); + /* Improve precision estimate and set expansion order. */ + /* Should this done for other k also? */ + if (k == 0) + { + accx = acb_rel_accuracy_bits(xx); + accy = acb_rel_accuracy_bits(yy); + accz = acb_rel_accuracy_bits(zz); + accx = FLINT_MAX(accx, accy); + accx = FLINT_MAX(accx, accz); + if (accx < prec - 20) + prec = FLINT_MAX(2, accx + 20); + wp = prec + 10 + FLINT_BIT_COUNT(prec); + + if (acb_is_real(xx) && acb_is_real(yy) && acb_is_real(zz)) + order = 2.0 * pow(prec, 0.4); + else + order = 2.5 * pow(prec, 0.4); + order = FLINT_MIN(order, 500); + order = FLINT_MAX(order, 2); + } + /* Close enough? Quick estimate based on |x-y|/|x| and |x-z|/|x| */ /* We also terminate if there is no improvement. */ - acb_sub(t, xx, yy, wp); acb_get_mag(err, t); acb_sub(t, xx, zz, wp); @@ -87,8 +211,7 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, mag_max(err, err, err2); acb_get_mag_lower(err2, xx); mag_div(err, err, err2); - mag_pow_ui(err, err, 8); - + mag_pow_ui(err, err, order); if (mag_cmp_2exp_si(err, -prec) < 0 || (k > 2 && mag_cmp(err, prev_err) > 0)) break; @@ -97,35 +220,29 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, } /* X = 1-x/t, Y = 1-y/t, Z = -X-Y, t = (x+y+z)/3 */ - acb_add(t, xx, yy, prec); - acb_add(t, t, zz, prec); - acb_div_ui(t, t, 3, prec); + acb_add(t, xx, yy, wp); + acb_add(t, t, zz, wp); + acb_div_ui(t, t, 3, wp); - acb_div(X, xx, t, prec); - acb_sub_ui(X, X, 1, prec); + acb_div(X, xx, t, wp); + acb_sub_ui(X, X, 1, wp); acb_neg(X, X); - acb_div(Y, yy, t, prec); - acb_sub_ui(Y, Y, 1, prec); + acb_div(Y, yy, t, wp); + acb_sub_ui(Y, Y, 1, wp); acb_neg(Y, Y); - acb_add(Z, X, Y, prec); + acb_add(Z, X, Y, wp); acb_neg(Z, Z); /* E2 = XY-Z^2, E3 = XYZ */ - acb_mul(E2, X, Y, prec); - acb_mul(E3, E2, Z, prec); - acb_submul(E2, Z, Z, prec); + acb_mul(E2, X, Y, wp); + acb_mul(E3, E2, Z, wp); + acb_submul(E2, Z, Z, wp); /* Crude bound for the coefficient of X^n1 Y^n2 Z^n3 with n1+n2+n3=n: 2*(9/8)^n. - We truncate before n = 8 (note: for higher precision, we could gain by - computing more terms). The actual evaluation is done using - elementary symmetric polynomials, giving - - 1 + (-5775*E2**3 + 15015*E2**2*E3 + 10010*E2**2 - 16380*E2*E3 - 24024*E2 - + 6930*E3**2 + 17160*E3)/240240. */ /* Error bound. */ @@ -134,32 +251,21 @@ acb_elliptic_rf(acb_t res, const acb_t x, const acb_t y, const acb_t z, mag_max(err, err, err2); acb_get_mag(err2, Z); mag_max(err, err, err2); + mag_mul_ui(err, err, 9); mag_mul_2exp_si(err, err, -3); - mag_geom_series(err, err, 8); + + mag_geom_series(err, err, order); mag_mul_2exp_si(err, err, 1); - /* 1 + (E2*(E2*(-5775*E2 + 15015*E3 + 10010) - 16380*E3 - 24024) - + E3*(6930*E3 + 17160))/240240 */ - acb_set_ui(sx, 10010); - acb_addmul_ui(sx, E3, 15015, prec); - acb_submul_ui(sx, E2, 5775, prec); - acb_mul(sx, sx, E2, prec); - acb_submul_ui(sx, E3, 16380, prec); - acb_sub_ui(sx, sx, 24024, prec); - acb_mul(sx, sx, E2, prec); - acb_mul_ui(sy, E3, 6930, prec); - acb_add_ui(sy, sy, 17160, prec); - acb_addmul(sx, sy, E3, prec); - acb_div_ui(sx, sx, 240240, prec); - acb_add_ui(sx, sx, 1, prec); + acb_elliptic_rf_taylor_sum(sx, E2, E3, order, wp); if (acb_is_real(X) && acb_is_real(Y)) arb_add_error_mag(acb_realref(sx), err); else acb_add_error_mag(sx, err); - acb_rsqrt(t, t, prec); + acb_rsqrt(t, t, wp); acb_mul(res, sx, t, prec); acb_clear(xx); acb_clear(yy); acb_clear(zz); diff --git a/doc/source/acb_elliptic.rst b/doc/source/acb_elliptic.rst index a491a681..85a96da4 100644 --- a/doc/source/acb_elliptic.rst +++ b/doc/source/acb_elliptic.rst @@ -190,8 +190,7 @@ in [Car1995]_ and chapter 19 in [NIST2012]_. still defined by analytic continuation. In general, one or more duplication steps are applied until - `x,y,z` are close enough to use a multivariate Taylor polynomial - of total degree 7. + `x,y,z` are close enough to use a multivariate Taylor series. The special case `R_C(x, y) = R_F(x, y, y) = \frac{1}{2} \int_0^{\infty} (t+x)^{-1/2} (t+y)^{-1} dt` may be computed by