speed up arb_exp() between 4600 and 19000 bits

2025-03-05 09:21:38 -05:00 · 2017-09-06 16:45:56 +02:00 · 2017-09-06 16:45:56 +02:00 · d3e395f4f4
commit d3e395f4f4
parent fcd3d3bf12
6 changed files with 322 additions and 3 deletions
--- a/arb.h
+++ b/arb.h
@ -937,6 +937,7 @@ void _arb_exp_taylor_rs(mp_ptr y, mp_limb_t * error,
    mp_srcptr x, mp_size_t xn, ulong N);

 void arb_exp_arf_bb(arb_t z, const arf_t x, slong prec, int minus_one);
+void arb_exp_arf_rs_generic(arb_t res, const arf_t x, slong prec, int minus_one);

 int _arb_get_mpn_fixed_mod_log2(mp_ptr w, fmpz_t q, mp_limb_t * error,
                                                const arf_t x, mp_size_t wn);
--- a/arb/exp.c
+++ b/arb/exp.c
@ -77,8 +77,12 @@ arb_exp_arf_overflow(arb_t z, slong expbound, int negative, int minus_one, slong
 static void
 arb_exp_arf_fallback(arb_t z, const arf_t x, slong mag, slong prec, int minus_one)
 {
-    if (mag > 64)
+    /* reduce by log(2) if needed, but avoid computing log(2) unnecessarily at
+       extremely high precision */
+    if (mag > 64 || (mag > 8 && prec < 1000000))
        arb_exp_arf_huge(z, x, mag, prec, minus_one);
+    else if (prec < 19000)
+        arb_exp_arf_rs_generic(z, x, prec, minus_one);
    else
        arb_exp_arf_bb(z, x, prec, minus_one);
 }
--- a/arb/exp_arf_rs_generic.c
+++ b/arb/exp_arf_rs_generic.c
@ -0,0 +1,180 @@
+/*
+    Copyright (C) 2014 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+arb_exp_taylor_sum_rs_generic(arb_t res, const arb_t x, slong N, slong prec)
+{
+    arb_t s;
+    mag_t err;
+
+    arb_init(s);
+    mag_init(err);
+
+    arb_get_mag(err, x);
+    mag_exp_tail(err, err, N);
+
+    if (N <= 2)
+    {
+        if (N == 0)
+            arb_zero(res);
+        else if (N == 1)
+            arb_one(res);
+        else if (N == 2)
+            arb_add_ui(res, x, 1, prec);
+        arb_add_error_mag(res, err);
+    }
+    else
+    {
+        arb_ptr tpow;
+        slong j, k, m, M, tp, xmag;
+        mp_limb_t c, d, chi, clo;
+
+        xmag = arf_abs_bound_lt_2exp_si(arb_midref(x));
+
+        /* Convert to order as a series in x^2. */
+        M = N / 2;
+
+        m = n_sqrt(M);
+
+        /* not intended (and not 32-bit safe...) */
+        if (M > 30000)
+        {
+            flint_abort();
+        }
+
+        tpow = _arb_vec_init(m + 1);
+
+        arb_mul(s, x, x, prec);
+        _arb_vec_set_powers(tpow, s, m + 1, prec);
+        arb_zero(s);
+
+        c = 1;
+        j = (M - 1) % m;
+
+        for (k = M - 1; k >= 0; k--)
+        {
+            tp = prec - 2 * k * (-xmag) + 10;
+            tp = FLINT_MAX(tp, 2);
+            tp = FLINT_MIN(tp, prec);
+
+            d = (2 * k) * (2 * k + 1);
+
+            if (k != 0)
+            {
+                umul_ppmm(chi, clo, c, d);
+
+                if (chi != 0)
+                {
+                    arb_div_ui(s, s, c, tp);
+                    c = 1;
+                }
+            }
+
+            arb_addmul_ui(s, tpow + j, c, tp);
+
+            if (k != 0)
+            {
+                c *= d;
+
+                if (j == 0)
+                {
+                    arb_mul(s, s, tpow + m, tp);
+                    j = m - 1;
+                }
+                else
+                {
+                    j--;
+                }
+            }
+        }
+
+        arb_div_ui(s, s, c, prec);
+        arb_mul(s, s, x, prec);
+
+        arb_add_error_mag(s, err);
+
+        /* exp = sinh + sqrt(1 + sinh^2) */
+        arb_mul(res, s, s, prec);
+        arb_add_ui(res, res, 1, prec);
+        arb_sqrt(res, res, prec);
+        arb_add(res, res, s, prec);
+
+        _arb_vec_clear(tpow, m + 1);
+    }
+
+    mag_clear(err);
+    arb_clear(s);
+}
+
+void
+arb_exp_arf_rs_generic(arb_t res, const arf_t x, slong prec, int minus_one)
+{
+    slong q, xmag, wp, k, N;
+    arb_t t;
+
+    if (arf_is_zero(x))
+    {
+        if (minus_one)
+            arb_zero(res);
+        else
+            arb_one(res);
+        return;
+    }
+
+    xmag = arf_abs_bound_lt_2exp_si(x);
+
+    /* 1 + x + O(x^2) */
+    /* We don't really need to worry too much about degenerate input
+       because the main exp function already takes care of it. */
+    if (xmag < -prec - 4)
+    {
+        mag_t t;
+        mag_init(t);
+        arf_get_mag(t, x);
+        mag_exp_tail(t, t, 2);
+        arb_set_arf(res, x);
+        arb_add_ui(res, res, minus_one ? 0 : 1, prec);
+        arb_add_error_mag(res, t);
+        mag_clear(t);
+        return;
+    }
+
+    arb_init(t);
+
+    /* generic tuning value */
+    q = 4.5 * pow(prec, 0.2);
+    q = FLINT_MAX(q, 6);
+    /* adjust to magnitude */
+    q = FLINT_MAX(0, xmag + q);
+
+    wp = prec + 10 + 2 * q + 2 * FLINT_BIT_COUNT(prec);
+    if (minus_one && xmag < 0)
+        wp += (-xmag);
+
+    /* t = x/2^q */
+    arf_mul_2exp_si(arb_midref(t), x, -q);
+
+    N = _arb_exp_taylor_bound(xmag - q, wp);
+    arb_exp_taylor_sum_rs_generic(t, t, N, wp);
+
+    /* exp(x) = exp(x/2^q)^(2^q) */
+    for (k = 0; k < q; k++)
+        arb_mul(t, t, t, wp);
+
+    if (minus_one)
+        arb_sub_ui(t, t, 1, wp);
+
+    arb_set_round(res, t, prec);
+    arb_clear(t);
+}
+
--- a/arb/sin_cos_arf_generic.c
+++ b/arb/sin_cos_arf_generic.c
@ -11,8 +11,6 @@

 #include "arb.h"

-slong _arb_get_exp_pos(const slong * tab, slong step);
-
 static void
 arb_zero_pm_one(arb_t res)
 {
--- a/arb/test/t-exp_arf_rs_generic.c
+++ b/arb/test/t-exp_arf_rs_generic.c
@ -0,0 +1,131 @@
+/*
+    Copyright (C) 2017 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+/* these functions are not exposed to the public for now,
+   but it still makes sense to test them explicitly */
+void arb_exp_taylor_sum_rs_generic(arb_t s, const arb_t x, slong N, slong prec);
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("exp_arf_rs_generic....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    /* test the rs algorithm explicitly */
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
+    {
+        arb_t x, s1, s2;
+        slong prec;
+        int minus1;
+
+        arb_init(x);
+        arb_init(s1);
+        arb_init(s2);
+
+        prec = 2 + n_randint(state, 2000);
+        minus1 = n_randint(state, 2);
+        arb_randtest(x, state, 1 + n_randint(state, 2000), 3);
+        mag_zero(arb_radref(x));
+
+        if (n_randint(state, 2))
+            arb_mul_2exp_si(x, x, -n_randint(state, 2 * prec));
+
+        if (minus1)
+            arb_expm1(s1, x, prec);
+        else
+            arb_exp(s1, x, prec);
+
+        switch (n_randint(state, 2))
+        {
+            case 0:
+                arb_exp_arf_rs_generic(s2, arb_midref(x), prec, minus1);
+                break;
+            case 1:
+                arb_set(s2, x);
+                arb_exp_arf_rs_generic(s2, arb_midref(s2), prec, minus1);
+                break;
+        }
+
+        if (!arb_overlaps(s1, s2))
+        {
+            flint_printf("FAIL: overlap\n\n");
+            flint_printf("prec = %wd, minus1 = %d\n\n", prec, minus1);
+            flint_printf("x = "); arb_printn(x, 500, 0); flint_printf("\n\n");
+            flint_printf("s1 = "); arb_printn(s1, 500, 0); flint_printf("\n\n");
+            flint_printf("s2 = "); arb_printn(s2, 500, 0); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        if (arb_rel_accuracy_bits(s2) < prec - 2)
+        {
+            flint_printf("FAIL: poor accuracy\n\n");
+            flint_printf("prec = %wd,  acc1 = %wd,  minus1 = %d\n\n",
+                prec, arb_rel_accuracy_bits(s2), minus1);
+            flint_printf("x = "); arb_printn(x, 500, 0); flint_printf("\n\n");
+            flint_printf("s1 = "); arb_printn(s1, 500, 0); flint_printf("\n\n");
+            flint_printf("s2 = "); arb_printn(s2, 500, 0); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        arb_clear(x);
+        arb_clear(s1);
+        arb_clear(s2);
+    }
+
+    /* test the series evaluation code directly */
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
+    {
+        arb_t x, y, z;
+        slong prec;
+        slong N;
+
+        arb_init(x);
+        arb_init(y);
+        arb_init(z);
+
+        prec = 2 + n_randint(state, 2000);
+        N = n_randint(state, 100);
+        arb_randtest(x, state, 1 + n_randint(state, 2000), 1);
+        mag_zero(arb_radref(x));
+
+        if (n_randint(state, 2))
+            arb_mul_2exp_si(x, x, -n_randint(state, 2 * prec));
+
+        arb_exp(y, x, prec);
+        arb_exp_taylor_sum_rs_generic(z, x, N, prec);
+
+        if (!arb_overlaps(z, y))
+        {
+            flint_printf("FAIL: overlap\n\n");
+            flint_printf("prec = %wd, N = %wd\n\n", prec, N);
+            flint_printf("x = "); arb_printn(x, 500, 0); flint_printf("\n\n");
+            flint_printf("y = "); arb_printn(y, 500, 0); flint_printf("\n\n");
+            flint_printf("z = "); arb_printn(z, 500, 0); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        arb_clear(x);
+        arb_clear(y);
+        arb_clear(z);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
+
--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@ -1541,6 +1541,11 @@ Internals for computing elementary functions
    resulting in slightly higher memory usage but better speed. For best
    efficiency, *N* should have many trailing zero bits.

+.. function:: void arb_exp_arf_rs_generic(arb_t res, const arf_t x, slong prec, int minus_one)
+
+    Computes the exponential function using a generic version of the rectangular
+    splitting strategy, intended for intermediate precision.
+
 .. function:: void _arb_atan_sum_bs_simple(fmpz_t T, fmpz_t Q, mp_bitcnt_t * Qexp, const fmpz_t x, mp_bitcnt_t r, slong N)

 .. function:: void _arb_atan_sum_bs_powtab(fmpz_t T, fmpz_t Q, mp_bitcnt_t * Qexp, const fmpz_t x, mp_bitcnt_t r, slong N)