faster rising factorials again

2025-03-06 01:41:39 -05:00 · 2013-03-09 12:32:56 +01:00 · 2013-03-09 12:32:56 +01:00 · 72e4d68b63
commit 72e4d68b63
parent 79143c1c7e
10 changed files with 606 additions and 9 deletions
--- a/doc/source/credits.rst
+++ b/doc/source/credits.rst
@ -66,6 +66,8 @@ Bibliography
 .. [PS1973] M.S. Paterson and L. J. Stockmeyer, "On the number of nonscalar multiplications necessary to evaluate polynomials", SIAM J. Comput (1973)
 .. [Smi2001] D.M. Smith, "Algorithm: Fortran 90 Software for Floating-Point Multiple Precision Arithmetic, Gamma and Related Functions", Transactions on Mathematical Software 27 (2001) 377-387, http://myweb.lmu.edu/dmsmith/toms2001.pdf
 .. [Tak2000] D.Takahashi, "A fast algorithm for computing large Fibonacci numbers", Information Processing Letters 75 (2000) 243-246. http://www.ii.uni.wroc.pl/~lorys/IPL/article75-6-1.pdf
--- a/doc/source/gamma.rst
+++ b/doc/source/gamma.rst
@ -170,6 +170,43 @@ Rising factorials
    *gamma_rising_fmpcb_ui_bsplit* automatically choose
    an algorithm depending on the inputs.
 .. function :: void gamma_rising_fmprb_ui_delta(fmprb_t y, const fmprb_t x, ulong n, ulong m, long prec)
 .. function :: void gamma_rising_fmpcb_ui_delta(fmpcb_t y, const fmpcb_t x, ulong n, ulong m, long prec)
    Sets `y` to the rising factorial `x (x+1) (x+2) \cdots (x+n-1)`,
    computed as a product of partial products
    `(x+k)(x+k+1)\cdots(x+k+m-1)`. Each partial product is obtained
    from the previous by using a precomputed table of powers of `x` to
    evaluate the difference
    .. math ::
        \Delta_m(x,k) = (x+k+m)_{(m)} - (x+k)_{(m)}.
    The instance `m = 4` of this algorithm was used by Smith ([Smi2001]_),
    but we generalize it to a variable `m` which can be chosen nearly
    optimally depending on the precision and `n`.
    The polynomials `\Delta_m(x,k) \in \mathbb{Z}[k][x]` are generated dynamically.
    Expanding the rising factorials, applying the binomial theorem
    a couple of times, and doing several rearrangements of the sums, we
    find the closed form
    .. math ::
        \Delta_m(x,k) = \sum_{v=0}^{m-1} x^v \sum_{i=0}^{m-v-1} k^i C_m(v,i),
    where
    .. math ::
        C_m(v,i) = \sum_{j=i+1}^{m-v} m^{j-i} \left[{m \atop v+j}\right] {{v+j} \choose v} {j \choose i}
    in which the square bracket denotes an unsigned Stirling number
    of the first kind.
 .. function :: void gamma_rising_fmprb_ui_multipoint(fmprb_t f, const fmprb_t c, ulong n, long prec)
    Sets `y` to the rising factorial `x (x+1) (x+2) \cdots (x+n-1)`,
--- a/gamma.h
+++ b/gamma.h
@ -81,6 +81,9 @@ void gamma_rising_fmprb_ui_bsplit_eight(fmprb_t y, const fmprb_t x, ulong n, lon
 void gamma_rising_fmprb_ui_bsplit_rectangular(fmprb_t y, const fmprb_t x, ulong n, ulong step, long prec);
 void gamma_rising_fmprb_ui_bsplit(fmprb_t y, const fmprb_t x, ulong n, long prec);
 void gamma_rising_fmprb_ui_delta(fmprb_t y, const fmprb_t x, ulong n, ulong m, long prec);
 void gamma_rising_fmpcb_ui_delta(fmpcb_t y, const fmpcb_t x, ulong n, ulong m, long prec);
 void gamma_rising_fmpcb_ui_bsplit_simple(fmpcb_t y, const fmpcb_t x, ulong n, long prec);
 void gamma_rising_fmpcb_ui_bsplit_eight(fmpcb_t y, const fmpcb_t x, ulong n, long prec);
 void gamma_rising_fmpcb_ui_bsplit_rectangular(fmpcb_t y, const fmpcb_t x, ulong n, ulong step, long prec);
--- a/gamma/rising_fmpcb_ui_bsplit.c
+++ b/gamma/rising_fmpcb_ui_bsplit.c
@ -28,11 +28,9 @@
 void
 gamma_rising_fmpcb_ui_bsplit(fmpcb_t y, const fmpcb_t x, ulong n, long prec)
 {
-    if (n < 8 || fmpcb_bits(x) < prec / 8)
+    if (prec < 256 || n < 8 || fmpcb_bits(x) < prec / 8)
        gamma_rising_fmpcb_ui_bsplit_simple(y, x, n, prec);
    else if (prec < 250 || n < 250000 / prec)
        gamma_rising_fmpcb_ui_bsplit_eight(y, x, n, prec);
    else
-        gamma_rising_fmpcb_ui_bsplit_rectangular(y, x, n, 0, prec);
+        gamma_rising_fmpcb_ui_delta(y, x, n, 0, prec);
 }
--- a/gamma/rising_fmpcb_ui_delta.c
+++ b/gamma/rising_fmpcb_ui_delta.c
@ -0,0 +1,139 @@
 /*=============================================================================
    This file is part of ARB.
    ARB is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    ARB is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with ARB; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 =============================================================================*/
 /******************************************************************************
    Copyright (C) 2013 Fredrik Johansson
 ******************************************************************************/
 #include "gamma.h"
 #include "arith.h"
 void rising_difference_polynomial(fmpz * s, fmpz * c, ulong m);
 void
 gamma_rising_fmpcb_ui_delta(fmpcb_t y, const fmpcb_t x, ulong n, ulong m, long prec)
 {
    fmpcb_struct * xs;
    fmpcb_t t, u, v;
    ulong i, k, rem;
    fmpz_t c, h;
    fmpz *s, *d;
    long wp;
    if (n == 0)
    {
        fmpcb_one(y);
        return;
    }
    if (n == 1)
    {
        fmpcb_set_round(y, x, prec);
        return;
    }
    wp = FMPR_PREC_ADD(prec, FLINT_BIT_COUNT(n));
    fmpcb_init(t);
    fmpcb_init(u);
    fmpcb_init(v);
    fmpz_init(c);
    fmpz_init(h);
    if (m == 0)
    {
        ulong m1, m2;
        m1 = 0.04 * pow(2.0 * wp, 0.65);
        m2 = n_sqrt(n);
        m = FLINT_MIN(m1, m2);
    }
    m = FLINT_MIN(m, n);
    m = FLINT_MAX(m, 1);
    xs = _fmpcb_vec_init(m + 1);
    d = _fmpz_vec_init(m * m);
    s = _fmpz_vec_init(m + 1);
    for (i = 0; i <= m; i++)
    {
        if (i == 0)
            fmpcb_one(xs + i);
        else if (i == 1)
            fmpcb_set(xs + i, x);
        else if (i % 2 == 0)
            fmpcb_mul(xs + i, xs + i / 2, xs + i / 2, wp);
        else
            fmpcb_mul(xs + i, xs + i - 1, x, wp);
    }
    rising_difference_polynomial(s, d, m);
    /* tail */
    rem = m;
    while (rem + m <= n)
        rem += m;
    fmpcb_one(y);
    for (k = rem; k < n; k++)
    {
        fmpcb_add_ui(t, xs + 1, k, wp);
        fmpcb_mul(y, y, t, wp);
    }
    /* initial rising factorial */
    fmpcb_zero(t);
    for (i = 1; i <= m; i++)
        fmpcb_addmul_fmpz(t, xs + i, s + i, wp);
    fmpcb_mul(y, y, t, wp);
    /* the leading coefficient is always the same */
    fmpcb_mul_fmpz(xs + m - 1, xs + m - 1, d + m - 1 + 0, wp);
    for (k = 0; k + 2 * m <= n; k += m)
    {
        for (i = 0; i < m - 1; i++)
        {
            fmpz_set_ui(h, k);
            _fmpz_poly_evaluate_horner_fmpz(c, d + i * m, m - i, h);
            if (i == 0)
                fmpcb_add_fmpz(t, t, c, wp);
            else
                fmpcb_addmul_fmpz(t, xs + i, c, wp);
        }
        fmpcb_add(t, t, xs + m - 1, wp);
        fmpcb_mul(y, y, t, wp);
    }
    fmpcb_set_round(y, y, prec);
    fmpcb_clear(t);
    fmpcb_clear(u);
    fmpcb_clear(v);
    _fmpcb_vec_clear(xs, m + 1);
    _fmpz_vec_clear(d, m * m);
    _fmpz_vec_clear(s, m + 1);
    fmpz_clear(c);
    fmpz_clear(h);
 }
--- a/gamma/rising_fmprb_ui_bsplit.c
+++ b/gamma/rising_fmprb_ui_bsplit.c
@ -28,11 +28,9 @@
 void
 gamma_rising_fmprb_ui_bsplit(fmprb_t y, const fmprb_t x, ulong n, long prec)
 {
-    if (prec < 768 || n < 8 || fmprb_bits(x) < prec / 8)
+    if (prec < 512 || n < 8 || fmprb_bits(x) < prec / 8)
        gamma_rising_fmprb_ui_bsplit_simple(y, x, n, prec);
    else if (prec < 1500 || n < 500000 / prec)
        gamma_rising_fmprb_ui_bsplit_eight(y, x, n, prec);
    else
-        gamma_rising_fmprb_ui_bsplit_rectangular(y, x, n, 0, prec);
+        gamma_rising_fmprb_ui_delta(y, x, n, 0, prec);
 }
--- a/gamma/rising_fmprb_ui_delta.c
+++ b/gamma/rising_fmprb_ui_delta.c
@ -0,0 +1,174 @@
 /*=============================================================================
    This file is part of ARB.
    ARB is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    ARB is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with ARB; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 =============================================================================*/
 /******************************************************************************
    Copyright (C) 2013 Fredrik Johansson
 ******************************************************************************/
 #include "gamma.h"
 #include "arith.h"
 void
 rising_difference_polynomial(fmpz * s, fmpz * c, ulong m)
 {
    long i, j, k;
    fmpz_t t;
    fmpz_init(t);
    arith_stirling_number_1u_vec(s, m, m + 1);
    for (k = 0; k < m; k++)
    {
        for (i = 0; i <= m - k - 1; i++)
        {
            fmpz_zero(c + m * k + i);
            for (j = i + 1; j + k <= m; j++)
            {
                if (j == i + 1)
                {
                    fmpz_bin_uiui(t, 1+i+k, k);
                    fmpz_mul_ui(t, t, m * (i+1));
                }
                else
                {
                    fmpz_mul_ui(t, t, m * (k + j));
                    fmpz_divexact_ui(t, t, j - i);
                }
                fmpz_addmul(c + m * k + i, s + j + k, t);
            }
        }
    }
    fmpz_clear(t);
 }
 void
 gamma_rising_fmprb_ui_delta(fmprb_t y, const fmprb_t x, ulong n, ulong m, long prec)
 {
    fmprb_struct * xs;
    fmprb_t t, u, v;
    ulong i, k, rem;
    fmpz_t c, h;
    fmpz *s, *d;
    long wp;
    if (n == 0)
    {
        fmprb_one(y);
        return;
    }
    if (n == 1)
    {
        fmprb_set_round(y, x, prec);
        return;
    }
    wp = FMPR_PREC_ADD(prec, FLINT_BIT_COUNT(n));
    fmprb_init(t);
    fmprb_init(u);
    fmprb_init(v);
    fmpz_init(c);
    fmpz_init(h);
    if (m == 0)
    {
        ulong m1, m2;
        m1 = 0.04 * pow(wp, 0.65);
        m2 = n_sqrt(n);
        m = FLINT_MIN(m1, m2);
    }
    m = FLINT_MIN(m, n);
    m = FLINT_MAX(m, 1);
    xs = _fmprb_vec_init(m + 1);
    d = _fmpz_vec_init(m * m);
    s = _fmpz_vec_init(m + 1);
    for (i = 0; i <= m; i++)
    {
        if (i == 0)
            fmprb_one(xs + i);
        else if (i == 1)
            fmprb_set(xs + i, x);
        else if (i % 2 == 0)
            fmprb_mul(xs + i, xs + i / 2, xs + i / 2, wp);
        else
            fmprb_mul(xs + i, xs + i - 1, x, wp);
    }
    rising_difference_polynomial(s, d, m);
    /* tail */
    rem = m;
    while (rem + m <= n)
        rem += m;
    fmprb_one(y);
    for (k = rem; k < n; k++)
    {
        fmprb_add_ui(t, xs + 1, k, wp);
        fmprb_mul(y, y, t, wp);
    }
    /* initial rising factorial */
    fmprb_zero(t);
    for (i = 1; i <= m; i++)
        fmprb_addmul_fmpz(t, xs + i, s + i, wp);
    fmprb_mul(y, y, t, wp);
    /* the leading coefficient is always the same */
    fmprb_mul_fmpz(xs + m - 1, xs + m - 1, d + m - 1 + 0, wp);
    for (k = 0; k + 2 * m <= n; k += m)
    {
        for (i = 0; i < m - 1; i++)
        {
            fmpz_set_ui(h, k);
            _fmpz_poly_evaluate_horner_fmpz(c, d + i * m, m - i, h);
            if (i == 0)
                fmprb_add_fmpz(t, t, c, wp);
            else
                fmprb_addmul_fmpz(t, xs + i, c, wp);
        }
        fmprb_add(t, t, xs + m - 1, wp);
        fmprb_mul(y, y, t, wp);
    }
    fmprb_set_round(y, y, prec);
    fmprb_clear(t);
    fmprb_clear(u);
    fmprb_clear(v);
    _fmprb_vec_clear(xs, m + 1);
    _fmpz_vec_clear(d, m * m);
    _fmpz_vec_clear(s, m + 1);
    fmpz_clear(c);
    fmpz_clear(h);
 }
--- a/gamma/stirling_choose_param.c
+++ b/gamma/stirling_choose_param.c
@ -27,7 +27,7 @@
 #include "bernoulli.h"
 /* tuning factor */
-#define GAMMA_STIRLING_BETA 0.23
+#define GAMMA_STIRLING_BETA 0.27
 static __inline__ long
 _fmpr_mag(const fmpr_t x)
--- a/gamma/test/t-rising_fmpcb_ui_delta.c
+++ b/gamma/test/t-rising_fmpcb_ui_delta.c
@ -0,0 +1,120 @@
 /*=============================================================================
    This file is part of fmpcb.
    fmpcb is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    fmpcb is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with fmpcb; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 =============================================================================*/
 /******************************************************************************
    Copyright (C) 2012 Fredrik Johansson
 ******************************************************************************/
 #include "gamma.h"
 int main()
 {
    long iter;
    flint_rand_t state;
    printf("rising_fmpcb_ui_delta....");
    fflush(stdout);
    flint_randinit(state);
    /* check functional equation */
    for (iter = 0; iter < 1000; iter++)
    {
        fmpcb_t x, xn, y, z;
        ulong n, m, step1, step2, step3;
        fmpcb_init(x);
        fmpcb_init(xn);
        fmpcb_init(y);
        fmpcb_init(z);
        fmpcb_randtest(x, state, 1 + n_randint(state, 4000), 10);
        n = n_randint(state, 80);
        m = n_randint(state, 40);
        fmpcb_add_ui(xn, x, n, 1 + n_randint(state, 4000));
        step1 = n_randint(state, 20);
        step2 = n_randint(state, 20);
        step3 = n_randint(state, 20);
        gamma_rising_fmpcb_ui_delta(y, x, n, step1, 2 + n_randint(state, 4000));
        gamma_rising_fmpcb_ui_delta(z, xn, m, step2, 2 + n_randint(state, 4000));
        fmpcb_mul(y, y, z, 2 + n_randint(state, 4000));
        gamma_rising_fmpcb_ui_delta(z, x, n + m, step3, 2 + n_randint(state, 4000));
        if (!fmpcb_overlaps(y, z))
        {
            printf("FAIL: overlap\n\n");
            printf("n = %lu\n", n);
            printf("m = %lu\n", m);
            printf("x = "); fmpcb_print(x); printf("\n\n");
            printf("xn = "); fmpcb_print(xn); printf("\n\n");
            printf("y = "); fmpcb_print(y); printf("\n\n");
            printf("z = "); fmpcb_print(z); printf("\n\n");
            abort();
        }
        fmpcb_clear(x);
        fmpcb_clear(xn);
        fmpcb_clear(y);
        fmpcb_clear(z);
    }
    /* aliasing of y and x */
    for (iter = 0; iter < 1000; iter++)
    {
        fmpcb_t x, y;
        ulong n;
        long prec;
        ulong step;
        fmpcb_init(x);
        fmpcb_init(y);
        fmpcb_randtest(x, state, 1 + n_randint(state, 200), 10);
        fmpcb_randtest(y, state, 1 + n_randint(state, 200), 10);
        n = n_randint(state, 100);
        step = n_randint(state, 20);
        prec = 2 + n_randint(state, 4000);
        gamma_rising_fmpcb_ui_delta(y, x, n, step, prec);
        gamma_rising_fmpcb_ui_delta(x, x, n, step, prec);
        if (!fmpcb_equal(x, y))
        {
            printf("FAIL: aliasing\n\n");
            printf("x = "); fmpcb_print(x); printf("\n\n");
            printf("y = "); fmpcb_print(y); printf("\n\n");
            printf("n = %lu\n", n);
            abort();
        }
        fmpcb_clear(x);
        fmpcb_clear(y);
    }
    flint_randclear(state);
    _fmpz_cleanup();
    printf("PASS\n");
    return EXIT_SUCCESS;
 }
--- a/gamma/test/t-rising_fmprb_ui_delta.c
+++ b/gamma/test/t-rising_fmprb_ui_delta.c
@ -0,0 +1,126 @@
 /*=============================================================================
    This file is part of fmprb.
    fmprb is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    fmprb is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with fmprb; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 =============================================================================*/
 /******************************************************************************
    Copyright (C) 2012 Fredrik Johansson
 ******************************************************************************/
 #include "gamma.h"
 int main()
 {
    long iter;
    flint_rand_t state;
    printf("rising_fmprb_ui_delta....");
    fflush(stdout);
    flint_randinit(state);
    /* compare with fmpq */
    for (iter = 0; iter < 1000; iter++)
    {
        fmprb_t a, b;
        fmpq_t x, y, z;
        ulong n, step;
        long i;
        fmprb_init(a);
        fmprb_init(b);
        fmpq_init(x);
        fmpq_init(y);
        fmpq_init(z);
        fmprb_randtest(a, state, 1 + n_randint(state, 1000), 10);
        fmprb_randtest(b, state, 1 + n_randint(state, 1000), 10);
        n = n_randint(state, 80);
        fmprb_get_rand_fmpq(x, state, a, 1 + n_randint(state, 1000));
        step = n_randint(state, 20);
        gamma_rising_fmprb_ui_delta(b, a, n, step, 2 + n_randint(state, 1000));
        fmpq_one(y);
        for (i = 0; i < n; i++)
        {
            fmpq_set_si(z, i, 1);
            fmpq_add(z, x, z);
            fmpq_mul(y, y, z);
        }
        if (!fmprb_contains_fmpq(b, y))
        {
            printf("FAIL: containment\n\n");
            printf("n = %lu\n", n);
            printf("a = "); fmprb_print(a); printf("\n\n");
            printf("x = "); fmpq_print(x); printf("\n\n");
            printf("b = "); fmprb_print(b); printf("\n\n");
            printf("y = "); fmpq_print(y); printf("\n\n");
            abort();
        }
        fmprb_clear(a);
        fmprb_clear(b);
        fmpq_clear(x);
        fmpq_clear(y);
        fmpq_clear(z);
    }
    /* aliasing of y and x */
    for (iter = 0; iter < 1000; iter++)
    {
        fmprb_t x, y;
        ulong n, step;
        long prec;
        fmprb_init(x);
        fmprb_init(y);
        fmprb_randtest(x, state, 1 + n_randint(state, 200), 10);
        fmprb_randtest(y, state, 1 + n_randint(state, 200), 10);
        n = n_randint(state, 100);
        prec = 2 + n_randint(state, 1000);
        step = n_randint(state, 20);
        gamma_rising_fmprb_ui_delta(y, x, n, step, prec);
        gamma_rising_fmprb_ui_delta(x, x, n, step, prec);
        if (!fmprb_equal(x, y))
        {
            printf("FAIL: aliasing\n\n");
            printf("x = "); fmprb_print(x); printf("\n\n");
            printf("y = "); fmprb_print(y); printf("\n\n");
            printf("n = %lu\n", n);
            abort();
        }
        fmprb_clear(x);
        fmprb_clear(y);
    }
    flint_randclear(state);
    _fmpz_cleanup();
    printf("PASS\n");
    return EXIT_SUCCESS;
 }