add space-efficient powsum_smooth version

2025-03-04 17:01:40 -05:00 · 2016-10-22 00:26:10 +02:00 · 2016-10-22 00:26:10 +02:00 · 1b92a14213
commit 1b92a14213
parent c4af23b1c5
5 changed files with 297 additions and 2 deletions
--- a/acb_dirichlet.h
+++ b/acb_dirichlet.h
@ -31,6 +31,7 @@ void acb_dirichlet_powsum_term(acb_ptr res, arb_t log_prev, ulong * prev,
    const acb_t s, ulong k, int integer, int critical_line, slong len, slong prec);

 void acb_dirichlet_powsum_sieved(acb_ptr z, const acb_t s, ulong n, slong len, slong prec);
+void acb_dirichlet_powsum_smooth(acb_ptr z, const acb_t s, ulong n, slong len, slong prec);

 typedef struct
 {
--- a/acb_dirichlet/powsum_smooth.c
+++ b/acb_dirichlet/powsum_smooth.c
@ -0,0 +1,202 @@
+/*
+    Copyright (C) 2016 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb_dirichlet.h"
+#include "acb_poly.h"
+
+/* bound number of 5-smooth numbers up to n; see http://oeis.org/A188425 */
+/* this does not need to be tight */
+static slong
+smooth_bound(ulong n)
+{
+    if (n <= 256) return 52;
+    if (n <= 65536) return 284;
+    if (n <= 16777216) return 836;
+    return 13283;  /* ok up to 2^64 */
+}
+
+static ulong smul(ulong x, ulong y)
+{
+    ulong hi, lo;
+    umul_ppmm(hi, lo, x, y);
+    if (hi)
+        return UWORD_MAX;
+    else
+        return lo;
+}
+
+static slong index(const ulong * v, ulong c)
+{
+    slong i;
+    for (i = 0; ; i++)
+        if (v[i] == c)
+            return i;
+}
+
+void
+acb_dirichlet_powsum_smooth(acb_ptr res, const acb_t s, ulong N, slong d, slong prec)
+{
+    ulong * smooth;     /* numbers 2^i 3^j 5^k <= N */
+    slong num_smooth;   /* the number of such numbers */
+    acb_ptr sums;       /* partial sum for each smooth prefactor */
+    acb_ptr powers;     /* (3^j 5^k)^(-s) */
+    acb_ptr t;          /* temporary */
+    arb_t log_n;
+    ulong x2, x3, x5, m, n, nprev;
+    slong i2, i3, i5, i, j, iu;
+    int critical_line, integer;
+
+    if (N <= 1)
+    {
+        acb_set_ui(res, N);
+        _acb_vec_zero(res + 1, d - 1);
+        return;
+    }
+
+    if (N >= UWORD_MAX - 2)
+        abort();
+
+    critical_line = arb_is_exact(acb_realref(s)) &&
+        (arf_cmp_2exp_si(arb_midref(acb_realref(s)), -1) == 0);
+
+    integer = arb_is_zero(acb_imagref(s)) && arb_is_int(acb_realref(s));
+
+    /* generate the smooth numbers */
+    smooth = flint_malloc(smooth_bound(N) * sizeof(ulong));
+    smooth[0] = 1;
+    num_smooth = 1;
+    x2 = 2; x3 = 3; x5 = 5;
+    i2 = i3 = i5 = 0;
+
+    while ((m = FLINT_MIN(FLINT_MIN(x2, x3), x5)) <= N)
+    {
+        smooth[num_smooth++] = m;
+        if (m == x2) x2 = smul(2, smooth[++i2]);
+        if (m == x3) x3 = smul(3, smooth[++i3]);
+        if (m == x5) x5 = smul(5, smooth[++i5]);
+    }
+
+    sums = _acb_vec_init(num_smooth * d);
+    powers = _acb_vec_init(num_smooth * d);
+    t = _acb_vec_init(d);
+    arb_init(log_n);
+
+    arb_zero(log_n);
+    nprev = 1;
+
+    /* add 1^-s */
+    for (i = 0; i < num_smooth; i++)
+        acb_one(sums + i * d);
+
+    /* compute all the non-smooth index terms (bulk of the work) */
+    for (n = 7; n <= N; n += 2)
+    {
+        if ((n % 3 != 0) && (n % 5 != 0))
+        {
+            acb_dirichlet_powsum_term(t, log_n, &nprev, s, n, integer, critical_line, d, prec);
+            _acb_vec_add(sums, sums, t, d, prec);
+
+            for (i = 1; i < num_smooth && (smooth[i] <= (N / n)); i++)
+                _acb_vec_add(sums + i * d, sums + i * d, t, d, prec);
+        }
+    }
+
+    /* compute 2^(-s) and powers (3^j 3^k)^(-s) */
+    arb_zero(log_n);
+    nprev = 1;
+
+    for (i = 1; i < num_smooth; i++)
+    {
+        n = smooth[i];
+
+        if (n == 2)
+        {
+            acb_dirichlet_powsum_term(powers + i * d, log_n, &nprev, s,
+                n, integer, critical_line, d, prec);
+        }
+        else if (n % 2 != 0)
+        {
+            if (n <= 5)
+            {
+                acb_dirichlet_powsum_term(powers + i * d, log_n, &nprev, s,
+                    n, integer, critical_line, d, prec);
+            }
+            else if (n % 3 == 0)
+            {
+                i3 = index(smooth, 3);
+                iu = index(smooth, n / 3);
+                _acb_poly_mullow(powers + i * d,
+                    powers + i3 * d, d, powers + iu * d, d, d, prec);
+            }
+            else
+            {
+                i5 = index(smooth, 5);
+                iu = index(smooth, n / 5);
+                _acb_poly_mullow(powers + i * d,
+                    powers + i5 * d, d, powers + iu * d, d, d, prec);
+            }
+        }
+    }
+
+    /* merge the sums into the power-of-two sums */
+    for (i = 0; i < num_smooth; i++)
+    {
+        ulong u, v;
+
+        m = smooth[i];
+        u = m;
+        v = 0;
+
+        while ((u & 1) == 0)
+        {
+            u >>= 1;
+            v++;
+        }
+
+        if ((UWORD(1) << v) != m)
+        {
+            j = index(smooth, UWORD(1) << v);
+            iu = index(smooth, u);
+
+            if (u == 1)
+            {
+                _acb_vec_add(sums + j * d, sums + j * d, sums + i * d, d, prec);
+            }
+            else
+            {
+                _acb_poly_mullow(t, sums + i * d, d, powers + iu * d, d, d, prec);
+                _acb_vec_add(sums + j * d, sums + j * d, t, d, prec);
+            }
+        }
+    }
+
+    /* finally evaluate with respect to powers of 2 using horner */
+    _acb_vec_zero(res, d);
+    i2 = index(smooth, 2);
+
+    for (i = num_smooth - 1; i >= 0; i--)
+    {
+        n = smooth[i];
+
+        if ((n & (n - 1)) == 0)
+        {
+            _acb_poly_mullow(t, powers + i2 * d, d, res, d, d, prec);
+            _acb_vec_add(res, sums + i * d, t, d, prec);
+        }
+    }
+
+    _acb_vec_clear(sums, num_smooth * d);
+    _acb_vec_clear(powers, num_smooth * d);
+    _acb_vec_clear(t, d);
+    arb_clear(log_n);
+    flint_free(smooth);
+}
+
--- a/acb_dirichlet/test/t-powsum_smooth.c
+++ b/acb_dirichlet/test/t-powsum_smooth.c
@ -0,0 +1,77 @@
+/*
+    Copyright (C) 2013 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb_dirichlet.h"
+#include "acb_poly.h"
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("powsum_smooth....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 2000 * arb_test_multiplier(); iter++)
+    {
+        acb_t s;
+        acb_ptr z1, z2;
+        slong i, n, len, prec;
+
+        acb_init(s);
+
+        if (n_randint(state, 2))
+        {
+            acb_randtest(s, state, 1 + n_randint(state, 200), 3);
+        }
+        else
+        {
+            arb_set_ui(acb_realref(s), 1);
+            arb_mul_2exp_si(acb_realref(s), acb_realref(s), -1);
+            arb_randtest(acb_imagref(s), state, 1 + n_randint(state, 200), 4);
+        }
+
+        prec = 2 + n_randint(state, 200);
+        n = n_randtest(state) % 500;
+        len = 1 + n_randint(state, 4);
+
+        z1 = _acb_vec_init(len);
+        z2 = _acb_vec_init(len);
+
+        acb_dirichlet_powsum_sieved(z1, s, n, len, prec);
+        acb_dirichlet_powsum_smooth(z2, s, n, len, prec);
+
+        for (i = 0; i < len; i++)
+        {
+            if (!acb_overlaps(z1 + i, z2 + i))
+            {
+                flint_printf("FAIL: overlap\n\n");
+                flint_printf("iter = %wd\n", iter);
+                flint_printf("n = %wd, prec = %wd, len = %wd, i = %wd\n\n", n, prec, len, i);
+                flint_printf("s = "); acb_printd(s, prec / 3.33); flint_printf("\n\n");
+                flint_printf("z1 = "); acb_printd(z1 + i, prec / 3.33); flint_printf("\n\n");
+                flint_printf("z2 = "); acb_printd(z2 + i, prec / 3.33); flint_printf("\n\n");
+                abort();
+            }
+        }
+
+        acb_clear(s);
+        _acb_vec_clear(z1, len);
+        _acb_vec_clear(z2, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/acb_poly/zeta_em_sum.c
+++ b/acb_poly/zeta_em_sum.c
@ -10,6 +10,7 @@
 */

 #include "acb_poly.h"
+#include "acb_dirichlet.h"

 /* res = src * (c + x) */
 void _acb_poly_mullow_cpx(acb_ptr res, acb_srcptr src, slong len, const acb_t c, slong trunc, slong prec)
@ -50,8 +51,10 @@ _acb_poly_zeta_em_sum(acb_ptr z, const acb_t s, const acb_t a, int deflate, ulon
    acb_one(one);

    /* sum 1/(k+a)^(s+x) */
-    if (acb_is_one(a) && d <= 3 && _acb_vec_estimate_allocated_bytes(d * N / 6, prec) < SIEVE_ALLOC_LIMIT)
-        _acb_poly_powsum_one_series_sieved(sum, s, N, d, prec);
+    if (acb_is_one(a) && d <= 2 && _acb_vec_estimate_allocated_bytes(d * N / 6, prec) < SIEVE_ALLOC_LIMIT)
+        acb_dirichlet_powsum_sieved(sum, s, N, d, prec);
+    else if (acb_is_one(a) && d <= 4) /* todo: also better for slightly larger d, if N and prec large enough */
+        acb_dirichlet_powsum_smooth(sum, s, N, d, prec);
    else if (N > 50 && flint_get_num_threads() > 1)
        _acb_poly_powsum_series_naive_threaded(sum, s, a, one, N, d, prec);
    else
--- a/doc/source/acb_dirichlet.rst
+++ b/doc/source/acb_dirichlet.rst
@ -51,6 +51,18 @@ Truncated L-series and power sums
    power series multiplications, it is only faster than the naive
    algorithm when *len* is small.

+.. function:: void acb_dirichlet_powsum_smooth(acb_ptr res, const acb_t s, ulong n, slong len, slong prec)
+
+    Sets *res* to `\sum_{k=1}^n k^{-(s+x)}`
+    as a power series in *x* truncated to length *len*.
+    This function performs partial sieving by adding multiples of 5-smooth *k*
+    into separate buckets. Asymptotically, this requires computing 4/15
+    of the powers, which is slower than *sieved*, but only requires
+    logarithmic extra space. It is also faster for large *len*, since most
+    power series multiplications are traded for additions.
+    A slightly bigger gain for larger *n* could be achieved by using more
+    small prime factors, at the expense of space.
+
 Hurwitz zeta function
 -------------------------------------------------------------------------------