implement acb_dot and use in the acb_poly module

2025-03-05 09:21:38 -05:00 · 2018-08-26 19:59:30 +02:00 · 2018-08-26 19:59:30 +02:00 · ffa60bf319
commit ffa60bf319
parent 7739f884de
19 changed files with 1528 additions and 116 deletions
--- a/acb.h
+++ b/acb.h
@ -644,6 +644,14 @@ acb_submul_arb(acb_t z, const acb_t x, const arb_t y, slong prec)
    arb_submul(acb_imagref(z), acb_imagref(x), y, prec);
 }

+void acb_dot_simple(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+void acb_dot_precise(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+void acb_dot(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+
+
 void acb_inv(acb_t z, const acb_t x, slong prec);

 void acb_div(acb_t z, const acb_t x, const acb_t y, slong prec);
--- a/acb/dot.c
+++ b/acb/dot.c
--- a/acb/dot_precise.c
+++ b/acb/dot_precise.c
@ -0,0 +1,50 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+void
+acb_dot_precise(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep,
+    acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    arb_ptr tmp;
+    slong i;
+
+    tmp = flint_malloc(sizeof(arb_struct) * (4 * len));
+
+    for (i = 0; i < len; i++)
+    {
+        tmp[0 * len + i] = *acb_realref(x + i * xstep);
+        tmp[1 * len + i] = *acb_imagref(x + i * xstep);
+        tmp[2 * len + i] = *acb_realref(y + i * ystep);
+        arb_init(tmp + 3 * len + i);
+        arb_neg(tmp + 3 * len + i, acb_imagref(y + i * ystep));
+    }
+
+    arb_dot_precise(acb_realref(res), initial == NULL ? NULL : acb_realref(initial), subtract,
+        tmp, 1, tmp + 2 * len, 1, 2 * len, prec);
+
+    for (i = 0; i < len; i++)
+        arb_clear(tmp + 3 * len + i);
+
+    for (i = 0; i < len; i++)
+    {
+        tmp[0 * len + i] = *acb_realref(x + i * xstep);
+        tmp[1 * len + i] = *acb_imagref(x + i * xstep);
+        tmp[2 * len + i] = *acb_imagref(y + i * ystep);
+        tmp[3 * len + i] = *acb_realref(y + i * ystep);
+    }
+
+    arb_dot_precise(acb_imagref(res), initial == NULL ? NULL : acb_imagref(initial), subtract,
+        tmp, 1, tmp + 2 * len, 1, 2 * len, prec);
+
+    flint_free(tmp);
+}
--- a/acb/dot_simple.c
+++ b/acb/dot_simple.c
@ -0,0 +1,47 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+void
+acb_dot_simple(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+            acb_zero(res);
+        else
+            acb_set_round(res, initial, prec);
+        return;
+    }
+
+    if (initial == NULL)
+    {
+        acb_mul(res, x, y, prec);
+    }
+    else
+    {
+        if (subtract)
+            acb_neg(res, initial);
+        else
+            acb_set(res, initial);
+        acb_addmul(res, x, y, prec);
+    }
+
+    for (i = 1; i < len; i++)
+        acb_addmul(res, x + i * xstep, y + i * ystep, prec);
+
+    if (subtract)
+        acb_neg(res, res);
+}
--- a/acb/test/t-dot.c
+++ b/acb/test/t-dot.c
@ -0,0 +1,200 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("dot....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 1000000 * arb_test_multiplier(); iter++)
+    {
+        acb_ptr x, y;
+        acb_t s1, s2, z;
+        slong i, len, prec, xbits, ybits, ebits;
+        int ok, initial, subtract, revx, revy;
+
+        if (n_randint(state, 100) == 0)
+            len = n_randint(state, 50);
+        else if (n_randint(state, 10) == 0)
+            len = n_randint(state, 5);
+        else
+            len = n_randint(state, 3);
+
+        acb_dot_gauss_dot_cutoff = 3 + n_randint(state, 30);
+
+        if (n_randint(state, 10) != 0 || len > 10)
+        {
+            prec = 2 + n_randint(state, 500);
+            xbits = 2 + n_randint(state, 500);
+            ybits = 2 + n_randint(state, 500);
+        }
+        else
+        {
+            prec = 2 + n_randint(state, 5000);
+            xbits = 2 + n_randint(state, 5000);
+            ybits = 2 + n_randint(state, 5000);
+        }
+
+        if (n_randint(state, 100) == 0)
+            ebits = 2 + n_randint(state, 100);
+        else
+            ebits = 2 + n_randint(state, 10);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        revx = n_randint(state, 2);
+        revy = n_randint(state, 2);
+
+        x = _acb_vec_init(len);
+        y = _acb_vec_init(len);
+        acb_init(s1);
+        acb_init(s2);
+        acb_init(z);
+
+        switch (n_randint(state, 3))
+        {
+            case 0:
+                for (i = 0; i < len; i++)
+                {
+                    acb_randtest(x + i, state, xbits, ebits);
+                    acb_randtest(y + i, state, ybits, ebits);
+                }
+                break;
+
+            /* Test with cancellation */
+            case 1:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg(x + i, x + len - i - 1);
+                        acb_set(y + i, y + len - i - 1);
+                    }
+                }
+                break;
+
+            default:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg_round(x + i, x + len - i - 1, 2 + n_randint(state, 500));
+                        acb_set_round(y + i, y + len - i - 1, 2 + n_randint(state, 500));
+                    }
+                }
+                break;
+        }
+
+        acb_randtest(s1, state, 200, 100);
+        acb_randtest(s2, state, 200, 100);
+        acb_randtest(z, state, xbits, ebits);
+
+        acb_dot(s1, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+
+        acb_dot_precise(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, ebits <= 12 ? ARF_PREC_EXACT : 2 * prec + 100);
+
+        if (ebits <= 12)
+            ok = acb_contains(s1, s2);
+        else
+            ok = acb_overlaps(s1, s2);
+
+        if (!ok)
+        {
+            flint_printf("FAIL\n\n");
+            flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd, subtract = %d\n\n", iter, len, prec, ebits, subtract);
+
+            if (initial)
+            {
+                flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+            }
+
+            for (i = 0; i < len; i++)
+            {
+                flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+            }
+            flint_printf("\n\n");
+            flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* With the fast algorithm, we expect identical results when
+           reversing the vectors. */
+        if (ebits <= 12)
+        {
+            revx ^= 1;
+            revy ^= 1;
+
+            acb_dot(s2, initial ? z : NULL, subtract,
+                revx ? (x + len - 1) : x, revx ? -1 : 1,
+                revy ? (y + len - 1) : y, revy ? -1 : 1,
+                len, prec);
+
+            if (!acb_equal(s1, s2))
+            {
+                flint_printf("FAIL (reversal)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+        }
+
+        acb_clear(s1);
+        acb_clear(s2);
+        acb_clear(z);
+        _acb_vec_clear(x, len);
+        _acb_vec_clear(y, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/acb_poly/div_series.c
+++ b/acb_poly/div_series.c
@ -44,7 +44,7 @@ _acb_poly_div_series(acb_ptr Q, acb_srcptr A, slong Alen,
    {
        /* The basecase algorithm is faster for much larger Blen and n than
           this, but unfortunately has worse numerical stability. */
-        slong i, j;
+        slong i;
        acb_t q;

        acb_init(q);
@ -54,16 +54,8 @@ _acb_poly_div_series(acb_ptr Q, acb_srcptr A, slong Alen,

        for (i = 1; i < n; i++)
        {
-            acb_mul(Q + i, B + 1, Q + i - 1, prec);
-
-            for (j = 2; j < FLINT_MIN(i + 1, Blen); j++)
-                acb_addmul(Q + i, B + j, Q + i - j, prec);
-
-            if (i < Alen)
-                acb_sub(Q + i, A + i, Q + i, prec);
-            else
-                acb_neg(Q + i, Q + i);
-
+            acb_dot(Q + i, (i < Alen) ? A + i : NULL, 1,
+                B + 1, 1, Q + i - 1, -1, FLINT_MIN(i, Blen - 1), prec);
            if (!acb_is_one(q))
                acb_mul(Q + i, Q + i, q, prec);
        }
--- a/acb_poly/exp_series.c
+++ b/acb_poly/exp_series.c
@ -12,7 +12,7 @@
 #include "acb_poly.h"

 /* allow changing this from the test code */
-ARB_DLL slong acb_poly_newton_exp_cutoff = 120;
+ARB_DLL slong acb_poly_newton_exp_cutoff = 0;

 /* with inverse=1 simultaneously computes g = exp(-x) to length n
 with inverse=0 uses g as scratch space, computing
@ -112,39 +112,51 @@ _acb_poly_exp_series(acb_ptr f, acb_srcptr h, slong hlen, slong n, slong prec)
        _acb_vec_zero(f + j - d + 1, n - (j - d + 1));
        acb_clear(t);
    }
-    else if (hlen <= acb_poly_newton_exp_cutoff)
-    {
-        _acb_poly_exp_series_basecase(f, h, hlen, n, prec);
-    }
    else
    {
-        acb_ptr g, t;
-        acb_t u;
-        int fix;
+        slong cutoff;

-        g = _acb_vec_init((n + 1) / 2);
-        fix = (hlen < n || h == f || !acb_is_zero(h));
+        if (acb_poly_newton_exp_cutoff != 0)
+            cutoff = acb_poly_newton_exp_cutoff;
+        else if (prec <= 256)
+            cutoff = 750;
+        else
+            cutoff = 1e5 / pow(log(prec), 3);

-        if (fix)
+        if (hlen <= cutoff)
        {
-            t = _acb_vec_init(n);
-            _acb_vec_set(t + 1, h + 1, hlen - 1);
+            _acb_poly_exp_series_basecase(f, h, hlen, n, prec);
        }
        else
-            t = (acb_ptr) h;
+        {
+            acb_ptr g, t;
+            acb_t u;
+            int fix;

-        acb_init(u);
-        acb_exp(u, h, prec);
+            g = _acb_vec_init((n + 1) / 2);
+            fix = (hlen < n || h == f || !acb_is_zero(h));

-        _acb_poly_exp_series_newton(f, g, t, n, prec, 0, acb_poly_newton_exp_cutoff);
+            if (fix)
+            {
+                t = _acb_vec_init(n);
+                _acb_vec_set(t + 1, h + 1, hlen - 1);
+            }
+            else
+                t = (acb_ptr) h;

-        if (!acb_is_one(u))
-            _acb_vec_scalar_mul(f, f, n, u, prec);
+            acb_init(u);
+            acb_exp(u, h, prec);

-        _acb_vec_clear(g, (n + 1) / 2);
-        if (fix)
-            _acb_vec_clear(t, n);
-        acb_clear(u);
+            _acb_poly_exp_series_newton(f, g, t, n, prec, 0, cutoff);
+
+            if (!acb_is_one(u))
+                _acb_vec_scalar_mul(f, f, n, u, prec);
+
+            _acb_vec_clear(g, (n + 1) / 2);
+            if (fix)
+                _acb_vec_clear(t, n);
+            acb_clear(u);
+        }
    }
 }

--- a/acb_poly/exp_series_basecase.c
+++ b/acb_poly/exp_series_basecase.c
@ -11,13 +11,11 @@

 #include "acb_poly.h"

-#define MUL_CUTOFF 24
-
 static void
 _acb_poly_exp_series_basecase_rec(acb_ptr f, acb_ptr a,
        acb_srcptr h, slong hlen, slong n, slong prec)
 {
-    slong j, k;
+    slong k;

    acb_t s;
    acb_init(s);
@ -29,10 +27,7 @@ _acb_poly_exp_series_basecase_rec(acb_ptr f, acb_ptr a,

    for (k = 1; k < n; k++)
    {
-        acb_zero(s);
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-            acb_addmul(s, a + j, f + k - j, prec);
-
+        acb_dot(s, NULL, 0, a + 1, 1, f + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
        acb_div_ui(f + k, s, k, prec);
    }

@ -45,7 +40,7 @@ _acb_poly_exp_series_basecase(acb_ptr f,
 {
    hlen = FLINT_MIN(n, hlen);

-    if (n < MUL_CUTOFF || hlen < 0.9 * n)
+    if (n < 20 || hlen < 0.9 * n || prec <= 2 * FLINT_BITS || n < 1000.0 / log(prec + 10) - 70)
    {
        acb_ptr t = _acb_vec_init(hlen);
        _acb_poly_exp_series_basecase_rec(f, t, h, hlen, n, prec);
--- a/acb_poly/inv_series.c
+++ b/acb_poly/inv_series.c
@ -37,7 +37,7 @@ _acb_poly_inv_series(acb_ptr Qinv,
    }
    else
    {
-        slong i, j, blen;
+        slong i, blen;

        /* The basecase algorithm is faster for much larger Qlen or len than
           this, but unfortunately also much less numerically stable. */
@ -48,15 +48,10 @@ _acb_poly_inv_series(acb_ptr Qinv,

        for (i = 1; i < blen; i++)
        {
-            acb_mul(Qinv + i, Q + 1, Qinv + i - 1, prec);
-
-            for (j = 2; j < FLINT_MIN(i + 1, Qlen); j++)
-                acb_addmul(Qinv + i, Q + j, Qinv + i - j, prec);
-
+            acb_dot(Qinv + i, NULL, 1,
+                Q + 1, 1, Qinv + i - 1, -1, FLINT_MIN(i, Qlen - 1), prec);
            if (!acb_is_one(Qinv))
                acb_mul(Qinv + i, Qinv + i, Qinv, prec);
-
-            acb_neg(Qinv + i, Qinv + i);
        }

        if (len > blen)
@ -116,4 +111,3 @@ acb_poly_inv_series(acb_poly_t Qinv, const acb_poly_t Q, slong n, slong prec)
    _acb_poly_set_length(Qinv, n);
    _acb_poly_normalise(Qinv);
 }
-
--- a/acb_poly/mullow.c
+++ b/acb_poly/mullow.c
@ -11,17 +11,46 @@

 #include "acb_poly.h"

-#define CUTOFF 4
-
 void
 _acb_poly_mullow(acb_ptr res,
    acb_srcptr poly1, slong len1,
    acb_srcptr poly2, slong len2, slong n, slong prec)
 {
-    if (n < CUTOFF || len1 < CUTOFF || len2 < CUTOFF)
+    if (n == 1)
+    {
+        acb_mul(res, poly1, poly2, prec);
+    }
+    else if (n <= 7 || len1 <= 7 || len2 <= 7)
+    {
        _acb_poly_mullow_classical(res, poly1, len1, poly2, len2, n, prec);
+    }
    else
-        _acb_poly_mullow_transpose(res, poly1, len1, poly2, len2, n, prec);
+    {
+        slong cutoff;
+        double p;
+
+        if (prec <= 2 * FLINT_BITS)
+        {
+            cutoff = 110;
+        }
+        else
+        {
+            p = log(prec);
+
+            cutoff = 10000.0 / (p * p * p);
+            cutoff = FLINT_MIN(cutoff, 60);
+            if (poly1 == poly2 && prec >= 256)
+                cutoff *= 1.25;
+            if (poly1 == poly2 && prec >= 4096)
+                cutoff *= 1.25;
+            cutoff = FLINT_MAX(cutoff, 8);
+        }
+
+        if (2 * FLINT_MIN(len1, len2) <= cutoff || n <= cutoff)
+            _acb_poly_mullow_classical(res, poly1, len1, poly2, len2, n, prec);
+        else
+            _acb_poly_mullow_transpose(res, poly1, len1, poly2, len2, n, prec);
+    }
 }

 void
--- a/acb_poly/mullow_classical.c
+++ b/acb_poly/mullow_classical.c
@ -26,35 +26,55 @@ _acb_poly_mullow_classical(acb_ptr res,
    }
    else if (poly1 == poly2 && len1 == len2)
    {
-        slong i;
+        slong i, start, stop;

-        _acb_vec_scalar_mul(res, poly1, FLINT_MIN(len1, n), poly1, prec);
-        _acb_vec_scalar_mul(res + len1, poly1 + 1, n - len1, poly1 + len1 - 1, prec);
+        acb_sqr(res, poly1, prec);
+        acb_mul(res + 1, poly1, poly1 + 1, prec);
+        acb_mul_2exp_si(res + 1, res + 1, 1);

-        for (i = 1; i < len1 - 1; i++)
-            _acb_vec_scalar_addmul(res + i + 1, poly1 + 1,
-                FLINT_MIN(i - 1, n - (i + 1)), poly1 + i, prec);
+        for (i = 2; i < FLINT_MIN(n, 2 * len1 - 3); i++)
+        {
+            start = FLINT_MAX(0, i - len1 + 1);
+            stop = FLINT_MIN(len1 - 1, (i + 1) / 2 - 1);

-        for (i = 1; i < FLINT_MIN(2 * len1 - 2, n); i++)
+            acb_dot(res + i, NULL, 0, poly1 + start, 1,
+                poly1 + i - start, -1, stop - start + 1, prec);
            acb_mul_2exp_si(res + i, res + i, 1);
+            if (i % 2 == 0 && i / 2 < len1)
+                acb_addmul(res + i, poly1 + i / 2, poly1 + i / 2, prec);
+        }

-        for (i = 1; i < FLINT_MIN(len1 - 1, (n + 1) / 2); i++)
-            acb_addmul(res + 2 * i, poly1 + i, poly1 + i, prec);
+        if (len1 > 2 && n >= 2 * len1 - 2)
+        {
+            acb_mul(res + 2 * len1 - 3, poly1 + len1 - 1, poly1 + len1 - 2, prec);
+            acb_mul_2exp_si(res + 2 * len1 - 3, res + 2 * len1 - 3, 1);
+        }
+
+        if (n >= 2 * len1 - 1)
+            acb_sqr(res + 2 * len1 - 2, poly1 + len1 - 1, prec);
+    }
+    else if (len1 == 1)
+    {
+        _acb_vec_scalar_mul(res, poly2, n, poly1, prec);
+    }
+    else if (len2 == 1)
+    {
+        _acb_vec_scalar_mul(res, poly1, n, poly2, prec);
    }
    else
    {
-        slong i;
+        slong i, top1, top2;

-        _acb_vec_scalar_mul(res, poly1, FLINT_MIN(len1, n), poly2, prec);
+        acb_mul(res, poly1, poly2, prec);

-        if (n > len1)
-            _acb_vec_scalar_mul(res + len1, poly2 + 1, n - len1,
-                                      poly1 + len1 - 1, prec);
+        for (i = 1; i < n; i++)
+        {
+            top1 = FLINT_MIN(len1 - 1, i);
+            top2 = FLINT_MIN(len2 - 1, i);

-        for (i = 0; i < FLINT_MIN(len1, n) - 1; i++)
-            _acb_vec_scalar_addmul(res + i + 1, poly2 + 1,
-                                         FLINT_MIN(len2, n - i) - 1,
-                                         poly1 + i, prec);
+            acb_dot(res + i, NULL, 0, poly1 + i - top2, 1,
+                poly2 + top2, -1, top1 + top2 - i + 1, prec);
+        }
    }
 }

@ -94,4 +114,3 @@ acb_poly_mullow_classical(acb_poly_t res, const acb_poly_t poly1,
    _acb_poly_set_length(res, n);
    _acb_poly_normalise(res);
 }
-
--- a/acb_poly/sin_cos_pi_series.c
+++ b/acb_poly/sin_cos_pi_series.c
@ -11,10 +11,8 @@

 #include "acb_poly.h"

-#define TANGENT_CUTOFF 80
-
 void
-_acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
    hlen = FLINT_MIN(hlen, n);

@ -36,10 +34,25 @@ _acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen
        acb_mul(c + 1, s, t, prec);
        acb_clear(t);
    }
-    else if (hlen < TANGENT_CUTOFF)
-        _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 1);
    else
-        _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 1);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+        {
+            cutoff = 1400;
+        }
+        else
+        {
+            cutoff = 100000 / pow(log(prec), 3);
+            cutoff = FLINT_MIN(cutoff, 700);
+        }
+
+        if (hlen < cutoff)
+            _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 1);
+        else
+            _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 1);
+    }
 }

 void
@ -73,4 +86,3 @@ acb_poly_sin_cos_pi_series(acb_poly_t s, acb_poly_t c,
    _acb_poly_set_length(c, n);
    _acb_poly_normalise(c);
 }
-
--- a/acb_poly/sin_cos_series.c
+++ b/acb_poly/sin_cos_series.c
@ -11,10 +11,8 @@

 #include "acb_poly.h"

-#define TANGENT_CUTOFF 80
-
 void
-_acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
    hlen = FLINT_MIN(hlen, n);

@ -35,10 +33,25 @@ _acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, s
        acb_mul(c + 1, s, t, prec);
        acb_clear(t);
    }
-    else if (hlen < TANGENT_CUTOFF)
-        _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 0);
    else
-        _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 0);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+        {
+            cutoff = 1400;
+        }
+        else
+        {
+            cutoff = 100000 / pow(log(prec), 3);
+            cutoff = FLINT_MIN(cutoff, 700);
+        }
+
+        if (hlen < cutoff)
+            _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 0);
+        else
+            _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 0);
+    }
 }

 void
--- a/acb_poly/sin_cos_series_basecase.c
+++ b/acb_poly/sin_cos_series_basecase.c
@ -15,7 +15,7 @@ void
 _acb_poly_sin_cos_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen,
        slong n, slong prec, int times_pi)
 {
-    slong j, k, alen = FLINT_MIN(n, hlen);
+    slong k, alen = FLINT_MIN(n, hlen);
    acb_ptr a;
    acb_t t, u;

@ -46,15 +46,8 @@ _acb_poly_sin_cos_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen

    for (k = 1; k < n; k++)
    {
-        acb_zero(t);
-        acb_zero(u);
-
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-        {
-            acb_submul(t, a + j, s + k - j, prec);
-            acb_addmul(u, a + j, c + k - j, prec);
-        }
-
+        acb_dot(t, NULL, 1, a + 1, 1, s + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
+        acb_dot(u, NULL, 0, a + 1, 1, c + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
        acb_div_ui(c + k, t, k, prec);
        acb_div_ui(s + k, u, k, prec);
    }
@ -92,4 +85,3 @@ acb_poly_sin_cos_series_basecase(acb_poly_t s, acb_poly_t c,
    _acb_poly_set_length(c, n);
    _acb_poly_normalise(c);
 }
-
--- a/acb_poly/sinh_cosh_series.c
+++ b/acb_poly/sinh_cosh_series.c
@ -12,7 +12,7 @@
 #include "acb_poly.h"

 void
-_acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
    hlen = FLINT_MIN(hlen, n);

@ -32,10 +32,20 @@ _acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen,
        acb_mul(c + 1, s, t, prec);
        acb_clear(t);
    }
-    else if (hlen < 60 || n < 120)
-        _acb_poly_sinh_cosh_series_basecase(s, c, h, hlen, n, prec);
    else
-        _acb_poly_sinh_cosh_series_exponential(s, c, h, hlen, n, prec);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+            cutoff = 400;
+        else
+            cutoff = 30000 / pow(log(prec), 3);
+
+        if (hlen < cutoff)
+            _acb_poly_sinh_cosh_series_basecase(s, c, h, hlen, n, prec);
+        else
+            _acb_poly_sinh_cosh_series_exponential(s, c, h, hlen, n, prec);
+    }
 }

 void
@ -69,4 +79,3 @@ acb_poly_sinh_cosh_series(acb_poly_t s, acb_poly_t c,
    _acb_poly_set_length(c, n);
    _acb_poly_normalise(c);
 }
-
--- a/acb_poly/sinh_cosh_series_basecase.c
+++ b/acb_poly/sinh_cosh_series_basecase.c
@ -15,7 +15,7 @@ void
 _acb_poly_sinh_cosh_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen,
        slong n, slong prec)
 {
-    slong j, k, alen = FLINT_MIN(n, hlen);
+    slong k, alen = FLINT_MIN(n, hlen);
    acb_ptr a;
    acb_t t, u;

@ -37,15 +37,8 @@ _acb_poly_sinh_cosh_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hl

    for (k = 1; k < n; k++)
    {
-        acb_zero(t);
-        acb_zero(u);
-
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-        {
-            acb_addmul(t, a + j, s + k - j, prec);
-            acb_addmul(u, a + j, c + k - j, prec);
-        }
-
+        acb_dot(t, NULL, 0, a + 1, 1, s + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
+        acb_dot(u, NULL, 0, a + 1, 1, c + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
        acb_div_ui(c + k, t, k, prec);
        acb_div_ui(s + k, u, k, prec);
    }
@ -83,4 +76,3 @@ acb_poly_sinh_cosh_series_basecase(acb_poly_t s, acb_poly_t c,
    _acb_poly_set_length(c, n);
    _acb_poly_normalise(c);
 }
-
--- a/arb/test/t-dot.c
+++ b/arb/test/t-dot.c
@ -35,7 +35,7 @@ int main()
        else
            len = n_randint(state, 3);

-        if (n_randint(state, 100) == 0 || len > 10)
+        if (n_randint(state, 10) != 0 || len > 10)
        {
            prec = 2 + n_randint(state, 500);
            xbits = 2 + n_randint(state, 500);
--- a/doc/source/acb.rst
+++ b/doc/source/acb.rst
@ -499,6 +499,46 @@ Arithmetic

    Sets *z* to the quotient of *x* and *y*.

+Dot product
+-------------------------------------------------------------------------------
+
+.. function:: void acb_dot_precise(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void acb_dot_simple(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void acb_dot(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+    Computes the dot product of the vectors *x* and *y*, setting
+    *res* to `s + (-1)^{subtract} \sum_{i=0}^{len-1} x_i y_i`.
+
+    The initial term *s* is optional and can be
+    omitted by passing *NULL* (equivalently, `s = 0`).
+    The parameter *subtract* must be 0 or 1.
+    The length *len* is allowed to be negative, which is equivalent
+    to a length of zero.
+    The parameters *xstep* or *ystep* specify a step length for
+    traversing subsequences of the vectors *x* and *y*; either can be
+    negative to step in the reverse direction starting from
+    the initial pointer.
+    Aliasing is allowed between *res* and *s* but not between
+    *res* and the entries of *x* and *y*.
+
+    The default version determines the optimal precision for each term
+    and performs all internal calculations using mpn arithmetic
+    with minimal overhead. This is the preferred way to compute a
+    dot product; it is generally much faster and more precise
+    than a simple loop.
+
+    The *simple* version performs fused multiply-add operations in
+    a simple loop. This can be used for
+    testing purposes and is also used as a fallback by the
+    default version when the exponents are out of range
+    for the optimized code.
+
+    The *precise* version computes the dot product exactly up to the
+    final rounding. This can be extremely slow and is only intended
+    for testing.
+
 Mathematical constants
 -------------------------------------------------------------------------------

--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@ -782,7 +782,7 @@ Arithmetic

    Sets `z = x / (2^n - 1)`, rounded to *prec* bits.

-Sum and dot product
+Dot product
 -------------------------------------------------------------------------------

 .. function:: void arb_dot_precise(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)