diff --git a/acb.h b/acb.h
index 7ea59648..e30dd6fc 100644
--- a/acb.h
+++ b/acb.h
@@ -644,6 +644,14 @@ acb_submul_arb(acb_t z, const acb_t x, const arb_t y, slong prec)
     arb_submul(acb_imagref(z), acb_imagref(x), y, prec);
 }
 
+void acb_dot_simple(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+void acb_dot_precise(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+void acb_dot(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);
+
+
 void acb_inv(acb_t z, const acb_t x, slong prec);
 
 void acb_div(acb_t z, const acb_t x, const acb_t y, slong prec);
diff --git a/acb/dot.c b/acb/dot.c
new file mode 100644
index 00000000..27b0d971
--- /dev/null
+++ b/acb/dot.c
@@ -0,0 +1,1008 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+/* We need uint64_t instead of mp_limb_t on 32-bit systems for
+   safe summation of 30-bit error bounds. */
+#include <stdint.h>
+
+/* The following macros are found in FLINT's longlong.h, but
+   the release version is out of date. */
+
+/* x86 : 64 bit */
+#if (GMP_LIMB_BITS == 64 && defined (__amd64__))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0"     \
+       : "=r" (sh), "=&r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)),  \
+         "1"  ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)),  \
+         "2"  ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl)))  \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0"     \
+       : "=r" (dh), "=&r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)),  \
+         "1"  ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)),  \
+"2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
+
+#endif /* x86_64 */
+
+/* x86 : 32 bit */
+#if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
+   || defined (__i486__) || defined(__amd64__)))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0"     \
+       : "=r" (sh), "=r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)),    \
+         "1"  ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)),    \
+         "2"  ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl)))    \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0"     \
+       : "=r" (dh), "=r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)),    \
+         "1"  ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)),    \
+         "2"  ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl)))    \
+
+#endif /* x86 */
+
+
+#if !defined(add_sssaaaaaa2)
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl);      \
+    add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm);      \
+    add_ssaaaa(sh, sm, ah + bh, sm, __u, __t);                      \
+} while (0)
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl);      \
+    sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm);      \
+    sub_ddmmss(dh, dm, mh - sh, dm, __u, __t);                      \
+  } while (0)
+
+#endif
+
+
+/* Add ((a * b) / 2^MAG_BITS) * 2^exp into srad*2^srad_exp.
+   Assumes that srad_exp >= exp and that overflow cannot occur.  */
+#define RAD_ADDMUL(srad, srad_exp, a, b, exp) \
+    do { \
+        uint64_t __a, __b; \
+        slong __shift; \
+        __a = (a); \
+        __b = (b); \
+        __shift = (srad_exp) - (exp); \
+        if (__shift < MAG_BITS) \
+            (srad) += (((__a) * (__b)) >> (MAG_BITS + __shift)) + 1; \
+        else \
+            (srad) += 1; \
+    } while (0)
+
+void mag_set_ui_2exp_small(mag_t z, ulong x, slong e);
+
+static void
+add_errors(mag_t rad, uint64_t Aerr, slong Aexp, uint64_t Berr, slong Bexp, uint64_t Cerr, slong Cexp)
+{
+    slong shift;
+
+    if (Aerr && Berr)
+    {
+        if (Aexp >= Bexp)
+        {
+            shift = Aexp - Bexp;
+            if (shift < 64)
+                Aerr = Aerr + (Berr >> shift) + 1;
+            else
+                Aerr = Aerr + 1;
+        }
+        else
+        {
+            shift = Bexp - Aexp;
+            if (shift < 64)
+                Aerr = Berr + (Aerr >> shift) + 1;
+            else
+                Aerr = Berr + 1;
+            Aexp = Bexp;
+        }
+    }
+    else if (Berr)
+    {
+        Aerr = Berr;
+        Aexp = Bexp;
+    }
+
+    if (Aerr && Cerr)
+    {
+        if (Aexp >= Cexp)
+        {
+            shift = Aexp - Cexp;
+            if (shift < 64)
+                Aerr = Aerr + (Cerr >> shift) + 1;
+            else
+                Aerr = Aerr + 1;
+        }
+        else
+        {
+            shift = Cexp - Aexp;
+            if (shift < 64)
+                Aerr = Cerr + (Aerr >> shift) + 1;
+            else
+                Aerr = Cerr + 1;
+            Aexp = Cexp;
+        }
+    }
+    else if (Cerr)
+    {
+        Aerr = Cerr;
+        Aexp = Cexp;
+    }
+
+#if FLINT_BITS == 64
+    mag_set_ui_2exp_small(rad, Aerr, Aexp - MAG_BITS);
+#else
+    mag_set_d(rad, Aerr * (1.0 + 1e-14));
+    mag_mul_2exp_si(rad, rad, Aexp - MAG_BITS);
+#endif
+}
+
+void
+_arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
+    int negative, mp_bitcnt_t shift);
+
+void
+_arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn,
+    int negative, mp_bitcnt_t shift);
+
+static void
+_arb_dot_output(arb_t res, mp_ptr sum, mp_size_t sn, int negative,
+    uint64_t serr, slong sum_exp, uint64_t srad, slong srad_exp, slong prec)
+{
+    slong exp_fix;
+    int inexact;
+
+    if (sum[sn - 1] >= LIMB_TOP)
+    {
+        mpn_neg(sum, sum, sn);
+        negative ^= 1;
+    }
+
+    exp_fix = 0;
+
+    if (sum[sn - 1] == 0)
+    {
+        slong sum_exp2;
+        mp_size_t sn2;
+
+        sn2 = sn;
+        sum_exp2 = sum_exp; 
+
+        while (sn2 > 0 && sum[sn2 - 1] == 0)
+        {
+            sum_exp2 -= FLINT_BITS;
+            sn2--;
+        }
+
+        if (sn2 == 0)
+        {
+            arf_zero(arb_midref(res));
+            inexact = 0;
+        }
+        else
+        {
+            inexact = _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn2, negative, prec, ARF_RND_DOWN);
+            _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp2);
+        }
+    }
+    else
+    {
+        if (sn == 2)  /* unnecessary? */
+            inexact = _arf_set_round_uiui(arb_midref(res), &exp_fix, sum[1], sum[0], negative, prec, ARF_RND_DOWN);
+        else
+            inexact = _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn, negative, prec, ARF_RND_DOWN);
+
+        _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp);
+    }
+
+    add_errors(arb_radref(res),
+        inexact << MAG_BITS,
+        exp_fix + sum_exp - prec,
+        ((uint64_t) serr) << MAG_BITS,
+        sum_exp - sn * FLINT_BITS,
+        srad,
+        srad_exp);
+}
+
+/* xxx: don't use surrounding variables */
+#define ARB_DOT_ADD(s_sum, s_serr, s_sn, s_sum_exp, s_subtract, xm) \
+    if (!arf_is_special(xm)) \
+    { \
+        mp_srcptr xptr; \
+        xexp = ARF_EXP(xm); \
+        xn = ARF_SIZE(xm); \
+        xnegative = ARF_SGNBIT(xm); \
+        shift = s_sum_exp - xexp; \
+        if (shift >= s_sn * FLINT_BITS) \
+        { \
+            s_serr++; \
+        } \
+        else \
+        { \
+            xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); \
+            _arb_dot_add_generic(s_sum, &s_serr, tmp, s_sn, xptr, xn, xnegative ^ s_subtract, shift); \
+        } \
+    } \
+
+/* xxx: don't use surrounding variables */
+#define ARB_DOT_ADD_RAD(s_srad, s_srad_exp, xr) \
+    if (!mag_is_special(xr)) \
+    { \
+        xrad = MAG_MAN(xr); \
+        xrexp = MAG_EXP(xr); \
+        shift = s_srad_exp - xrexp; \
+        if (shift < 64) \
+            s_srad += (xrad >> shift) + 1; \
+        else \
+            s_srad++; \
+    }
+
+static void
+_arf_complex_mul_gauss(arf_t e, arf_t f, const arf_t a, const arf_t b,
+                                         const arf_t c, const arf_t d)
+{
+    mp_srcptr ap, bp, cp, dp;
+    int asgn, bsgn, csgn, dsgn;
+    mp_size_t an, bn, cn, dn;
+    slong aexp, bexp, cexp, dexp;
+    fmpz texp, uexp;
+
+    fmpz_t za, zb, zc, zd, t, u, v;
+    slong abot, bbot, cbot, dbot;
+
+    ARF_GET_MPN_READONLY(ap, an, a);
+    asgn = ARF_SGNBIT(a);
+    aexp = ARF_EXP(a);
+
+    ARF_GET_MPN_READONLY(bp, bn, b);
+    bsgn = ARF_SGNBIT(b);
+    bexp = ARF_EXP(b);
+
+    ARF_GET_MPN_READONLY(cp, cn, c);
+    csgn = ARF_SGNBIT(c);
+    cexp = ARF_EXP(c);
+
+    ARF_GET_MPN_READONLY(dp, dn, d);
+    dsgn = ARF_SGNBIT(d);
+    dexp = ARF_EXP(d);
+
+    /* Gauss multiplication
+        e = ac - bd
+        f = (a+b)(c+d) - ac - bd */
+
+    abot = aexp - an * FLINT_BITS;
+    bbot = bexp - bn * FLINT_BITS;
+    cbot = cexp - cn * FLINT_BITS;
+    dbot = dexp - dn * FLINT_BITS;
+
+    texp = FLINT_MIN(abot, bbot);
+    uexp = FLINT_MIN(cbot, dbot);
+
+    fmpz_init(za);
+    fmpz_init(zb);
+    fmpz_init(zc);
+    fmpz_init(zd);
+    fmpz_init(t);
+    fmpz_init(u);
+    fmpz_init(v);
+
+    fmpz_lshift_mpn(za, ap, an, asgn, abot - texp);
+    fmpz_lshift_mpn(zb, bp, bn, bsgn, bbot - texp);
+    fmpz_lshift_mpn(zc, cp, cn, csgn, cbot - uexp);
+    fmpz_lshift_mpn(zd, dp, dn, dsgn, dbot - uexp);
+
+    fmpz_add(t, za, zb);
+    fmpz_add(v, zc, zd);
+    fmpz_mul(u, t, v);
+    fmpz_mul(t, za, zc);
+    fmpz_mul(v, zb, zd);
+    fmpz_sub(u, u, t);
+    fmpz_sub(u, u, v);
+    fmpz_sub(t, t, v);
+
+    texp += uexp;
+    arf_set_fmpz_2exp(e, t, &texp);
+    arf_set_fmpz_2exp(f, u, &texp);
+
+    fmpz_clear(za);
+    fmpz_clear(zb);
+    fmpz_clear(zc);
+    fmpz_clear(zd);
+    fmpz_clear(t);
+    fmpz_clear(u);
+    fmpz_clear(v);
+}
+
+/* TODO: this could be much lower, but it's currently competing
+   against mulhigh in the Karatsuba range. */
+ARB_DLL slong acb_dot_gauss_dot_cutoff = 128;
+#define GAUSS_CUTOFF acb_dot_gauss_dot_cutoff
+
+void
+acb_dot(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i, j, padding, extend;
+    slong xexp, yexp, exp;
+    slong re_nonzero, im_nonzero;
+    slong re_max_exp, re_min_exp, re_sum_exp;
+    slong im_max_exp, im_min_exp, im_sum_exp;
+    slong re_srad_exp, re_max_rad_exp;
+    slong im_srad_exp, im_max_rad_exp;
+    slong re_prec, im_prec;
+    slong xrexp, yrexp;
+    int xnegative, ynegative;
+    mp_size_t xn, yn, re_sn, im_sn, alloc;
+    mp_bitcnt_t shift;
+    arb_srcptr xi, yi;
+    arf_srcptr xm, ym;
+    mag_srcptr xr, yr;
+    mp_limb_t xtop, ytop;
+    mp_limb_t xrad, yrad;
+    mp_limb_t re_serr, im_serr;   /* Sum over arithmetic errors */
+    uint64_t re_srad, im_srad;    /* Sum over propagated errors */
+    mp_ptr tmp, re_sum, im_sum;   /* Workspace */
+    slong xoff, yoff;
+    char * use_gauss;
+    ARF_ADD_TMP_DECL;
+
+    /* todo: fast fma and fmma (len=2) code */
+    if (len <= 1)
+    {
+        if (initial == NULL)
+        {
+            if (len <= 0)
+                acb_zero(res);
+            else
+            {
+                acb_mul(res, x, y, prec);
+                if (subtract)
+                    acb_neg(res, res);
+            }
+            return;
+        }
+        else if (len <= 0)
+        {
+            acb_set_round(res, initial, prec);
+            return;
+        }
+    }
+
+    /* Number of nonzero midpoint terms in sum. */
+    re_nonzero = 0;
+    im_nonzero = 0;
+
+    /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
+    re_max_exp = WORD_MIN;
+    im_max_exp = WORD_MIN;
+
+    /* Propagated error terms are bounded by 2^max_rad_exp */
+    re_max_rad_exp = WORD_MIN;
+    im_max_rad_exp = WORD_MIN;
+
+    /* Used to reduce the precision. */
+    re_min_exp = WORD_MAX;
+    im_min_exp = WORD_MAX;
+
+    /* Account for the initial term. */
+    if (initial != NULL)
+    {
+        if (!ARB_IS_LAGOM(acb_realref(initial)) || !ARB_IS_LAGOM(acb_imagref(initial)))
+        {
+            acb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(acb_realref(initial));
+        xr = arb_radref(acb_realref(initial));
+
+        if (!arf_is_special(xm))
+        {
+            re_max_exp = ARF_EXP(xm);
+            re_nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                re_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+
+        if (!mag_is_special(xr))
+            re_max_rad_exp = MAG_EXP(xr);
+
+        xm = arb_midref(acb_imagref(initial));
+        xr = arb_radref(acb_imagref(initial));
+
+        if (!arf_is_special(xm))
+        {
+            im_max_exp = ARF_EXP(xm);
+            im_nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                im_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+
+        if (!mag_is_special(xr))
+            im_max_rad_exp = MAG_EXP(xr);
+    }
+
+    for (xoff = 0; xoff < 2; xoff++)
+    {
+        for (yoff = 0; yoff < 2; yoff++)
+        {
+            slong nonzero, max_exp, min_exp, max_rad_exp;
+
+            if (xoff == yoff)
+            {
+                nonzero = re_nonzero;
+                max_exp = re_max_exp;
+                min_exp = re_min_exp;
+                max_rad_exp = re_max_rad_exp;
+            }
+            else
+            {
+                nonzero = im_nonzero;
+                max_exp = im_max_exp;
+                min_exp = im_min_exp;
+                max_rad_exp = im_max_rad_exp;
+            }
+
+            /* Determine maximum exponents for the main sum and the radius sum. */
+            for (i = 0; i < len; i++)
+            {
+                xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
+                yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
+
+                /* Fallback for huge exponents or non-finite values. */
+                if (!ARB_IS_LAGOM(xi) || !ARB_IS_LAGOM(yi))
+                {
+                    acb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+                    return;
+                }
+
+                xm = arb_midref(xi);
+                ym = arb_midref(yi);
+                xr = arb_radref(xi);
+                yr = arb_radref(yi);
+
+                /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
+                if (!arf_is_special(xm))
+                {
+                    xexp = ARF_EXP(xm);
+
+                    if (!arf_is_special(ym))
+                    {
+                        yexp = ARF_EXP(ym);
+
+                        max_exp = FLINT_MAX(max_exp, xexp + yexp);
+                        nonzero++;
+
+                        if (prec > 2 * FLINT_BITS)
+                        {
+                            slong bot;
+                            bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
+                            min_exp = FLINT_MIN(min_exp, bot);
+                        }
+
+                        if (!mag_is_special(xr))
+                        {
+                            xrexp = MAG_EXP(xr);
+                            max_rad_exp = FLINT_MAX(max_rad_exp, yexp + xrexp);
+
+                            if (!mag_is_special(yr))
+                            {
+                                yrexp = MAG_EXP(yr);
+                                max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+                                max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                            }
+                        }
+                        else
+                        {
+                            if (!mag_is_special(yr))
+                            {
+                                yrexp = MAG_EXP(yr);
+                                max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+                            }
+                        }
+                    }
+                    else  /* if y = 0, something can happen only if yr != 0 */
+                    {
+                        if (!mag_is_special(yr))
+                        {
+                            yrexp = MAG_EXP(yr);
+                            max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+
+                            if (!mag_is_special(xr))
+                            {
+                                xrexp = MAG_EXP(xr);
+                                max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                            }
+                        }
+                    }
+                }
+                else  /* if x = 0, something can happen only if xr != 0 */
+                {
+                    if (!mag_is_special(xr))
+                    {
+                        xrexp = MAG_EXP(xr);
+
+                        if (!arf_is_special(ym))
+                        {
+                            yexp = ARF_EXP(ym);
+                            max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yexp);
+                        }
+
+                        if (!mag_is_special(yr))
+                        {
+                            yrexp = MAG_EXP(yr);
+                            max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                        }
+                    }
+                }
+            }
+
+            if (xoff == yoff)
+            {
+                re_nonzero = nonzero;
+                re_max_exp = max_exp;
+                re_min_exp = min_exp;
+                re_max_rad_exp = max_rad_exp;
+            }
+            else
+            {
+                im_nonzero = nonzero;
+                im_max_exp = max_exp;
+                im_min_exp = min_exp;
+                im_max_rad_exp = max_rad_exp;
+            }
+        }
+    }
+
+    re_prec = prec;
+    im_prec = prec;
+
+    if (re_max_exp == WORD_MIN && re_max_rad_exp == WORD_MIN &&
+        im_max_exp == WORD_MIN && im_max_rad_exp == WORD_MIN)
+    {
+        acb_zero(res);
+        return;
+    }
+
+    /* The midpoint sum is zero. */
+    if (re_max_exp == WORD_MIN)
+    {
+        re_prec = 2;
+    }
+    else
+    {
+        if (re_max_rad_exp != WORD_MIN)
+            re_prec = FLINT_MIN(re_prec, re_max_exp - re_max_rad_exp + MAG_BITS);
+        if (re_min_exp != WORD_MAX)
+            re_prec = FLINT_MIN(re_prec, re_max_exp - re_min_exp + MAG_BITS);
+        re_prec = FLINT_MAX(re_prec, 2);
+    }
+
+    if (im_max_exp == WORD_MIN)
+    {
+        im_prec = 2;
+    }
+    else
+    {
+        if (im_max_rad_exp != WORD_MIN)
+            im_prec = FLINT_MIN(im_prec, im_max_exp - im_max_rad_exp + MAG_BITS);
+        if (re_min_exp != WORD_MAX)
+            im_prec = FLINT_MIN(im_prec, im_max_exp - im_min_exp + MAG_BITS);
+        im_prec = FLINT_MAX(im_prec, 2);
+    }
+
+    extend = FLINT_BIT_COUNT(re_nonzero) + 1;
+    padding = 4 + FLINT_BIT_COUNT(len);
+    re_sn = (re_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+    re_sn = FLINT_MAX(re_sn, 2);
+    re_sum_exp = re_max_exp + extend;
+
+    extend = FLINT_BIT_COUNT(im_nonzero) + 1;
+    padding = 4 + FLINT_BIT_COUNT(len);
+    im_sn = (im_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+    im_sn = FLINT_MAX(im_sn, 2);
+    im_sum_exp = im_max_exp + extend;
+
+    /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
+       for carry or borrow that avoids an extra branch). We need
+       2 * (sn + 2) limbs to store the product of two numbers
+       with up to (sn + 2) limbs, plus 1 extra limb for shifting
+       the product. */
+    alloc = (re_sn + 1) + (im_sn + 1) + 2 * (FLINT_MAX(re_sn, im_sn) + 2) + 1;
+    ARF_ADD_TMP_ALLOC(re_sum, alloc)
+    im_sum = re_sum + (re_sn + 1);
+    tmp = im_sum + (im_sn + 1);
+
+    /* Sum of propagated errors. */
+    re_srad_exp = re_max_rad_exp;
+    re_srad = 0;
+    im_srad_exp = im_max_rad_exp;
+    im_srad = 0;
+
+    /* Set sum to 0 */
+    re_serr = 0;
+    for (j = 0; j < re_sn + 1; j++)
+        re_sum[j] = 0;
+    im_serr = 0;
+    for (j = 0; j < im_sn + 1; j++)
+        im_sum[j] = 0;
+
+    if (initial != NULL)
+    {
+        xm = arb_midref(acb_realref(initial));
+        xr = arb_radref(acb_realref(initial));
+
+        ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, subtract, xm);
+        ARB_DOT_ADD_RAD(re_srad, re_srad_exp, xr);
+
+        xm = arb_midref(acb_imagref(initial));
+        xr = arb_radref(acb_imagref(initial));
+
+        ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, subtract, xm);
+        ARB_DOT_ADD_RAD(im_srad, im_srad_exp, xr);
+    }
+
+    /*
+    Look for terms to process using the Gauss multiplication formula.
+    If any such terms are found, we mask the ith entry in use_gauss
+    so that they will be skipped in the main loop.
+    Important: the cutoffs must be such that the fast case
+    (xn <= 2, yn <= 2, sn <= 3) is not hit below and the mask
+    check is done.
+
+    The cutoffs below are not optimal in the generic case; also, it
+    would be nicer to have both mulhigh and Gauss here. A more elegant
+    solution would be to write a fallback version of acb_dot_simple
+    where acb_addmul does the right thing.
+    */
+    use_gauss = NULL;
+
+    if (re_prec >= GAUSS_CUTOFF * FLINT_BITS &&
+        im_prec >= GAUSS_CUTOFF * FLINT_BITS)
+    {
+        arf_t e, f;
+
+        for (i = 0; i < len; i++)
+        {
+            arb_srcptr ai, bi, ci, di;
+            mp_size_t an, bn, cn, dn;
+            slong aexp, bexp, cexp, dexp;
+
+            ai = ((arb_srcptr) x) + 2 * i * xstep;
+            bi = ((arb_srcptr) x) + 2 * i * xstep + 1;
+            ci = ((arb_srcptr) y) + 2 * i * ystep;
+            di = ((arb_srcptr) y) + 2 * i * ystep + 1;
+
+            an = ARF_SIZE(arb_midref(ai));
+            bn = ARF_SIZE(arb_midref(bi));
+            cn = ARF_SIZE(arb_midref(ci));
+            dn = ARF_SIZE(arb_midref(di));
+
+            aexp = ARF_EXP(arb_midref(ai));
+            bexp = ARF_EXP(arb_midref(bi));
+            cexp = ARF_EXP(arb_midref(ci));
+            dexp = ARF_EXP(arb_midref(di));
+
+            if (an >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF &&
+                bn >= GAUSS_CUTOFF && cn >= GAUSS_CUTOFF &&
+                FLINT_ABS(an - bn) <= 2 &&
+                FLINT_ABS(cn - dn) <= 2 &&
+                FLINT_ABS(aexp - bexp) <= 64 &&
+                FLINT_ABS(cexp - dexp) <= 64 &&
+                re_sum_exp - (aexp + cexp) < 0.1 * re_prec &&
+                im_sum_exp - (aexp + dexp) < 0.1 * im_prec &&
+                an + cn < 2.2 * re_sn && an + dn < 2.2 * im_sn)
+            {
+                if (use_gauss == NULL)
+                {
+                    use_gauss = flint_calloc(len, sizeof(char));
+                    arf_init(e);
+                    arf_init(f);
+                }
+
+                use_gauss[i] = 1;
+                _arf_complex_mul_gauss(e, f, arb_midref(ai), arb_midref(bi), arb_midref(ci), arb_midref(di));
+                ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, 0, e);
+                ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, 0, f);
+            }
+        }
+
+        if (use_gauss != NULL)
+        {
+            arf_clear(e);
+            arf_clear(f);
+        }
+    }
+
+    for (xoff = 0; xoff < 2; xoff++)
+    {
+        for (yoff = 0; yoff < 2; yoff++)
+        {
+            slong sum_exp, srad_exp;
+            mp_ptr sum;
+            mp_size_t sn;
+            mp_limb_t serr, srad;
+            int flipsign;
+
+            if (xoff == yoff)
+            {
+                sum_exp = re_sum_exp;
+                srad_exp = re_srad_exp;
+                sum = re_sum;
+                sn = re_sn;
+                if (re_max_exp == WORD_MIN && re_max_rad_exp == WORD_MIN)
+                    continue;
+            }
+            else
+            {
+                sum_exp = im_sum_exp;
+                srad_exp = im_srad_exp;
+                sum = im_sum;
+                sn = im_sn;
+                if (im_max_exp == WORD_MIN && im_max_rad_exp == WORD_MIN)
+                    continue;
+            }
+
+            serr = 0;
+            srad = 0;
+            flipsign = (xoff + yoff == 2);
+
+            for (i = 0; i < len; i++)
+            {
+                xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
+                yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
+
+                xm = arb_midref(xi);
+                ym = arb_midref(yi);
+                xr = arb_radref(xi);
+                yr = arb_radref(yi);
+
+                /* The midpoints of x[i] and y[i] are both nonzero. */
+                if (!arf_is_special(xm) && !arf_is_special(ym))
+                {
+                    xexp = ARF_EXP(xm);
+                    xn = ARF_SIZE(xm);
+                    xnegative = ARF_SGNBIT(xm);
+
+                    yexp = ARF_EXP(ym);
+                    yn = ARF_SIZE(ym);
+                    ynegative = ARF_SGNBIT(ym);
+
+                    exp = xexp + yexp;
+                    shift = sum_exp - exp;
+
+                    if (shift >= sn * FLINT_BITS)
+                    {
+                        /* We may yet need the top limbs for bounds. */
+                        ARF_GET_TOP_LIMB(xtop, xm);
+                        ARF_GET_TOP_LIMB(ytop, ym);
+                        serr++;
+                    }
+                    else if (xn <= 2 && yn <= 2 && sn <= 3)
+                    {
+                        mp_limb_t x1, x0, y1, y0;
+                        mp_limb_t u3, u2, u1, u0;
+
+                        if (xn == 1 && yn == 1)
+                        {
+                            xtop = ARF_NOPTR_D(xm)[0];
+                            ytop = ARF_NOPTR_D(ym)[0];
+                            umul_ppmm(u3, u2, xtop, ytop);
+                            u1 = u0 = 0;
+                        }
+                        else if (xn == 2 && yn == 2)
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            x1 = ARF_NOPTR_D(xm)[1];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            y1 = ARF_NOPTR_D(ym)[1];
+                            xtop = x1;
+                            ytop = y1;
+                            nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+                        }
+                        else if (xn == 1)
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            y1 = ARF_NOPTR_D(ym)[1];
+                            xtop = x0;
+                            ytop = y1;
+                            nn_mul_2x1(u3, u2, u1, y1, y0, x0);
+                            u0 = 0;
+                        }
+                        else
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            x1 = ARF_NOPTR_D(xm)[1];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            xtop = x1;
+                            ytop = y0;
+                            nn_mul_2x1(u3, u2, u1, x1, x0, y0);
+                            u0 = 0;
+                        }
+
+                        if (sn == 2)
+                        {
+                            if (shift < FLINT_BITS)
+                            {
+                                serr += ((u2 << (FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
+                                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                                u3 = (u3 >> shift);
+                            }
+                            else if (shift == FLINT_BITS)
+                            {
+                                serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
+                                u2 = u3;
+                                u3 = 0;
+                            }
+                            else /* FLINT_BITS < shift < 2 * FLINT_BITS */
+                            {
+                                serr += ((u3 << (2 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
+                                u2 = (u3 >> (shift - FLINT_BITS));
+                                u3 = 0;
+                            }
+
+                            if (xnegative ^ ynegative ^ flipsign)
+                                sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                            else
+                                add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                        }
+                        else if (sn == 3)
+                        {
+                            if (shift < FLINT_BITS)
+                            {
+                                serr += ((u1 << (FLINT_BITS - shift)) != 0) || (u0 != 0);
+                                u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                                u3 = (u3 >> shift);
+                            }
+                            else if (shift == FLINT_BITS)
+                            {
+                                serr += (u1 != 0) || (u0 != 0);
+                                u1 = u2;
+                                u2 = u3;
+                                u3 = 0;
+                            }
+                            else if (shift < 2 * FLINT_BITS)
+                            {
+                                serr += ((u2 << (2 * FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
+                                u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
+                                u2 = (u3 >> (shift - FLINT_BITS));
+                                u3 = 0;
+                            }
+                            else if (shift == 2 * FLINT_BITS)
+                            {
+                                serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
+                                u1 = u3;
+                                u2 = 0;
+                                u3 = 0;
+                            }
+                            else  /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
+                            {
+                                serr += ((u3 << (3 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
+                                u1 = (u3 >> (shift - 2 * FLINT_BITS));
+                                u2 = 0;
+                                u3 = 0;
+                            }
+
+                            if (xnegative ^ ynegative ^ flipsign)
+                                sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                            else
+                                add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                        }
+                    }
+                    else
+                    {
+                        mp_srcptr xptr, yptr;
+
+                        xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                        yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
+
+                        xtop = xptr[xn - 1];
+                        ytop = yptr[yn - 1];
+
+                        if (use_gauss == NULL || use_gauss[i] == 0)
+                            _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative ^ flipsign, shift);
+                    }
+
+                    xrad = MAG_MAN(xr);
+                    yrad = MAG_MAN(yr);
+
+                    if (xrad != 0 && yrad != 0)
+                    {
+                        xrexp = MAG_EXP(xr);
+                        yrexp = MAG_EXP(yr);
+
+                        RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+                        RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+                        RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
+                    }
+                    else if (xrad != 0)
+                    {
+                        xrexp = MAG_EXP(xr);
+                        RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+                    }
+                    else if (yrad != 0)
+                    {
+                        yrexp = MAG_EXP(yr);
+                        RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+                    }
+                }
+                else
+                {
+                    xrad = MAG_MAN(xr);
+                    yrad = MAG_MAN(yr);
+
+                    xexp = ARF_EXP(xm);
+                    yexp = ARF_EXP(ym);
+
+                    xrexp = MAG_EXP(xr);
+                    yrexp = MAG_EXP(yr);
+
+                    /* (xm+xr)(ym+yr) = xm ym + [xm yr + ym xr + xr yr] */
+                    if (yrad && !arf_is_special(xm))
+                    {
+                        ARF_GET_TOP_LIMB(xtop, xm);
+                        RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+                    }
+
+                    if (xrad && !arf_is_special(ym))
+                    {
+                        ARF_GET_TOP_LIMB(ytop, ym);
+                        RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+                    }
+
+                    if (xrad && yrad)
+                    {
+                        RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
+                    }
+                }
+            }
+
+            if (xoff == yoff)
+            {
+                re_serr += serr;
+                re_srad += srad;
+            }
+            else
+            {
+                im_serr += serr;
+                im_srad += srad;
+            }
+        }
+    }
+
+    _arb_dot_output(acb_realref(res), re_sum, re_sn, subtract, re_serr, re_sum_exp, re_srad, re_srad_exp, re_prec);
+    _arb_dot_output(acb_imagref(res), im_sum, im_sn, subtract, im_serr, im_sum_exp, im_srad, im_srad_exp, im_prec);
+
+    ARF_ADD_TMP_FREE(re_sum, alloc);
+    if (use_gauss != NULL)
+        flint_free(use_gauss);
+}
diff --git a/acb/dot_precise.c b/acb/dot_precise.c
new file mode 100644
index 00000000..78eef5e4
--- /dev/null
+++ b/acb/dot_precise.c
@@ -0,0 +1,50 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+void
+acb_dot_precise(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep,
+    acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    arb_ptr tmp;
+    slong i;
+
+    tmp = flint_malloc(sizeof(arb_struct) * (4 * len));
+
+    for (i = 0; i < len; i++)
+    {
+        tmp[0 * len + i] = *acb_realref(x + i * xstep);
+        tmp[1 * len + i] = *acb_imagref(x + i * xstep);
+        tmp[2 * len + i] = *acb_realref(y + i * ystep);
+        arb_init(tmp + 3 * len + i);
+        arb_neg(tmp + 3 * len + i, acb_imagref(y + i * ystep));
+    }
+
+    arb_dot_precise(acb_realref(res), initial == NULL ? NULL : acb_realref(initial), subtract,
+        tmp, 1, tmp + 2 * len, 1, 2 * len, prec);
+
+    for (i = 0; i < len; i++)
+        arb_clear(tmp + 3 * len + i);
+
+    for (i = 0; i < len; i++)
+    {
+        tmp[0 * len + i] = *acb_realref(x + i * xstep);
+        tmp[1 * len + i] = *acb_imagref(x + i * xstep);
+        tmp[2 * len + i] = *acb_imagref(y + i * ystep);
+        tmp[3 * len + i] = *acb_realref(y + i * ystep);
+    }
+
+    arb_dot_precise(acb_imagref(res), initial == NULL ? NULL : acb_imagref(initial), subtract,
+        tmp, 1, tmp + 2 * len, 1, 2 * len, prec);
+
+    flint_free(tmp);
+}
diff --git a/acb/dot_simple.c b/acb/dot_simple.c
new file mode 100644
index 00000000..215596e8
--- /dev/null
+++ b/acb/dot_simple.c
@@ -0,0 +1,47 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+void
+acb_dot_simple(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+            acb_zero(res);
+        else
+            acb_set_round(res, initial, prec);
+        return;
+    }
+
+    if (initial == NULL)
+    {
+        acb_mul(res, x, y, prec);
+    }
+    else
+    {
+        if (subtract)
+            acb_neg(res, initial);
+        else
+            acb_set(res, initial);
+        acb_addmul(res, x, y, prec);
+    }
+
+    for (i = 1; i < len; i++)
+        acb_addmul(res, x + i * xstep, y + i * ystep, prec);
+
+    if (subtract)
+        acb_neg(res, res);
+}
diff --git a/acb/test/t-dot.c b/acb/test/t-dot.c
new file mode 100644
index 00000000..0cd25848
--- /dev/null
+++ b/acb/test/t-dot.c
@@ -0,0 +1,200 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("dot....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 1000000 * arb_test_multiplier(); iter++)
+    {
+        acb_ptr x, y;
+        acb_t s1, s2, z;
+        slong i, len, prec, xbits, ybits, ebits;
+        int ok, initial, subtract, revx, revy;
+
+        if (n_randint(state, 100) == 0)
+            len = n_randint(state, 50);
+        else if (n_randint(state, 10) == 0)
+            len = n_randint(state, 5);
+        else
+            len = n_randint(state, 3);
+
+        acb_dot_gauss_dot_cutoff = 3 + n_randint(state, 30);
+
+        if (n_randint(state, 10) != 0 || len > 10)
+        {
+            prec = 2 + n_randint(state, 500);
+            xbits = 2 + n_randint(state, 500);
+            ybits = 2 + n_randint(state, 500);
+        }
+        else
+        {
+            prec = 2 + n_randint(state, 5000);
+            xbits = 2 + n_randint(state, 5000);
+            ybits = 2 + n_randint(state, 5000);
+        }
+
+        if (n_randint(state, 100) == 0)
+            ebits = 2 + n_randint(state, 100);
+        else
+            ebits = 2 + n_randint(state, 10);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        revx = n_randint(state, 2);
+        revy = n_randint(state, 2);
+
+        x = _acb_vec_init(len);
+        y = _acb_vec_init(len);
+        acb_init(s1);
+        acb_init(s2);
+        acb_init(z);
+
+        switch (n_randint(state, 3))
+        {
+            case 0:
+                for (i = 0; i < len; i++)
+                {
+                    acb_randtest(x + i, state, xbits, ebits);
+                    acb_randtest(y + i, state, ybits, ebits);
+                }
+                break;
+
+            /* Test with cancellation */
+            case 1:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg(x + i, x + len - i - 1);
+                        acb_set(y + i, y + len - i - 1);
+                    }
+                }
+                break;
+
+            default:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg_round(x + i, x + len - i - 1, 2 + n_randint(state, 500));
+                        acb_set_round(y + i, y + len - i - 1, 2 + n_randint(state, 500));
+                    }
+                }
+                break;
+        }
+
+        acb_randtest(s1, state, 200, 100);
+        acb_randtest(s2, state, 200, 100);
+        acb_randtest(z, state, xbits, ebits);
+
+        acb_dot(s1, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+
+        acb_dot_precise(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, ebits <= 12 ? ARF_PREC_EXACT : 2 * prec + 100);
+
+        if (ebits <= 12)
+            ok = acb_contains(s1, s2);
+        else
+            ok = acb_overlaps(s1, s2);
+
+        if (!ok)
+        {
+            flint_printf("FAIL\n\n");
+            flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd, subtract = %d\n\n", iter, len, prec, ebits, subtract);
+
+            if (initial)
+            {
+                flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+            }
+
+            for (i = 0; i < len; i++)
+            {
+                flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+            }
+            flint_printf("\n\n");
+            flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* With the fast algorithm, we expect identical results when
+           reversing the vectors. */
+        if (ebits <= 12)
+        {
+            revx ^= 1;
+            revy ^= 1;
+
+            acb_dot(s2, initial ? z : NULL, subtract,
+                revx ? (x + len - 1) : x, revx ? -1 : 1,
+                revy ? (y + len - 1) : y, revy ? -1 : 1,
+                len, prec);
+
+            if (!acb_equal(s1, s2))
+            {
+                flint_printf("FAIL (reversal)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+        }
+
+        acb_clear(s1);
+        acb_clear(s2);
+        acb_clear(z);
+        _acb_vec_clear(x, len);
+        _acb_vec_clear(y, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
diff --git a/acb_poly/div_series.c b/acb_poly/div_series.c
index 81e6d58b..419cef9a 100644
--- a/acb_poly/div_series.c
+++ b/acb_poly/div_series.c
@@ -44,7 +44,7 @@ _acb_poly_div_series(acb_ptr Q, acb_srcptr A, slong Alen,
     {
         /* The basecase algorithm is faster for much larger Blen and n than
            this, but unfortunately has worse numerical stability. */
-        slong i, j;
+        slong i;
         acb_t q;
 
         acb_init(q);
@@ -54,16 +54,8 @@ _acb_poly_div_series(acb_ptr Q, acb_srcptr A, slong Alen,
 
         for (i = 1; i < n; i++)
         {
-            acb_mul(Q + i, B + 1, Q + i - 1, prec);
-
-            for (j = 2; j < FLINT_MIN(i + 1, Blen); j++)
-                acb_addmul(Q + i, B + j, Q + i - j, prec);
-
-            if (i < Alen)
-                acb_sub(Q + i, A + i, Q + i, prec);
-            else
-                acb_neg(Q + i, Q + i);
-
+            acb_dot(Q + i, (i < Alen) ? A + i : NULL, 1,
+                B + 1, 1, Q + i - 1, -1, FLINT_MIN(i, Blen - 1), prec);
             if (!acb_is_one(q))
                 acb_mul(Q + i, Q + i, q, prec);
         }
diff --git a/acb_poly/exp_series.c b/acb_poly/exp_series.c
index c69d99f9..414f0a05 100644
--- a/acb_poly/exp_series.c
+++ b/acb_poly/exp_series.c
@@ -12,7 +12,7 @@
 #include "acb_poly.h"
 
 /* allow changing this from the test code */
-ARB_DLL slong acb_poly_newton_exp_cutoff = 120;
+ARB_DLL slong acb_poly_newton_exp_cutoff = 0;
 
 /* with inverse=1 simultaneously computes g = exp(-x) to length n
 with inverse=0 uses g as scratch space, computing
@@ -112,39 +112,51 @@ _acb_poly_exp_series(acb_ptr f, acb_srcptr h, slong hlen, slong n, slong prec)
         _acb_vec_zero(f + j - d + 1, n - (j - d + 1));
         acb_clear(t);
     }
-    else if (hlen <= acb_poly_newton_exp_cutoff)
-    {
-        _acb_poly_exp_series_basecase(f, h, hlen, n, prec);
-    }
     else
     {
-        acb_ptr g, t;
-        acb_t u;
-        int fix;
+        slong cutoff;
 
-        g = _acb_vec_init((n + 1) / 2);
-        fix = (hlen < n || h == f || !acb_is_zero(h));
+        if (acb_poly_newton_exp_cutoff != 0)
+            cutoff = acb_poly_newton_exp_cutoff;
+        else if (prec <= 256)
+            cutoff = 750;
+        else
+            cutoff = 1e5 / pow(log(prec), 3);
 
-        if (fix)
+        if (hlen <= cutoff)
         {
-            t = _acb_vec_init(n);
-            _acb_vec_set(t + 1, h + 1, hlen - 1);
+            _acb_poly_exp_series_basecase(f, h, hlen, n, prec);
         }
         else
-            t = (acb_ptr) h;
+        {
+            acb_ptr g, t;
+            acb_t u;
+            int fix;
 
-        acb_init(u);
-        acb_exp(u, h, prec);
+            g = _acb_vec_init((n + 1) / 2);
+            fix = (hlen < n || h == f || !acb_is_zero(h));
 
-        _acb_poly_exp_series_newton(f, g, t, n, prec, 0, acb_poly_newton_exp_cutoff);
+            if (fix)
+            {
+                t = _acb_vec_init(n);
+                _acb_vec_set(t + 1, h + 1, hlen - 1);
+            }
+            else
+                t = (acb_ptr) h;
 
-        if (!acb_is_one(u))
-            _acb_vec_scalar_mul(f, f, n, u, prec);
+            acb_init(u);
+            acb_exp(u, h, prec);
 
-        _acb_vec_clear(g, (n + 1) / 2);
-        if (fix)
-            _acb_vec_clear(t, n);
-        acb_clear(u);
+            _acb_poly_exp_series_newton(f, g, t, n, prec, 0, cutoff);
+
+            if (!acb_is_one(u))
+                _acb_vec_scalar_mul(f, f, n, u, prec);
+
+            _acb_vec_clear(g, (n + 1) / 2);
+            if (fix)
+                _acb_vec_clear(t, n);
+            acb_clear(u);
+        }
     }
 }
 
diff --git a/acb_poly/exp_series_basecase.c b/acb_poly/exp_series_basecase.c
index e6eefb53..1de27418 100644
--- a/acb_poly/exp_series_basecase.c
+++ b/acb_poly/exp_series_basecase.c
@@ -11,13 +11,11 @@
 
 #include "acb_poly.h"
 
-#define MUL_CUTOFF 24
-
 static void
 _acb_poly_exp_series_basecase_rec(acb_ptr f, acb_ptr a,
         acb_srcptr h, slong hlen, slong n, slong prec)
 {
-    slong j, k;
+    slong k;
 
     acb_t s;
     acb_init(s);
@@ -29,10 +27,7 @@ _acb_poly_exp_series_basecase_rec(acb_ptr f, acb_ptr a,
 
     for (k = 1; k < n; k++)
     {
-        acb_zero(s);
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-            acb_addmul(s, a + j, f + k - j, prec);
-
+        acb_dot(s, NULL, 0, a + 1, 1, f + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
         acb_div_ui(f + k, s, k, prec);
     }
 
@@ -45,7 +40,7 @@ _acb_poly_exp_series_basecase(acb_ptr f,
 {
     hlen = FLINT_MIN(n, hlen);
 
-    if (n < MUL_CUTOFF || hlen < 0.9 * n)
+    if (n < 20 || hlen < 0.9 * n || prec <= 2 * FLINT_BITS || n < 1000.0 / log(prec + 10) - 70)
     {
         acb_ptr t = _acb_vec_init(hlen);
         _acb_poly_exp_series_basecase_rec(f, t, h, hlen, n, prec);
diff --git a/acb_poly/inv_series.c b/acb_poly/inv_series.c
index 9fbb4d92..1872e5a5 100644
--- a/acb_poly/inv_series.c
+++ b/acb_poly/inv_series.c
@@ -37,7 +37,7 @@ _acb_poly_inv_series(acb_ptr Qinv,
     }
     else
     {
-        slong i, j, blen;
+        slong i, blen;
 
         /* The basecase algorithm is faster for much larger Qlen or len than
            this, but unfortunately also much less numerically stable. */
@@ -48,15 +48,10 @@ _acb_poly_inv_series(acb_ptr Qinv,
 
         for (i = 1; i < blen; i++)
         {
-            acb_mul(Qinv + i, Q + 1, Qinv + i - 1, prec);
-
-            for (j = 2; j < FLINT_MIN(i + 1, Qlen); j++)
-                acb_addmul(Qinv + i, Q + j, Qinv + i - j, prec);
-
+            acb_dot(Qinv + i, NULL, 1,
+                Q + 1, 1, Qinv + i - 1, -1, FLINT_MIN(i, Qlen - 1), prec);
             if (!acb_is_one(Qinv))
                 acb_mul(Qinv + i, Qinv + i, Qinv, prec);
-
-            acb_neg(Qinv + i, Qinv + i);
         }
 
         if (len > blen)
@@ -116,4 +111,3 @@ acb_poly_inv_series(acb_poly_t Qinv, const acb_poly_t Q, slong n, slong prec)
     _acb_poly_set_length(Qinv, n);
     _acb_poly_normalise(Qinv);
 }
-
diff --git a/acb_poly/mullow.c b/acb_poly/mullow.c
index 75002552..ac034c0e 100644
--- a/acb_poly/mullow.c
+++ b/acb_poly/mullow.c
@@ -11,17 +11,46 @@
 
 #include "acb_poly.h"
 
-#define CUTOFF 4
-
 void
 _acb_poly_mullow(acb_ptr res,
     acb_srcptr poly1, slong len1,
     acb_srcptr poly2, slong len2, slong n, slong prec)
 {
-    if (n < CUTOFF || len1 < CUTOFF || len2 < CUTOFF)
+    if (n == 1)
+    {
+        acb_mul(res, poly1, poly2, prec);
+    }
+    else if (n <= 7 || len1 <= 7 || len2 <= 7)
+    {
         _acb_poly_mullow_classical(res, poly1, len1, poly2, len2, n, prec);
+    }
     else
-        _acb_poly_mullow_transpose(res, poly1, len1, poly2, len2, n, prec);
+    {
+        slong cutoff;
+        double p;
+
+        if (prec <= 2 * FLINT_BITS)
+        {
+            cutoff = 110;
+        }
+        else
+        {
+            p = log(prec);
+
+            cutoff = 10000.0 / (p * p * p);
+            cutoff = FLINT_MIN(cutoff, 60);
+            if (poly1 == poly2 && prec >= 256)
+                cutoff *= 1.25;
+            if (poly1 == poly2 && prec >= 4096)
+                cutoff *= 1.25;
+            cutoff = FLINT_MAX(cutoff, 8);
+        }
+
+        if (2 * FLINT_MIN(len1, len2) <= cutoff || n <= cutoff)
+            _acb_poly_mullow_classical(res, poly1, len1, poly2, len2, n, prec);
+        else
+            _acb_poly_mullow_transpose(res, poly1, len1, poly2, len2, n, prec);
+    }
 }
 
 void
diff --git a/acb_poly/mullow_classical.c b/acb_poly/mullow_classical.c
index 316415bb..11465bf0 100644
--- a/acb_poly/mullow_classical.c
+++ b/acb_poly/mullow_classical.c
@@ -26,35 +26,55 @@ _acb_poly_mullow_classical(acb_ptr res,
     }
     else if (poly1 == poly2 && len1 == len2)
     {
-        slong i;
+        slong i, start, stop;
 
-        _acb_vec_scalar_mul(res, poly1, FLINT_MIN(len1, n), poly1, prec);
-        _acb_vec_scalar_mul(res + len1, poly1 + 1, n - len1, poly1 + len1 - 1, prec);
+        acb_sqr(res, poly1, prec);
+        acb_mul(res + 1, poly1, poly1 + 1, prec);
+        acb_mul_2exp_si(res + 1, res + 1, 1);
 
-        for (i = 1; i < len1 - 1; i++)
-            _acb_vec_scalar_addmul(res + i + 1, poly1 + 1,
-                FLINT_MIN(i - 1, n - (i + 1)), poly1 + i, prec);
+        for (i = 2; i < FLINT_MIN(n, 2 * len1 - 3); i++)
+        {
+            start = FLINT_MAX(0, i - len1 + 1);
+            stop = FLINT_MIN(len1 - 1, (i + 1) / 2 - 1);
 
-        for (i = 1; i < FLINT_MIN(2 * len1 - 2, n); i++)
+            acb_dot(res + i, NULL, 0, poly1 + start, 1,
+                poly1 + i - start, -1, stop - start + 1, prec);
             acb_mul_2exp_si(res + i, res + i, 1);
+            if (i % 2 == 0 && i / 2 < len1)
+                acb_addmul(res + i, poly1 + i / 2, poly1 + i / 2, prec);
+        }
 
-        for (i = 1; i < FLINT_MIN(len1 - 1, (n + 1) / 2); i++)
-            acb_addmul(res + 2 * i, poly1 + i, poly1 + i, prec);
+        if (len1 > 2 && n >= 2 * len1 - 2)
+        {
+            acb_mul(res + 2 * len1 - 3, poly1 + len1 - 1, poly1 + len1 - 2, prec);
+            acb_mul_2exp_si(res + 2 * len1 - 3, res + 2 * len1 - 3, 1);
+        }
+
+        if (n >= 2 * len1 - 1)
+            acb_sqr(res + 2 * len1 - 2, poly1 + len1 - 1, prec);
+    }
+    else if (len1 == 1)
+    {
+        _acb_vec_scalar_mul(res, poly2, n, poly1, prec);
+    }
+    else if (len2 == 1)
+    {
+        _acb_vec_scalar_mul(res, poly1, n, poly2, prec);
     }
     else
     {
-        slong i;
+        slong i, top1, top2;
 
-        _acb_vec_scalar_mul(res, poly1, FLINT_MIN(len1, n), poly2, prec);
+        acb_mul(res, poly1, poly2, prec);
 
-        if (n > len1)
-            _acb_vec_scalar_mul(res + len1, poly2 + 1, n - len1,
-                                      poly1 + len1 - 1, prec);
+        for (i = 1; i < n; i++)
+        {
+            top1 = FLINT_MIN(len1 - 1, i);
+            top2 = FLINT_MIN(len2 - 1, i);
 
-        for (i = 0; i < FLINT_MIN(len1, n) - 1; i++)
-            _acb_vec_scalar_addmul(res + i + 1, poly2 + 1,
-                                         FLINT_MIN(len2, n - i) - 1,
-                                         poly1 + i, prec);
+            acb_dot(res + i, NULL, 0, poly1 + i - top2, 1,
+                poly2 + top2, -1, top1 + top2 - i + 1, prec);
+        }
     }
 }
 
@@ -94,4 +114,3 @@ acb_poly_mullow_classical(acb_poly_t res, const acb_poly_t poly1,
     _acb_poly_set_length(res, n);
     _acb_poly_normalise(res);
 }
-
diff --git a/acb_poly/sin_cos_pi_series.c b/acb_poly/sin_cos_pi_series.c
index 846cd263..58d48dce 100644
--- a/acb_poly/sin_cos_pi_series.c
+++ b/acb_poly/sin_cos_pi_series.c
@@ -11,10 +11,8 @@
 
 #include "acb_poly.h"
 
-#define TANGENT_CUTOFF 80
-
 void
-_acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
     hlen = FLINT_MIN(hlen, n);
 
@@ -36,10 +34,25 @@ _acb_poly_sin_cos_pi_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen
         acb_mul(c + 1, s, t, prec);
         acb_clear(t);
     }
-    else if (hlen < TANGENT_CUTOFF)
-        _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 1);
     else
-        _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 1);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+        {
+            cutoff = 1400;
+        }
+        else
+        {
+            cutoff = 100000 / pow(log(prec), 3);
+            cutoff = FLINT_MIN(cutoff, 700);
+        }
+
+        if (hlen < cutoff)
+            _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 1);
+        else
+            _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 1);
+    }
 }
 
 void
@@ -73,4 +86,3 @@ acb_poly_sin_cos_pi_series(acb_poly_t s, acb_poly_t c,
     _acb_poly_set_length(c, n);
     _acb_poly_normalise(c);
 }
-
diff --git a/acb_poly/sin_cos_series.c b/acb_poly/sin_cos_series.c
index 42b5c733..6557ab74 100644
--- a/acb_poly/sin_cos_series.c
+++ b/acb_poly/sin_cos_series.c
@@ -11,10 +11,8 @@
 
 #include "acb_poly.h"
 
-#define TANGENT_CUTOFF 80
-
 void
-_acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
     hlen = FLINT_MIN(hlen, n);
 
@@ -35,10 +33,25 @@ _acb_poly_sin_cos_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, s
         acb_mul(c + 1, s, t, prec);
         acb_clear(t);
     }
-    else if (hlen < TANGENT_CUTOFF)
-        _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 0);
     else
-        _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 0);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+        {
+            cutoff = 1400;
+        }
+        else
+        {
+            cutoff = 100000 / pow(log(prec), 3);
+            cutoff = FLINT_MIN(cutoff, 700);
+        }
+
+        if (hlen < cutoff)
+            _acb_poly_sin_cos_series_basecase(s, c, h, hlen, n, prec, 0);
+        else
+            _acb_poly_sin_cos_series_tangent(s, c, h, hlen, n, prec, 0);
+    }
 }
 
 void
diff --git a/acb_poly/sin_cos_series_basecase.c b/acb_poly/sin_cos_series_basecase.c
index 9af63a9b..3b44649a 100644
--- a/acb_poly/sin_cos_series_basecase.c
+++ b/acb_poly/sin_cos_series_basecase.c
@@ -15,7 +15,7 @@ void
 _acb_poly_sin_cos_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen,
         slong n, slong prec, int times_pi)
 {
-    slong j, k, alen = FLINT_MIN(n, hlen);
+    slong k, alen = FLINT_MIN(n, hlen);
     acb_ptr a;
     acb_t t, u;
 
@@ -46,15 +46,8 @@ _acb_poly_sin_cos_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen
 
     for (k = 1; k < n; k++)
     {
-        acb_zero(t);
-        acb_zero(u);
-
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-        {
-            acb_submul(t, a + j, s + k - j, prec);
-            acb_addmul(u, a + j, c + k - j, prec);
-        }
-
+        acb_dot(t, NULL, 1, a + 1, 1, s + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
+        acb_dot(u, NULL, 0, a + 1, 1, c + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
         acb_div_ui(c + k, t, k, prec);
         acb_div_ui(s + k, u, k, prec);
     }
@@ -92,4 +85,3 @@ acb_poly_sin_cos_series_basecase(acb_poly_t s, acb_poly_t c,
     _acb_poly_set_length(c, n);
     _acb_poly_normalise(c);
 }
-
diff --git a/acb_poly/sinh_cosh_series.c b/acb_poly/sinh_cosh_series.c
index ff3accce..d2d82233 100644
--- a/acb_poly/sinh_cosh_series.c
+++ b/acb_poly/sinh_cosh_series.c
@@ -12,7 +12,7 @@
 #include "acb_poly.h"
 
 void
-_acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen, slong n, slong prec)
+_acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen, slong n, slong prec)
 {
     hlen = FLINT_MIN(hlen, n);
 
@@ -32,10 +32,20 @@ _acb_poly_sinh_cosh_series(acb_ptr s, acb_ptr c, const acb_srcptr h, slong hlen,
         acb_mul(c + 1, s, t, prec);
         acb_clear(t);
     }
-    else if (hlen < 60 || n < 120)
-        _acb_poly_sinh_cosh_series_basecase(s, c, h, hlen, n, prec);
     else
-        _acb_poly_sinh_cosh_series_exponential(s, c, h, hlen, n, prec);
+    {
+        slong cutoff;
+
+        if (prec <= 128)
+            cutoff = 400;
+        else
+            cutoff = 30000 / pow(log(prec), 3);
+
+        if (hlen < cutoff)
+            _acb_poly_sinh_cosh_series_basecase(s, c, h, hlen, n, prec);
+        else
+            _acb_poly_sinh_cosh_series_exponential(s, c, h, hlen, n, prec);
+    }
 }
 
 void
@@ -69,4 +79,3 @@ acb_poly_sinh_cosh_series(acb_poly_t s, acb_poly_t c,
     _acb_poly_set_length(c, n);
     _acb_poly_normalise(c);
 }
-
diff --git a/acb_poly/sinh_cosh_series_basecase.c b/acb_poly/sinh_cosh_series_basecase.c
index 0e3b393b..9e98f28a 100644
--- a/acb_poly/sinh_cosh_series_basecase.c
+++ b/acb_poly/sinh_cosh_series_basecase.c
@@ -15,7 +15,7 @@ void
 _acb_poly_sinh_cosh_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hlen,
         slong n, slong prec)
 {
-    slong j, k, alen = FLINT_MIN(n, hlen);
+    slong k, alen = FLINT_MIN(n, hlen);
     acb_ptr a;
     acb_t t, u;
 
@@ -37,15 +37,8 @@ _acb_poly_sinh_cosh_series_basecase(acb_ptr s, acb_ptr c, acb_srcptr h, slong hl
 
     for (k = 1; k < n; k++)
     {
-        acb_zero(t);
-        acb_zero(u);
-
-        for (j = 1; j < FLINT_MIN(k + 1, hlen); j++)
-        {
-            acb_addmul(t, a + j, s + k - j, prec);
-            acb_addmul(u, a + j, c + k - j, prec);
-        }
-
+        acb_dot(t, NULL, 0, a + 1, 1, s + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
+        acb_dot(u, NULL, 0, a + 1, 1, c + k - 1, -1, FLINT_MIN(k, hlen - 1), prec);
         acb_div_ui(c + k, t, k, prec);
         acb_div_ui(s + k, u, k, prec);
     }
@@ -83,4 +76,3 @@ acb_poly_sinh_cosh_series_basecase(acb_poly_t s, acb_poly_t c,
     _acb_poly_set_length(c, n);
     _acb_poly_normalise(c);
 }
-
diff --git a/arb/test/t-dot.c b/arb/test/t-dot.c
index 9c5e2c67..58230145 100644
--- a/arb/test/t-dot.c
+++ b/arb/test/t-dot.c
@@ -35,7 +35,7 @@ int main()
         else
             len = n_randint(state, 3);
 
-        if (n_randint(state, 100) == 0 || len > 10)
+        if (n_randint(state, 10) != 0 || len > 10)
         {
             prec = 2 + n_randint(state, 500);
             xbits = 2 + n_randint(state, 500);
diff --git a/doc/source/acb.rst b/doc/source/acb.rst
index ecb8760e..f7f0136e 100644
--- a/doc/source/acb.rst
+++ b/doc/source/acb.rst
@@ -499,6 +499,46 @@ Arithmetic
 
     Sets *z* to the quotient of *x* and *y*.
 
+Dot product
+-------------------------------------------------------------------------------
+
+.. function:: void acb_dot_precise(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void acb_dot_simple(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void acb_dot(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+    Computes the dot product of the vectors *x* and *y*, setting
+    *res* to `s + (-1)^{subtract} \sum_{i=0}^{len-1} x_i y_i`.
+
+    The initial term *s* is optional and can be
+    omitted by passing *NULL* (equivalently, `s = 0`).
+    The parameter *subtract* must be 0 or 1.
+    The length *len* is allowed to be negative, which is equivalent
+    to a length of zero.
+    The parameters *xstep* or *ystep* specify a step length for
+    traversing subsequences of the vectors *x* and *y*; either can be
+    negative to step in the reverse direction starting from
+    the initial pointer.
+    Aliasing is allowed between *res* and *s* but not between
+    *res* and the entries of *x* and *y*.
+
+    The default version determines the optimal precision for each term
+    and performs all internal calculations using mpn arithmetic
+    with minimal overhead. This is the preferred way to compute a
+    dot product; it is generally much faster and more precise
+    than a simple loop.
+
+    The *simple* version performs fused multiply-add operations in
+    a simple loop. This can be used for
+    testing purposes and is also used as a fallback by the
+    default version when the exponents are out of range
+    for the optimized code.
+
+    The *precise* version computes the dot product exactly up to the
+    final rounding. This can be extremely slow and is only intended
+    for testing.
+
 Mathematical constants
 -------------------------------------------------------------------------------
 
diff --git a/doc/source/arb.rst b/doc/source/arb.rst
index 44064dd4..c3a87bb2 100644
--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@@ -782,7 +782,7 @@ Arithmetic
 
     Sets `z = x / (2^n - 1)`, rounded to *prec* bits.
 
-Sum and dot product
+Dot product
 -------------------------------------------------------------------------------
 
 .. function:: void arb_dot_precise(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)