approximate dot product and matrix multiplication

2025-03-05 09:21:38 -05:00 · 2018-09-07 18:04:30 +02:00 · 2018-09-07 18:04:30 +02:00 · 7c98883478
commit 7c98883478
parent fb9a514763
24 changed files with 2276 additions and 299 deletions
--- a/acb.h
+++ b/acb.h
@ -651,6 +651,8 @@ void acb_dot_precise(acb_t res, const acb_t initial, int subtract,
 void acb_dot(acb_t res, const acb_t initial, int subtract,
    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);

+void acb_approx_dot(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec);

 void acb_inv(acb_t z, const acb_t x, slong prec);

--- a/acb/approx_dot.c
+++ b/acb/approx_dot.c
@ -0,0 +1,772 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+/* We need uint64_t instead of mp_limb_t on 32-bit systems for
+   safe summation of 30-bit error bounds. */
+#include <stdint.h>
+
+/* The following macros are found in FLINT's longlong.h, but
+   the release version is out of date. */
+
+/* x86 : 64 bit */
+#if (GMP_LIMB_BITS == 64 && defined (__amd64__))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0"     \
+       : "=r" (sh), "=&r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)),  \
+         "1"  ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)),  \
+         "2"  ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl)))  \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0"     \
+       : "=r" (dh), "=&r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)),  \
+         "1"  ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)),  \
+"2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
+
+#endif /* x86_64 */
+
+/* x86 : 32 bit */
+#if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
+   || defined (__i486__) || defined(__amd64__)))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0"     \
+       : "=r" (sh), "=r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)),    \
+         "1"  ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)),    \
+         "2"  ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl)))    \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0"     \
+       : "=r" (dh), "=r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)),    \
+         "1"  ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)),    \
+         "2"  ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl)))    \
+
+#endif /* x86 */
+
+
+#if !defined(add_sssaaaaaa2)
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl);      \
+    add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm);      \
+    add_ssaaaa(sh, sm, ah + bh, sm, __u, __t);                      \
+} while (0)
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl);      \
+    sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm);      \
+    sub_ddmmss(dh, dm, mh - sh, dm, __u, __t);                      \
+  } while (0)
+
+#endif
+
+void
+_arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
+    int negative, mp_bitcnt_t shift);
+
+void
+_arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn,
+    int negative, mp_bitcnt_t shift);
+
+static void
+_arb_dot_output(arb_t res, mp_ptr sum, mp_size_t sn, int negative,
+    slong sum_exp, slong prec)
+{
+    slong exp_fix;
+
+    if (sum[sn - 1] >= LIMB_TOP)
+    {
+        mpn_neg(sum, sum, sn);
+        negative ^= 1;
+    }
+
+    exp_fix = 0;
+
+    if (sum[sn - 1] == 0)
+    {
+        slong sum_exp2;
+        mp_size_t sn2;
+
+        sn2 = sn;
+        sum_exp2 = sum_exp; 
+
+        while (sn2 > 0 && sum[sn2 - 1] == 0)
+        {
+            sum_exp2 -= FLINT_BITS;
+            sn2--;
+        }
+
+        if (sn2 == 0)
+        {
+            arf_zero(arb_midref(res));
+        }
+        else
+        {
+            _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn2, negative, prec, ARF_RND_DOWN);
+            _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp2);
+        }
+    }
+    else
+    {
+        if (sn == 2)  /* unnecessary? */
+            _arf_set_round_uiui(arb_midref(res), &exp_fix, sum[1], sum[0], negative, prec, ARF_RND_DOWN);
+        else
+            _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn, negative, prec, ARF_RND_DOWN);
+
+        _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp);
+    }
+}
+
+/* xxx: don't use surrounding variables */
+#define ARB_DOT_ADD(s_sum, s_serr, s_sn, s_sum_exp, s_subtract, xm) \
+    if (!arf_is_special(xm)) \
+    { \
+        mp_srcptr xptr; \
+        xexp = ARF_EXP(xm); \
+        xn = ARF_SIZE(xm); \
+        xnegative = ARF_SGNBIT(xm); \
+        shift = s_sum_exp - xexp; \
+        if (shift >= s_sn * FLINT_BITS) \
+        { \
+        } \
+        else \
+        { \
+            xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); \
+            _arb_dot_add_generic(s_sum, &s_serr, tmp, s_sn, xptr, xn, xnegative ^ s_subtract, shift); \
+        } \
+    } \
+
+static void
+_arf_complex_mul_gauss(arf_t e, arf_t f, const arf_t a, const arf_t b,
+                                         const arf_t c, const arf_t d)
+{
+    mp_srcptr ap, bp, cp, dp;
+    int asgn, bsgn, csgn, dsgn;
+    mp_size_t an, bn, cn, dn;
+    slong aexp, bexp, cexp, dexp;
+    fmpz texp, uexp;
+
+    fmpz_t za, zb, zc, zd, t, u, v;
+    slong abot, bbot, cbot, dbot;
+
+    ARF_GET_MPN_READONLY(ap, an, a);
+    asgn = ARF_SGNBIT(a);
+    aexp = ARF_EXP(a);
+
+    ARF_GET_MPN_READONLY(bp, bn, b);
+    bsgn = ARF_SGNBIT(b);
+    bexp = ARF_EXP(b);
+
+    ARF_GET_MPN_READONLY(cp, cn, c);
+    csgn = ARF_SGNBIT(c);
+    cexp = ARF_EXP(c);
+
+    ARF_GET_MPN_READONLY(dp, dn, d);
+    dsgn = ARF_SGNBIT(d);
+    dexp = ARF_EXP(d);
+
+    /* Gauss multiplication
+        e = ac - bd
+        f = (a+b)(c+d) - ac - bd */
+
+    abot = aexp - an * FLINT_BITS;
+    bbot = bexp - bn * FLINT_BITS;
+    cbot = cexp - cn * FLINT_BITS;
+    dbot = dexp - dn * FLINT_BITS;
+
+    texp = FLINT_MIN(abot, bbot);
+    uexp = FLINT_MIN(cbot, dbot);
+
+    fmpz_init(za);
+    fmpz_init(zb);
+    fmpz_init(zc);
+    fmpz_init(zd);
+    fmpz_init(t);
+    fmpz_init(u);
+    fmpz_init(v);
+
+    fmpz_lshift_mpn(za, ap, an, asgn, abot - texp);
+    fmpz_lshift_mpn(zb, bp, bn, bsgn, bbot - texp);
+    fmpz_lshift_mpn(zc, cp, cn, csgn, cbot - uexp);
+    fmpz_lshift_mpn(zd, dp, dn, dsgn, dbot - uexp);
+
+    fmpz_add(t, za, zb);
+    fmpz_add(v, zc, zd);
+    fmpz_mul(u, t, v);
+    fmpz_mul(t, za, zc);
+    fmpz_mul(v, zb, zd);
+    fmpz_sub(u, u, t);
+    fmpz_sub(u, u, v);
+    fmpz_sub(t, t, v);
+
+    texp += uexp;
+    arf_set_fmpz_2exp(e, t, &texp);
+    arf_set_fmpz_2exp(f, u, &texp);
+
+    fmpz_clear(za);
+    fmpz_clear(zb);
+    fmpz_clear(zc);
+    fmpz_clear(zd);
+    fmpz_clear(t);
+    fmpz_clear(u);
+    fmpz_clear(v);
+}
+
+ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
+#define GAUSS_CUTOFF acb_dot_gauss_dot_cutoff
+
+void
+acb_approx_dot_simple(acb_t res, const acb_t initial, int subtract,
+    acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+        {
+            arf_zero(arb_midref(acb_realref(res)));
+            arf_zero(arb_midref(acb_imagref(res)));
+        }
+        else
+        {
+            arf_set_round(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)), prec, ARB_RND);
+            arf_set_round(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)), prec, ARB_RND);
+        }
+        return;
+    }
+
+    if (initial == NULL && len == 1)
+    {
+        arf_complex_mul(arb_midref(acb_realref(res)),
+                        arb_midref(acb_imagref(res)),
+                        arb_midref(acb_realref(x)),
+                        arb_midref(acb_imagref(x)),
+                        arb_midref(acb_realref(y)),
+                        arb_midref(acb_imagref(y)), prec, ARB_RND);
+    }
+    else
+    {
+        arf_t e, f;
+
+        arf_init(e);
+        arf_init(f);
+
+        if (initial != NULL)
+        {
+            if (subtract)
+            {
+                arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
+                arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
+            }
+            else
+            {
+                arf_set(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
+                arf_set(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
+            }
+        }
+
+        for (i = 0; i < len; i++)
+        {
+            arf_complex_mul(e, f,
+                            arb_midref(acb_realref(x + i * xstep)),
+                            arb_midref(acb_imagref(x + i * xstep)),
+                            arb_midref(acb_realref(y + i * ystep)),
+                            arb_midref(acb_imagref(y + i * ystep)), prec, ARB_RND);
+
+
+            if (i == 0 && initial == NULL)
+            {
+                arf_set(arb_midref(acb_realref(res)), e);
+                arf_set(arb_midref(acb_imagref(res)), f);
+            }
+            else
+            {
+                arf_add(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)), e, prec, ARB_RND);
+                arf_add(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)), f, prec, ARB_RND);
+            }
+        }
+
+        arf_clear(e);
+        arf_clear(f);
+    }
+
+    if (subtract)
+    {
+        arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)));
+        arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)));
+    }
+}
+
+void
+acb_approx_dot(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i, j, padding, extend;
+    slong xexp, yexp, exp;
+    slong re_nonzero, im_nonzero;
+    slong re_max_exp, re_min_exp, re_sum_exp;
+    slong im_max_exp, im_min_exp, im_sum_exp;
+    slong re_prec, im_prec;
+    int xnegative, ynegative;
+    mp_size_t xn, yn, re_sn, im_sn, alloc;
+    mp_bitcnt_t shift;
+    arb_srcptr xi, yi;
+    arf_srcptr xm, ym;
+    mp_limb_t re_serr, im_serr;   /* Sum over arithmetic errors */
+    mp_ptr tmp, re_sum, im_sum;   /* Workspace */
+    slong xoff, yoff;
+    char * use_gauss;
+    ARF_ADD_TMP_DECL;
+
+    /* todo: fast fma and fmma (len=2) code */
+    if (len <= 1)
+    {
+        acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+        return;
+    }
+
+    /* Number of nonzero midpoint terms in sum. */
+    re_nonzero = 0;
+    im_nonzero = 0;
+
+    /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
+    re_max_exp = WORD_MIN;
+    im_max_exp = WORD_MIN;
+
+    /* Used to reduce the precision. */
+    re_min_exp = WORD_MAX;
+    im_min_exp = WORD_MAX;
+
+    /* Account for the initial term. */
+    if (initial != NULL)
+    {
+        if (!ARF_IS_LAGOM(arb_midref(acb_realref(initial))) || !ARF_IS_LAGOM(arb_midref(acb_imagref(initial))))
+        {
+            acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(acb_realref(initial));
+
+        if (!arf_is_special(xm))
+        {
+            re_max_exp = ARF_EXP(xm);
+            re_nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                re_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+
+        xm = arb_midref(acb_imagref(initial));
+
+        if (!arf_is_special(xm))
+        {
+            im_max_exp = ARF_EXP(xm);
+            im_nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                im_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+    }
+
+    for (xoff = 0; xoff < 2; xoff++)
+    {
+        for (yoff = 0; yoff < 2; yoff++)
+        {
+            slong nonzero, max_exp, min_exp;
+
+            if (xoff == yoff)
+            {
+                nonzero = re_nonzero;
+                max_exp = re_max_exp;
+                min_exp = re_min_exp;
+            }
+            else
+            {
+                nonzero = im_nonzero;
+                max_exp = im_max_exp;
+                min_exp = im_min_exp;
+            }
+
+            /* Determine maximum exponents for the main sum and the radius sum. */
+            for (i = 0; i < len; i++)
+            {
+                xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
+                yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
+
+                /* Fallback for huge exponents or non-finite values. */
+                if (!ARF_IS_LAGOM(arb_midref(xi)) || !ARF_IS_LAGOM(arb_midref(yi)))
+                {
+                    acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+                    return;
+                }
+
+                xm = arb_midref(xi);
+                ym = arb_midref(yi);
+
+                /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
+                if (!arf_is_special(xm))
+                {
+                    xexp = ARF_EXP(xm);
+
+                    if (!arf_is_special(ym))
+                    {
+                        yexp = ARF_EXP(ym);
+
+                        max_exp = FLINT_MAX(max_exp, xexp + yexp);
+                        nonzero++;
+
+                        if (prec > 2 * FLINT_BITS)
+                        {
+                            slong bot;
+                            bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
+                            min_exp = FLINT_MIN(min_exp, bot);
+                        }
+                    }
+                }
+            }
+
+            if (xoff == yoff)
+            {
+                re_nonzero = nonzero;
+                re_max_exp = max_exp;
+                re_min_exp = min_exp;
+            }
+            else
+            {
+                im_nonzero = nonzero;
+                im_max_exp = max_exp;
+                im_min_exp = min_exp;
+            }
+        }
+    }
+
+    re_prec = prec;
+    im_prec = prec;
+
+    if (re_max_exp == WORD_MIN && im_max_exp == WORD_MIN)
+    {
+        arf_zero(arb_midref(acb_realref(res)));
+        arf_zero(arb_midref(acb_imagref(res)));
+        return;
+    }
+
+    /* The midpoint sum is zero. */
+    if (re_max_exp == WORD_MIN)
+    {
+        re_prec = 2;
+    }
+    else
+    {
+        if (re_min_exp != WORD_MAX)
+            re_prec = FLINT_MIN(re_prec, re_max_exp - re_min_exp + MAG_BITS);
+        re_prec = FLINT_MAX(re_prec, 2);
+    }
+
+    if (im_max_exp == WORD_MIN)
+    {
+        im_prec = 2;
+    }
+    else
+    {
+        if (re_min_exp != WORD_MAX)
+            im_prec = FLINT_MIN(im_prec, im_max_exp - im_min_exp + MAG_BITS);
+        im_prec = FLINT_MAX(im_prec, 2);
+    }
+
+    extend = FLINT_BIT_COUNT(re_nonzero) + 1;
+    padding = 4 + FLINT_BIT_COUNT(len);
+    re_sn = (re_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+    re_sn = FLINT_MAX(re_sn, 2);
+    re_sum_exp = re_max_exp + extend;
+
+    extend = FLINT_BIT_COUNT(im_nonzero) + 1;
+    padding = 4 + FLINT_BIT_COUNT(len);
+    im_sn = (im_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+    im_sn = FLINT_MAX(im_sn, 2);
+    im_sum_exp = im_max_exp + extend;
+
+    /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
+       for carry or borrow that avoids an extra branch). We need
+       2 * (sn + 2) limbs to store the product of two numbers
+       with up to (sn + 2) limbs, plus 1 extra limb for shifting
+       the product. */
+    alloc = (re_sn + 1) + (im_sn + 1) + 2 * (FLINT_MAX(re_sn, im_sn) + 2) + 1;
+    ARF_ADD_TMP_ALLOC(re_sum, alloc)
+    im_sum = re_sum + (re_sn + 1);
+    tmp = im_sum + (im_sn + 1);
+
+    /* Set sum to 0 */
+    re_serr = 0;
+    for (j = 0; j < re_sn + 1; j++)
+        re_sum[j] = 0;
+    im_serr = 0;
+    for (j = 0; j < im_sn + 1; j++)
+        im_sum[j] = 0;
+
+    if (initial != NULL)
+    {
+        xm = arb_midref(acb_realref(initial));
+
+        ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, subtract, xm);
+
+        xm = arb_midref(acb_imagref(initial));
+
+        ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, subtract, xm);
+    }
+
+    use_gauss = NULL;
+
+    if (re_prec >= GAUSS_CUTOFF * FLINT_BITS &&
+        im_prec >= GAUSS_CUTOFF * FLINT_BITS)
+    {
+        arf_t e, f;
+
+        for (i = 0; i < len; i++)
+        {
+            arb_srcptr ai, bi, ci, di;
+            mp_size_t an, bn, cn, dn;
+            slong aexp, bexp, cexp, dexp;
+
+            ai = ((arb_srcptr) x) + 2 * i * xstep;
+            bi = ((arb_srcptr) x) + 2 * i * xstep + 1;
+            ci = ((arb_srcptr) y) + 2 * i * ystep;
+            di = ((arb_srcptr) y) + 2 * i * ystep + 1;
+
+            an = ARF_SIZE(arb_midref(ai));
+            bn = ARF_SIZE(arb_midref(bi));
+            cn = ARF_SIZE(arb_midref(ci));
+            dn = ARF_SIZE(arb_midref(di));
+
+            aexp = ARF_EXP(arb_midref(ai));
+            bexp = ARF_EXP(arb_midref(bi));
+            cexp = ARF_EXP(arb_midref(ci));
+            dexp = ARF_EXP(arb_midref(di));
+
+            if (an >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF &&
+                bn >= GAUSS_CUTOFF && cn >= GAUSS_CUTOFF &&
+                FLINT_ABS(an - bn) <= 2 &&
+                FLINT_ABS(cn - dn) <= 2 &&
+                FLINT_ABS(aexp - bexp) <= 64 &&
+                FLINT_ABS(cexp - dexp) <= 64 &&
+                re_sum_exp - (aexp + cexp) < 0.1 * re_prec &&
+                im_sum_exp - (aexp + dexp) < 0.1 * im_prec &&
+                an + cn < 2.2 * re_sn && an + dn < 2.2 * im_sn)
+            {
+                if (use_gauss == NULL)
+                {
+                    use_gauss = flint_calloc(len, sizeof(char));
+                    arf_init(e);
+                    arf_init(f);
+                }
+
+                use_gauss[i] = 1;
+                _arf_complex_mul_gauss(e, f, arb_midref(ai), arb_midref(bi), arb_midref(ci), arb_midref(di));
+                ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, 0, e);
+                ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, 0, f);
+            }
+        }
+
+        if (use_gauss != NULL)
+        {
+            arf_clear(e);
+            arf_clear(f);
+        }
+    }
+
+    for (xoff = 0; xoff < 2; xoff++)
+    {
+        for (yoff = 0; yoff < 2; yoff++)
+        {
+            slong sum_exp;
+            mp_ptr sum;
+            mp_size_t sn;
+            mp_limb_t serr;
+            int flipsign;
+
+            if (xoff == yoff)
+            {
+                sum_exp = re_sum_exp;
+                sum = re_sum;
+                sn = re_sn;
+                if (re_max_exp == WORD_MIN)
+                    continue;
+            }
+            else
+            {
+                sum_exp = im_sum_exp;
+                sum = im_sum;
+                sn = im_sn;
+                if (im_max_exp == WORD_MIN)
+                    continue;
+            }
+
+            serr = 0;
+            flipsign = (xoff + yoff == 2);
+
+            for (i = 0; i < len; i++)
+            {
+                xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
+                yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
+
+                xm = arb_midref(xi);
+                ym = arb_midref(yi);
+
+                /* The midpoints of x[i] and y[i] are both nonzero. */
+                if (!arf_is_special(xm) && !arf_is_special(ym))
+                {
+                    xexp = ARF_EXP(xm);
+                    xn = ARF_SIZE(xm);
+                    xnegative = ARF_SGNBIT(xm);
+
+                    yexp = ARF_EXP(ym);
+                    yn = ARF_SIZE(ym);
+                    ynegative = ARF_SGNBIT(ym);
+
+                    exp = xexp + yexp;
+                    shift = sum_exp - exp;
+
+                    if (shift >= sn * FLINT_BITS)
+                    {
+                    }
+                    else if (xn <= 2 && yn <= 2 && sn <= 3)
+                    {
+                        mp_limb_t x1, x0, y1, y0;
+                        mp_limb_t u3, u2, u1, u0;
+
+                        if (xn == 1 && yn == 1)
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            umul_ppmm(u3, u2, x0, y0);
+                            u1 = u0 = 0;
+                        }
+                        else if (xn == 2 && yn == 2)
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            x1 = ARF_NOPTR_D(xm)[1];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            y1 = ARF_NOPTR_D(ym)[1];
+                            nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+                        }
+                        else if (xn == 1)
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            y1 = ARF_NOPTR_D(ym)[1];
+                            nn_mul_2x1(u3, u2, u1, y1, y0, x0);
+                            u0 = 0;
+                        }
+                        else
+                        {
+                            x0 = ARF_NOPTR_D(xm)[0];
+                            x1 = ARF_NOPTR_D(xm)[1];
+                            y0 = ARF_NOPTR_D(ym)[0];
+                            nn_mul_2x1(u3, u2, u1, x1, x0, y0);
+                            u0 = 0;
+                        }
+
+                        if (sn == 2)
+                        {
+                            if (shift < FLINT_BITS)
+                            {
+                                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                                u3 = (u3 >> shift);
+                            }
+                            else if (shift == FLINT_BITS)
+                            {
+                                u2 = u3;
+                                u3 = 0;
+                            }
+                            else /* FLINT_BITS < shift < 2 * FLINT_BITS */
+                            {
+                                u2 = (u3 >> (shift - FLINT_BITS));
+                                u3 = 0;
+                            }
+
+                            if (xnegative ^ ynegative ^ flipsign)
+                                sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                            else
+                                add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                        }
+                        else if (sn == 3)
+                        {
+                            if (shift < FLINT_BITS)
+                            {
+                                u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                                u3 = (u3 >> shift);
+                            }
+                            else if (shift == FLINT_BITS)
+                            {
+                                u1 = u2;
+                                u2 = u3;
+                                u3 = 0;
+                            }
+                            else if (shift < 2 * FLINT_BITS)
+                            {
+                                u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
+                                u2 = (u3 >> (shift - FLINT_BITS));
+                                u3 = 0;
+                            }
+                            else if (shift == 2 * FLINT_BITS)
+                            {
+                                u1 = u3;
+                                u2 = 0;
+                                u3 = 0;
+                            }
+                            else  /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
+                            {
+                                u1 = (u3 >> (shift - 2 * FLINT_BITS));
+                                u2 = 0;
+                                u3 = 0;
+                            }
+
+                            if (xnegative ^ ynegative ^ flipsign)
+                                sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                            else
+                                add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                        }
+                    }
+                    else
+                    {
+                        mp_srcptr xptr, yptr;
+
+                        xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                        yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
+
+                        if (use_gauss == NULL || use_gauss[i] == 0)
+                            _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative ^ flipsign, shift);
+                    }
+                }
+            }
+        }
+    }
+
+    _arb_dot_output(acb_realref(res), re_sum, re_sn, subtract, re_sum_exp, re_prec);
+    _arb_dot_output(acb_imagref(res), im_sum, im_sn, subtract, im_sum_exp, im_prec);
+
+    ARF_ADD_TMP_FREE(re_sum, alloc);
+    if (use_gauss != NULL)
+        flint_free(use_gauss);
+}
--- a/acb/test/t-approx_dot.c
+++ b/acb/test/t-approx_dot.c
@ -0,0 +1,261 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb.h"
+
+ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("approx_dot....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 100000 * arb_test_multiplier(); iter++)
+    {
+        acb_ptr x, y;
+        acb_t s1, s2, z;
+        slong i, len, prec, xbits, ybits, ebits;
+        int initial, subtract, revx, revy;
+
+        if (n_randint(state, 100) == 0)
+            len = n_randint(state, 100);
+        else if (n_randint(state, 10) == 0)
+            len = n_randint(state, 10);
+        else
+            len = n_randint(state, 3);
+
+        acb_dot_gauss_dot_cutoff = 3 + n_randint(state, 30);
+
+        if (n_randint(state, 10) != 0 || len > 10)
+        {
+            prec = 2 + n_randint(state, 500);
+            xbits = 2 + n_randint(state, 500);
+            ybits = 2 + n_randint(state, 500);
+        }
+        else
+        {
+            prec = 2 + n_randint(state, 4000);
+            xbits = 2 + n_randint(state, 4000);
+            ybits = 2 + n_randint(state, 4000);
+        }
+
+        if (n_randint(state, 100) == 0)
+            ebits = 2 + n_randint(state, 100);
+        else
+            ebits = 2 + n_randint(state, 10);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        revx = n_randint(state, 2);
+        revy = n_randint(state, 2);
+
+        x = _acb_vec_init(len);
+        y = _acb_vec_init(len);
+        acb_init(s1);
+        acb_init(s2);
+        acb_init(z);
+
+        switch (n_randint(state, 3))
+        {
+            case 0:
+                for (i = 0; i < len; i++)
+                {
+                    acb_randtest(x + i, state, xbits, ebits);
+                    acb_randtest(y + i, state, ybits, ebits);
+                }
+                break;
+
+            /* Test with cancellation */
+            case 1:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg(x + i, x + len - i - 1);
+                        acb_set(y + i, y + len - i - 1);
+                    }
+                }
+                break;
+
+            default:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        acb_randtest(x + i, state, xbits, ebits);
+                        acb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        acb_neg_round(x + i, x + len - i - 1, 2 + n_randint(state, 500));
+                        acb_set_round(y + i, y + len - i - 1, 2 + n_randint(state, 500));
+                    }
+                }
+                break;
+        }
+
+        acb_randtest(s1, state, 200, 100);
+        acb_randtest(s2, state, 200, 100);
+        acb_randtest(z, state, xbits, ebits);
+
+        acb_approx_dot(s1, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+        mag_zero(arb_radref(acb_realref(s1)));
+        mag_zero(arb_radref(acb_imagref(s1)));
+
+        /* With the fast algorithm, we expect identical results when
+           reversing the vectors. */
+        if (ebits <= 12)
+        {
+            acb_approx_dot(s2, initial ? z : NULL, subtract,
+                !revx ? (x + len - 1) : x, !revx ? -1 : 1,
+                !revy ? (y + len - 1) : y, !revy ? -1 : 1,
+                len, prec);
+            mag_zero(arb_radref(acb_realref(s2)));
+            mag_zero(arb_radref(acb_imagref(s2)));
+
+            if (!acb_equal(s1, s2))
+            {
+                flint_printf("FAIL (reversal)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+        }
+
+        /* Verify that radii are ignored */
+        for (i = 0; i < len; i++)
+        {
+            arb_get_mid_arb(acb_realref(x + i), acb_realref(x + i));
+            arb_get_mid_arb(acb_imagref(x + i), acb_imagref(x + i));
+            arb_get_mid_arb(acb_realref(y + i), acb_realref(y + i));
+            arb_get_mid_arb(acb_imagref(y + i), acb_imagref(y + i));
+        }
+        arb_get_mid_arb(acb_realref(z), acb_realref(z));
+        arb_get_mid_arb(acb_imagref(z), acb_imagref(z));
+
+        acb_approx_dot(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+        mag_zero(arb_radref(acb_realref(s2)));
+        mag_zero(arb_radref(acb_imagref(s2)));
+
+        if (!acb_equal(s1, s2))
+        {
+            flint_printf("FAIL (radii)\n\n");
+            flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+            if (initial)
+            {
+                flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+            }
+
+            for (i = 0; i < len; i++)
+            {
+                flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+            }
+            flint_printf("\n\n");
+            flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* Compare with acb_dot */
+        acb_approx_dot(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+
+        {
+            mag_t err, xx, yy;
+
+            mag_init(err);
+            mag_init(xx);
+            mag_init(yy);
+
+            if (initial)
+                acb_get_mag(err, z);
+
+            for (i = 0; i < len; i++)
+            {
+                acb_get_mag(xx, revx ? x + len - 1 - i : x + i);
+                acb_get_mag(yy, revx ? y + len - 1 - i : y + i);
+                mag_addmul(err, xx, yy);
+            }
+
+            mag_mul_2exp_si(err, err, -prec + 2);
+            acb_add_error_mag(s2, err);
+
+            if (!acb_contains(s2, s1))
+            {
+                flint_printf("FAIL (inclusion)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); acb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", acb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); acb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); acb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", acb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); acb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); acb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+
+            mag_clear(err);
+            mag_clear(xx);
+            mag_clear(yy);
+        }
+
+        acb_clear(s1);
+        acb_clear(s2);
+        acb_clear(z);
+        _acb_vec_clear(x, len);
+        _acb_vec_clear(y, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/acb_mat.h
+++ b/acb_mat.h
@ -389,6 +389,7 @@ int acb_mat_solve(acb_mat_t X, const acb_mat_t A, const acb_mat_t B, slong prec)

 int acb_mat_solve_precond(acb_mat_t X, const acb_mat_t A, const acb_mat_t B, slong prec);

+void acb_mat_approx_mul(acb_mat_t C, const acb_mat_t A, const acb_mat_t B, slong prec);
 void acb_mat_approx_solve_triu(acb_mat_t X, const acb_mat_t U, const acb_mat_t B, int unit, slong prec);
 void acb_mat_approx_solve_tril(acb_mat_t X, const acb_mat_t L, const acb_mat_t B, int unit, slong prec);
 int acb_mat_approx_lu(slong * P, acb_mat_t LU, const acb_mat_t A, slong prec);
--- a/acb_mat/approx_lu.c
+++ b/acb_mat/approx_lu.c
@ -194,8 +194,7 @@ acb_mat_approx_lu_recursive(slong * P, acb_mat_t LU, const acb_mat_t A, slong pr
        /* acb_mat_submul(A11, A11, A10, A01, prec); */
        acb_mat_t T;
        acb_mat_init(T, A10->r, A01->c);
-        acb_mat_mul(T, A10, A01, prec);
-        acb_mat_get_mid(T, T);
+        acb_mat_approx_mul(T, A10, A01, prec);
        acb_mat_sub(A11, A11, T, prec);
        acb_mat_get_mid(A11, A11);
        acb_mat_clear(T);
@ -227,4 +226,3 @@ acb_mat_approx_lu(slong * P, acb_mat_t LU, const acb_mat_t A, slong prec)
    else
        return acb_mat_approx_lu_recursive(P, LU, A, prec);
 }
-
--- a/acb_mat/approx_mul.c
+++ b/acb_mat/approx_mul.c
@ -0,0 +1,157 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "acb_mat.h"
+
+void
+acb_mat_approx_mul_classical(acb_mat_t C, const acb_mat_t A, const acb_mat_t B, slong prec)
+{
+    slong ar, br, bc, i, j;
+
+    ar = acb_mat_nrows(A);
+    br = acb_mat_nrows(B);
+    bc = acb_mat_ncols(B);
+
+    if (br == 0)
+    {
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                arf_zero(arb_midref(acb_realref(acb_mat_entry(C, i, j))));
+                arf_zero(arb_midref(acb_imagref(acb_mat_entry(C, i, j))));
+            }
+        }
+        return;
+    }
+
+    if (A == C || B == C)
+    {
+        acb_mat_t T;
+        acb_mat_init(T, ar, bc);
+        acb_mat_approx_mul_classical(T, A, B, prec);
+        acb_mat_swap(T, C);
+        acb_mat_clear(T);
+        return;
+    }
+
+    if (br == 1)
+    {
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                arf_complex_mul(
+                    arb_midref(acb_realref(acb_mat_entry(C, i, j))),
+                    arb_midref(acb_imagref(acb_mat_entry(C, i, j))),
+                    arb_midref(acb_realref(acb_mat_entry(A, i, 0))),
+                    arb_midref(acb_imagref(acb_mat_entry(A, i, 0))),
+                    arb_midref(acb_realref(acb_mat_entry(B, 0, j))),
+                    arb_midref(acb_imagref(acb_mat_entry(B, 0, j))), prec, ARB_RND);
+            }
+        }
+    }
+    else
+    {
+        acb_ptr tmp;
+        TMP_INIT;
+
+        TMP_START;
+        tmp = TMP_ALLOC(sizeof(acb_struct) * br * bc);
+
+        for (i = 0; i < br; i++)
+            for (j = 0; j < bc; j++)
+                tmp[j * br + i] = *acb_mat_entry(B, i, j);
+
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                acb_approx_dot(acb_mat_entry(C, i, j), NULL, 0,
+                    A->rows[i], 1, tmp + j * br, 1, br, prec);
+            }
+        }
+
+        TMP_END;
+    }
+}
+
+int
+acb_mat_is_exact(const acb_mat_t A)
+{
+    slong i, j;
+
+    for (i = 0; i < acb_mat_nrows(A); i++)
+        for (j = 0; j < acb_mat_ncols(A); j++)
+            if (!mag_is_zero(arb_radref(acb_realref(acb_mat_entry(A, i, j)))) ||
+                !mag_is_zero(arb_radref(acb_imagref(acb_mat_entry(A, i, j)))))
+                return 0;
+
+    return 1;
+}
+
+void
+acb_mat_approx_mul(acb_mat_t C, const acb_mat_t A, const acb_mat_t B, slong prec)
+{
+    slong cutoff;
+
+    /* todo: detect small-integer matrices */
+    if (prec <= 2 * FLINT_BITS)
+        cutoff = 120;
+    else if (prec <= 16 * FLINT_BITS)
+        cutoff = 60;
+    else
+        cutoff = 40;
+
+    if (acb_mat_nrows(A) <= cutoff || acb_mat_ncols(A) <= cutoff ||
+        acb_mat_ncols(B) <= cutoff)
+    {
+        acb_mat_approx_mul_classical(C, A, B, prec);
+    }
+    else
+    {
+        if (acb_mat_is_exact(A) && acb_mat_is_exact(B))
+        {
+            acb_mat_mul(C, A, B, prec);
+        }
+        else
+        {
+            acb_mat_t AM, BM;
+
+            if (acb_mat_is_exact(A))
+            {
+                acb_mat_init(BM, acb_mat_nrows(B), acb_mat_ncols(B));
+                acb_mat_get_mid(BM, B);
+                acb_mat_mul(C, A, BM, prec);
+                acb_mat_clear(BM);
+            }
+            else if (acb_mat_is_exact(A))
+            {
+                acb_mat_init(AM, acb_mat_nrows(A), acb_mat_ncols(A));
+                acb_mat_get_mid(AM, A);
+                acb_mat_mul(C, AM, B, prec);
+                acb_mat_clear(AM);
+            }
+            else
+            {
+                acb_mat_init(BM, acb_mat_nrows(B), acb_mat_ncols(B));
+                acb_mat_get_mid(BM, B);
+                acb_mat_init(AM, acb_mat_nrows(A), acb_mat_ncols(A));
+                acb_mat_get_mid(AM, A);
+                acb_mat_mul(C, AM, BM, prec);
+                acb_mat_clear(AM);
+                acb_mat_clear(BM);
+            }
+        }
+
+        acb_mat_get_mid(C, C);
+    }
+}
--- a/acb_mat/approx_solve_lu_precomp.c
+++ b/acb_mat/approx_solve_lu_precomp.c
@ -11,48 +11,11 @@

 #include "acb_mat.h"

-static void
-_acb_approx_mul(acb_t res, const acb_t x, const acb_t y, slong prec)
-{
-    arf_complex_mul(arb_midref(acb_realref(res)), arb_midref(acb_imagref(res)),
-        arb_midref(acb_realref(x)), arb_midref(acb_imagref(x)), 
-        arb_midref(acb_realref(y)), arb_midref(acb_imagref(y)), prec, ARB_RND);
-}
-
-static void
-_acb_approx_submul(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
-{
-    _acb_approx_mul(t, x, y, prec);
-
-    arf_sub(arb_midref(acb_realref(z)),
-        arb_midref(acb_realref(z)), 
-        arb_midref(acb_realref(t)), prec, ARB_RND);
-    arf_sub(arb_midref(acb_imagref(z)),
-        arb_midref(acb_imagref(z)), 
-        arb_midref(acb_imagref(t)), prec, ARB_RND);
-}
-
-/* note: the tmp variable t should have zero radius */
-static void
-_acb_approx_div(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
-{
-    arf_set(arb_midref(acb_realref(t)), arb_midref(acb_realref(y)));
-    arf_set(arb_midref(acb_imagref(t)), arb_midref(acb_imagref(y)));
-
-    acb_inv(t, t, prec);
-
-    mag_zero(arb_radref(acb_realref(t)));
-    mag_zero(arb_radref(acb_imagref(t)));
-
-    _acb_approx_mul(z, x, t, prec);
-}
-
 void
 acb_mat_approx_solve_lu_precomp(acb_mat_t X, const slong * perm,
    const acb_mat_t A, const acb_mat_t B, slong prec)
 {
-    acb_t t;
-    slong i, j, c, n, m;
+    slong i, c, n, m;

    n = acb_mat_nrows(X);
    m = acb_mat_ncols(X);
@ -84,44 +47,6 @@ acb_mat_approx_solve_lu_precomp(acb_mat_t X, const slong * perm,
    }

    acb_mat_get_mid(X, X);
-
-    /* todo: solve_tril and solve_triu have some overhead; should be
-       able to eliminate the basecase code below */
-    if (n >= 8 && m >= 8)
-    {
    acb_mat_approx_solve_tril(X, A, X, 1, prec);
    acb_mat_approx_solve_triu(X, A, X, 0, prec);
-        return;
-    }
-
-    acb_init(t);
-
-    for (c = 0; c < m; c++)
-    {
-        /* solve Ly = b */
-        for (i = 1; i < n; i++)
-        {
-            for (j = 0; j < i; j++)
-            {
-                _acb_approx_submul(acb_mat_entry(X, i, c),
-                    acb_mat_entry(A, i, j), acb_mat_entry(X, j, c), t, prec);
-            }
-        }
-
-        /* solve Ux = y */
-        for (i = n - 1; i >= 0; i--)
-        {
-            for (j = i + 1; j < n; j++)
-            {
-                _acb_approx_submul(acb_mat_entry(X, i, c),
-                    acb_mat_entry(A, i, j), acb_mat_entry(X, j, c), t, prec);
-            }
-
-            _acb_approx_div(acb_mat_entry(X, i, c), acb_mat_entry(X, i, c),
-                acb_mat_entry(A, i, i), t, prec);
-        }
-    }
-
-    acb_clear(t);
 }
-
--- a/acb_mat/approx_solve_tril.c
+++ b/acb_mat/approx_solve_tril.c
@ -11,13 +11,6 @@

 #include "acb_mat.h"

-static void
-acb_approx_set(acb_t z, const acb_t x)
-{
-    arf_set(arb_midref(acb_realref(z)), arb_midref(acb_realref(x)));
-    arf_set(arb_midref(acb_imagref(z)), arb_midref(acb_imagref(x)));
-}
-
 static void
 acb_approx_mul(acb_t res, const acb_t x, const acb_t y, slong prec)
 {
@ -26,26 +19,6 @@ acb_approx_mul(acb_t res, const acb_t x, const acb_t y, slong prec)
        arb_midref(acb_realref(y)), arb_midref(acb_imagref(y)), prec, ARB_RND);
 }

-static void
-acb_approx_sub(acb_t z, const acb_t x, const acb_t y, slong prec)
-{
-    arf_sub(arb_midref(acb_realref(z)), arb_midref(acb_realref(x)), arb_midref(acb_realref(y)), prec, ARF_RND_DOWN);
-    arf_sub(arb_midref(acb_imagref(z)), arb_midref(acb_imagref(x)), arb_midref(acb_imagref(y)), prec, ARF_RND_DOWN);
-}
-
-static void
-acb_approx_addmul(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
-{
-    acb_approx_mul(t, x, y, prec);
-
-    arf_add(arb_midref(acb_realref(z)),
-        arb_midref(acb_realref(z)), 
-        arb_midref(acb_realref(t)), prec, ARB_RND);
-    arf_add(arb_midref(acb_imagref(z)),
-        arb_midref(acb_imagref(z)), 
-        arb_midref(acb_imagref(t)), prec, ARB_RND);
-}
-
 /* note: the tmp variable t should have zero radius */
 static void
 acb_approx_div(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
@ -65,7 +38,7 @@ void
 acb_mat_approx_solve_tril_classical(acb_mat_t X,
        const acb_mat_t L, const acb_mat_t B, int unit, slong prec)
 {
-    slong i, j, k, n, m;
+    slong i, j, n, m;
    acb_ptr tmp;
    acb_t s, t;

@ -74,29 +47,28 @@ acb_mat_approx_solve_tril_classical(acb_mat_t X,

    acb_init(s);
    acb_init(t);
-    tmp = _acb_vec_init(n);
+    tmp = flint_malloc(sizeof(acb_struct) * n);

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
-            acb_approx_set(tmp + j, acb_mat_entry(X, j, i));
+            tmp[j] = *acb_mat_entry(X, j, i);

        for (j = 0; j < n; j++)
        {
-            acb_zero(s);
-            for (k = 0; k < j; k++)
-                acb_approx_addmul(s, L->rows[j] + k, tmp + k, t, prec);
-            acb_approx_sub(s, acb_mat_entry(B, j, i), s, prec);
+            acb_approx_dot(s, acb_mat_entry(B, j, i), 1, L->rows[j], 1, tmp, 1, j, prec);
+
            if (!unit)
-                acb_approx_div(s, s, acb_mat_entry(L, j, j), t, prec);
-            acb_approx_set(tmp + j, s);
+                acb_approx_div(tmp + j, s, acb_mat_entry(L, j, j), t, prec);
+            else
+                acb_swap(tmp + j, s);
        }

        for (j = 0; j < n; j++)
-            acb_approx_set(acb_mat_entry(X, j, i), tmp + j);
+            *acb_mat_entry(X, j, i) = tmp[j];
    }

-    _acb_vec_clear(tmp, n);
+    flint_free(tmp);
    acb_clear(s);
    acb_clear(t);
 }
@ -133,8 +105,7 @@ acb_mat_approx_solve_tril_recursive(acb_mat_t X,

    /* acb_mat_submul(XY, BY, LC, XX); */
    acb_mat_init(T, LC->r, BX->c);
-    acb_mat_mul(T, LC, XX, prec);
-    acb_mat_get_mid(T, T);
+    acb_mat_approx_mul(T, LC, XX, prec);
    acb_mat_sub(XY, BY, T, prec);
    acb_mat_get_mid(XY, XY);
    acb_mat_clear(T);
@ -154,9 +125,8 @@ void
 acb_mat_approx_solve_tril(acb_mat_t X, const acb_mat_t L,
                                    const acb_mat_t B, int unit, slong prec)
 {
-    if (B->r < 8 || B->c < 8)
+    if (B->r < 40 || B->c < 40)
        acb_mat_approx_solve_tril_classical(X, L, B, unit, prec);
    else
        acb_mat_approx_solve_tril_recursive(X, L, B, unit, prec);
 }
-
--- a/acb_mat/approx_solve_triu.c
+++ b/acb_mat/approx_solve_triu.c
@ -11,13 +11,6 @@

 #include "acb_mat.h"

-static void
-acb_approx_set(acb_t z, const acb_t x)
-{
-    arf_set(arb_midref(acb_realref(z)), arb_midref(acb_realref(x)));
-    arf_set(arb_midref(acb_imagref(z)), arb_midref(acb_imagref(x)));
-}
-
 static void
 acb_approx_mul(acb_t res, const acb_t x, const acb_t y, slong prec)
 {
@ -26,26 +19,6 @@ acb_approx_mul(acb_t res, const acb_t x, const acb_t y, slong prec)
        arb_midref(acb_realref(y)), arb_midref(acb_imagref(y)), prec, ARB_RND);
 }

-static void
-acb_approx_sub(acb_t z, const acb_t x, const acb_t y, slong prec)
-{
-    arf_sub(arb_midref(acb_realref(z)), arb_midref(acb_realref(x)), arb_midref(acb_realref(y)), prec, ARF_RND_DOWN);
-    arf_sub(arb_midref(acb_imagref(z)), arb_midref(acb_imagref(x)), arb_midref(acb_imagref(y)), prec, ARF_RND_DOWN);
-}
-
-static void
-acb_approx_addmul(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
-{
-    acb_approx_mul(t, x, y, prec);
-
-    arf_add(arb_midref(acb_realref(z)),
-        arb_midref(acb_realref(z)), 
-        arb_midref(acb_realref(t)), prec, ARB_RND);
-    arf_add(arb_midref(acb_imagref(z)),
-        arb_midref(acb_imagref(z)), 
-        arb_midref(acb_imagref(t)), prec, ARB_RND);
-}
-
 /* note: the tmp variable t should have zero radius */
 static void
 acb_approx_div(acb_t z, const acb_t x, const acb_t y, acb_t t, slong prec)
@ -65,40 +38,37 @@ void
 acb_mat_approx_solve_triu_classical(acb_mat_t X, const acb_mat_t U,
    const acb_mat_t B, int unit, slong prec)
 {
-    slong i, j, k, n, m;
+    slong i, j, n, m;
    acb_ptr tmp;
    acb_t s, t;

    n = U->r;
    m = B->c;

-    tmp = _acb_vec_init(n);
    acb_init(s);
    acb_init(t);
+    tmp = flint_malloc(sizeof(acb_struct) * n);

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
-            acb_approx_set(tmp + j, acb_mat_entry(X, j, i));
+            tmp[j] = *acb_mat_entry(X, j, i);

        for (j = n - 1; j >= 0; j--)
        {
-            acb_zero(s);
-            for (k = 0; k < n - j - 1; k++)
-                acb_approx_addmul(s, U->rows[j] + j + 1 + k, tmp + j + 1 + k, t, prec);
+            acb_approx_dot(s, acb_mat_entry(B, j, i), 1, U->rows[j] + j + 1, 1, tmp + j + 1, 1, n - j - 1, prec);

-            acb_approx_sub(s, acb_mat_entry(B, j, i), s, prec);
            if (!unit)
-                acb_approx_div(s, s, acb_mat_entry(U, j, j), t, prec);
-
-            acb_approx_set(tmp + j, s);
+                acb_approx_div(tmp + j, s, arb_mat_entry(U, j, j), t, prec);
+            else
+                acb_swap(tmp + j, s);
        }

        for (j = 0; j < n; j++)
-            acb_approx_set(acb_mat_entry(X, j, i), tmp + j);
+            *acb_mat_entry(X, j, i) = tmp[j];
    }

-    _acb_vec_clear(tmp, n);
+    flint_free(tmp);
    acb_clear(s);
    acb_clear(t);
 }
@ -134,8 +104,7 @@ acb_mat_approx_solve_triu_recursive(acb_mat_t X,
    acb_mat_approx_solve_triu(XY, UD, BY, unit, prec);

    acb_mat_init(T, UB->r, XY->c);
-    acb_mat_mul(T, UB, XY, prec);
-    acb_mat_get_mid(T, T);
+    acb_mat_approx_mul(T, UB, XY, prec);
    acb_mat_sub(XX, BX, T, prec);
    acb_mat_get_mid(XX, XX);
    acb_mat_clear(T);
@ -155,9 +124,8 @@ void
 acb_mat_approx_solve_triu(acb_mat_t X, const acb_mat_t U,
                                    const acb_mat_t B, int unit, slong prec)
 {
-    if (B->r < 8 || B->c < 8)
+    if (B->r < 40 || B->c < 40)
        acb_mat_approx_solve_triu_classical(X, U, B, unit, prec);
    else
        acb_mat_approx_solve_triu_recursive(X, U, B, unit, prec);
 }
-
--- a/acb_mat/solve_lu_precomp.c
+++ b/acb_mat/solve_lu_precomp.c
@ -15,7 +15,7 @@ void
 acb_mat_solve_lu_precomp(acb_mat_t X, const slong * perm,
    const acb_mat_t A, const acb_mat_t B, slong prec)
 {
-    slong i, c, n, m;
+    slong i, j, c, n, m;

    n = acb_mat_nrows(X);
    m = acb_mat_ncols(X);
@ -46,6 +46,37 @@ acb_mat_solve_lu_precomp(acb_mat_t X, const slong * perm,
        }
    }

+    /* solve_tril and solve_triu have some overhead */
+    if (n >= 4)
+    {
        acb_mat_solve_tril(X, A, X, 1, prec);
        acb_mat_solve_triu(X, A, X, 0, prec);
+        return;
+    }
+
+    for (c = 0; c < m; c++)
+    {
+        /* solve Ly = b */
+        for (i = 1; i < n; i++)
+        {
+            for (j = 0; j < i; j++)
+            {
+                acb_submul(acb_mat_entry(X, i, c),
+                    acb_mat_entry(A, i, j), acb_mat_entry(X, j, c), prec);
+            }
+        }
+
+        /* solve Ux = y */
+        for (i = n - 1; i >= 0; i--)
+        {
+            for (j = i + 1; j < n; j++)
+            {
+                acb_submul(acb_mat_entry(X, i, c),
+                    acb_mat_entry(A, i, j), acb_mat_entry(X, j, c), prec);
+            }
+
+            acb_div(acb_mat_entry(X, i, c), acb_mat_entry(X, i, c),
+                acb_mat_entry(A, i, i), prec);
+        }
+    }
 }
--- a/arb.h
+++ b/arb.h
@ -464,6 +464,9 @@ void arb_dot_precise(arb_t res, const arb_t initial, int subtract,
 void arb_dot(arb_t res, const arb_t initial, int subtract,
    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec);

+void arb_approx_dot(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec);
+
 void arb_div(arb_t z, const arb_t x, const arb_t y, slong prec);
 void arb_div_arf(arb_t z, const arb_t x, const arf_t y, slong prec);
 void arb_div_si(arb_t z, const arb_t x, slong y, slong prec);
--- a/arb/approx_dot.c
+++ b/arb/approx_dot.c
@ -0,0 +1,527 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+/* We need uint64_t instead of mp_limb_t on 32-bit systems for
+   safe summation of 30-bit error bounds. */
+#include <stdint.h>
+
+
+/* The following macros are found in FLINT's longlong.h, but
+   the release version is out of date. */
+
+/* x86 : 64 bit */
+#if (GMP_LIMB_BITS == 64 && defined (__amd64__))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0"     \
+       : "=r" (sh), "=&r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)),  \
+         "1"  ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)),  \
+         "2"  ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl)))  \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0"     \
+       : "=r" (dh), "=&r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)),  \
+         "1"  ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)),  \
+"2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
+
+#endif /* x86_64 */
+
+/* x86 : 32 bit */
+#if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
+   || defined (__i486__) || defined(__amd64__)))
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
+  __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0"     \
+       : "=r" (sh), "=r" (sm), "=&r" (sl)                  \
+       : "0"  ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)),    \
+         "1"  ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)),    \
+         "2"  ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl)))    \
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
+  __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0"     \
+       : "=r" (dh), "=r" (dm), "=&r" (dl)                  \
+       : "0"  ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)),    \
+         "1"  ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)),    \
+         "2"  ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl)))    \
+
+#endif /* x86 */
+
+
+#if !defined(add_sssaaaaaa2)
+
+#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl);      \
+    add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm);      \
+    add_ssaaaa(sh, sm, ah + bh, sm, __u, __t);                      \
+} while (0)
+
+#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)           \
+  do {                                                              \
+    mp_limb_t __t, __u;                                             \
+    sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl);      \
+    sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm);      \
+    sub_ddmmss(dh, dm, mh - sh, dm, __u, __t);                      \
+  } while (0)
+
+#endif
+
+
+void
+_arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
+    int negative, mp_bitcnt_t shift);
+
+void
+_arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn,
+    int negative, mp_bitcnt_t shift);
+
+void
+arb_approx_dot_simple(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+            arf_zero(arb_midref(res));
+        else
+            arf_set_round(arb_midref(res), arb_midref(initial), prec, ARB_RND);
+        return;
+    }
+
+    if (initial == NULL)
+    {
+        arf_mul(arb_midref(res), arb_midref(x), arb_midref(y), prec, ARB_RND);
+    }
+    else
+    {
+        if (subtract)
+            arf_neg(arb_midref(res), arb_midref(initial));
+        else
+            arf_set(arb_midref(res), arb_midref(initial));
+        arf_addmul(arb_midref(res), arb_midref(x), arb_midref(y), prec, ARB_RND);
+    }
+
+    for (i = 1; i < len; i++)
+        arf_addmul(arb_midref(res), arb_midref(x + i * xstep), arb_midref(y + i * ystep), prec, ARB_RND);
+
+    if (subtract)
+        arf_neg(arb_midref(res), arb_midref(res));
+}
+
+void
+arb_approx_dot(arb_t res, const arb_t initial, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i, j, nonzero, padding, extend;
+    slong xexp, yexp, exp, max_exp, min_exp, sum_exp;
+    int xnegative, ynegative;
+    mp_size_t xn, yn, sn, alloc;
+    mp_bitcnt_t shift;
+    arb_srcptr xi, yi;
+    arf_srcptr xm, ym;
+    mp_limb_t serr;   /* Sum over arithmetic errors  - not used, but need dummy for calls */
+    mp_ptr tmp, sum;  /* Workspace */
+    ARF_ADD_TMP_DECL;
+
+    /* todo: fast fma and fmma (len=2) code */
+    if (len <= 1)
+    {
+        if (initial == NULL)
+        {
+            if (len <= 0)
+                arf_zero(arb_midref(res));
+            else
+            {
+                if (subtract)
+                    arf_neg_mul(arb_midref(res), arb_midref(x), arb_midref(y), prec, ARB_RND);
+                else
+                    arf_mul(arb_midref(res), arb_midref(x), arb_midref(y), prec, ARB_RND);
+            }
+            return;
+        }
+        else if (len <= 0)
+        {
+            arf_set_round(arb_midref(res), arb_midref(initial), prec, ARB_RND);
+            return;
+        }
+    }
+
+    /* Number of nonzero midpoint terms in sum. */
+    nonzero = 0;
+
+    /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
+    max_exp = WORD_MIN;
+
+    /* Used to reduce the precision. */
+    min_exp = WORD_MAX;
+
+    /* Account for the initial term. */
+    if (initial != NULL)
+    {
+        if (!ARF_IS_LAGOM(arb_midref(initial)))
+        {
+            arb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(initial);
+
+        if (!arf_is_special(xm))
+        {
+            max_exp = ARF_EXP(xm);
+            nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+    }
+
+    /* Determine maximum exponents for the main sum and the radius sum. */
+    for (i = 0; i < len; i++)
+    {
+        xi = x + i * xstep;
+        yi = y + i * ystep;
+
+        /* Fallback for huge exponents or non-finite values. */
+        if (!ARF_IS_LAGOM(arb_midref(xi)) || !ARF_IS_LAGOM(arb_midref(yi)))
+        {
+            arb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(xi);
+        ym = arb_midref(yi);
+
+        if (!arf_is_special(xm))
+        {
+            xexp = ARF_EXP(xm);
+
+            if (!arf_is_special(ym))
+            {
+                yexp = ARF_EXP(ym);
+
+                max_exp = FLINT_MAX(max_exp, xexp + yexp);
+                nonzero++;
+
+                if (prec > 2 * FLINT_BITS)
+                {
+                    slong bot;
+                    bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
+                    min_exp = FLINT_MIN(min_exp, bot);
+                }
+            }
+        }
+    }
+
+    /* The midpoint sum is zero. */
+    if (max_exp == WORD_MIN)
+    {
+        arf_zero(arb_midref(res));
+        return;
+    }
+    else
+    {
+        /* Reduce precision based on actual sizes. */
+        if (min_exp != WORD_MAX)
+            prec = FLINT_MIN(prec, max_exp - min_exp + MAG_BITS);
+
+        prec = FLINT_MAX(prec, 2);
+    }
+
+    /* Extend sum so that we can use two's complement addition. */
+    extend = FLINT_BIT_COUNT(nonzero) + 1;
+
+    /* Extra bits to improve accuracy (optional). */
+    padding = 4 + FLINT_BIT_COUNT(len);
+
+    /* Number of limbs. */
+    sn = (prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+
+    /* Avoid having to make a special case for sn = 1. */
+    sn = FLINT_MAX(sn, 2);
+
+    /* Exponent for the main sum. */
+    sum_exp = max_exp + extend;
+
+    /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
+       for carry or borrow that avoids an extra branch). We need
+       2 * (sn + 2) limbs to store the product of two numbers
+       with up to (sn + 2) limbs, plus 1 extra limb for shifting
+       the product. */
+    alloc = (sn + 1) + 2 * (sn + 2) + 1;
+    ARF_ADD_TMP_ALLOC(sum, alloc)
+    tmp = sum + (sn + 1);
+
+    /* Set sum to 0 */
+    serr = 0;
+    for (j = 0; j < sn + 1; j++)
+        sum[j] = 0;
+
+    if (initial != NULL)
+    {
+        xm = arb_midref(initial);
+
+        if (!arf_is_special(xm))
+        {
+            mp_srcptr xptr;
+
+            xexp = ARF_EXP(xm);
+            xn = ARF_SIZE(xm);
+            xnegative = ARF_SGNBIT(xm);
+
+            shift = sum_exp - xexp;
+
+            if (shift < sn * FLINT_BITS)
+            {
+                xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                _arb_dot_add_generic(sum, &serr, tmp, sn, xptr, xn, xnegative ^ subtract, shift);
+            }
+        }
+    }
+
+    for (i = 0; i < len; i++)
+    {
+        xi = x + i * xstep;
+        yi = y + i * ystep;
+        xm = arb_midref(xi);
+        ym = arb_midref(yi);
+
+        /* The midpoints of x[i] and y[i] are both nonzero. */
+        if (!arf_is_special(xm) && !arf_is_special(ym))
+        {
+            xexp = ARF_EXP(xm);
+            xn = ARF_SIZE(xm);
+            xnegative = ARF_SGNBIT(xm);
+
+            yexp = ARF_EXP(ym);
+            yn = ARF_SIZE(ym);
+            ynegative = ARF_SGNBIT(ym);
+
+            exp = xexp + yexp;
+            shift = sum_exp - exp;
+
+            if (shift >= sn * FLINT_BITS)
+            {
+                /* do nothing */
+            }
+#if 0
+            else if (xn == 1 && yn == 1 && sn == 2 && shift < FLINT_BITS)  /* Fastest path. */
+            {
+                mp_limb_t hi, lo, x0, y0;
+
+                x0 = ARF_NOPTR_D(xm)[0];
+                y0 = ARF_NOPTR_D(ym)[0];
+
+                umul_ppmm(hi, lo, x0, y0);
+
+                lo = (lo >> shift) | (hi << (FLINT_BITS - shift));
+                hi = (hi >> shift);
+
+                if (xnegative ^ ynegative)
+                    sub_ddmmss(sum[1], sum[0], sum[1], sum[0], hi, lo);
+                else
+                    add_ssaaaa(sum[1], sum[0], sum[1], sum[0], hi, lo);
+            }
+            else if (xn == 2 && yn == 2 && shift < FLINT_BITS && sn <= 3)
+            {
+                mp_limb_t x1, x0, y1, y0;
+                mp_limb_t u3, u2, u1, u0;
+
+                x0 = ARF_NOPTR_D(xm)[0];
+                x1 = ARF_NOPTR_D(xm)[1];
+                y0 = ARF_NOPTR_D(ym)[0];
+                y1 = ARF_NOPTR_D(ym)[1];
+
+                nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+
+                u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                u3 = (u3 >> shift);
+
+                if (sn == 2)
+                {
+                    if (xnegative ^ ynegative)
+                        sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                    else
+                        add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                }
+                else
+                {
+                    if (xnegative ^ ynegative)
+                        sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                    else
+                        add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                }
+            }
+#endif
+            else if (xn <= 2 && yn <= 2 && sn <= 3)
+            {
+                mp_limb_t x1, x0, y1, y0;
+                mp_limb_t u3, u2, u1, u0;
+
+                if (xn == 1 && yn == 1)
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    umul_ppmm(u3, u2, x0, y0);
+                    u1 = u0 = 0;
+                }
+                else if (xn == 2 && yn == 2)
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    x1 = ARF_NOPTR_D(xm)[1];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    y1 = ARF_NOPTR_D(ym)[1];
+                    nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+                }
+                else if (xn == 1)
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    y1 = ARF_NOPTR_D(ym)[1];
+                    nn_mul_2x1(u3, u2, u1, y1, y0, x0);
+                    u0 = 0;
+                }
+                else
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    x1 = ARF_NOPTR_D(xm)[1];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    nn_mul_2x1(u3, u2, u1, x1, x0, y0);
+                    u0 = 0;
+                }
+
+                if (sn == 2)
+                {
+                    if (shift < FLINT_BITS)
+                    {
+                        u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                        u3 = (u3 >> shift);
+                    }
+                    else if (shift == FLINT_BITS)
+                    {
+                        u2 = u3;
+                        u3 = 0;
+                    }
+                    else /* FLINT_BITS < shift < 2 * FLINT_BITS */
+                    {
+                        u2 = (u3 >> (shift - FLINT_BITS));
+                        u3 = 0;
+                    }
+
+                    if (xnegative ^ ynegative)
+                        sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                    else
+                        add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                }
+                else if (sn == 3)
+                {
+                    if (shift < FLINT_BITS)
+                    {
+                        u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                        u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                        u3 = (u3 >> shift);
+                    }
+                    else if (shift == FLINT_BITS)
+                    {
+                        u1 = u2;
+                        u2 = u3;
+                        u3 = 0;
+                    }
+                    else if (shift < 2 * FLINT_BITS)
+                    {
+                        u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
+                        u2 = (u3 >> (shift - FLINT_BITS));
+                        u3 = 0;
+                    }
+                    else if (shift == 2 * FLINT_BITS)
+                    {
+                        u1 = u3;
+                        u2 = 0;
+                        u3 = 0;
+                    }
+                    else  /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
+                    {
+                        u1 = (u3 >> (shift - 2 * FLINT_BITS));
+                        u2 = 0;
+                        u3 = 0;
+                    }
+
+                    if (xnegative ^ ynegative)
+                        sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                    else
+                        add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                }
+            }
+            else
+            {
+                mp_srcptr xptr, yptr;
+
+                xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
+
+                _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative, shift);
+            }
+        }
+    }
+
+    xnegative = 0;
+    if (sum[sn - 1] >= LIMB_TOP)
+    {
+        mpn_neg(sum, sum, sn);
+        xnegative = 1;
+    }
+
+    if (sum[sn - 1] == 0)
+    {
+        slong sum_exp2;
+        mp_size_t sn2;
+
+        sn2 = sn;
+        sum_exp2 = sum_exp; 
+
+        while (sn2 > 0 && sum[sn2 - 1] == 0)
+        {
+            sum_exp2 -= FLINT_BITS;
+            sn2--;
+        }
+
+        if (sn2 == 0)
+        {
+            arf_zero(arb_midref(res));
+        }
+        else
+        {
+            _arf_set_round_mpn(arb_midref(res), &exp, sum, sn2, xnegative ^ subtract, prec, ARF_RND_DOWN);
+            _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp2);
+        }
+    }
+    else
+    {
+
+        if (sn == 2)
+            _arf_set_round_uiui(arb_midref(res), &exp, sum[1], sum[0], xnegative ^ subtract, prec, ARF_RND_DOWN);
+        else
+            _arf_set_round_mpn(arb_midref(res), &exp, sum, sn, xnegative ^ subtract, prec, ARF_RND_DOWN);
+
+        _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp);
+    }
+
+    ARF_ADD_TMP_FREE(sum, alloc);
+}
--- a/arb/test/t-approx_dot.c
+++ b/arb/test/t-approx_dot.c
@ -0,0 +1,251 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("approx_dot....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 100000 * arb_test_multiplier(); iter++)
+    {
+        arb_ptr x, y;
+        arb_t s1, s2, z;
+        slong i, len, prec, xbits, ybits, ebits;
+        int initial, subtract, revx, revy;
+
+        if (n_randint(state, 100) == 0)
+            len = n_randint(state, 100);
+        else if (n_randint(state, 10) == 0)
+            len = n_randint(state, 10);
+        else
+            len = n_randint(state, 3);
+
+        if (n_randint(state, 10) != 0 || len > 10)
+        {
+            prec = 2 + n_randint(state, 500);
+            xbits = 2 + n_randint(state, 500);
+            ybits = 2 + n_randint(state, 500);
+        }
+        else
+        {
+            prec = 2 + n_randint(state, 4000);
+            xbits = 2 + n_randint(state, 4000);
+            ybits = 2 + n_randint(state, 4000);
+        }
+
+        if (n_randint(state, 100) == 0)
+            ebits = 2 + n_randint(state, 100);
+        else
+            ebits = 2 + n_randint(state, 10);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        revx = n_randint(state, 2);
+        revy = n_randint(state, 2);
+
+        x = _arb_vec_init(len);
+        y = _arb_vec_init(len);
+        arb_init(s1);
+        arb_init(s2);
+        arb_init(z);
+
+        switch (n_randint(state, 3))
+        {
+            case 0:
+                for (i = 0; i < len; i++)
+                {
+                    arb_randtest(x + i, state, xbits, ebits);
+                    arb_randtest(y + i, state, ybits, ebits);
+                }
+                break;
+
+            /* Test with cancellation */
+            case 1:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        arb_randtest(x + i, state, xbits, ebits);
+                        arb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        arb_neg(x + i, x + len - i - 1);
+                        arb_set(y + i, y + len - i - 1);
+                    }
+                }
+                break;
+
+            default:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        arb_randtest(x + i, state, xbits, ebits);
+                        arb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        arb_neg_round(x + i, x + len - i - 1, 2 + n_randint(state, 500));
+                        arb_set_round(y + i, y + len - i - 1, 2 + n_randint(state, 500));
+                    }
+                }
+                break;
+        }
+
+        arb_randtest(s1, state, 200, 100);
+        arb_randtest(s2, state, 200, 100);
+        arb_randtest(z, state, xbits, ebits);
+
+        arb_approx_dot(s1, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+        mag_zero(arb_radref(s1));
+
+        /* With the fast algorithm, we expect identical results when
+           reversing the vectors. */
+        if (ebits <= 12)
+        {
+            arb_approx_dot(s2, initial ? z : NULL, subtract,
+                !revx ? (x + len - 1) : x, !revx ? -1 : 1,
+                !revy ? (y + len - 1) : y, !revy ? -1 : 1,
+                len, prec);
+            mag_zero(arb_radref(s2));
+
+            if (!arb_equal(s1, s2))
+            {
+                flint_printf("FAIL (reversal)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); arb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", arb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); arb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); arb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); arb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); arb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+        }
+
+        /* Verify that radii are ignored */
+        for (i = 0; i < len; i++)
+        {
+            arb_get_mid_arb(x + i, x + i);
+            arb_get_mid_arb(y + i, y + i);
+        }
+        arb_get_mid_arb(z, z);
+
+        arb_approx_dot(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+        mag_zero(arb_radref(s2));
+
+        if (!arb_equal(s1, s2))
+        {
+            flint_printf("FAIL (radii)\n\n");
+            flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+            if (initial)
+            {
+                flint_printf("z = ", i); arb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", arb_bits(z));
+            }
+
+            for (i = 0; i < len; i++)
+            {
+                flint_printf("x[%wd] = ", i); arb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(x + i));
+                flint_printf("y[%wd] = ", i); arb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(y + i));
+            }
+            flint_printf("\n\n");
+            flint_printf("s1 = "); arb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_printf("s2 = "); arb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* Compare with arb_dot */
+        arb_approx_dot(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+
+        {
+            mag_t err, xx, yy;
+
+            mag_init(err);
+            mag_init(xx);
+            mag_init(yy);
+
+            if (initial)
+                arb_get_mag(err, z);
+
+            for (i = 0; i < len; i++)
+            {
+                arb_get_mag(xx, revx ? x + len - 1 - i : x + i);
+                arb_get_mag(yy, revx ? y + len - 1 - i : y + i);
+                mag_addmul(err, xx, yy);
+            }
+
+            mag_mul_2exp_si(err, err, -prec + 2);
+            arb_add_error_mag(s2, err);
+
+            if (!arb_contains(s2, s1))
+            {
+                flint_printf("FAIL (inclusion)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); arb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", arb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); arb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); arb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); arb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); arb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+
+            mag_clear(err);
+            mag_clear(xx);
+            mag_clear(yy);
+        }
+
+        arb_clear(s1);
+        arb_clear(s2);
+        arb_clear(z);
+        _arb_vec_clear(x, len);
+        _arb_vec_clear(y, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/arb_mat.h
+++ b/arb_mat.h
@ -351,6 +351,7 @@ int arb_mat_solve_precond(arb_mat_t X, const arb_mat_t A, const arb_mat_t B, slo
 int arb_mat_solve_preapprox(arb_mat_t X, const arb_mat_t A,
    const arb_mat_t B, const arb_mat_t R, const arb_mat_t T, slong prec);

+void arb_mat_approx_mul(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec);
 void arb_mat_approx_solve_triu(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec);
 void arb_mat_approx_solve_tril(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec);
 int arb_mat_approx_lu(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec);
--- a/arb_mat/approx_lu.c
+++ b/arb_mat/approx_lu.c
@ -85,11 +85,11 @@ arb_mat_approx_lu_classical(slong * P, arb_mat_t LU, const arb_mat_t A, slong pr
        else if (r != row)
            arb_mat_swap_rows(LU, P, row, r);

-        arf_set(d, arb_midref(a[row] + col));
+        arf_ui_div(d, 1, arb_midref(a[row] + col), prec, ARB_RND);

        for (j = row + 1; j < m; j++)
        {
-            arf_div(arb_midref(e), arb_midref(a[j] + col), d, prec, ARB_RND);
+            arf_mul(arb_midref(e), arb_midref(a[j] + col), d, prec, ARB_RND);
            arb_neg(e, e);
            _arb_vec_approx_scalar_addmul(a[j] + col,
                a[row] + col, n - col, e, prec);
@ -156,11 +156,10 @@ arb_mat_approx_lu_recursive(slong * P, arb_mat_t LU, const arb_mat_t A, slong pr
    arb_mat_approx_solve_tril(A01, A00, A01, 1, prec);

    {
-        /* arb_mat_submul(A11, A11, A10, A01, prec); */
+        /* arb_mat_approx_submul(A11, A11, A10, A01, prec); */
        arb_mat_t T;
        arb_mat_init(T, A10->r, A01->c);
-        arb_mat_mul(T, A10, A01, prec);
-        arb_mat_get_mid(T, T);
+        arb_mat_approx_mul(T, A10, A01, prec);
        arb_mat_sub(A11, A11, T, prec);
        arb_mat_get_mid(A11, A11);
        arb_mat_clear(T);
@ -192,4 +191,3 @@ arb_mat_approx_lu(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec)
    else
        return arb_mat_approx_lu_recursive(P, LU, A, prec);
 }
-
--- a/arb_mat/approx_mul.c
+++ b/arb_mat/approx_mul.c
@ -0,0 +1,152 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+void
+arb_mat_approx_mul_classical(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec)
+{
+    slong ar, br, bc, i, j, k;
+
+    ar = arb_mat_nrows(A);
+    br = arb_mat_nrows(B);
+    bc = arb_mat_ncols(B);
+
+    if (br == 0)
+    {
+        arb_mat_zero(C);
+        return;
+    }
+
+    if (A == C || B == C)
+    {
+        arb_mat_t T;
+        arb_mat_init(T, ar, bc);
+        arb_mat_approx_mul_classical(T, A, B, prec);
+        arb_mat_swap(T, C);
+        arb_mat_clear(T);
+        return;
+    }
+
+    if (br <= 2)
+    {
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                arf_mul(arb_midref(arb_mat_entry(C, i, j)),
+                          arb_midref(arb_mat_entry(A, i, 0)),
+                          arb_midref(arb_mat_entry(B, 0, j)), prec, ARB_RND);
+
+                for (k = 1; k < br; k++)
+                {
+                    arf_addmul(arb_midref(arb_mat_entry(C, i, j)),
+                                 arb_midref(arb_mat_entry(A, i, k)),
+                                 arb_midref(arb_mat_entry(B, k, j)), prec, ARB_RND);
+                }
+            }
+        }
+    }
+    else
+    {
+        arb_ptr tmp;
+        TMP_INIT;
+
+        TMP_START;
+        tmp = TMP_ALLOC(sizeof(arb_struct) * br * bc);
+
+        for (i = 0; i < br; i++)
+            for (j = 0; j < bc; j++)
+                tmp[j * br + i] = *arb_mat_entry(B, i, j);
+
+        for (i = 0; i < ar; i++)
+        {
+            for (j = 0; j < bc; j++)
+            {
+                arb_approx_dot(arb_mat_entry(C, i, j), NULL, 0,
+                    A->rows[i], 1, tmp + j * br, 1, br, prec);
+            }
+        }
+
+        TMP_END;
+    }
+}
+
+int
+arb_mat_is_exact(const arb_mat_t A)
+{
+    slong i, j;
+
+    for (i = 0; i < arb_mat_nrows(A); i++)
+        for (j = 0; j < arb_mat_ncols(A); j++)
+            if (!mag_is_zero(arb_radref(arb_mat_entry(A, i, j))))
+                return 0;
+
+    return 1;
+}
+
+void
+arb_mat_approx_mul(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec)
+{
+    slong cutoff;
+
+    /* todo: detect small-integer matrices */
+    if (prec <= 2 * FLINT_BITS)
+        cutoff = 120;
+    else if (prec <= 16 * FLINT_BITS)
+        cutoff = 60;
+    else
+        cutoff = 40;
+
+    if (arb_mat_nrows(A) <= cutoff || arb_mat_ncols(A) <= cutoff ||
+        arb_mat_ncols(B) <= cutoff)
+    {
+        arb_mat_approx_mul_classical(C, A, B, prec);
+    }
+    else
+    {
+        if (arb_mat_is_exact(A) && arb_mat_is_exact(B))
+        {
+            arb_mat_mul(C, A, B, prec);
+        }
+        else
+        {
+            arb_mat_t AM, BM;
+
+            if (arb_mat_is_exact(A))
+            {
+                arb_mat_init(BM, arb_mat_nrows(B), arb_mat_ncols(B));
+                arb_mat_get_mid(BM, B);
+                arb_mat_mul(C, A, BM, prec);
+                arb_mat_clear(BM);
+            }
+            else if (arb_mat_is_exact(A))
+            {
+                arb_mat_init(AM, arb_mat_nrows(A), arb_mat_ncols(A));
+                arb_mat_get_mid(AM, A);
+                arb_mat_mul(C, AM, B, prec);
+                arb_mat_clear(AM);
+            }
+            else
+            {
+                arb_mat_init(BM, arb_mat_nrows(B), arb_mat_ncols(B));
+                arb_mat_get_mid(BM, B);
+                arb_mat_init(AM, arb_mat_nrows(A), arb_mat_ncols(A));
+                arb_mat_get_mid(AM, A);
+                arb_mat_mul(C, AM, BM, prec);
+                arb_mat_clear(AM);
+                arb_mat_clear(BM);
+            }
+        }
+
+        arb_mat_get_mid(C, C);
+    }
+}
--- a/arb_mat/approx_solve_lu_precomp.c
+++ b/arb_mat/approx_solve_lu_precomp.c
@ -12,24 +12,11 @@

 #include "arb_mat.h"

-static void
-arb_approx_submul(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_submul(arb_midref(z),
-               arb_midref(x), arb_midref(y), prec, ARF_RND_DOWN);
-}
-
-static void
-arb_approx_div(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_div(arb_midref(z), arb_midref(x), arb_midref(y), prec, ARB_RND);
-}
-
 void
 arb_mat_approx_solve_lu_precomp(arb_mat_t X, const slong * perm,
    const arb_mat_t A, const arb_mat_t B, slong prec)
 {
-    slong i, j, c, n, m;
+    slong i, c, n, m;

    n = arb_mat_nrows(X);
    m = arb_mat_ncols(X);
@ -61,40 +48,6 @@ arb_mat_approx_solve_lu_precomp(arb_mat_t X, const slong * perm,
    }

    arb_mat_get_mid(X, X);
-
-    /* todo: solve_tril and solve_triu have some overhead; should be
-       able to eliminate the basecase code below */
-    if (n >= 8 && m >= 8)
-    {
    arb_mat_approx_solve_tril(X, A, X, 1, prec);
    arb_mat_approx_solve_triu(X, A, X, 0, prec);
-        return;
-    }
-
-    for (c = 0; c < m; c++)
-    {
-        /* solve Ly = b */
-        for (i = 1; i < n; i++)
-        {
-            for (j = 0; j < i; j++)
-            {
-                arb_approx_submul(arb_mat_entry(X, i, c),
-                    arb_mat_entry(A, i, j), arb_mat_entry(X, j, c), prec);
-            }
-        }
-
-        /* solve Ux = y */
-        for (i = n - 1; i >= 0; i--)
-        {
-            for (j = i + 1; j < n; j++)
-            {
-                arb_approx_submul(arb_mat_entry(X, i, c),
-                    arb_mat_entry(A, i, j), arb_mat_entry(X, j, c), prec);
-            }
-
-            arb_approx_div(arb_mat_entry(X, i, c), arb_mat_entry(X, i, c),
-                arb_mat_entry(A, i, i), prec);
-        }
-    }
 }
-
--- a/arb_mat/approx_solve_tril.c
+++ b/arb_mat/approx_solve_tril.c
@ -11,26 +11,6 @@

 #include "arb_mat.h"

-static void
-arb_approx_set(arb_t z, const arb_t x)
-{
-    arf_set(arb_midref(z), arb_midref(x));
-}
-
-static void
-arb_approx_sub(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_sub(arb_midref(z),
-               arb_midref(x), arb_midref(y), prec, ARF_RND_DOWN);
-}
-
-static void
-arb_approx_addmul(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_addmul(arb_midref(z),
-               arb_midref(x), arb_midref(y), prec, ARF_RND_DOWN);
-}
-
 static void
 arb_approx_div(arb_t z, const arb_t x, const arb_t y, slong prec)
 {
@ -41,7 +21,7 @@ void
 arb_mat_approx_solve_tril_classical(arb_mat_t X,
        const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
 {
-    slong i, j, k, n, m;
+    slong i, j, n, m;
    arb_ptr tmp;
    arb_t s;

@ -49,29 +29,28 @@ arb_mat_approx_solve_tril_classical(arb_mat_t X,
    m = B->c;

    arb_init(s);
-    tmp = _arb_vec_init(n);
+    tmp = flint_malloc(sizeof(arb_struct) * n);

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
-            arb_approx_set(tmp + j, arb_mat_entry(X, j, i));
+            tmp[j] = *arb_mat_entry(X, j, i);

        for (j = 0; j < n; j++)
        {
-            arb_zero(s);
-            for (k = 0; k < j; k++)
-                arb_approx_addmul(s, L->rows[j] + k, tmp + k, prec);
-            arb_approx_sub(s, arb_mat_entry(B, j, i), s, prec);
+            arb_approx_dot(s, arb_mat_entry(B, j, i), 1, L->rows[j], 1, tmp, 1, j, prec);
+
            if (!unit)
-                arb_approx_div(s, s, arb_mat_entry(L, j, j), prec);
-            arb_approx_set(tmp + j, s);
+                arb_approx_div(tmp + j, s, arb_mat_entry(L, j, j), prec);
+            else
+                arb_swap(tmp + j, s);
        }

        for (j = 0; j < n; j++)
-            arb_approx_set(arb_mat_entry(X, j, i), tmp + j);
+            *arb_mat_entry(X, j, i) = tmp[j];
    }

-    _arb_vec_clear(tmp, n);
+    flint_free(tmp);
    arb_clear(s);
 }

@ -107,8 +86,7 @@ arb_mat_approx_solve_tril_recursive(arb_mat_t X,

    /* arb_mat_submul(XY, BY, LC, XX); */
    arb_mat_init(T, LC->r, BX->c);
-    arb_mat_mul(T, LC, XX, prec);
-    arb_mat_get_mid(T, T);
+    arb_mat_approx_mul(T, LC, XX, prec);
    arb_mat_sub(XY, BY, T, prec);
    arb_mat_get_mid(XY, XY);
    arb_mat_clear(T);
@ -128,9 +106,8 @@ void
 arb_mat_approx_solve_tril(arb_mat_t X, const arb_mat_t L,
                                    const arb_mat_t B, int unit, slong prec)
 {
-    if (B->r < 8 || B->c < 8)
+    if (B->r < 40 || B->c < 40)
        arb_mat_approx_solve_tril_classical(X, L, B, unit, prec);
    else
        arb_mat_approx_solve_tril_recursive(X, L, B, unit, prec);
 }
-
--- a/arb_mat/approx_solve_triu.c
+++ b/arb_mat/approx_solve_triu.c
@ -11,26 +11,6 @@

 #include "arb_mat.h"

-static void
-arb_approx_set(arb_t z, const arb_t x)
-{
-    arf_set(arb_midref(z), arb_midref(x));
-}
-
-static void
-arb_approx_sub(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_sub(arb_midref(z),
-               arb_midref(x), arb_midref(y), prec, ARF_RND_DOWN);
-}
-
-static void
-arb_approx_addmul(arb_t z, const arb_t x, const arb_t y, slong prec)
-{
-    arf_addmul(arb_midref(z),
-               arb_midref(x), arb_midref(y), prec, ARF_RND_DOWN);
-}
-
 static void
 arb_approx_div(arb_t z, const arb_t x, const arb_t y, slong prec)
 {
@ -41,39 +21,36 @@ void
 arb_mat_approx_solve_triu_classical(arb_mat_t X, const arb_mat_t U,
    const arb_mat_t B, int unit, slong prec)
 {
-    slong i, j, k, n, m;
+    slong i, j, n, m;
    arb_ptr tmp;
    arb_t s;

    n = U->r;
    m = B->c;

-    tmp = _arb_vec_init(n);
    arb_init(s);
+    tmp = flint_malloc(sizeof(arb_struct) * n);

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
-            arb_approx_set(tmp + j, arb_mat_entry(X, j, i));
+            tmp[j] = *arb_mat_entry(X, j, i);

        for (j = n - 1; j >= 0; j--)
        {
-            arb_zero(s);
-            for (k = 0; k < n - j - 1; k++)
-                arb_approx_addmul(s, U->rows[j] + j + 1 + k, tmp + j + 1 + k, prec);
+            arb_approx_dot(s, arb_mat_entry(B, j, i), 1, U->rows[j] + j + 1, 1, tmp + j + 1, 1, n - j - 1, prec);

-            arb_approx_sub(s, arb_mat_entry(B, j, i), s, prec);
            if (!unit)
-                arb_approx_div(s, s, arb_mat_entry(U, j, j), prec);
-
-            arb_approx_set(tmp + j, s);
+                arb_approx_div(tmp + j, s, arb_mat_entry(U, j, j), prec);
+            else
+                arb_swap(tmp + j, s);
        }

        for (j = 0; j < n; j++)
-            arb_approx_set(arb_mat_entry(X, j, i), tmp + j);
+            *arb_mat_entry(X, j, i) = tmp[j];
    }

-    _arb_vec_clear(tmp, n);
+    flint_free(tmp);
    arb_clear(s);
 }

@ -108,8 +85,7 @@ arb_mat_approx_solve_triu_recursive(arb_mat_t X,
    arb_mat_approx_solve_triu(XY, UD, BY, unit, prec);

    arb_mat_init(T, UB->r, XY->c);
-    arb_mat_mul(T, UB, XY, prec);
-    arb_mat_get_mid(T, T);
+    arb_mat_approx_mul(T, UB, XY, prec);
    arb_mat_sub(XX, BX, T, prec);
    arb_mat_get_mid(XX, XX);
    arb_mat_clear(T);
@ -129,9 +105,8 @@ void
 arb_mat_approx_solve_triu(arb_mat_t X, const arb_mat_t U,
                                    const arb_mat_t B, int unit, slong prec)
 {
-    if (B->r < 8 || B->c < 8)
+    if (B->r < 40 || B->c < 40)
        arb_mat_approx_solve_triu_classical(X, U, B, unit, prec);
    else
        arb_mat_approx_solve_triu_recursive(X, U, B, unit, prec);
 }
-
--- a/arb_mat/solve_lu_precomp.c
+++ b/arb_mat/solve_lu_precomp.c
@ -15,7 +15,7 @@ void
 arb_mat_solve_lu_precomp(arb_mat_t X, const slong * perm,
    const arb_mat_t A, const arb_mat_t B, slong prec)
 {
-    slong i, c, n, m;
+    slong i, j, c, n, m;

    n = arb_mat_nrows(X);
    m = arb_mat_ncols(X);
@ -46,6 +46,37 @@ arb_mat_solve_lu_precomp(arb_mat_t X, const slong * perm,
        }
    }

+    /* solve_tril and solve_triu have some overhead */
+    if (n >= 4)
+    {
        arb_mat_solve_tril(X, A, X, 1, prec);
        arb_mat_solve_triu(X, A, X, 0, prec);
+        return;
+    }
+
+    for (c = 0; c < m; c++)
+    {
+        /* solve Ly = b */
+        for (i = 1; i < n; i++)
+        {
+            for (j = 0; j < i; j++)
+            {
+                arb_submul(arb_mat_entry(X, i, c),
+                    arb_mat_entry(A, i, j), arb_mat_entry(X, j, c), prec);
+            }
+        }
+
+        /* solve Ux = y */
+        for (i = n - 1; i >= 0; i--)
+        {
+            for (j = i + 1; j < n; j++)
+            {
+                arb_submul(arb_mat_entry(X, i, c),
+                    arb_mat_entry(A, i, j), arb_mat_entry(X, j, c), prec);
+            }
+
+            arb_div(arb_mat_entry(X, i, c), arb_mat_entry(X, i, c),
+                arb_mat_entry(A, i, i), prec);
+        }
+    }
 }
--- a/doc/source/acb.rst
+++ b/doc/source/acb.rst
@ -539,6 +539,12 @@ Dot product
    final rounding. This can be extremely slow and is only intended
    for testing.

+.. function:: void acb_approx_dot(acb_t res, const acb_t s, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
+
+    Computes an approximate dot product *without error bounds*.
+    The radii of the inputs are ignored (only the midpoints are read)
+    and only the midpoint of the output is written.
+
 Mathematical constants
 -------------------------------------------------------------------------------

--- a/doc/source/acb_mat.rst
+++ b/doc/source/acb_mat.rst
@ -487,6 +487,12 @@ Component and error operations
 Approximate solving
 -------------------------------------------------------------------------------

+.. function:: void acb_mat_approx_mul(acb_mat_t res, const acb_mat_t mat1, const acb_mat_t mat2, slong prec)
+
+    Approximate matrix multiplication. The input radii are ignored and
+    the output matrix is set to an approximate floating-point result.
+    The radii in the output matrix will *not* necessarily be zeroed.
+
 .. function:: void acb_mat_approx_solve_triu(acb_mat_t X, const acb_mat_t U, const acb_mat_t B, int unit, slong prec)

 .. function:: void acb_mat_approx_solve_tril(acb_mat_t X, const acb_mat_t L, const acb_mat_t B, int unit, slong prec)
--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@ -822,6 +822,12 @@ Dot product
    final rounding. This can be extremely slow and is only intended
    for testing.

+.. function:: void arb_approx_dot(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+
+    Computes an approximate dot product *without error bounds*.
+    The radii of the inputs are ignored (only the midpoints are read)
+    and only the midpoint of the output is written.
+
 Powers and roots
 -------------------------------------------------------------------------------

--- a/doc/source/arb_mat.rst
+++ b/doc/source/arb_mat.rst
@ -663,6 +663,12 @@ Component and error operations
 Approximate solving
 -------------------------------------------------------------------------------

+.. function:: void arb_mat_approx_mul(arb_mat_t res, const arb_mat_t mat1, const arb_mat_t mat2, slong prec)
+
+    Approximate matrix multiplication. The input radii are ignored and
+    the output matrix is set to an approximate floating-point result.
+    The radii in the output matrix will *not* necessarily be zeroed.
+
 .. function:: void arb_mat_approx_solve_triu(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec)

 .. function:: void arb_mat_approx_solve_tril(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec)