fast arb dot product

2025-03-05 09:21:38 -05:00 · 2018-08-18 21:39:36 +02:00 · 2018-08-18 21:39:36 +02:00 · 689d13c64d
commit 689d13c64d
parent 35d6f7184a
6 changed files with 1357 additions and 0 deletions
--- a/arb.h
+++ b/arb.h
@ -457,6 +457,13 @@ void arb_submul_si(arb_t z, const arb_t x, slong y, slong prec);
 void arb_submul_ui(arb_t z, const arb_t x, ulong y, slong prec);
 void arb_submul_fmpz(arb_t z, const arb_t x, const fmpz_t y, slong prec);

+void arb_dot_simple(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec);
+void arb_dot_precise(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec);
+void arb_dot(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec);
+
 void arb_div(arb_t z, const arb_t x, const arb_t y, slong prec);
 void arb_div_arf(arb_t z, const arb_t x, const arf_t y, slong prec);
 void arb_div_si(arb_t z, const arb_t x, slong y, slong prec);
--- a/arb/dot.c
+++ b/arb/dot.c
@ -0,0 +1,988 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+/* We need uint64_t instead of mp_limb_t on 32-bit systems for
+   safe summation of 30-bit error bounds. */
+#include <stdint.h>
+
+void mpfr_mulhigh_n(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
+void mpfr_sqrhigh_n(mp_ptr rp, mp_srcptr np, mp_size_t n);
+
+
+/* Add ((a * b) / 2^MAG_BITS) * 2^exp into srad*2^srad_exp.
+   Assumes that srad_exp >= exp and that overflow cannot occur.  */
+#define RAD_ADDMUL(srad, srad_exp, a, b, exp) \
+    do { \
+        uint64_t __a, __b; \
+        slong __shift; \
+        __a = (a); \
+        __b = (b); \
+        __shift = (srad_exp) - (exp); \
+        if (__shift < MAG_BITS) \
+            (srad) += (((__a) * (__b)) >> (MAG_BITS + __shift)) + 1; \
+        else \
+            (srad) += 1; \
+    } while (0)
+
+/* mag_set_ui_2exp_si but assumes no promotion can occur.
+   Do we need this? */
+void
+mag_set_ui_2exp_small(mag_t z, ulong x, slong e)
+{
+    _fmpz_demote(MAG_EXPREF(z));
+
+    if (x == 0)
+    {
+        MAG_EXP(z) = 0;
+        MAG_MAN(z) = 0;
+    }
+    else
+    {
+        slong bits;
+        mp_limb_t overflow;
+
+        count_leading_zeros(bits, x);
+        bits = FLINT_BITS - bits;
+
+        if (bits <= MAG_BITS)
+        {
+            x = x << (MAG_BITS - bits);
+        }
+        else
+        {
+            x = (x >> (bits - MAG_BITS)) + 1;
+            overflow = x >> MAG_BITS;
+            bits += overflow;
+            x >>= overflow;
+        }
+
+        MAG_EXP(z) = bits + e;
+        MAG_MAN(z) = x;
+    }
+}
+
+/* Sets rad to (Aerr 2^Aexp + Berr 2^Bexp + Cerr 2^Cexp) 2^(-MAG_BITS).
+   Assumes that overflow cannot occur. */
+static void
+add_errors(mag_t rad, uint64_t Aerr, slong Aexp, uint64_t Berr, slong Bexp, uint64_t Cerr, slong Cexp)
+{
+    slong shift;
+
+    if (Aerr && Berr)
+    {
+        if (Aexp >= Bexp)
+        {
+            shift = Aexp - Bexp;
+            if (shift < 64)
+                Aerr = Aerr + (Berr >> shift) + 1;
+            else
+                Aerr = Aerr + 1;
+        }
+        else
+        {
+            shift = Bexp - Aexp;
+            if (shift < 64)
+                Aerr = Berr + (Aerr >> shift) + 1;
+            else
+                Aerr = Berr + 1;
+            Aexp = Bexp;
+        }
+    }
+    else if (Berr)
+    {
+        Aerr = Berr;
+        Aexp = Bexp;
+    }
+
+    if (Aerr && Cerr)
+    {
+        if (Aexp >= Cexp)
+        {
+            shift = Aexp - Cexp;
+            if (shift < 64)
+                Aerr = Aerr + (Cerr >> shift) + 1;
+            else
+                Aerr = Aerr + 1;
+        }
+        else
+        {
+            shift = Cexp - Aexp;
+            if (shift < 64)
+                Aerr = Cerr + (Aerr >> shift) + 1;
+            else
+                Aerr = Cerr + 1;
+            Aexp = Cexp;
+        }
+    }
+    else if (Cerr)
+    {
+        Aerr = Cerr;
+        Aexp = Cexp;
+    }
+
+#if FLINT_BITS == 64
+    mag_set_ui_2exp_small(rad, Aerr, Aexp - MAG_BITS);
+#else
+    mag_set_d(rad, Aerr * (1.0 + 1e-14));
+    mag_mul_2exp_si(rad, rad, Aexp - MAG_BITS);
+#endif
+}
+
+static void
+mulhigh(mp_ptr res, mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn, mp_size_t nn)
+{
+    mp_ptr tmp, xxx, yyy;
+    slong k;
+    ARF_MUL_TMP_DECL;
+
+    ARF_MUL_TMP_ALLOC(tmp, 2 * nn);
+    xxx = tmp;
+    yyy = tmp + nn;
+
+    mpn_copyi(xxx + nn - FLINT_MIN(xn, nn), xptr + xn - FLINT_MIN(xn, nn), FLINT_MIN(xn, nn));
+    mpn_copyi(yyy + nn - FLINT_MIN(yn, nn), yptr + yn - FLINT_MIN(yn, nn), FLINT_MIN(yn, nn));
+
+    for (k = 0; k < nn - xn; k++)
+        xxx[k] = 0;
+    for (k = 0; k < nn - yn; k++)
+        yyy[k] = 0;
+
+    if (xptr == yptr && xn == yn)
+        mpfr_sqrhigh_n(res, xxx, nn);
+    else
+        mpfr_mulhigh_n(res, xxx, yyy, nn);
+
+    ARF_MUL_TMP_FREE(tmp, 2 * nn);
+}
+
+void
+_arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
+    int negative, mp_bitcnt_t shift)
+{
+    slong shift_bits, shift_limbs, term_prec;
+    mp_limb_t cy;
+    mp_ptr sstart, tstart;
+    mp_size_t tn, nn;
+
+    shift_bits = shift % FLINT_BITS;
+    shift_limbs = shift / FLINT_BITS;
+
+    /*
+            shift          term_prec      (discarded bits)
+    |--------------------|--------------|
+                         | t[tn-1] |   ...  |  t[0]  |         Term
+    [ s[n-1] | s[n-2] |   ...  |  s[0]  ]                      Sum
+
+    */
+
+    /* Upper bound for the term bit length. The actual term bit length
+       could be smaller if the product does not extend all
+       the way down to s. */
+    term_prec = sn * FLINT_BITS - shift;
+
+    /* The mulhigh error relative to the top of the sum will
+       be bounded by nn * 2^(-shift) * 2^(-FLINT_BITS*nn).
+       One extra limb ensures that this is <1 sum-ulp. */
+    term_prec += FLINT_BITS;
+    nn = (term_prec + FLINT_BITS - 1) / FLINT_BITS;
+
+    /* Sanity check; must conform to the pre-allocated memory! */
+    if (nn > sn + 2)
+    {
+        flint_printf("nn > sn + 2\n");
+        flint_abort();
+    }
+
+    /* Use mulhigh? */
+    if (term_prec >= MUL_MPFR_MIN_LIMBS * FLINT_BITS &&
+        term_prec <= MUL_MPFR_MAX_LIMBS * FLINT_BITS &&
+        xn * FLINT_BITS > 0.9 * term_prec &&
+        yn * FLINT_BITS > 0.9 * term_prec)
+    {
+        mulhigh(tmp + 1, xptr, xn, yptr, yn, nn);
+        tn = 2 * nn;
+        serr[0]++;
+    }
+    else
+    {
+        if (xn > nn || yn > nn)
+        {
+            if (xn > nn)
+            {
+                xptr += (xn - nn);
+                xn = nn;
+            }
+
+            if (yn > nn)
+            {
+                yptr += (yn - nn);
+                yn = nn;
+            }
+
+            serr[0]++;
+        }
+
+        tn = xn + yn;
+        ARF_MPN_MUL(tmp + 1, xptr, xn, yptr, yn);
+    }
+
+    if (shift_bits == 0)
+    {
+        tstart = tmp + 1;
+    }
+    else
+    {
+        tmp[0] = mpn_rshift(tmp + 1, tmp + 1, tn, shift_bits);
+        tstart = tmp;
+        tn++;
+    }
+
+    while (tstart[0] == 0)
+    {
+        tstart++;
+        tn--;
+    }
+
+    if (shift_limbs + tn <= sn)
+    {
+        /* No truncation of the term. */
+        sstart = sum + sn - shift_limbs - tn;
+        nn = tn;
+    }
+    else
+    {
+        /* The term is truncated. */
+        sstart = sum;
+        tstart = tstart - (sn - shift_limbs - tn);
+        nn = sn - shift_limbs;
+        serr[0]++;
+    }
+
+    /* Likely case. Note: assumes 1 extra (dummy) limb in sum. */
+    if (shift_limbs <= 1)
+    {
+        if (negative)
+            sstart[nn] -= mpn_sub_n(sstart, sstart, tstart, nn);
+        else
+            sstart[nn] += mpn_add_n(sstart, sstart, tstart, nn);
+    }
+    else
+    {
+
+        if (negative)
+        {
+            cy = mpn_sub_n(sstart, sstart, tstart, nn);
+            mpn_sub_1(sstart + nn, sstart + nn, shift_limbs, cy);
+        }
+        else
+        {
+            cy = mpn_add_n(sstart, sstart, tstart, nn);
+            mpn_add_1(sstart + nn, sstart + nn, shift_limbs, cy);
+        }
+    }
+}
+
+void
+_arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
+    mp_srcptr xptr, mp_size_t xn,
+    int negative, mp_bitcnt_t shift)
+{
+    slong shift_bits, shift_limbs, term_prec;
+    mp_limb_t cy, err;
+    mp_ptr sstart, tstart;
+    mp_size_t tn, nn;
+
+    shift_bits = shift % FLINT_BITS;
+    shift_limbs = shift / FLINT_BITS;
+
+    term_prec = sn * FLINT_BITS - shift;
+    term_prec += FLINT_BITS;
+    nn = (term_prec + FLINT_BITS - 1) / FLINT_BITS;
+
+    err = 0;
+
+    if (xn > nn)
+    {
+        xptr += (xn - nn);
+        xn = nn;
+        err = 1;
+    }
+
+    tn = xn;
+
+    if (shift_bits == 0)
+    {
+        mpn_copyi(tmp, xptr, tn);
+        tstart = tmp;
+    }
+    else
+    {
+        tmp[0] = mpn_rshift(tmp + 1, xptr, tn, shift_bits);
+        tstart = tmp;
+        tn++;
+    }
+
+    while (tstart[0] == 0)
+    {
+        tstart++;
+        tn--;
+    }
+
+    if (shift_limbs + tn <= sn)
+    {
+        /* No truncation of the term. */
+        sstart = sum + sn - shift_limbs - tn;
+        nn = tn;
+    }
+    else
+    {
+        /* The term is truncated. */
+        sstart = sum;
+        tstart = tstart - (sn - shift_limbs - tn);
+        nn = sn - shift_limbs;
+        err = 1;
+    }
+
+    serr[0] += err;
+
+    /* Likely case. Note: assumes 1 extra (dummy) limb in sum. */
+    if (shift_limbs <= 1)
+    {
+        if (negative)
+            sstart[nn] -= mpn_sub_n(sstart, sstart, tstart, nn);
+        else
+            sstart[nn] += mpn_add_n(sstart, sstart, tstart, nn);
+    }
+    else
+    {
+
+        if (negative)
+        {
+            cy = mpn_sub_n(sstart, sstart, tstart, nn);
+            mpn_sub_1(sstart + nn, sstart + nn, shift_limbs, cy);
+        }
+        else
+        {
+            cy = mpn_add_n(sstart, sstart, tstart, nn);
+            mpn_add_1(sstart + nn, sstart + nn, shift_limbs, cy);
+        }
+    }
+}
+
+void
+arb_dot(arb_t res, const arb_t initial, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i, j, nonzero, padding, extend;
+    slong xexp, yexp, exp, max_exp, min_exp, sum_exp;
+    slong xrexp, yrexp, srad_exp, max_rad_exp;
+    int xnegative, ynegative, inexact;
+    mp_size_t xn, yn, sn, alloc;
+    mp_bitcnt_t shift;
+    arb_srcptr xi, yi;
+    arf_srcptr xm, ym;
+    mag_srcptr xr, yr;
+    mp_limb_t xtop, ytop;
+    mp_limb_t xrad, yrad;
+    mp_limb_t serr;   /* Sum over arithmetic errors */
+    uint64_t srad;    /* Sum over propagated errors */
+    mp_ptr tmp, sum;  /* Workspace */
+    ARF_ADD_TMP_DECL;
+
+    if (len <= 1)
+    {
+        if (initial == NULL)
+        {
+            if (len <= 0)
+                arb_zero(res);
+            else
+            {
+                arb_mul(res, x, y, prec);
+                if (subtract)
+                    arb_neg(res, res);
+            }
+            return;
+        }
+        else if (len <= 0)
+        {
+            arb_set_round(res, initial, prec);
+            return;
+        }
+    }
+
+    /* Number of nonzero midpoint terms in sum. */
+    nonzero = 0;
+
+    /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
+    max_exp = WORD_MIN;
+
+    /* Propagated error terms are bounded by 2^max_rad_exp */
+    max_rad_exp = WORD_MIN;
+
+    /* Used to reduce the precision. */
+    min_exp = WORD_MAX;
+
+    /* Account for the initial term. */
+    if (initial != NULL)
+    {
+        if (!ARB_IS_LAGOM(initial))
+        {
+            arb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(initial);
+        xr = arb_radref(initial);
+
+        if (!arf_is_special(xm))
+        {
+            max_exp = ARF_EXP(xm);
+            nonzero++;
+
+            if (prec > 2 * FLINT_BITS)
+                min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
+        }
+
+        if (!mag_is_special(xr))
+            max_rad_exp = MAG_EXP(xr);
+    }
+
+    /* Determine maximum exponents for the main sum and the radius sum. */
+    for (i = 0; i < len; i++)
+    {
+        xi = x + i * xstep;
+        yi = y + i * ystep;
+
+        /* Fallback for huge exponents or non-finite values. */
+        if (!ARB_IS_LAGOM(xi) || !ARB_IS_LAGOM(yi))
+        {
+            arb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
+            return;
+        }
+
+        xm = arb_midref(xi);
+        ym = arb_midref(yi);
+        xr = arb_radref(xi);
+        yr = arb_radref(yi);
+
+        /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
+        if (!arf_is_special(xm))
+        {
+            xexp = ARF_EXP(xm);
+
+            if (!arf_is_special(ym))
+            {
+                yexp = ARF_EXP(ym);
+
+                max_exp = FLINT_MAX(max_exp, xexp + yexp);
+                nonzero++;
+
+                if (prec > 2 * FLINT_BITS)
+                {
+                    slong bot;
+                    bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
+                    min_exp = FLINT_MIN(min_exp, bot);
+                }
+
+                if (!mag_is_special(xr))
+                {
+                    xrexp = MAG_EXP(xr);
+                    max_rad_exp = FLINT_MAX(max_rad_exp, yexp + xrexp);
+
+                    if (!mag_is_special(yr))
+                    {
+                        yrexp = MAG_EXP(yr);
+                        max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+                        max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                    }
+                }
+                else
+                {
+                    if (!mag_is_special(yr))
+                    {
+                        yrexp = MAG_EXP(yr);
+                        max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+                    }
+                }
+            }
+            else  /* if y = 0, something can happen only if yr != 0 */
+            {
+                if (!mag_is_special(yr))
+                {
+                    yrexp = MAG_EXP(yr);
+                    max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
+
+                    if (!mag_is_special(xr))
+                    {
+                        xrexp = MAG_EXP(xr);
+                        max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                    }
+                }
+            }
+        }
+        else  /* if x = 0, something can happen only if xr != 0 */
+        {
+            if (!mag_is_special(xr))
+            {
+                xrexp = MAG_EXP(xr);
+
+                if (!arf_is_special(ym))
+                {
+                    yexp = ARF_EXP(ym);
+                    max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yexp);
+                }
+
+                if (!mag_is_special(yr))
+                {
+                    yrexp = MAG_EXP(yr);
+                    max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
+                }
+            }
+        }
+    }
+
+    /* The midpoint sum is zero. */
+    if (max_exp == WORD_MIN)
+    {
+        /* The sum is exactly zero. */
+        if (max_rad_exp == WORD_MIN)
+        {
+            arb_zero(res);
+            return;
+        }
+
+        prec = 2;
+    }
+    else
+    {
+        /* Reduce precision based on errors. */
+        if (max_rad_exp != WORD_MIN)
+            prec = FLINT_MIN(prec, max_exp - max_rad_exp + MAG_BITS);
+
+        /* Reduce precision based on actual sizes. */
+        if (min_exp != WORD_MAX)
+            prec = FLINT_MIN(prec, max_exp - min_exp + MAG_BITS);
+
+        prec = FLINT_MAX(prec, 2);
+    }
+
+    /* Extend sum so that we can use two's complement addition. */
+    extend = FLINT_BIT_COUNT(nonzero) + 1;
+
+    /* Extra bits to improve accuracy (optional). */
+    padding = 4 + FLINT_BIT_COUNT(len);
+
+    /* Number of limbs. */
+    sn = (prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
+
+    /* Avoid having to make a special case for sn = 1. */
+    sn = FLINT_MAX(sn, 2);
+
+    /* Exponent for the main sum. */
+    sum_exp = max_exp + extend;
+
+    /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
+       for carry or borrow that avoids an extra branch). We need
+       2 * (sn + 2) limbs to store the product of two numbers
+       with up to (sn + 2) limbs, plus 1 extra limb for shifting
+       the product. */
+    alloc = (sn + 1) + 2 * (sn + 2) + 1;
+    ARF_ADD_TMP_ALLOC(sum, alloc)
+    tmp = sum + (sn + 1);
+
+    /* Sum of propagated errors. */
+    srad_exp = max_rad_exp;
+    srad = 0;
+
+    /* Set sum to 0 */
+    serr = 0;
+    for (j = 0; j < sn + 1; j++)
+        sum[j] = 0;
+
+    if (initial != NULL)
+    {
+        xm = arb_midref(initial);
+        xr = arb_radref(initial);
+
+        if (!arf_is_special(xm))
+        {
+            mp_srcptr xptr;
+
+            xexp = ARF_EXP(xm);
+            xn = ARF_SIZE(xm);
+            xnegative = ARF_SGNBIT(xm);
+
+            shift = sum_exp - xexp;
+
+            if (shift >= sn * FLINT_BITS)
+            {
+                serr++;
+            }
+            else
+            {
+                xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                _arb_dot_add_generic(sum, &serr, tmp, sn, xptr, xn, xnegative ^ subtract, shift);
+            }
+        }
+
+        if (!mag_is_special(xr))
+        {
+            xrad = MAG_MAN(xr);
+            xrexp = MAG_EXP(xr);
+
+            shift = srad_exp - xrexp;
+            if (shift < 64)
+                srad += (xrad >> shift) + 1;
+            else
+                srad++;
+        }
+    }
+
+    for (i = 0; i < len; i++)
+    {
+        xi = x + i * xstep;
+        yi = y + i * ystep;
+        xm = arb_midref(xi);
+        ym = arb_midref(yi);
+        xr = arb_radref(xi);
+        yr = arb_radref(yi);
+
+        /* The midpoints of x[i] and y[i] are both nonzero. */
+        if (!arf_is_special(xm) && !arf_is_special(ym))
+        {
+            xexp = ARF_EXP(xm);
+            xn = ARF_SIZE(xm);
+            xnegative = ARF_SGNBIT(xm);
+
+            yexp = ARF_EXP(ym);
+            yn = ARF_SIZE(ym);
+            ynegative = ARF_SGNBIT(ym);
+
+            exp = xexp + yexp;
+            shift = sum_exp - exp;
+
+            if (shift >= sn * FLINT_BITS)
+            {
+                /* We may yet need the top limbs for bounds. */
+                ARF_GET_TOP_LIMB(xtop, xm);
+                ARF_GET_TOP_LIMB(ytop, ym);
+                serr++;
+            }
+#if 0
+            else if (xn == 1 && yn == 1 && sn == 2 && shift < FLINT_BITS)  /* Fastest path. */
+            {
+                mp_limb_t hi, lo, out;
+
+                xtop = ARF_NOPTR_D(xm)[0];
+                ytop = ARF_NOPTR_D(ym)[0];
+
+                umul_ppmm(hi, lo, xtop, ytop);
+
+                out = lo << (FLINT_BITS - shift);
+                lo = (lo >> shift) | (hi << (FLINT_BITS - shift));
+                hi = (hi >> shift);
+                serr += (out != 0);
+
+                if (xnegative ^ ynegative)
+                    sub_ddmmss(sum[1], sum[0], sum[1], sum[0], hi, lo);
+                else
+                    add_ssaaaa(sum[1], sum[0], sum[1], sum[0], hi, lo);
+            }
+            else if (xn == 2 && yn == 2 && shift < FLINT_BITS && sn <= 3)
+            {
+                mp_limb_t x1, x0, y1, y0;
+                mp_limb_t u3, u2, u1, u0;
+
+                x0 = ARF_NOPTR_D(xm)[0];
+                x1 = ARF_NOPTR_D(xm)[1];
+                y0 = ARF_NOPTR_D(ym)[0];
+                y1 = ARF_NOPTR_D(ym)[1];
+
+                xtop = x1;
+                ytop = y1;
+
+                nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+
+                u0 = (u0 != 0) || ((u1 << (FLINT_BITS - shift)) != 0);
+                u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                u3 = (u3 >> shift);
+
+                if (sn == 2)
+                {
+                    serr += (u0 || (u1 != 0));
+                    if (xnegative ^ ynegative)
+                        sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                    else
+                        add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                }
+                else
+                {
+                    serr += u0;
+                    if (xnegative ^ ynegative)
+                        sub_dddmmmsss(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                    else
+                        add_sssaaaaaa(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                }
+            }
+#endif
+            else if (xn <= 2 && yn <= 2 && sn <= 3)
+            {
+                mp_limb_t x1, x0, y1, y0;
+                mp_limb_t u3, u2, u1, u0;
+
+                if (xn == 1 && yn == 1)
+                {
+                    xtop = ARF_NOPTR_D(xm)[0];
+                    ytop = ARF_NOPTR_D(ym)[0];
+                    umul_ppmm(u3, u2, xtop, ytop);
+                    u1 = u0 = 0;
+                }
+                else if (xn == 2 && yn == 2)
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    x1 = ARF_NOPTR_D(xm)[1];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    y1 = ARF_NOPTR_D(ym)[1];
+                    xtop = x1;
+                    ytop = y1;
+                    nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
+                }
+                else if (xn == 1)
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    y1 = ARF_NOPTR_D(ym)[1];
+                    xtop = x0;
+                    ytop = y1;
+                    nn_mul_2x1(u3, u2, u1, y1, y0, x0);
+                    u0 = 0;
+                }
+                else
+                {
+                    x0 = ARF_NOPTR_D(xm)[0];
+                    x1 = ARF_NOPTR_D(xm)[1];
+                    y0 = ARF_NOPTR_D(ym)[0];
+                    xtop = x1;
+                    ytop = y0;
+                    nn_mul_2x1(u3, u2, u1, x1, x0, y0);
+                    u0 = 0;
+                }
+
+                if (sn == 2)
+                {
+                    if (shift < FLINT_BITS)
+                    {
+                        serr += ((u2 << (FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
+                        u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                        u3 = (u3 >> shift);
+                    }
+                    else if (shift == FLINT_BITS)
+                    {
+                        serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
+                        u2 = u3;
+                        u3 = 0;
+                    }
+                    else /* FLINT_BITS < shift < 2 * FLINT_BITS */
+                    {
+                        serr += ((u3 << (2 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
+                        u2 = (u3 >> (shift - FLINT_BITS));
+                        u3 = 0;
+                    }
+
+                    if (xnegative ^ ynegative)
+                        sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                    else
+                        add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
+                }
+                else if (sn == 3)
+                {
+                    if (shift < FLINT_BITS)
+                    {
+                        serr += ((u1 << (FLINT_BITS - shift)) != 0) || (u0 != 0);
+                        u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
+                        u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
+                        u3 = (u3 >> shift);
+                    }
+                    else if (shift == FLINT_BITS)
+                    {
+                        serr += (u1 != 0) || (u0 != 0);
+                        u1 = u2;
+                        u2 = u3;
+                        u3 = 0;
+                    }
+                    else if (shift < 2 * FLINT_BITS)
+                    {
+                        serr += ((u2 << (2 * FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
+                        u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
+                        u2 = (u3 >> (shift - FLINT_BITS));
+                        u3 = 0;
+                    }
+                    else if (shift == 2 * FLINT_BITS)
+                    {
+                        serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
+                        u1 = u3;
+                        u2 = 0;
+                        u3 = 0;
+                    }
+                    else  /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
+                    {
+                        serr += ((u3 << (3 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
+                        u1 = (u3 >> (shift - 2 * FLINT_BITS));
+                        u2 = 0;
+                        u3 = 0;
+                    }
+
+                    if (xnegative ^ ynegative)
+                        sub_dddmmmsss(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                    else
+                        add_sssaaaaaa(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
+                }
+            }
+            else
+            {
+                mp_srcptr xptr, yptr;
+
+                xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
+                yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
+
+                xtop = xptr[xn - 1];
+                ytop = yptr[yn - 1];
+
+                _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative, shift);
+            }
+
+            xrad = MAG_MAN(xr);
+            yrad = MAG_MAN(yr);
+
+            if (xrad != 0 && yrad != 0)
+            {
+                xrexp = MAG_EXP(xr);
+                yrexp = MAG_EXP(yr);
+
+                RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+                RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+                RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
+            }
+            else if (xrad != 0)
+            {
+                xrexp = MAG_EXP(xr);
+                RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+            }
+            else if (yrad != 0)
+            {
+                yrexp = MAG_EXP(yr);
+                RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+            }
+        }
+        else
+        {
+            xrad = MAG_MAN(xr);
+            yrad = MAG_MAN(yr);
+
+            xexp = ARF_EXP(xm);
+            yexp = ARF_EXP(ym);
+
+            xrexp = MAG_EXP(xr);
+            yrexp = MAG_EXP(yr);
+
+            /* (xm+xr)(ym+yr) = xm ym + [xm yr + ym xr + xr yr] */
+            if (yrad && !arf_is_special(xm))
+            {
+                ARF_GET_TOP_LIMB(xtop, xm);
+                RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
+            }
+
+            if (xrad && !arf_is_special(ym))
+            {
+                ARF_GET_TOP_LIMB(ytop, ym);
+                RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
+            }
+
+            if (xrad && yrad)
+            {
+                RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
+            }
+        }
+    }
+
+    xnegative = 0;
+    if (sum[sn - 1] >= LIMB_TOP)
+    {
+        mpn_neg(sum, sum, sn);
+        xnegative = 1;
+    }
+
+    if (sum[sn - 1] == 0)
+    {
+        slong sum_exp2;
+        mp_size_t sn2;
+
+        sn2 = sn;
+        sum_exp2 = sum_exp; 
+
+        while (sn2 > 0 && sum[sn2 - 1] == 0)
+        {
+            sum_exp2 -= FLINT_BITS;
+            sn2--;
+        }
+
+        if (sn2 == 0)
+        {
+            arf_zero(arb_midref(res));
+            inexact = 0;
+        }
+        else
+        {
+            inexact = _arf_set_round_mpn(arb_midref(res), &exp, sum, sn2, xnegative ^ subtract, prec, ARF_RND_DOWN);
+            _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp2);
+        }
+    }
+    else
+    {
+
+        if (sn == 2)
+            inexact = _arf_set_round_uiui(arb_midref(res), &exp, sum[1], sum[0], xnegative ^ subtract, prec, ARF_RND_DOWN);
+        else
+            inexact = _arf_set_round_mpn(arb_midref(res), &exp, sum, sn, xnegative ^ subtract, prec, ARF_RND_DOWN);
+
+        _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp);
+    }
+
+    /*
+    Add the three sources of error.
+
+    Final rounding error   = inexact * 2^(exp + sum_exp - prec)
+                             0 <= inexact <= 1
+
+    Arithmetic error       = serr    * 2^(sum_exp - sn * FLINT_BITS)
+                             0 <= serr <= 3 * n
+
+    Propagated error       = srad    * 2^(srad_exp - MAG_BITS)
+                             0 <= srad <= 3 * n * MAG_BITS
+
+    We shift the first two by MAG_BITS so that the magnitudes
+    become similar and a reasonably accurate addition can be done
+    by comparing exponents without normalizing first.
+    */
+
+    add_errors(arb_radref(res),
+        inexact << MAG_BITS,
+        exp + sum_exp - prec,
+        ((uint64_t) serr) << MAG_BITS,
+        sum_exp - sn * FLINT_BITS,
+        srad,
+        srad_exp);
+
+    ARF_ADD_TMP_FREE(sum, alloc);
+}
--- a/arb/dot_precise.c
+++ b/arb/dot_precise.c
@ -0,0 +1,79 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+arb_dot_precise(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+{
+    arf_ptr A, B;
+    arf_t t, u;
+    slong i;
+    int inexact;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+            arb_zero(res);
+        else
+            arb_set_round(res, initial, prec);
+        return;
+    }
+
+    A = flint_calloc(len + (initial != NULL), sizeof(arf_struct));
+    B = flint_calloc(3 * len + 1 + (initial != NULL), sizeof(arf_struct));
+
+    for (i = 0; i < len; i++)
+    {
+        arb_srcptr xp = x + i * xstep;
+        arb_srcptr yp = y + i * ystep;
+
+        arf_mul(A + i, arb_midref(xp), arb_midref(yp), ARF_PREC_EXACT, ARF_RND_DOWN);
+        if (subtract)
+            arf_neg(A + i, A + i);
+
+        arf_init_set_mag_shallow(t, arb_radref(xp));
+        arf_init_set_mag_shallow(u, arb_radref(yp));
+
+        arf_mul(B + 3 * i, t, u, ARF_PREC_EXACT, ARF_RND_DOWN);
+
+        arf_mul(B + 3 * i + 1, t, arb_midref(yp), ARF_PREC_EXACT, ARF_RND_DOWN);
+        arf_abs(B + 3 * i + 1, B + 3 * i + 1);
+
+        arf_mul(B + 3 * i + 2, u, arb_midref(xp), ARF_PREC_EXACT, ARF_RND_DOWN);
+        arf_abs(B + 3 * i + 2, B + 3 * i + 2);
+    }
+
+    if (initial != NULL)
+    {
+        arf_set(A + len, arb_midref(initial));
+        arf_set_mag(B + 3 * len + 1, arb_radref(initial));
+    }
+
+    inexact = arf_sum(arb_midref(res), A, len + (initial != NULL), prec, ARF_RND_DOWN);
+    if (inexact)
+        arf_mag_set_ulp(arb_radref(res), arb_midref(res), prec);
+    else
+        mag_zero(arb_radref(res));
+    arf_set_mag(B + 3 * len, arb_radref(res));
+
+    arf_sum(A, B, 3 * len + 1 + (initial != NULL), 3 * MAG_BITS, ARF_RND_UP);
+    arf_get_mag(arb_radref(res), A);
+
+    for (i = 0; i < len + (initial != NULL); i++)
+        arf_clear(A + i);
+    for (i = 0; i < 3 * len + 1 + (initial != NULL); i++)
+        arf_clear(B + i);
+
+    flint_free(A);
+    flint_free(B);
+}
--- a/arb/dot_simple.c
+++ b/arb/dot_simple.c
@ -0,0 +1,47 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+arb_dot_simple(arb_t res, const arb_t initial, int subtract,
+    arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+{
+    slong i;
+
+    if (len <= 0)
+    {
+        if (initial == NULL)
+            arb_zero(res);
+        else
+            arb_set_round(res, initial, prec);
+        return;
+    }
+
+    if (initial == NULL)
+    {
+        arb_mul(res, x, y, prec);
+    }
+    else
+    {
+        if (subtract)
+            arb_neg(res, initial);
+        else
+            arb_set(res, initial);
+        arb_addmul(res, x, y, prec);
+    }
+
+    for (i = 1; i < len; i++)
+        arb_addmul(res, x + i * xstep, y + i * ystep, prec);
+
+    if (subtract)
+        arb_neg(res, res);
+}
--- a/arb/test/t-dot.c
+++ b/arb/test/t-dot.c
@ -0,0 +1,196 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("dot....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 1000000 * arb_test_multiplier(); iter++)
+    {
+        arb_ptr x, y;
+        arb_t s1, s2, z;
+        slong i, len, prec, xbits, ybits, ebits;
+        int ok, initial, subtract, revx, revy;
+
+        if (n_randint(state, 100) == 0)
+            len = n_randint(state, 100);
+        else if (n_randint(state, 10) == 0)
+            len = n_randint(state, 10);
+        else
+            len = n_randint(state, 3);
+
+        if (n_randint(state, 100) == 0 || len > 10)
+        {
+            prec = 2 + n_randint(state, 500);
+            xbits = 2 + n_randint(state, 500);
+            ybits = 2 + n_randint(state, 500);
+        }
+        else
+        {
+            prec = 2 + n_randint(state, 4000);
+            xbits = 2 + n_randint(state, 4000);
+            ybits = 2 + n_randint(state, 4000);
+        }
+
+        if (n_randint(state, 100) == 0)
+            ebits = 2 + n_randint(state, 100);
+        else
+            ebits = 2 + n_randint(state, 10);
+
+        initial = n_randint(state, 2);
+        subtract = n_randint(state, 2);
+        revx = n_randint(state, 2);
+        revy = n_randint(state, 2);
+
+        x = _arb_vec_init(len);
+        y = _arb_vec_init(len);
+        arb_init(s1);
+        arb_init(s2);
+        arb_init(z);
+
+        switch (n_randint(state, 3))
+        {
+            case 0:
+                for (i = 0; i < len; i++)
+                {
+                    arb_randtest(x + i, state, xbits, ebits);
+                    arb_randtest(y + i, state, ybits, ebits);
+                }
+                break;
+
+            /* Test with cancellation */
+            case 1:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        arb_randtest(x + i, state, xbits, ebits);
+                        arb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        arb_neg(x + i, x + len - i - 1);
+                        arb_set(y + i, y + len - i - 1);
+                    }
+                }
+                break;
+
+            default:
+                for (i = 0; i < len; i++)
+                {
+                    if (i <= len / 2)
+                    {
+                        arb_randtest(x + i, state, xbits, ebits);
+                        arb_randtest(y + i, state, ybits, ebits);
+                    }
+                    else
+                    {
+                        arb_neg_round(x + i, x + len - i - 1, 2 + n_randint(state, 500));
+                        arb_set_round(y + i, y + len - i - 1, 2 + n_randint(state, 500));
+                    }
+                }
+                break;
+        }
+
+        arb_randtest(s1, state, 200, 100);
+        arb_randtest(s2, state, 200, 100);
+        arb_randtest(z, state, xbits, ebits);
+
+        arb_dot(s1, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, prec);
+
+        arb_dot_precise(s2, initial ? z : NULL, subtract,
+            revx ? (x + len - 1) : x, revx ? -1 : 1,
+            revy ? (y + len - 1) : y, revy ? -1 : 1,
+            len, ebits <= 12 ? ARF_PREC_EXACT : 2 * prec + 100);
+
+        if (ebits <= 12)
+            ok = arb_contains(s1, s2);
+        else
+            ok = arb_overlaps(s1, s2);
+
+        if (!ok)
+        {
+            flint_printf("FAIL\n\n");
+            flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+            if (initial)
+            {
+                flint_printf("z = ", i); arb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", arb_bits(z));
+            }
+
+            for (i = 0; i < len; i++)
+            {
+                flint_printf("x[%wd] = ", i); arb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(x + i));
+                flint_printf("y[%wd] = ", i); arb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(y + i));
+            }
+            flint_printf("\n\n");
+            flint_printf("s1 = "); arb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_printf("s2 = "); arb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* With the fast algorithm, we expect identical results when
+           reversing the vectors. */
+        if (ebits <= 12)
+        {
+            revx ^= 1;
+            revy ^= 1;
+
+            arb_dot(s2, initial ? z : NULL, subtract,
+                revx ? (x + len - 1) : x, revx ? -1 : 1,
+                revy ? (y + len - 1) : y, revy ? -1 : 1,
+                len, prec);
+
+            if (!arb_equal(s1, s2))
+            {
+                flint_printf("FAIL (reversal)\n\n");
+                flint_printf("iter = %wd, len = %wd, prec = %wd, ebits = %wd\n\n", iter, len, prec, ebits);
+
+                if (initial)
+                {
+                    flint_printf("z = ", i); arb_printn(z, 100, ARB_STR_MORE); flint_printf(" (%wd)\n\n", arb_bits(z));
+                }
+
+                for (i = 0; i < len; i++)
+                {
+                    flint_printf("x[%wd] = ", i); arb_printn(x + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(x + i));
+                    flint_printf("y[%wd] = ", i); arb_printn(y + i, 100, ARB_STR_MORE); flint_printf(" (%wd)\n", arb_bits(y + i));
+                }
+                flint_printf("\n\n");
+                flint_printf("s1 = "); arb_printn(s1, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_printf("s2 = "); arb_printn(s2, 100, ARB_STR_MORE); flint_printf("\n\n");
+                flint_abort();
+            }
+        }
+
+        arb_clear(s1);
+        arb_clear(s2);
+        arb_clear(z);
+        _arb_vec_clear(x, len);
+        _arb_vec_clear(y, len);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@ -782,6 +782,46 @@ Arithmetic

    Sets `z = x / (2^n - 1)`, rounded to *prec* bits.

+Sum and dot product
+-------------------------------------------------------------------------------
+
+.. function:: void arb_dot_precise(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void arb_dot_simple(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+
+.. function:: void arb_dot(arb_t res, const arb_t s, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
+
+    Computes the dot product of the vectors *x* and *y*, setting
+    *res* to `s + (-1)^{subtract} \sum_{i=0}^{len-1} x_i y_i`.
+
+    The initial term *s* is optional and can be
+    omitted by passing *NULL* (equivalently, `s = 0`).
+    The parameter *subtract* must be 0 or 1.
+    The length *len* is allowed to be negative, which is equivalent
+    to a length of zero.
+    The parameters *xstep* or *ystep* specify a step length for
+    traversing subsequences of the vectors *x* and *y*; either can be
+    negative to step in the reverse direction starting from
+    the initial pointer.
+    Aliasing is allowed between *res* and *s* but not between
+    *res* and the entries of *x* and *y*.
+
+    The default version determines the optimal precision for each term
+    and performs all internal calculations using mpn arithmetic
+    with minimal overhead. This is the preferred way to compute a
+    dot product; it is generally much faster and more precise
+    than a simple loop.
+
+    The *simple* version performs fused multiply-add operations in
+    a simple loop. This can be used for
+    testing purposes and is also used as a fallback by the
+    default version when the exponents are out of range
+    for the optimized code.
+
+    The *precise* version computes the dot product exactly up to the
+    final rounding. This can be extremely slow and is only intended
+    for testing.
+
 Powers and roots
 -------------------------------------------------------------------------------