use doubles for faster error bounds in fmprb_poly_mullow_block2

2025-03-05 09:21:38 -05:00 · 2014-04-16 22:57:15 +02:00 · 2014-04-16 22:57:15 +02:00 · 462cd887ff
commit 462cd887ff
parent 8c71a3c8dc
1 changed files with 214 additions and 67 deletions
--- a/fmprb_poly/mullow_block2.c
+++ b/fmprb_poly/mullow_block2.c
@ -23,11 +23,24 @@

 ******************************************************************************/

+#include <math.h>
 #include "fmprb_poly.h"

 void _fmprb_poly_get_scale(fmpz_t scale, fmprb_srcptr x, long xlen,
                                         fmprb_srcptr y, long ylen);

+static int
+_fmprb_vec_is_finite(fmprb_srcptr x, long len)
+{
+    long i;
+
+    for (i = 0; i < len; i++)
+        if (!fmprb_is_finite(x + i))
+            return 0;
+
+    return 1;
+}
+
 /*static __inline__ */ void
 fmpr_add_abs_ubound(fmpr_t z, const fmpr_t x, const fmpr_t y, long prec)
 {
@ -60,16 +73,61 @@ fmpr_abs_round(fmpr_t z, const fmpr_t x, long prec, fmpr_rnd_t rnd)
 }

 /* Break fmpr vector into same-exponent blocks where the largest block
-   has a height of at most ALPHA*prec + BETA bits. */
+   has a height of at most ALPHA*prec + BETA bits. These are just
+   tuning parameters. Note that ALPHA * FMPRB_RAD_PREC + BETA
+   should be smaller than DOUBLE_BLOCK_MAX_HEIGHT if we want to use
+   doubles for error bounding. */
 #define ALPHA 3.0
 #define BETA 512

-void
-_fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
-    const fmpz_t scale, fmpr_srcptr x, long len, long step, long prec)
+
+/* Maximum length of block for which we use double multiplication
+   (for longer blocks, we use fmpz_poly multiplication). This is essentially
+   just a tuning parameter, but note that it must be considered when
+   compensating for rounding error below. */
+#define DOUBLE_BLOCK_MAX_LENGTH 1000
+
+/* Computing a dot product of length DOUBLE_BLOCK_MAX_LENGTH involving
+   only nonnegative numbers, and then multiplying by this factor, must give
+   an upper bound for the exact dot product (we can assume that no
+   overflow or underflow occurs). The following is certainly
+   sufficient, but it would be nice to include a formal proof here. */
+#define DOUBLE_ROUNDING_FACTOR (1.0 + 1e-9)
+
+/* Maximum height for which we use double multiplication. Since the dynamic
+   exponent range of doubles is about +/- 1024, this must be less than about
+   1024 (to allow the product of two numbers). This must also
+   account for adding FMPRB_RAD_PREC bits. */
+#define DOUBLE_BLOCK_MAX_HEIGHT 800
+
+/* We divide coefficients by 2^DOUBLE_BLOCK_SHIFT when converting them to
+   doubles, in order to use the whole exponent range. Note that this means
+   numbers of size (2^(-DOUBLE_BLOCK_SHIFT))^2 must not underflow. */
+#define DOUBLE_BLOCK_SHIFT (DOUBLE_BLOCK_MAX_HEIGHT / 2)
+
+
+/* Converts fmpr vector to a vector of blocks, where each block is
+   a vector of fmpz integers on a common exponent.
+   Optionally, we also generate doubles on the same common exponent
+   (minus DOUBLE_BLOCK_SHIFT), where this can be done exactly. */
+
+static __inline__ int           /* returns new can_use_doubles status */
+_fmpr_vec_get_fmpz_2exp_blocks
+(
+    fmpz * coeffs,            /* output fmpz coefficients */
+    double * dblcoeffs,       /* output double coefficients (optional) */
+    fmpz * exps,              /* common exponent of each block */
+    long * blocks,            /* start positions of blocks (plus end marker) */
+    const fmpz_t scale,       /* compose input poly by x -> x/2^scale */
+    fmpr_srcptr x,            /* first in vector of input coefficient */
+    long len,                 /* number of input coefficients */
+    long step,                /* step length to read input coefficients*/
+    long prec,                /* prec */
+    int can_use_doubles
+)
 {
    fmpz_t top, bot, t, b, v, block_top, block_bot;
-    long i, j, s, block;
+    long i, j, s, block, bits, maxheight;
    int in_zero;

    fmpz_init(top);
@ -84,17 +142,26 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
    block = 0;
    in_zero = 1;

+    if (prec == FMPR_PREC_EXACT)
+        maxheight = FMPR_PREC_EXACT;
+    else
+        maxheight = ALPHA * prec + BETA;
+
+    can_use_doubles = can_use_doubles && (maxheight <= DOUBLE_BLOCK_MAX_HEIGHT);
+
    for (i = 0; i < len; i++)
    {
-        /* Skip (must be zero, since we assume there are no Infs/NaNs) */
+        /* Skip (must be zero, since we assume there are no Infs/NaNs). */
        if (fmpr_is_special(x + i * step))
            continue;

        /* Bottom and top exponent of current number */
        fmpz_set(bot, fmpr_expref(x + i * step));
-        /* Divide coefficient by 2^(scale * i) */
        fmpz_submul_ui(bot, scale, i);
-        fmpz_add_ui(top, bot, fmpz_bits(fmpr_manref(x + i * step)));
+        bits = fmpz_bits(fmpr_manref(x + i * step));
+        fmpz_add_ui(top, bot, bits);
+
+        can_use_doubles = can_use_doubles && (bits <= FMPRB_RAD_PREC);

        /* Extend current block. */
        if (in_zero)
@ -109,7 +176,7 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
            fmpz_sub(v, t, b);

            /* extend current block */
-            if (prec == FMPR_PREC_EXACT || fmpz_cmp_ui(v, ALPHA * prec + BETA) < 0)
+            if (fmpz_cmp_ui(v, maxheight) < 0)
            {
                fmpz_swap(block_top, t);
                fmpz_swap(block_bot, b);
@ -141,9 +208,12 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
    {
        for (j = blocks[i]; j < blocks[i + 1]; j++)
        {
-            if (fmpr_is_zero(x + j * step))
+            if (fmpr_is_special(x + j * step))
            {
                fmpz_zero(coeffs + j);
+
+                if (can_use_doubles)
+                    dblcoeffs[j] = 0.0;
            }
            else
            {
@ -155,6 +225,17 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
                if (s < 0) abort(); /* Bug catcher */

                fmpz_mul_2exp(coeffs + j, fmpr_manref(x + j * step), s);
+
+                if (can_use_doubles)
+                {
+                    double c = *fmpr_manref(x + j * step);
+                    c = ldexp(c, s - DOUBLE_BLOCK_SHIFT);
+
+                    if (c < 1e-150 || c > 1e150) /* Bug catcher */
+                        abort();
+
+                    dblcoeffs[j] = c;
+                }
            }
        }
    }
@ -166,32 +247,19 @@ _fmpr_vec_get_fmpz_2exp_blocks(fmpz * coeffs, fmpz * exps, long * blocks,
    fmpz_clear(v);
    fmpz_clear(block_top);
    fmpz_clear(block_bot);
+
+    return can_use_doubles;
 }

-static int
-has_infnan(fmprb_srcptr x, long len)
-{
-    long i;
-
-    for (i = 0; i < len; i++)
-    {
-        if (fmpr_is_nan(fmprb_midref(x + i)) || fmpr_is_inf(fmprb_midref(x + i))
-         || fmpr_is_nan(fmprb_radref(x + i)) || fmpr_is_inf(fmprb_radref(x + i)))
-        {
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-void
+static __inline__ void
 _fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
-    const fmpz * xz, const fmpz * xexps, const long * xblocks, long xlen,
-    const fmpz * yz, const fmpz * yexps, const long * yblocks, long ylen,
-    long n)
+    const fmpz * xz, const double * xdbl, const fmpz * xexps,
+    const long * xblocks, long xlen,
+    const fmpz * yz, const double * ydbl, const fmpz * yexps,
+    const long * yblocks, long ylen,
+    long n, int can_use_doubles)
 {
-    long i, j, k, xp, yp, xl, yl, bn;
+    long i, j, k, ii, xp, yp, xl, yl, bn;
    fmpz_t zexp;
    fmpr_t t;

@ -211,20 +279,49 @@ _fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
            xl = FLINT_MIN(xl, bn);
            yl = FLINT_MIN(yl, bn);

-            if (xl >= yl)
-                _fmpz_poly_mullow(zz, xz + xp, xl, yz + yp, yl, bn);
-            else
-                _fmpz_poly_mullow(zz, yz + yp, yl, xz + xp, xl, bn);
-
            fmpz_add_inline(zexp, xexps + i, yexps + j);

-            for (k = 0; k < bn; k++)
+            if (can_use_doubles && xl > 1 && yl > 1 &&
+                (xl < DOUBLE_BLOCK_MAX_LENGTH || yl < DOUBLE_BLOCK_MAX_LENGTH))
            {
-                fmpr_set_round_fmpz_2exp(t, zz + k, zexp,
-                    FMPRB_RAD_PREC, FMPR_RND_UP);
-                fmpr_add(fmprb_radref(z + xp + yp + k),
-                    fmprb_radref(z + xp + yp + k), t,
-                    FMPRB_RAD_PREC, FMPR_RND_UP);
+                fmpz_add_ui(zexp, zexp, 2 * DOUBLE_BLOCK_SHIFT);
+
+                for (k = 0; k < bn; k++)
+                {
+                    /* Classical multiplication (may round down!) */
+                    double ss = 0.0;
+
+                    for (ii = FLINT_MAX(0, k - yl + 1);
+                        ii <= FLINT_MIN(xl - 1, k); ii++)
+                    {
+                        ss += xdbl[xp + ii] * ydbl[yp + k - ii];
+                    }
+
+                    /* Compensate for rounding error */
+                    ss *= DOUBLE_ROUNDING_FACTOR;
+
+                    fmpr_set_d(t, ss);
+                    fmpr_mul_2exp_fmpz(t, t, zexp);
+                    fmpr_add(fmprb_radref(z + xp + yp + k),
+                        fmprb_radref(z + xp + yp + k), t,
+                        FMPRB_RAD_PREC, FMPR_RND_UP);
+                }
+            }
+            else
+            {
+                if (xl >= yl)
+                    _fmpz_poly_mullow(zz, xz + xp, xl, yz + yp, yl, bn);
+                else
+                    _fmpz_poly_mullow(zz, yz + yp, yl, xz + xp, xl, bn);
+
+                for (k = 0; k < bn; k++)
+                {
+                    fmpr_set_round_fmpz_2exp(t, zz + k, zexp,
+                        FMPRB_RAD_PREC, FMPR_RND_UP);
+                    fmpr_add(fmprb_radref(z + xp + yp + k),
+                        fmprb_radref(z + xp + yp + k), t,
+                        FMPRB_RAD_PREC, FMPR_RND_UP);
+                }
            }
        }
    }
@ -233,7 +330,7 @@ _fmprb_poly_addmullow_rad(fmprb_ptr z, fmpz * zz,
    fmpr_clear(t);
 }

-void
+static __inline__ void
 _fmprb_poly_addmullow_block(fmprb_ptr z, fmpz * zz,
    const fmpz * xz, const fmpz * xexps, const long * xblocks, long xlen,
    const fmpz * yz, const fmpz * yexps, const long * yblocks, long ylen,
@ -334,7 +431,8 @@ _fmprb_poly_mullow_block2(fmprb_ptr z, fmprb_srcptr x, long xlen,
    }

    /* We don't know how to deal with infinities or NaNs */
-    if (has_infnan(x, xlen) || (!squaring && has_infnan(y, ylen)))
+    if (!_fmprb_vec_is_finite(x, xlen) ||
+        (!squaring && !_fmprb_vec_is_finite(y, ylen)))
    {
        _fmprb_poly_mullow_classical(z, x, xlen, y, ylen, n, prec);
        return;
@ -369,74 +467,123 @@ _fmprb_poly_mullow_block2(fmprb_ptr z, fmprb_srcptr x, long xlen,
                           = (xm*ym) + (xm*yr + xr*(ym + yr))  */
    if (xrlen != 0 || yrlen != 0)
    {
-        fmpr_ptr tmp = _fmpr_vec_init(FLINT_MAX(xlen, ylen));
+        fmpr_ptr tmp;
+        double *xdbl, *ydbl;
+        int can_use_doubles = 1;
+
+        tmp = _fmpr_vec_init(FLINT_MAX(xlen, ylen));
+        xdbl = flint_malloc(sizeof(double) * xlen);
+        ydbl = flint_malloc(sizeof(double) * ylen);

        /* (xm + xr)^2 = (xm*ym) + (xr^2 + 2 xm xr)
                       = (xm*ym) + xr*(2 xm + xr)    */
        if (squaring)
        {
-            _fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
+                xblocks, scale, fmprb_radref(x), xrlen, 2,
+                FMPRB_RAD_PREC, can_use_doubles);

            for (i = 0; i < xlen; i++)
            {
-                fmpr_abs_round(tmp + i, fmprb_midref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
+                fmpr_abs_round(tmp + i, fmprb_midref(x + i),
+                    FMPRB_RAD_PREC, FMPR_RND_UP);
                fmpr_mul_2exp_si(tmp + i, tmp + i, 1);
-                fmpr_add(tmp + i, tmp + i, fmprb_radref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
+                fmpr_add(tmp + i, tmp + i, fmprb_radref(x + i),
+                    FMPRB_RAD_PREC, FMPR_RND_UP);
            }
-            _fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, xlen, 1, FMPRB_RAD_PREC);

-            _fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, xlen, n);
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
+                yblocks, scale, tmp, xlen, 1,
+                FMPRB_RAD_PREC, can_use_doubles);
+
+            _fmprb_poly_addmullow_rad(z, zz,
+                xz, xdbl, xe, xblocks, xrlen,
+                yz, ydbl, ye, yblocks, xlen, n, can_use_doubles);
        }
        else if (yrlen == 0)
        {
            /* xr * |ym| */
-            _fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
+                xblocks, scale, fmprb_radref(x), xrlen, 2,
+                FMPRB_RAD_PREC, can_use_doubles);
+
            for (i = 0; i < ymlen; i++)
-                fmpr_abs_round(tmp + i, fmprb_midref(y + i), FMPRB_RAD_PREC, FMPR_RND_UP);
-            _fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, ymlen, 1, FMPRB_RAD_PREC);
-            _fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, ymlen, n);
+                fmpr_abs_round(tmp + i, fmprb_midref(y + i),
+                    FMPRB_RAD_PREC, FMPR_RND_UP);
+
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
+                yblocks, scale, tmp, ymlen, 1,
+                FMPRB_RAD_PREC, can_use_doubles);
+
+            _fmprb_poly_addmullow_rad(z, zz,
+                xz, xdbl, xe, xblocks, xrlen,
+                yz, ydbl, ye, yblocks, ymlen, n, can_use_doubles);
        }
        else
        {
            /* |xm| * yr */
            for (i = 0; i < xmlen; i++)
-                fmpr_abs_round(tmp + i, fmprb_midref(x + i), FMPRB_RAD_PREC, FMPR_RND_UP);
+                fmpr_abs_round(tmp + i, fmprb_midref(x + i),
+                    FMPRB_RAD_PREC, FMPR_RND_UP);

-            _fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, tmp, xmlen, 1, FMPRB_RAD_PREC);
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
+                xblocks, scale, tmp, xmlen, 1,
+                FMPRB_RAD_PREC, can_use_doubles);

-            _fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, fmprb_radref(y), yrlen, 2, FMPRB_RAD_PREC);
+            can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
+                yblocks, scale, fmprb_radref(y), yrlen, 2,
+                FMPRB_RAD_PREC, can_use_doubles);

-            _fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xmlen, yz, ye, yblocks, yrlen, n);
+            _fmprb_poly_addmullow_rad(z, zz,
+                xz, xdbl, xe, xblocks, xmlen,
+                yz, ydbl, ye, yblocks, yrlen, n, can_use_doubles);

            /* xr*(|ym| + yr) */
            if (xrlen != 0)
            {
-                _fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_radref(x), xrlen, 2, FMPRB_RAD_PREC);
+                can_use_doubles = 1;
+                can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(xz, xdbl, xe,
+                    xblocks, scale, fmprb_radref(x), xrlen, 2,
+                    FMPRB_RAD_PREC, can_use_doubles);

                for (i = 0; i < ylen; i++)
-                    fmpr_add_abs_ubound(tmp + i, fmprb_midref(y + i), fmprb_radref(y + i), FMPRB_RAD_PREC);
+                    fmpr_add_abs_ubound(tmp + i, fmprb_midref(y + i),
+                        fmprb_radref(y + i), FMPRB_RAD_PREC);

-                _fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, tmp, ylen, 1, FMPRB_RAD_PREC);
-                _fmprb_poly_addmullow_rad(z, zz, xz, xe, xblocks, xrlen, yz, ye, yblocks, ylen, n);
+                can_use_doubles = _fmpr_vec_get_fmpz_2exp_blocks(yz, ydbl, ye,
+                    yblocks, scale, tmp, ylen, 1,
+                    FMPRB_RAD_PREC, can_use_doubles);
+
+                _fmprb_poly_addmullow_rad(z, zz,
+                    xz, xdbl, xe, xblocks, xrlen,
+                    yz, ydbl, ye, yblocks, ylen, n, can_use_doubles);
            }
        }

        _fmpr_vec_clear(tmp, FLINT_MAX(xlen, ylen));
+        flint_free(xdbl);
+        flint_free(ydbl);
    }

    /* multiply midpoints */
    if (xmlen != 0 && ymlen != 0)
    {
-        _fmpr_vec_get_fmpz_2exp_blocks(xz, xe, xblocks, scale, fmprb_midref(x), xmlen, 2, prec);
+        _fmpr_vec_get_fmpz_2exp_blocks(xz, NULL, xe, xblocks,
+            scale, fmprb_midref(x), xmlen, 2, prec, 0);

        if (squaring)
        {
-            _fmprb_poly_addmullow_block(z, zz, xz, xe, xblocks, xmlen, xz, xe, xblocks, xmlen, n, prec, 1);
+            _fmprb_poly_addmullow_block(z, zz,
+                xz, xe, xblocks, xmlen, xz, xe, xblocks, xmlen, n, prec, 1);
        }
        else
        {
-            _fmpr_vec_get_fmpz_2exp_blocks(yz, ye, yblocks, scale, fmprb_midref(y), ymlen, 2, prec);
-            _fmprb_poly_addmullow_block(z, zz, xz, xe, xblocks, xmlen, yz, ye, yblocks, ymlen, n, prec, 0);
+            _fmpr_vec_get_fmpz_2exp_blocks(yz, NULL, ye, yblocks,
+                scale, fmprb_midref(y), ymlen, 2, prec, 0);
+
+            _fmprb_poly_addmullow_block(z, zz,
+                xz, xe, xblocks, xmlen,
+                yz, ye, yblocks, ymlen, n, prec, 0);
        }
    }