arb/arb_mat/addmul_rad_mag_fast.c

/*
    Copyright (C) 2018 Fredrik Johansson

    This file is part of Arb.

    Arb is free software: you can redistribute it and/or modify it under
    the terms of the GNU Lesser General Public License (LGPL) as published
    by the Free Software Foundation; either version 2.1 of the License, or
    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
*/

#include "arb_mat.h"

/* Block size for better cache locality. */
#define BLOCK_SIZE 32

/* Don't convert to doubles when smaller than this block size. */
#define MIN_D_BLOCK_SIZE 5

/* With doubles, we can have an exponent range of about 1024, minus some
   slack for accumulated sums. */
#define DOUBLE_MAX_OFFSET 900

static __inline__ double dot8(const double * A, const double * B)
{
    return ((A[0] * B[0] + A[1] * B[1]) + (A[2] * B[2] + A[3] * B[3])) +
           ((A[4] * B[4] + A[5] * B[5]) + (A[6] * B[6] + A[7] * B[7]));
}

/* Upper bound of matrix product, assuming nonnegative entries and
   no overflow/underflow. B is pre-transposed. Straightforward blocked
   implementation; could use BLAS, but this matrix product is rarely going
   to be the bottleneck. */

static void
_d_mat_addmul(double * C, const double * A, const double * B, slong ar, slong ac, slong bc)
{
    slong ii, jj, kk, i, j, k;
    double t, eps;

    eps = ldexp(1.0, -52);

    for (ii = 0; ii < ar; ii += BLOCK_SIZE)
    {
        for (jj = 0; jj < bc; jj += BLOCK_SIZE)
        {
            for (kk = 0; kk < ac; kk += BLOCK_SIZE)
            {
                for (i = ii; i < FLINT_MIN(ii + BLOCK_SIZE, ar); i++)
                {
                    for (j = jj; j < FLINT_MIN(jj + BLOCK_SIZE, bc); j++)
                    {
                        if (BLOCK_SIZE == 32 && kk + BLOCK_SIZE <= ac)
                        {
                            double t0, t1, t2, t3;

                            t0 = dot8(A + i * ac + kk + 0, B + j * ac + kk + 0);
                            t1 = dot8(A + i * ac + kk + 8, B + j * ac + kk + 8);
                            t2 = dot8(A + i * ac + kk + 16, B + j * ac + kk + 16);
                            t3 = dot8(A + i * ac + kk + 24, B + j * ac + kk + 24);

                            t = (t0 + t1) + (t2 + t3);
                        }
                        else
                        {
                            t = 0.0;

                            for (k = kk; k < FLINT_MIN(kk + BLOCK_SIZE, ac); k++)
                                t += A[i * ac + k] * B[j * ac + k];
                        }

                        C[i * bc + j] += t;
                    }
                }
            }
        }
    }

    /* Compensate for possible rounding errors */
    for (i = 0; i < ar; i++)
        for (j = 0; j < bc; j++)
            C[i * bc + j] *= (1.0 + 2.01 * (ac + 1) * eps);
}

/* We use WORD_MIN to represent zero here. */
static __inline__ slong _mag_get_exp(const mag_t x)
{
    if (mag_is_special(x))
        return WORD_MIN;
    else
        return MAG_EXP(x);
}

static double
mag_get_d_fixed_si(const mag_t x, slong e)
{
    return ldexp(MAG_MAN(x), MAG_EXP(x) - e - MAG_BITS);
}

void
_arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B,
    slong ar, slong ac, slong bc)
{
    slong i, j, k, M, N, P, top, n, block_start, block_end;
    slong *A_min, *A_max, *B_min, *B_max, max_offset;
    double *CC, *AA, *BB;

    M = ar;
    N = ac;
    P = bc;

    /* todo: could use TMP_ALLOC */
    A_min = flint_malloc(sizeof(slong) * M);
    A_max = flint_malloc(sizeof(slong) * M);
    B_min = flint_malloc(sizeof(slong) * P);
    B_max = flint_malloc(sizeof(slong) * P);

    AA = flint_malloc(ar * ac * sizeof(double));
    BB = flint_malloc(ac * bc * sizeof(double));
    CC = flint_malloc(ar * bc * sizeof(double));

    max_offset = DOUBLE_MAX_OFFSET;

    block_start = 0;
    while (block_start < N)
    {
        block_end = block_start + 1;  /* index is exclusive block_end */

        /* begin with this column of A and row of B */
        for (i = 0; i < M; i++)
            A_max[i] = A_min[i] = _mag_get_exp(A + i * N + block_start);
        for (i = 0; i < P; i++)
            B_max[i] = B_min[i] = _mag_get_exp(B + i * N + block_start);

        while (block_end < N)
        {
            /* check if we can extend with column [block_end] of A */
            for (i = 0; i < M; i++)
            {
                top = _mag_get_exp(A + i * N + block_end);
                /* zeros are irrelevant */
                if (top == WORD_MIN || A_max[i] == WORD_MIN)
                    continue;
                /* jump will be too big */
                if (top > A_min[i] + max_offset || top < A_max[i] - max_offset)
                    goto mblocks_built;
            }

            /* check if we can extend with row [block_end] of B */
            for (i = 0; i < P; i++)
            {
                top = _mag_get_exp(B + i * N + block_end);
                if (top == WORD_MIN || B_max[i] == WORD_MIN)
                    continue;
                if (top > B_min[i] + max_offset || top < B_max[i] - max_offset)
                    goto mblocks_built;
            }

            /* second pass to update the extreme values */
            for (i = 0; i < M; i++)
            {
                top = _mag_get_exp(A + i * N + block_end);
                if (A_max[i] == WORD_MIN)
                {
                    A_max[i] = top;
                    A_min[i] = top;
                }
                else if (top != WORD_MIN)
                {
                    if (top < A_min[i]) A_min[i] = top;
                    if (top > A_max[i]) A_max[i] = top;
                }
            }

            for (i = 0; i < P; i++)
            {
                top = _mag_get_exp(B + i * N + block_end);
                if (B_max[i] == WORD_MIN)
                {
                    B_max[i] = top;
                    B_min[i] = top;
                }
                else if (top != WORD_MIN)
                {
                    if (top < B_min[i]) B_min[i] = top;
                    if (top > B_max[i]) B_max[i] = top;
                }
            }

            block_end++;
        }

    mblocks_built:

        n = block_end - block_start;

        if (n <= MIN_D_BLOCK_SIZE)
        {
            /* increment so we don't just do steps of 1 in degenerate cases */
            block_end = FLINT_MIN(block_start + MIN_D_BLOCK_SIZE, N);
            n = block_end - block_start;

            for (i = 0; i < ar; i++)
            {
                for (j = 0; j < bc; j++)
                {
                    for (k = 0; k < n; k++)
                    {
                        mag_fast_addmul(arb_radref(arb_mat_entry(C, i, j)),
                            A + i * ac + block_start + k,
                            B + j * ac + block_start + k);
                    }
                }
            }
        }
        else
        {
            for (i = 0; i < ar; i++)
            {
                if (A_min[i] == WORD_MIN)  /* only zeros in this row */
                    continue;

                A_min[i] = (A_min[i] + A_max[i]) / 2;

                for (j = 0; j < n; j++)
                    AA[i * n + j] = mag_get_d_fixed_si(A + i * ac + block_start + j, A_min[i]);
            }

            /* Note: B and BB are both transposed in memory */
            for (i = 0; i < bc; i++)
            {
                if (B_min[i] == WORD_MIN)  /* only zeros in this column */
                    continue;

                B_min[i] = (B_min[i] + B_max[i]) / 2;

                for (j = 0; j < n; j++)
                    BB[i * n + j] = mag_get_d_fixed_si(B + i * ac + block_start + j, B_min[i]);
            }

            for (i = 0; i < ar * bc; i++)
                CC[i] = 0.0;

            _d_mat_addmul(CC, AA, BB, ar, n, bc);

            for (i = 0; i < ar; i++)
            {
                if (A_min[i] == WORD_MIN)
                    continue;

                for (j = 0; j < bc; j++)
                {
                    if (B_min[j] == WORD_MIN)
                        continue;

                    if (CC[i * bc + j] != 0.0)
                    {
                        mag_t t;
                        MAG_SET_D_2EXP(MAG_MAN(t), MAG_EXP(t), CC[i * bc + j], A_min[i] + B_min[j]);
                        mag_add(arb_radref(arb_mat_entry(C, i, j)),
                                arb_radref(arb_mat_entry(C, i, j)), t);
                    }
                }
            }
        }

        block_start = block_end;
    }

    flint_free(A_max);
    flint_free(A_min);
    flint_free(B_max);
    flint_free(B_min);

    flint_free(AA);
    flint_free(BB);
    flint_free(CC);
}
faster arb_mat_mul using blocks, scaling and multiplying via fmpz_mat_mul and doubles 2018-06-25 19:39:55 +02:00			`/*`
			`Copyright (C) 2018 Fredrik Johansson`

			`This file is part of Arb.`

			`Arb is free software: you can redistribute it and/or modify it under`
			`the terms of the GNU Lesser General Public License (LGPL) as published`
			`by the Free Software Foundation; either version 2.1 of the License, or`
			`(at your option) any later version. See <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "arb_mat.h"`

			`/* Block size for better cache locality. */`
			`#define BLOCK_SIZE 32`

			`/* Don't convert to doubles when smaller than this block size. */`
			`#define MIN_D_BLOCK_SIZE 5`

			`/* With doubles, we can have an exponent range of about 1024, minus some`
			`slack for accumulated sums. */`
			`#define DOUBLE_MAX_OFFSET 900`

approximate computation of eigenvalues for acb_mat 2018-11-20 21:31:40 +01:00			`static __inline__ double dot8(const double * A, const double * B)`
			`{`
			`return ((A[0] * B[0] + A[1] * B[1]) + (A[2] * B[2] + A[3] * B[3])) +`
			`((A[4] * B[4] + A[5] * B[5]) + (A[6] * B[6] + A[7] * B[7]));`
			`}`

faster arb_mat_mul using blocks, scaling and multiplying via fmpz_mat_mul and doubles 2018-06-25 19:39:55 +02:00			`/* Upper bound of matrix product, assuming nonnegative entries and`
			`no overflow/underflow. B is pre-transposed. Straightforward blocked`
			`implementation; could use BLAS, but this matrix product is rarely going`
			`to be the bottleneck. */`
approximate computation of eigenvalues for acb_mat 2018-11-20 21:31:40 +01:00
faster arb_mat_mul using blocks, scaling and multiplying via fmpz_mat_mul and doubles 2018-06-25 19:39:55 +02:00			`static void`
			`_d_mat_addmul(double * C, const double * A, const double * B, slong ar, slong ac, slong bc)`
			`{`
			`slong ii, jj, kk, i, j, k;`
			`double t, eps;`

			`eps = ldexp(1.0, -52);`

			`for (ii = 0; ii < ar; ii += BLOCK_SIZE)`
			`{`
			`for (jj = 0; jj < bc; jj += BLOCK_SIZE)`
			`{`
			`for (kk = 0; kk < ac; kk += BLOCK_SIZE)`
			`{`
			`for (i = ii; i < FLINT_MIN(ii + BLOCK_SIZE, ar); i++)`
			`{`
			`for (j = jj; j < FLINT_MIN(jj + BLOCK_SIZE, bc); j++)`
			`{`
approximate computation of eigenvalues for acb_mat 2018-11-20 21:31:40 +01:00			`if (BLOCK_SIZE == 32 && kk + BLOCK_SIZE <= ac)`
			`{`
			`double t0, t1, t2, t3;`

			`t0 = dot8(A + i * ac + kk + 0, B + j * ac + kk + 0);`
			`t1 = dot8(A + i * ac + kk + 8, B + j * ac + kk + 8);`
			`t2 = dot8(A + i * ac + kk + 16, B + j * ac + kk + 16);`
			`t3 = dot8(A + i * ac + kk + 24, B + j * ac + kk + 24);`

			`t = (t0 + t1) + (t2 + t3);`
			`}`
			`else`
			`{`
			`t = 0.0;`

			`for (k = kk; k < FLINT_MIN(kk + BLOCK_SIZE, ac); k++)`
			`t += A[i * ac + k] * B[j * ac + k];`
			`}`
faster arb_mat_mul using blocks, scaling and multiplying via fmpz_mat_mul and doubles 2018-06-25 19:39:55 +02:00
			`C[i * bc + j] += t;`
			`}`
			`}`
			`}`
			`}`
			`}`

			`/* Compensate for possible rounding errors */`
			`for (i = 0; i < ar; i++)`
			`for (j = 0; j < bc; j++)`
			`C[i * bc + j] = (1.0 + 2.01 (ac + 1) * eps);`
			`}`

			`/* We use WORD_MIN to represent zero here. */`
			`static __inline__ slong _mag_get_exp(const mag_t x)`
			`{`
			`if (mag_is_special(x))`
			`return WORD_MIN;`
			`else`
			`return MAG_EXP(x);`
			`}`

			`static double`
			`mag_get_d_fixed_si(const mag_t x, slong e)`
			`{`
			`return ldexp(MAG_MAN(x), MAG_EXP(x) - e - MAG_BITS);`
			`}`

			`void`
			`_arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B,`
			`slong ar, slong ac, slong bc)`
			`{`
			`slong i, j, k, M, N, P, top, n, block_start, block_end;`
			`slong A_min, A_max, B_min, B_max, max_offset;`
			`double CC, AA, *BB;`

			`M = ar;`
			`N = ac;`
			`P = bc;`

			`/* todo: could use TMP_ALLOC */`
			`A_min = flint_malloc(sizeof(slong) * M);`
			`A_max = flint_malloc(sizeof(slong) * M);`
			`B_min = flint_malloc(sizeof(slong) * P);`
			`B_max = flint_malloc(sizeof(slong) * P);`

			`AA = flint_malloc(ar * ac * sizeof(double));`
			`BB = flint_malloc(ac * bc * sizeof(double));`
			`CC = flint_malloc(ar * bc * sizeof(double));`

			`max_offset = DOUBLE_MAX_OFFSET;`

			`block_start = 0;`
			`while (block_start < N)`
			`{`
			`block_end = block_start + 1; /* index is exclusive block_end */`

			`/* begin with this column of A and row of B */`
			`for (i = 0; i < M; i++)`
			`A_max[i] = A_min[i] = _mag_get_exp(A + i * N + block_start);`
			`for (i = 0; i < P; i++)`
			`B_max[i] = B_min[i] = _mag_get_exp(B + i * N + block_start);`

			`while (block_end < N)`
			`{`
			`/* check if we can extend with column [block_end] of A */`
			`for (i = 0; i < M; i++)`
			`{`
			`top = _mag_get_exp(A + i * N + block_end);`
			`/* zeros are irrelevant */`
			`if (top == WORD_MIN \|\| A_max[i] == WORD_MIN)`
			`continue;`
			`/* jump will be too big */`
			`if (top > A_min[i] + max_offset \|\| top < A_max[i] - max_offset)`
			`goto mblocks_built;`
			`}`

			`/* check if we can extend with row [block_end] of B */`
			`for (i = 0; i < P; i++)`
			`{`
			`top = _mag_get_exp(B + i * N + block_end);`
			`if (top == WORD_MIN \|\| B_max[i] == WORD_MIN)`
			`continue;`
			`if (top > B_min[i] + max_offset \|\| top < B_max[i] - max_offset)`
			`goto mblocks_built;`
			`}`

			`/* second pass to update the extreme values */`
			`for (i = 0; i < M; i++)`
			`{`
			`top = _mag_get_exp(A + i * N + block_end);`
			`if (A_max[i] == WORD_MIN)`
			`{`
			`A_max[i] = top;`
			`A_min[i] = top;`
			`}`
			`else if (top != WORD_MIN)`
			`{`
			`if (top < A_min[i]) A_min[i] = top;`
			`if (top > A_max[i]) A_max[i] = top;`
			`}`
			`}`

			`for (i = 0; i < P; i++)`
			`{`
			`top = _mag_get_exp(B + i * N + block_end);`
			`if (B_max[i] == WORD_MIN)`
			`{`
			`B_max[i] = top;`
			`B_min[i] = top;`
			`}`
			`else if (top != WORD_MIN)`
			`{`
			`if (top < B_min[i]) B_min[i] = top;`
			`if (top > B_max[i]) B_max[i] = top;`
			`}`
			`}`

			`block_end++;`
			`}`

			`mblocks_built:`

			`n = block_end - block_start;`

			`if (n <= MIN_D_BLOCK_SIZE)`
			`{`
			`/* increment so we don't just do steps of 1 in degenerate cases */`
			`block_end = FLINT_MIN(block_start + MIN_D_BLOCK_SIZE, N);`
			`n = block_end - block_start;`

			`for (i = 0; i < ar; i++)`
			`{`
			`for (j = 0; j < bc; j++)`
			`{`
			`for (k = 0; k < n; k++)`
			`{`
			`mag_fast_addmul(arb_radref(arb_mat_entry(C, i, j)),`
			`A + i * ac + block_start + k,`
			`B + j * ac + block_start + k);`
			`}`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (i = 0; i < ar; i++)`
			`{`
			`if (A_min[i] == WORD_MIN) /* only zeros in this row */`
			`continue;`

			`A_min[i] = (A_min[i] + A_max[i]) / 2;`

			`for (j = 0; j < n; j++)`
			`AA[i * n + j] = mag_get_d_fixed_si(A + i * ac + block_start + j, A_min[i]);`
			`}`

			`/* Note: B and BB are both transposed in memory */`
			`for (i = 0; i < bc; i++)`
			`{`
			`if (B_min[i] == WORD_MIN) /* only zeros in this column */`
			`continue;`

			`B_min[i] = (B_min[i] + B_max[i]) / 2;`

			`for (j = 0; j < n; j++)`
			`BB[i * n + j] = mag_get_d_fixed_si(B + i * ac + block_start + j, B_min[i]);`
			`}`

			`for (i = 0; i < ar * bc; i++)`
			`CC[i] = 0.0;`

			`_d_mat_addmul(CC, AA, BB, ar, n, bc);`

			`for (i = 0; i < ar; i++)`
			`{`
			`if (A_min[i] == WORD_MIN)`
			`continue;`

			`for (j = 0; j < bc; j++)`
			`{`
			`if (B_min[j] == WORD_MIN)`
			`continue;`

			`if (CC[i * bc + j] != 0.0)`
			`{`
			`mag_t t;`
			`MAG_SET_D_2EXP(MAG_MAN(t), MAG_EXP(t), CC[i * bc + j], A_min[i] + B_min[j]);`
			`mag_add(arb_radref(arb_mat_entry(C, i, j)),`
			`arb_radref(arb_mat_entry(C, i, j)), t);`
			`}`
			`}`
			`}`
			`}`

			`block_start = block_end;`
			`}`

			`flint_free(A_max);`
			`flint_free(A_min);`
			`flint_free(B_max);`
			`flint_free(B_min);`

			`flint_free(AA);`
			`flint_free(BB);`
			`flint_free(CC);`
			`}`