arb/arb_mat/addmul_rad_mag_fast.c

279 lines
8.5 KiB
C

/*
Copyright (C) 2018 Fredrik Johansson
This file is part of Arb.
Arb is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version. See <http://www.gnu.org/licenses/>.
*/
#include "arb_mat.h"
/* Block size for better cache locality. */
#define BLOCK_SIZE 32
/* Don't convert to doubles when smaller than this block size. */
#define MIN_D_BLOCK_SIZE 5
/* With doubles, we can have an exponent range of about 1024, minus some
slack for accumulated sums. */
#define DOUBLE_MAX_OFFSET 900
static __inline__ double dot8(const double * A, const double * B)
{
return ((A[0] * B[0] + A[1] * B[1]) + (A[2] * B[2] + A[3] * B[3])) +
((A[4] * B[4] + A[5] * B[5]) + (A[6] * B[6] + A[7] * B[7]));
}
/* Upper bound of matrix product, assuming nonnegative entries and
no overflow/underflow. B is pre-transposed. Straightforward blocked
implementation; could use BLAS, but this matrix product is rarely going
to be the bottleneck. */
static void
_d_mat_addmul(double * C, const double * A, const double * B, slong ar, slong ac, slong bc)
{
slong ii, jj, kk, i, j, k;
double t, eps;
eps = ldexp(1.0, -52);
for (ii = 0; ii < ar; ii += BLOCK_SIZE)
{
for (jj = 0; jj < bc; jj += BLOCK_SIZE)
{
for (kk = 0; kk < ac; kk += BLOCK_SIZE)
{
for (i = ii; i < FLINT_MIN(ii + BLOCK_SIZE, ar); i++)
{
for (j = jj; j < FLINT_MIN(jj + BLOCK_SIZE, bc); j++)
{
if (BLOCK_SIZE == 32 && kk + BLOCK_SIZE <= ac)
{
double t0, t1, t2, t3;
t0 = dot8(A + i * ac + kk + 0, B + j * ac + kk + 0);
t1 = dot8(A + i * ac + kk + 8, B + j * ac + kk + 8);
t2 = dot8(A + i * ac + kk + 16, B + j * ac + kk + 16);
t3 = dot8(A + i * ac + kk + 24, B + j * ac + kk + 24);
t = (t0 + t1) + (t2 + t3);
}
else
{
t = 0.0;
for (k = kk; k < FLINT_MIN(kk + BLOCK_SIZE, ac); k++)
t += A[i * ac + k] * B[j * ac + k];
}
C[i * bc + j] += t;
}
}
}
}
}
/* Compensate for possible rounding errors */
for (i = 0; i < ar; i++)
for (j = 0; j < bc; j++)
C[i * bc + j] *= (1.0 + 2.01 * (ac + 1) * eps);
}
/* We use WORD_MIN to represent zero here. */
static __inline__ slong _mag_get_exp(const mag_t x)
{
if (mag_is_special(x))
return WORD_MIN;
else
return MAG_EXP(x);
}
static double
mag_get_d_fixed_si(const mag_t x, slong e)
{
return ldexp(MAG_MAN(x), MAG_EXP(x) - e - MAG_BITS);
}
void
_arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B,
slong ar, slong ac, slong bc)
{
slong i, j, k, M, N, P, top, n, block_start, block_end;
slong *A_min, *A_max, *B_min, *B_max, max_offset;
double *CC, *AA, *BB;
M = ar;
N = ac;
P = bc;
/* todo: could use TMP_ALLOC */
A_min = flint_malloc(sizeof(slong) * M);
A_max = flint_malloc(sizeof(slong) * M);
B_min = flint_malloc(sizeof(slong) * P);
B_max = flint_malloc(sizeof(slong) * P);
AA = flint_malloc(ar * ac * sizeof(double));
BB = flint_malloc(ac * bc * sizeof(double));
CC = flint_malloc(ar * bc * sizeof(double));
max_offset = DOUBLE_MAX_OFFSET;
block_start = 0;
while (block_start < N)
{
block_end = block_start + 1; /* index is exclusive block_end */
/* begin with this column of A and row of B */
for (i = 0; i < M; i++)
A_max[i] = A_min[i] = _mag_get_exp(A + i * N + block_start);
for (i = 0; i < P; i++)
B_max[i] = B_min[i] = _mag_get_exp(B + i * N + block_start);
while (block_end < N)
{
/* check if we can extend with column [block_end] of A */
for (i = 0; i < M; i++)
{
top = _mag_get_exp(A + i * N + block_end);
/* zeros are irrelevant */
if (top == WORD_MIN || A_max[i] == WORD_MIN)
continue;
/* jump will be too big */
if (top > A_min[i] + max_offset || top < A_max[i] - max_offset)
goto mblocks_built;
}
/* check if we can extend with row [block_end] of B */
for (i = 0; i < P; i++)
{
top = _mag_get_exp(B + i * N + block_end);
if (top == WORD_MIN || B_max[i] == WORD_MIN)
continue;
if (top > B_min[i] + max_offset || top < B_max[i] - max_offset)
goto mblocks_built;
}
/* second pass to update the extreme values */
for (i = 0; i < M; i++)
{
top = _mag_get_exp(A + i * N + block_end);
if (A_max[i] == WORD_MIN)
{
A_max[i] = top;
A_min[i] = top;
}
else if (top != WORD_MIN)
{
if (top < A_min[i]) A_min[i] = top;
if (top > A_max[i]) A_max[i] = top;
}
}
for (i = 0; i < P; i++)
{
top = _mag_get_exp(B + i * N + block_end);
if (B_max[i] == WORD_MIN)
{
B_max[i] = top;
B_min[i] = top;
}
else if (top != WORD_MIN)
{
if (top < B_min[i]) B_min[i] = top;
if (top > B_max[i]) B_max[i] = top;
}
}
block_end++;
}
mblocks_built:
n = block_end - block_start;
if (n <= MIN_D_BLOCK_SIZE)
{
/* increment so we don't just do steps of 1 in degenerate cases */
block_end = FLINT_MIN(block_start + MIN_D_BLOCK_SIZE, N);
n = block_end - block_start;
for (i = 0; i < ar; i++)
{
for (j = 0; j < bc; j++)
{
for (k = 0; k < n; k++)
{
mag_fast_addmul(arb_radref(arb_mat_entry(C, i, j)),
A + i * ac + block_start + k,
B + j * ac + block_start + k);
}
}
}
}
else
{
for (i = 0; i < ar; i++)
{
if (A_min[i] == WORD_MIN) /* only zeros in this row */
continue;
A_min[i] = (A_min[i] + A_max[i]) / 2;
for (j = 0; j < n; j++)
AA[i * n + j] = mag_get_d_fixed_si(A + i * ac + block_start + j, A_min[i]);
}
/* Note: B and BB are both transposed in memory */
for (i = 0; i < bc; i++)
{
if (B_min[i] == WORD_MIN) /* only zeros in this column */
continue;
B_min[i] = (B_min[i] + B_max[i]) / 2;
for (j = 0; j < n; j++)
BB[i * n + j] = mag_get_d_fixed_si(B + i * ac + block_start + j, B_min[i]);
}
for (i = 0; i < ar * bc; i++)
CC[i] = 0.0;
_d_mat_addmul(CC, AA, BB, ar, n, bc);
for (i = 0; i < ar; i++)
{
if (A_min[i] == WORD_MIN)
continue;
for (j = 0; j < bc; j++)
{
if (B_min[j] == WORD_MIN)
continue;
if (CC[i * bc + j] != 0.0)
{
mag_t t;
MAG_SET_D_2EXP(MAG_MAN(t), MAG_EXP(t), CC[i * bc + j], A_min[i] + B_min[j]);
mag_add(arb_radref(arb_mat_entry(C, i, j)),
arb_radref(arb_mat_entry(C, i, j)), t);
}
}
}
}
block_start = block_end;
}
flint_free(A_max);
flint_free(A_min);
flint_free(B_max);
flint_free(B_min);
flint_free(AA);
flint_free(BB);
flint_free(CC);
}