2018-06-25 19:39:55 +02:00
|
|
|
/*
|
|
|
|
Copyright (C) 2018 Fredrik Johansson
|
|
|
|
|
|
|
|
This file is part of Arb.
|
|
|
|
|
|
|
|
Arb is free software: you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU Lesser General Public License (LGPL) as published
|
|
|
|
by the Free Software Foundation; either version 2.1 of the License, or
|
|
|
|
(at your option) any later version. See <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "arb_mat.h"
|
|
|
|
|
|
|
|
/* Block size for better cache locality. */
|
|
|
|
#define BLOCK_SIZE 32
|
|
|
|
|
|
|
|
/* Don't convert to doubles when smaller than this block size. */
|
|
|
|
#define MIN_D_BLOCK_SIZE 5
|
|
|
|
|
|
|
|
/* With doubles, we can have an exponent range of about 1024, minus some
|
|
|
|
slack for accumulated sums. */
|
|
|
|
#define DOUBLE_MAX_OFFSET 900
|
|
|
|
|
2018-11-20 21:31:40 +01:00
|
|
|
static __inline__ double dot8(const double * A, const double * B)
|
|
|
|
{
|
|
|
|
return ((A[0] * B[0] + A[1] * B[1]) + (A[2] * B[2] + A[3] * B[3])) +
|
|
|
|
((A[4] * B[4] + A[5] * B[5]) + (A[6] * B[6] + A[7] * B[7]));
|
|
|
|
}
|
|
|
|
|
2018-06-25 19:39:55 +02:00
|
|
|
/* Upper bound of matrix product, assuming nonnegative entries and
|
|
|
|
no overflow/underflow. B is pre-transposed. Straightforward blocked
|
|
|
|
implementation; could use BLAS, but this matrix product is rarely going
|
|
|
|
to be the bottleneck. */
|
2018-11-20 21:31:40 +01:00
|
|
|
|
2018-06-25 19:39:55 +02:00
|
|
|
static void
|
|
|
|
_d_mat_addmul(double * C, const double * A, const double * B, slong ar, slong ac, slong bc)
|
|
|
|
{
|
|
|
|
slong ii, jj, kk, i, j, k;
|
|
|
|
double t, eps;
|
|
|
|
|
|
|
|
eps = ldexp(1.0, -52);
|
|
|
|
|
|
|
|
for (ii = 0; ii < ar; ii += BLOCK_SIZE)
|
|
|
|
{
|
|
|
|
for (jj = 0; jj < bc; jj += BLOCK_SIZE)
|
|
|
|
{
|
|
|
|
for (kk = 0; kk < ac; kk += BLOCK_SIZE)
|
|
|
|
{
|
|
|
|
for (i = ii; i < FLINT_MIN(ii + BLOCK_SIZE, ar); i++)
|
|
|
|
{
|
|
|
|
for (j = jj; j < FLINT_MIN(jj + BLOCK_SIZE, bc); j++)
|
|
|
|
{
|
2018-11-20 21:31:40 +01:00
|
|
|
if (BLOCK_SIZE == 32 && kk + BLOCK_SIZE <= ac)
|
|
|
|
{
|
|
|
|
double t0, t1, t2, t3;
|
|
|
|
|
|
|
|
t0 = dot8(A + i * ac + kk + 0, B + j * ac + kk + 0);
|
|
|
|
t1 = dot8(A + i * ac + kk + 8, B + j * ac + kk + 8);
|
|
|
|
t2 = dot8(A + i * ac + kk + 16, B + j * ac + kk + 16);
|
|
|
|
t3 = dot8(A + i * ac + kk + 24, B + j * ac + kk + 24);
|
|
|
|
|
|
|
|
t = (t0 + t1) + (t2 + t3);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
t = 0.0;
|
|
|
|
|
|
|
|
for (k = kk; k < FLINT_MIN(kk + BLOCK_SIZE, ac); k++)
|
|
|
|
t += A[i * ac + k] * B[j * ac + k];
|
|
|
|
}
|
2018-06-25 19:39:55 +02:00
|
|
|
|
|
|
|
C[i * bc + j] += t;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compensate for possible rounding errors */
|
|
|
|
for (i = 0; i < ar; i++)
|
|
|
|
for (j = 0; j < bc; j++)
|
|
|
|
C[i * bc + j] *= (1.0 + 2.01 * (ac + 1) * eps);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We use WORD_MIN to represent zero here. */
|
|
|
|
static __inline__ slong _mag_get_exp(const mag_t x)
|
|
|
|
{
|
|
|
|
if (mag_is_special(x))
|
|
|
|
return WORD_MIN;
|
|
|
|
else
|
|
|
|
return MAG_EXP(x);
|
|
|
|
}
|
|
|
|
|
|
|
|
static double
|
|
|
|
mag_get_d_fixed_si(const mag_t x, slong e)
|
|
|
|
{
|
|
|
|
return ldexp(MAG_MAN(x), MAG_EXP(x) - e - MAG_BITS);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
_arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B,
|
|
|
|
slong ar, slong ac, slong bc)
|
|
|
|
{
|
|
|
|
slong i, j, k, M, N, P, top, n, block_start, block_end;
|
|
|
|
slong *A_min, *A_max, *B_min, *B_max, max_offset;
|
|
|
|
double *CC, *AA, *BB;
|
|
|
|
|
|
|
|
M = ar;
|
|
|
|
N = ac;
|
|
|
|
P = bc;
|
|
|
|
|
|
|
|
/* todo: could use TMP_ALLOC */
|
|
|
|
A_min = flint_malloc(sizeof(slong) * M);
|
|
|
|
A_max = flint_malloc(sizeof(slong) * M);
|
|
|
|
B_min = flint_malloc(sizeof(slong) * P);
|
|
|
|
B_max = flint_malloc(sizeof(slong) * P);
|
|
|
|
|
|
|
|
AA = flint_malloc(ar * ac * sizeof(double));
|
|
|
|
BB = flint_malloc(ac * bc * sizeof(double));
|
|
|
|
CC = flint_malloc(ar * bc * sizeof(double));
|
|
|
|
|
|
|
|
max_offset = DOUBLE_MAX_OFFSET;
|
|
|
|
|
|
|
|
block_start = 0;
|
|
|
|
while (block_start < N)
|
|
|
|
{
|
|
|
|
block_end = block_start + 1; /* index is exclusive block_end */
|
|
|
|
|
|
|
|
/* begin with this column of A and row of B */
|
|
|
|
for (i = 0; i < M; i++)
|
|
|
|
A_max[i] = A_min[i] = _mag_get_exp(A + i * N + block_start);
|
|
|
|
for (i = 0; i < P; i++)
|
|
|
|
B_max[i] = B_min[i] = _mag_get_exp(B + i * N + block_start);
|
|
|
|
|
|
|
|
while (block_end < N)
|
|
|
|
{
|
|
|
|
/* check if we can extend with column [block_end] of A */
|
|
|
|
for (i = 0; i < M; i++)
|
|
|
|
{
|
|
|
|
top = _mag_get_exp(A + i * N + block_end);
|
|
|
|
/* zeros are irrelevant */
|
|
|
|
if (top == WORD_MIN || A_max[i] == WORD_MIN)
|
|
|
|
continue;
|
|
|
|
/* jump will be too big */
|
|
|
|
if (top > A_min[i] + max_offset || top < A_max[i] - max_offset)
|
|
|
|
goto mblocks_built;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check if we can extend with row [block_end] of B */
|
|
|
|
for (i = 0; i < P; i++)
|
|
|
|
{
|
|
|
|
top = _mag_get_exp(B + i * N + block_end);
|
|
|
|
if (top == WORD_MIN || B_max[i] == WORD_MIN)
|
|
|
|
continue;
|
|
|
|
if (top > B_min[i] + max_offset || top < B_max[i] - max_offset)
|
|
|
|
goto mblocks_built;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* second pass to update the extreme values */
|
|
|
|
for (i = 0; i < M; i++)
|
|
|
|
{
|
|
|
|
top = _mag_get_exp(A + i * N + block_end);
|
|
|
|
if (A_max[i] == WORD_MIN)
|
|
|
|
{
|
|
|
|
A_max[i] = top;
|
|
|
|
A_min[i] = top;
|
|
|
|
}
|
|
|
|
else if (top != WORD_MIN)
|
|
|
|
{
|
|
|
|
if (top < A_min[i]) A_min[i] = top;
|
|
|
|
if (top > A_max[i]) A_max[i] = top;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < P; i++)
|
|
|
|
{
|
|
|
|
top = _mag_get_exp(B + i * N + block_end);
|
|
|
|
if (B_max[i] == WORD_MIN)
|
|
|
|
{
|
|
|
|
B_max[i] = top;
|
|
|
|
B_min[i] = top;
|
|
|
|
}
|
|
|
|
else if (top != WORD_MIN)
|
|
|
|
{
|
|
|
|
if (top < B_min[i]) B_min[i] = top;
|
|
|
|
if (top > B_max[i]) B_max[i] = top;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
block_end++;
|
|
|
|
}
|
|
|
|
|
|
|
|
mblocks_built:
|
|
|
|
|
|
|
|
n = block_end - block_start;
|
|
|
|
|
|
|
|
if (n <= MIN_D_BLOCK_SIZE)
|
|
|
|
{
|
|
|
|
/* increment so we don't just do steps of 1 in degenerate cases */
|
|
|
|
block_end = FLINT_MIN(block_start + MIN_D_BLOCK_SIZE, N);
|
|
|
|
n = block_end - block_start;
|
|
|
|
|
|
|
|
for (i = 0; i < ar; i++)
|
|
|
|
{
|
|
|
|
for (j = 0; j < bc; j++)
|
|
|
|
{
|
|
|
|
for (k = 0; k < n; k++)
|
|
|
|
{
|
|
|
|
mag_fast_addmul(arb_radref(arb_mat_entry(C, i, j)),
|
|
|
|
A + i * ac + block_start + k,
|
|
|
|
B + j * ac + block_start + k);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (i = 0; i < ar; i++)
|
|
|
|
{
|
|
|
|
if (A_min[i] == WORD_MIN) /* only zeros in this row */
|
|
|
|
continue;
|
|
|
|
|
|
|
|
A_min[i] = (A_min[i] + A_max[i]) / 2;
|
|
|
|
|
|
|
|
for (j = 0; j < n; j++)
|
|
|
|
AA[i * n + j] = mag_get_d_fixed_si(A + i * ac + block_start + j, A_min[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Note: B and BB are both transposed in memory */
|
|
|
|
for (i = 0; i < bc; i++)
|
|
|
|
{
|
|
|
|
if (B_min[i] == WORD_MIN) /* only zeros in this column */
|
|
|
|
continue;
|
|
|
|
|
|
|
|
B_min[i] = (B_min[i] + B_max[i]) / 2;
|
|
|
|
|
|
|
|
for (j = 0; j < n; j++)
|
|
|
|
BB[i * n + j] = mag_get_d_fixed_si(B + i * ac + block_start + j, B_min[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < ar * bc; i++)
|
|
|
|
CC[i] = 0.0;
|
|
|
|
|
|
|
|
_d_mat_addmul(CC, AA, BB, ar, n, bc);
|
|
|
|
|
|
|
|
for (i = 0; i < ar; i++)
|
|
|
|
{
|
|
|
|
if (A_min[i] == WORD_MIN)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (j = 0; j < bc; j++)
|
|
|
|
{
|
|
|
|
if (B_min[j] == WORD_MIN)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (CC[i * bc + j] != 0.0)
|
|
|
|
{
|
|
|
|
mag_t t;
|
|
|
|
MAG_SET_D_2EXP(MAG_MAN(t), MAG_EXP(t), CC[i * bc + j], A_min[i] + B_min[j]);
|
|
|
|
mag_add(arb_radref(arb_mat_entry(C, i, j)),
|
|
|
|
arb_radref(arb_mat_entry(C, i, j)), t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
block_start = block_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
flint_free(A_max);
|
|
|
|
flint_free(A_min);
|
|
|
|
flint_free(B_max);
|
|
|
|
flint_free(B_min);
|
|
|
|
|
|
|
|
flint_free(AA);
|
|
|
|
flint_free(BB);
|
|
|
|
flint_free(CC);
|
|
|
|
}
|
|
|
|
|