diff --git a/acb_mat/mul_classical.c b/acb_mat/mul_classical.c index c535d5d3..3472f2e7 100644 --- a/acb_mat/mul_classical.c +++ b/acb_mat/mul_classical.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2012 Fredrik Johansson + Copyright (C) 2012, 2018 Fredrik Johansson This file is part of Arb. @@ -49,21 +49,47 @@ acb_mat_mul_classical(acb_mat_t C, const acb_mat_t A, const acb_mat_t B, slong p return; } - for (i = 0; i < ar; i++) + if (br <= 2) { - for (j = 0; j < bc; j++) + for (i = 0; i < ar; i++) { - acb_mul(acb_mat_entry(C, i, j), - acb_mat_entry(A, i, 0), - acb_mat_entry(B, 0, j), prec); - - for (k = 1; k < br; k++) + for (j = 0; j < bc; j++) { - acb_addmul(acb_mat_entry(C, i, j), - acb_mat_entry(A, i, k), - acb_mat_entry(B, k, j), prec); + /* todo: efficient fmma code */ + acb_mul(acb_mat_entry(C, i, j), + acb_mat_entry(A, i, 0), + acb_mat_entry(B, 0, j), prec); + + for (k = 1; k < br; k++) + { + acb_addmul(acb_mat_entry(C, i, j), + acb_mat_entry(A, i, k), + acb_mat_entry(B, k, j), prec); + } } } } -} + else + { + acb_ptr tmp; + TMP_INIT; + TMP_START; + tmp = TMP_ALLOC(sizeof(acb_struct) * br * bc); + + for (i = 0; i < br; i++) + for (j = 0; j < bc; j++) + tmp[j * br + i] = *acb_mat_entry(B, i, j); + + for (i = 0; i < ar; i++) + { + for (j = 0; j < bc; j++) + { + acb_dot(acb_mat_entry(C, i, j), NULL, 0, + A->rows[i], 1, tmp + j * br, 1, br, prec); + } + } + + TMP_END; + } +} diff --git a/acb_mat/mul_threaded.c b/acb_mat/mul_threaded.c index 42bf4b5f..f6aac610 100644 --- a/acb_mat/mul_threaded.c +++ b/acb_mat/mul_threaded.c @@ -30,21 +30,30 @@ void * _acb_mat_mul_thread(void * arg_ptr) { acb_mat_mul_arg_t arg = *((acb_mat_mul_arg_t *) arg_ptr); - slong i, j, k; + slong i, j, br, bc; + acb_ptr tmp; + TMP_INIT; + + br = arg.br; + bc = arg.bc1 - arg.bc0; + + TMP_START; + tmp = TMP_ALLOC(sizeof(acb_struct) * br * bc); + + for (i = 0; i < br; i++) + for (j = 0; j < bc; j++) + tmp[j * br + i] = arg.B[i][arg.bc0 + j]; for (i = arg.ar0; i < arg.ar1; i++) { for (j = arg.bc0; j < arg.bc1; j++) { - acb_mul(arg.C[i] + j, arg.A[i] + 0, arg.B[0] + j, arg.prec); - - for (k = 1; k < arg.br; k++) - { - acb_addmul(arg.C[i] + j, arg.A[i] + k, arg.B[k] + j, arg.prec); - } + acb_dot(arg.C[i] + j, NULL, 0, + arg.A[i], 1, tmp + (j - arg.bc0) * br, 1, br, arg.prec); } } + TMP_END; flint_cleanup(); return NULL; } @@ -122,4 +131,3 @@ acb_mat_mul_threaded(acb_mat_t C, const acb_mat_t A, const acb_mat_t B, slong pr flint_free(threads); flint_free(args); } - diff --git a/arb_mat/mul_classical.c b/arb_mat/mul_classical.c index bd27e2d6..fe086477 100644 --- a/arb_mat/mul_classical.c +++ b/arb_mat/mul_classical.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2012 Fredrik Johansson + Copyright (C) 2012, 2018 Fredrik Johansson This file is part of Arb. @@ -49,21 +49,47 @@ arb_mat_mul_classical(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong p return; } - for (i = 0; i < ar; i++) + if (br <= 2) { - for (j = 0; j < bc; j++) + for (i = 0; i < ar; i++) { - arb_mul(arb_mat_entry(C, i, j), - arb_mat_entry(A, i, 0), - arb_mat_entry(B, 0, j), prec); - - for (k = 1; k < br; k++) + for (j = 0; j < bc; j++) { - arb_addmul(arb_mat_entry(C, i, j), - arb_mat_entry(A, i, k), - arb_mat_entry(B, k, j), prec); + /* todo: efficient fmma code */ + arb_mul(arb_mat_entry(C, i, j), + arb_mat_entry(A, i, 0), + arb_mat_entry(B, 0, j), prec); + + for (k = 1; k < br; k++) + { + arb_addmul(arb_mat_entry(C, i, j), + arb_mat_entry(A, i, k), + arb_mat_entry(B, k, j), prec); + } } } } -} + else + { + arb_ptr tmp; + TMP_INIT; + TMP_START; + tmp = TMP_ALLOC(sizeof(arb_struct) * br * bc); + + for (i = 0; i < br; i++) + for (j = 0; j < bc; j++) + tmp[j * br + i] = *arb_mat_entry(B, i, j); + + for (i = 0; i < ar; i++) + { + for (j = 0; j < bc; j++) + { + arb_dot(arb_mat_entry(C, i, j), NULL, 0, + A->rows[i], 1, tmp + j * br, 1, br, prec); + } + } + + TMP_END; + } +} diff --git a/arb_mat/mul_threaded.c b/arb_mat/mul_threaded.c index 37a3f457..30ea5923 100644 --- a/arb_mat/mul_threaded.c +++ b/arb_mat/mul_threaded.c @@ -30,21 +30,30 @@ void * _arb_mat_mul_thread(void * arg_ptr) { arb_mat_mul_arg_t arg = *((arb_mat_mul_arg_t *) arg_ptr); - slong i, j, k; + slong i, j, br, bc; + arb_ptr tmp; + TMP_INIT; + + br = arg.br; + bc = arg.bc1 - arg.bc0; + + TMP_START; + tmp = TMP_ALLOC(sizeof(arb_struct) * br * bc); + + for (i = 0; i < br; i++) + for (j = 0; j < bc; j++) + tmp[j * br + i] = arg.B[i][arg.bc0 + j]; for (i = arg.ar0; i < arg.ar1; i++) { for (j = arg.bc0; j < arg.bc1; j++) { - arb_mul(arg.C[i] + j, arg.A[i] + 0, arg.B[0] + j, arg.prec); - - for (k = 1; k < arg.br; k++) - { - arb_addmul(arg.C[i] + j, arg.A[i] + k, arg.B[k] + j, arg.prec); - } + arb_dot(arg.C[i] + j, NULL, 0, + arg.A[i], 1, tmp + (j - arg.bc0) * br, 1, br, arg.prec); } } + TMP_END; flint_cleanup(); return NULL; } @@ -122,4 +131,3 @@ arb_mat_mul_threaded(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong pr flint_free(threads); flint_free(args); } -