arb_mat: window matrices, block recursive LU factorization and triangular solving

2025-03-05 09:21:38 -05:00 · 2018-06-26 21:42:55 +02:00 · 2018-06-26 21:42:55 +02:00 · 33171f012b
commit 33171f012b
parent ee6883756b
14 changed files with 918 additions and 66 deletions
--- a/arb_mat.h
+++ b/arb_mat.h
@ -64,6 +64,16 @@ arb_mat_swap(arb_mat_t mat1, arb_mat_t mat2)
    *mat2 = t;
 }

+/* Window matrices */
+
+void arb_mat_window_init(arb_mat_t window, const arb_mat_t mat, slong r1, slong c1, slong r2, slong c2);
+
+ARB_MAT_INLINE void
+arb_mat_window_clear(arb_mat_t window)
+{
+    flint_free(window->rows);
+}
+
 /* Conversions */

 void arb_mat_set(arb_mat_t dest, const arb_mat_t src);
@ -318,6 +328,16 @@ arb_mat_swap_rows(arb_mat_t mat, slong * perm, slong r, slong s)
 slong arb_mat_find_pivot_partial(const arb_mat_t mat,
                                    slong start_row, slong end_row, slong c);

+void arb_mat_solve_tril_classical(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec);
+void arb_mat_solve_tril_recursive(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec);
+void arb_mat_solve_tril(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec);
+
+void arb_mat_solve_triu_classical(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec);
+void arb_mat_solve_triu_recursive(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec);
+void arb_mat_solve_triu(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec);
+
+int arb_mat_lu_classical(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec);
+int arb_mat_lu_recursive(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec);
 int arb_mat_lu(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec);

 void arb_mat_solve_lu_precomp(arb_mat_t X, const slong * perm,
--- a/arb_mat/lu.c
+++ b/arb_mat/lu.c
@ -14,60 +14,9 @@
 int
 arb_mat_lu(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec)
 {
-    arb_t d, e;
-    arb_ptr * a;
-    slong i, j, m, n, r, row, col;
-    int result;
-
-    if (arb_mat_is_empty(A))
-        return 1;
-
-    m = arb_mat_nrows(A);
-    n = arb_mat_ncols(A);
-
-    arb_mat_set(LU, A);
-
-    a = LU->rows;
-
-    row = col = 0;
-    for (i = 0; i < m; i++)
-        P[i] = i;
-
-    arb_init(d);
-    arb_init(e);
-
-    result = 1;
-
-    while (row < m && col < n)
-    {
-        r = arb_mat_find_pivot_partial(LU, row, m, col);
-
-        if (r == -1)
-        {
-            result = 0;
-            break;
-        }
-        else if (r != row)
-            arb_mat_swap_rows(LU, P, row, r);
-
-        arb_set(d, a[row] + col);
-
-        for (j = row + 1; j < m; j++)
-        {
-            arb_div(e, a[j] + col, d, prec);
-            arb_neg(e, e);
-            _arb_vec_scalar_addmul(a[j] + col,
-                a[row] + col, n - col, e, prec);
-            arb_zero(a[j] + col);
-            arb_neg(a[j] + row, e);
+    if (arb_mat_nrows(A) < 8 || arb_mat_ncols(A) < 8)
+        return arb_mat_lu_classical(P, LU, A, prec);
+    else
+        return arb_mat_lu_recursive(P, LU, A, prec);
 }

-        row++;
-        col++;
-    }
-
-    arb_clear(d);
-    arb_clear(e);
-
-    return result;
-}
--- a/arb_mat/lu_classical.c
+++ b/arb_mat/lu_classical.c
@ -0,0 +1,74 @@
+/*
+    Copyright (C) 2012 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+int
+arb_mat_lu_classical(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec)
+{
+    arb_t d, e;
+    arb_ptr * a;
+    slong i, j, m, n, r, row, col;
+    int result;
+
+    if (arb_mat_is_empty(A))
+        return 1;
+
+    m = arb_mat_nrows(A);
+    n = arb_mat_ncols(A);
+
+    arb_mat_set(LU, A);
+
+    a = LU->rows;
+
+    row = col = 0;
+    for (i = 0; i < m; i++)
+        P[i] = i;
+
+    arb_init(d);
+    arb_init(e);
+
+    result = 1;
+
+    while (row < m && col < n)
+    {
+        r = arb_mat_find_pivot_partial(LU, row, m, col);
+
+        if (r == -1)
+        {
+            result = 0;
+            break;
+        }
+        else if (r != row)
+            arb_mat_swap_rows(LU, P, row, r);
+
+        arb_set(d, a[row] + col);
+
+        for (j = row + 1; j < m; j++)
+        {
+            arb_div(e, a[j] + col, d, prec);
+            arb_neg(e, e);
+            _arb_vec_scalar_addmul(a[j] + col,
+                a[row] + col, n - col, e, prec);
+            arb_zero(a[j] + col);
+            arb_neg(a[j] + row, e);
+        }
+
+        row++;
+        col++;
+    }
+
+    arb_clear(d);
+    arb_clear(e);
+
+    return result;
+}
+
--- a/arb_mat/lu_recursive.c
+++ b/arb_mat/lu_recursive.c
@ -0,0 +1,113 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+static void
+_apply_permutation(slong * AP, arb_mat_t A, slong * P,
+    slong n, slong offset)
+{
+    if (n != 0)
+    {
+        arb_ptr * Atmp;
+        slong * APtmp;
+        slong i;
+
+        Atmp = flint_malloc(sizeof(arb_ptr) * n);
+        APtmp = flint_malloc(sizeof(slong) * n);
+
+        for (i = 0; i < n; i++) Atmp[i] = A->rows[P[i] + offset];
+        for (i = 0; i < n; i++) A->rows[i + offset] = Atmp[i];
+
+        for (i = 0; i < n; i++) APtmp[i] = AP[P[i] + offset];
+        for (i = 0; i < n; i++) AP[i + offset] = APtmp[i];
+
+        flint_free(Atmp);
+        flint_free(APtmp);
+    }
+}
+
+int
+arb_mat_lu_recursive(slong * P, arb_mat_t LU, const arb_mat_t A, slong prec)
+{
+    slong i, m, n, r1, r2, n1;
+    arb_mat_t A0, A1, A00, A01, A10, A11;
+    slong * P1;
+
+    m = A->r;
+    n = A->c;
+
+    if (m <= 1 || n <= 1)
+    {
+        return arb_mat_lu_classical(P, LU, A, prec);
+    }
+
+    if (LU != A)
+        arb_mat_set(LU, A);
+
+    n1 = n / 2;
+
+    for (i = 0; i < m; i++)
+        P[i] = i;
+
+    P1 = flint_malloc(sizeof(slong) * m);
+    arb_mat_window_init(A0, LU, 0, 0, m, n1);
+    arb_mat_window_init(A1, LU, 0, n1, m, n);
+
+    r1 = arb_mat_lu(P1, A0, A0, prec);
+
+    if (!r1)
+    {
+        flint_free(P1);
+        arb_mat_window_clear(A0);
+        arb_mat_window_clear(A1);
+        return 0;
+    }
+
+    /* r1 = rank of A0 */
+    r1 = FLINT_MIN(m, n1);
+
+    _apply_permutation(P, LU, P1, m, 0);
+
+    arb_mat_window_init(A00, LU, 0, 0, r1, r1);
+    arb_mat_window_init(A10, LU, r1, 0, m, r1);
+    arb_mat_window_init(A01, LU, 0, n1, r1, n);
+    arb_mat_window_init(A11, LU, r1, n1, m, n);
+
+    arb_mat_solve_tril(A01, A00, A01, 1, prec);
+
+    {
+        /* arb_mat_submul(A11, A11, A10, A01, prec); */
+        arb_mat_t T;
+        arb_mat_init(T, A10->r, A01->c);
+        arb_mat_mul(T, A10, A01, prec);
+        arb_mat_sub(A11, A11, T, prec);
+        arb_mat_clear(T);
+    }
+
+    r2 = arb_mat_lu(P1, A11, A11, prec);
+
+    if (!r2)
+        r1 = r2 = 0;
+    else
+        _apply_permutation(P, LU, P1, m - r1, r1);
+
+    flint_free(P1);
+    arb_mat_window_clear(A00);
+    arb_mat_window_clear(A01);
+    arb_mat_window_clear(A10);
+    arb_mat_window_clear(A11);
+    arb_mat_window_clear(A0);
+    arb_mat_window_clear(A1);
+
+    return r1 && r2;
+}
+
--- a/arb_mat/solve_lu_precomp.c
+++ b/arb_mat/solve_lu_precomp.c
@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2012 Fredrik Johansson
+    Copyright (C) 2012,2018 Fredrik Johansson

    This file is part of Arb.

@ -46,6 +46,15 @@ arb_mat_solve_lu_precomp(arb_mat_t X, const slong * perm,
        }
    }

+    /* todo: solve_tril and solve_triu have some overhead; should be
+       able to eliminate the basecase code below */
+    if (n >= 8 && m >= 8)
+    {
+        arb_mat_solve_tril(X, A, X, 1, prec);
+        arb_mat_solve_triu(X, A, X, 0, prec);
+        return;
+    }
+
    for (c = 0; c < m; c++)
    {
        /* solve Ly = b */
--- a/arb_mat/solve_tril.c
+++ b/arb_mat/solve_tril.c
@ -0,0 +1,108 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+void
+arb_mat_solve_tril_classical(arb_mat_t X,
+        const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
+{
+    slong i, j, k, n, m;
+    arb_ptr tmp;
+    arb_t s;
+
+    n = L->r;
+    m = B->c;
+
+    arb_init(s);
+    tmp = _arb_vec_init(n);
+
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < n; j++)
+            arb_set(tmp + j, arb_mat_entry(X, j, i));
+
+        for (j = 0; j < n; j++)
+        {
+            arb_zero(s);
+            for (k = 0; k < j; k++)
+                arb_addmul(s, L->rows[j] + k, tmp + k, prec);
+            arb_sub(s, arb_mat_entry(B, j, i), s, prec);
+            if (!unit)
+                arb_div(s, s, arb_mat_entry(L, j, j), prec);
+            arb_set(tmp + j, s);
+        }
+
+        for (j = 0; j < n; j++)
+            arb_set(arb_mat_entry(X, j, i), tmp + j);
+    }
+
+    _arb_vec_clear(tmp, n);
+    arb_clear(s);
+}
+
+void
+arb_mat_solve_tril_recursive(arb_mat_t X,
+        const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
+{
+    arb_mat_t LA, LC, LD, XX, XY, BX, BY, T;
+    slong r, n, m;
+
+    n = L->r;
+    m = B->c;
+    r = n / 2;
+
+    if (n == 0 || m == 0)
+        return;
+
+    /*
+    Denoting inv(M) by M^, we have:
+
+    [A 0]^ [X]  ==  [A^          0 ] [X]  ==  [A^ X]
+    [C D]  [Y]  ==  [-D^ C A^    D^] [Y]  ==  [D^ (Y - C A^ X)]
+    */
+    arb_mat_window_init(LA, L, 0, 0, r, r);
+    arb_mat_window_init(LC, L, r, 0, n, r);
+    arb_mat_window_init(LD, L, r, r, n, n);
+    arb_mat_window_init(BX, B, 0, 0, r, m);
+    arb_mat_window_init(BY, B, r, 0, n, m);
+    arb_mat_window_init(XX, X, 0, 0, r, m);
+    arb_mat_window_init(XY, X, r, 0, n, m);
+
+    arb_mat_solve_tril(XX, LA, BX, unit, prec);
+
+    /* arb_mat_submul(XY, BY, LC, XX); */
+    arb_mat_init(T, LC->r, BX->c);
+    arb_mat_mul(T, LC, XX, prec);
+    arb_mat_sub(XY, BY, T, prec);
+    arb_mat_clear(T);
+
+    arb_mat_solve_tril(XY, LD, XY, unit, prec);
+
+    arb_mat_window_clear(LA);
+    arb_mat_window_clear(LC);
+    arb_mat_window_clear(LD);
+    arb_mat_window_clear(BX);
+    arb_mat_window_clear(BY);
+    arb_mat_window_clear(XX);
+    arb_mat_window_clear(XY);
+}
+
+void
+arb_mat_solve_tril(arb_mat_t X, const arb_mat_t L,
+                                    const arb_mat_t B, int unit, slong prec)
+{
+    if (B->r < 8 || B->c < 8)
+        arb_mat_solve_tril_classical(X, L, B, unit, prec);
+    else
+        arb_mat_solve_tril_recursive(X, L, B, unit, prec);
+}
+
--- a/arb_mat/solve_triu.c
+++ b/arb_mat/solve_triu.c
@ -0,0 +1,110 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+void
+arb_mat_solve_triu_classical(arb_mat_t X, const arb_mat_t U,
+    const arb_mat_t B, int unit, slong prec)
+{
+    slong i, j, k, n, m;
+    arb_ptr tmp;
+    arb_t s;
+
+    n = U->r;
+    m = B->c;
+
+    tmp = _arb_vec_init(n);
+    arb_init(s);
+
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < n; j++)
+            arb_set(tmp + j, arb_mat_entry(X, j, i));
+
+        for (j = n - 1; j >= 0; j--)
+        {
+            arb_zero(s);
+            for (k = 0; k < n - j - 1; k++)
+                arb_addmul(s, U->rows[j] + j + 1 + k, tmp + j + 1 + k, prec);
+
+            arb_sub(s, arb_mat_entry(B, j, i), s, prec);
+            if (!unit)
+                arb_div(s, s, arb_mat_entry(U, j, j), prec);
+
+            arb_set(tmp + j, s);
+        }
+
+        for (j = 0; j < n; j++)
+            arb_set(arb_mat_entry(X, j, i), tmp + j);
+    }
+
+    _arb_vec_clear(tmp, n);
+    arb_clear(s);
+}
+
+void
+arb_mat_solve_triu_recursive(arb_mat_t X,
+        const arb_mat_t U, const arb_mat_t B, int unit, slong prec)
+{
+    arb_mat_t UA, UB, UD, XX, XY, BX, BY, T;
+    slong r, n, m;
+
+    n = U->r;
+    m = B->c;
+    r = n / 2;
+
+    if (n == 0 || m == 0)
+        return;
+
+    /*
+    Denoting inv(M) by M^, we have:
+    [A B]^ [X]  ==  [A^ (X - B D^ Y)]
+    [0 D]  [Y]  ==  [    D^ Y       ]
+    */
+
+    arb_mat_window_init(UA, U, 0, 0, r, r);
+    arb_mat_window_init(UB, U, 0, r, r, n);
+    arb_mat_window_init(UD, U, r, r, n, n);
+    arb_mat_window_init(BX, B, 0, 0, r, m);
+    arb_mat_window_init(BY, B, r, 0, n, m);
+    arb_mat_window_init(XX, X, 0, 0, r, m);
+    arb_mat_window_init(XY, X, r, 0, n, m);
+
+    arb_mat_solve_triu(XY, UD, BY, unit, prec);
+
+    /* arb_mat_submul(XX, BX, UB, XY); */
+    arb_mat_init(T, UB->r, XY->c);
+    arb_mat_mul(T, UB, XY, prec);
+    arb_mat_sub(XX, BX, T, prec);
+    arb_mat_clear(T);
+
+    arb_mat_solve_triu(XX, UA, XX, unit, prec);
+
+    arb_mat_window_clear(UA);
+    arb_mat_window_clear(UB);
+    arb_mat_window_clear(UD);
+    arb_mat_window_clear(BX);
+    arb_mat_window_clear(BY);
+    arb_mat_window_clear(XX);
+    arb_mat_window_clear(XY);
+}
+
+void
+arb_mat_solve_triu(arb_mat_t X, const arb_mat_t U,
+                                    const arb_mat_t B, int unit, slong prec)
+{
+    if (B->r < 8 || B->c < 8)
+        arb_mat_solve_triu_classical(X, U, B, unit, prec);
+    else
+        arb_mat_solve_triu_recursive(X, U, B, unit, prec);
+}
+
--- a/arb_mat/test/t-lu.c
+++ b/arb_mat/test/t-lu.c
@ -32,14 +32,14 @@ int main()

    flint_randinit(state);

-    for (iter = 0; iter < 100000 * arb_test_multiplier(); iter++)
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
    {
        fmpq_mat_t Q;
        arb_mat_t A, LU, P, L, U, T;
        slong i, j, n, qbits, prec, *perm;
        int q_invertible, r_invertible;

-        n = n_randint(state, 8);
+        n = n_randint(state, 10);
        qbits = 1 + n_randint(state, 100);
        prec = 2 + n_randint(state, 202);

--- a/arb_mat/test/t-lu_recursive.c
+++ b/arb_mat/test/t-lu_recursive.c
@ -0,0 +1,177 @@
+/*
+    Copyright (C) 2012 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+int fmpq_mat_is_invertible(const fmpq_mat_t A)
+{
+    int r;
+    fmpq_t t;
+    fmpq_init(t);
+    fmpq_mat_det(t, A);
+    r = !fmpq_is_zero(t);
+    fmpq_clear(t);
+    return r;
+}
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("lu_recursive....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    /* Dummy test with rectangular matrices. Rectangular matrices are
+       not actually supported (the output may be bogus), but the algorithm
+       should at least not crash. */
+    for (iter = 0; iter < 1000 * arb_test_multiplier(); iter++)
+    {
+        slong m, n, prec;
+        slong *perm;
+        arb_mat_t A, LU;
+
+        n = n_randint(state, 20);
+        m = n_randint(state, 20);
+        prec = 2 + n_randint(state, 200);
+
+        arb_mat_init(A, n, m);
+        arb_mat_init(LU, n, m);
+        perm = _perm_init(n);
+
+        arb_mat_randtest(A, state, prec, 10);
+
+        if (n_randint(state, 2))
+        {
+            arb_mat_lu_recursive(perm, LU, A, prec);
+        }
+        else
+        {
+            arb_mat_set(LU, A);
+            arb_mat_lu_recursive(perm, LU, LU, prec);
+        }
+
+        arb_mat_clear(A);
+        arb_mat_clear(LU);
+        _perm_clear(perm);
+    }
+
+    for (iter = 0; iter < 2000 * arb_test_multiplier(); iter++)
+    {
+        fmpq_mat_t Q;
+        arb_mat_t A, LU, P, L, U, T;
+        slong i, j, n, qbits, prec, *perm;
+        int q_invertible, r_invertible;
+
+        n = n_randint(state, 20);
+        qbits = 1 + n_randint(state, 100);
+        prec = 2 + n_randint(state, 202);
+
+        fmpq_mat_init(Q, n, n);
+        arb_mat_init(A, n, n);
+        arb_mat_init(LU, n, n);
+        arb_mat_init(P, n, n);
+        arb_mat_init(L, n, n);
+        arb_mat_init(U, n, n);
+        arb_mat_init(T, n, n);
+        perm = _perm_init(n);
+
+        fmpq_mat_randtest(Q, state, qbits);
+        q_invertible = fmpq_mat_is_invertible(Q);
+
+        if (!q_invertible)
+        {
+            arb_mat_set_fmpq_mat(A, Q, prec);
+            r_invertible = arb_mat_lu_recursive(perm, LU, A, prec);
+            if (r_invertible)
+            {
+                flint_printf("FAIL: matrix is singular over Q but not over R\n");
+                flint_printf("n = %wd, prec = %wd\n", n, prec);
+                flint_printf("\n");
+
+                flint_printf("Q = \n"); fmpq_mat_print(Q); flint_printf("\n\n");
+                flint_printf("A = \n"); arb_mat_printd(A, 15); flint_printf("\n\n");
+                flint_printf("LU = \n"); arb_mat_printd(LU, 15); flint_printf("\n\n");
+            }
+        }
+        else
+        {
+            /* now this must converge */
+            while (1)
+            {
+                arb_mat_set_fmpq_mat(A, Q, prec);
+                r_invertible = arb_mat_lu_recursive(perm, LU, A, prec);
+                if (r_invertible)
+                {
+                    break;
+                }
+                else
+                {
+                    if (prec > 10000)
+                    {
+                        flint_printf("FAIL: failed to converge at 10000 bits\n");
+                        flint_abort();
+                    }
+                    prec *= 2;
+                }
+            }
+
+            arb_mat_one(L);
+            for (i = 0; i < n; i++)
+                for (j = 0; j < i; j++)
+                    arb_set(arb_mat_entry(L, i, j),
+                        arb_mat_entry(LU, i, j));
+
+            for (i = 0; i < n; i++)
+                for (j = i; j < n; j++)
+                    arb_set(arb_mat_entry(U, i, j),
+                        arb_mat_entry(LU, i, j));
+
+            for (i = 0; i < n; i++)
+                arb_one(arb_mat_entry(P, perm[i], i));
+
+            arb_mat_mul(T, P, L, prec);
+            arb_mat_mul(T, T, U, prec);
+
+            if (!arb_mat_contains_fmpq_mat(T, Q))
+            {
+                flint_printf("FAIL (containment, iter = %wd)\n", iter);
+                flint_printf("n = %wd, prec = %wd\n", n, prec);
+                flint_printf("\n");
+
+                flint_printf("Q = \n"); fmpq_mat_print(Q); flint_printf("\n\n");
+                flint_printf("A = \n"); arb_mat_printd(A, 15); flint_printf("\n\n");
+                flint_printf("LU = \n"); arb_mat_printd(LU, 15); flint_printf("\n\n");
+                flint_printf("L = \n"); arb_mat_printd(L, 15); flint_printf("\n\n");
+                flint_printf("U = \n"); arb_mat_printd(U, 15); flint_printf("\n\n");
+                flint_printf("P*L*U = \n"); arb_mat_printd(T, 15); flint_printf("\n\n");
+
+                flint_abort();
+            }
+        }
+
+        fmpq_mat_clear(Q);
+        arb_mat_clear(A);
+        arb_mat_clear(LU);
+        arb_mat_clear(P);
+        arb_mat_clear(L);
+        arb_mat_clear(U);
+        arb_mat_clear(T);
+        _perm_clear(perm);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
--- a/arb_mat/test/t-solve_lu.c
+++ b/arb_mat/test/t-solve_lu.c
@ -28,8 +28,16 @@ int main()
        slong n, m, qbits, prec;
        int q_invertible, r_invertible, r_invertible2;

+        if (n_randint(state, 50) == 0)
+        {
+            n = n_randint(state, 20);
+            m = n_randint(state, 20);
+        }
+        else
+        {
            n = n_randint(state, 8);
            m = n_randint(state, 8);
+        }
        qbits = 1 + n_randint(state, 30);
        prec = 2 + n_randint(state, 200);

--- a/arb_mat/test/t-solve_tril.c
+++ b/arb_mat/test/t-solve_tril.c
@ -0,0 +1,99 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("solve_tril....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 2000 * arb_test_multiplier(); iter++)
+    {
+        arb_mat_t A, X, B, Y;
+        slong rows, cols, prec, i, j;
+        int unit;
+
+        prec = 2 + n_randint(state, 200);
+        rows = n_randint(state, 30);
+        cols = n_randint(state, 30);
+        unit = n_randint(state, 2);
+
+        arb_mat_init(A, rows, rows);
+        arb_mat_init(B, rows, cols);
+        arb_mat_init(X, rows, cols);
+        arb_mat_init(Y, rows, cols);
+
+        arb_mat_randtest(A, state, prec, 10);
+        arb_mat_randtest(X, state, prec, 10);
+        arb_mat_randtest(Y, state, prec, 10);
+
+        for (i = 0; i < rows; i++)
+        {
+            if (unit)
+                arb_one(arb_mat_entry(A, i, i));
+            else
+                arb_set_ui(arb_mat_entry(A, i, i), 1 + n_randint(state, 100));
+
+            for (j = i + 1; j < rows; j++)
+                arb_zero(arb_mat_entry(A, i, j));
+        }
+
+        arb_mat_mul(B, A, X, prec);
+
+        if (unit)  /* check that diagonal entries are ignored */
+        {
+            for (i = 0; i < rows; i++)
+                arb_set_ui(arb_mat_entry(A, i, i), 1 + n_randint(state, 100));
+        }
+
+        /* Check Y = A^(-1) * (A * X) = X */
+        arb_mat_solve_tril(Y, A, B, unit, prec);
+
+        if (!arb_mat_overlaps(Y, X))
+        {
+            flint_printf("FAIL\n");
+            flint_printf("A = \n"); arb_mat_printd(A, 10); flint_printf("\n\n");
+            flint_printf("B = \n"); arb_mat_printd(B, 10); flint_printf("\n\n");
+            flint_printf("X = \n"); arb_mat_printd(X, 10); flint_printf("\n\n");
+            flint_printf("Y = \n"); arb_mat_printd(Y, 10); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* Check aliasing */
+        arb_mat_solve_tril(B, A, B, unit, prec);
+        if (!arb_mat_equal(B, Y))
+        {
+            flint_printf("FAIL (aliasing)\n");
+            flint_printf("A = \n"); arb_mat_printd(A, 10); flint_printf("\n\n");
+            flint_printf("B = \n"); arb_mat_printd(B, 10); flint_printf("\n\n");
+            flint_printf("X = \n"); arb_mat_printd(X, 10); flint_printf("\n\n");
+            flint_printf("Y = \n"); arb_mat_printd(Y, 10); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        arb_mat_clear(A);
+        arb_mat_clear(B);
+        arb_mat_clear(X);
+        arb_mat_clear(Y);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
+
--- a/arb_mat/test/t-solve_triu.c
+++ b/arb_mat/test/t-solve_triu.c
@ -0,0 +1,99 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+int main()
+{
+    slong iter;
+    flint_rand_t state;
+
+    flint_printf("solve_triu....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 2000 * arb_test_multiplier(); iter++)
+    {
+        arb_mat_t A, X, B, Y;
+        slong rows, cols, prec, i, j;
+        int unit;
+
+        prec = 2 + n_randint(state, 200);
+        rows = n_randint(state, 30);
+        cols = n_randint(state, 30);
+        unit = n_randint(state, 2);
+
+        arb_mat_init(A, rows, rows);
+        arb_mat_init(B, rows, cols);
+        arb_mat_init(X, rows, cols);
+        arb_mat_init(Y, rows, cols);
+
+        arb_mat_randtest(A, state, prec, 10);
+        arb_mat_randtest(X, state, prec, 10);
+        arb_mat_randtest(Y, state, prec, 10);
+
+        for (i = 0; i < rows; i++)
+        {
+            if (unit)
+                arb_one(arb_mat_entry(A, i, i));
+            else
+                arb_set_ui(arb_mat_entry(A, i, i), 1 + n_randint(state, 100));
+
+            for (j = 0; j < i; j++)
+                arb_zero(arb_mat_entry(A, i, j));
+        }
+
+        arb_mat_mul(B, A, X, prec);
+
+        if (unit)  /* check that diagonal entries are ignored */
+        {
+            for (i = 0; i < rows; i++)
+                arb_set_ui(arb_mat_entry(A, i, i), 1 + n_randint(state, 100));
+        }
+
+        /* Check Y = A^(-1) * (A * X) = X */
+        arb_mat_solve_triu(Y, A, B, unit, prec);
+
+        if (!arb_mat_overlaps(Y, X))
+        {
+            flint_printf("FAIL\n");
+            flint_printf("A = \n"); arb_mat_printd(A, 10); flint_printf("\n\n");
+            flint_printf("B = \n"); arb_mat_printd(B, 10); flint_printf("\n\n");
+            flint_printf("X = \n"); arb_mat_printd(X, 10); flint_printf("\n\n");
+            flint_printf("Y = \n"); arb_mat_printd(Y, 10); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        /* Check aliasing */
+        arb_mat_solve_triu(B, A, B, unit, prec);
+        if (!arb_mat_equal(B, Y))
+        {
+            flint_printf("FAIL (aliasing)\n");
+            flint_printf("A = \n"); arb_mat_printd(A, 10); flint_printf("\n\n");
+            flint_printf("B = \n"); arb_mat_printd(B, 10); flint_printf("\n\n");
+            flint_printf("X = \n"); arb_mat_printd(X, 10); flint_printf("\n\n");
+            flint_printf("Y = \n"); arb_mat_printd(Y, 10); flint_printf("\n\n");
+            flint_abort();
+        }
+
+        arb_mat_clear(A);
+        arb_mat_clear(B);
+        arb_mat_clear(X);
+        arb_mat_clear(Y);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
+
--- a/arb_mat/window_init.c
+++ b/arb_mat/window_init.c
@ -0,0 +1,29 @@
+/*
+    Copyright (C) 2018 Fredrik Johansson
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb_mat.h"
+
+void
+arb_mat_window_init(arb_mat_t window, const arb_mat_t mat,
+    slong r1, slong c1, slong r2, slong c2)
+{
+    slong i;
+    window->entries = NULL;
+
+    window->rows = flint_malloc((r2 - r1) * sizeof(arb_ptr));
+
+    for (i = 0; i < r2 - r1; i++)
+        window->rows[i] = mat->rows[r1 + i] + c1;
+
+    window->r = r2 - r1;
+    window->c = c2 - c1;
+}
+
--- a/doc/source/arb_mat.rst
+++ b/doc/source/arb_mat.rst
@ -58,6 +58,16 @@ Memory management
    The count excludes the size of the structure itself. Add
    ``sizeof(arb_mat_struct)`` to get the size of the object as a whole.

+.. function:: void arb_mat_window_init(arb_mat_t window, const arb_mat_t mat, slong r1, slong c1, slong r2, slong c2)
+
+    Initializes *window* to a window matrix into the submatrix of *mat*
+    starting at the corner at row *r1* and column *c1* (inclusive) and ending
+    at row *r2* and column *c2* (exclusive).
+
+.. function:: void arb_mat_window_clear(arb_mat_t window)
+
+    Frees the submatrix.
+
 Conversions
 -------------------------------------------------------------------------------

@ -239,16 +249,24 @@ Arithmetic

 .. function:: void arb_mat_mul_threaded(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec)

+.. function:: void arb_mat_mul_block(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec)
+
 .. function:: void arb_mat_mul(arb_mat_t res, const arb_mat_t mat1, const arb_mat_t mat2, slong prec)

    Sets *res* to the matrix product of *mat1* and *mat2*. The operands must have
    compatible dimensions for matrix multiplication.

-    The *threaded* version splits the computation
-    over the number of threads returned by *flint_get_num_threads()*.
-    The default version automatically calls the *threaded* version
-    if the matrices are sufficiently large and more than one thread
-    can be used.
+    The *classical* version performs matrix multiplication in the trivial way.
+
+    The *block* version decomposes the input matrices into one or several
+    blocks of uniformly scaled matrices and multiplies 
+    large blocks via *fmpz_mat_mul*. It also invokes
+    :func:`_arb_mat_addmul_rad_mag_fast` for the radius matrix multiplications.
+
+    The *threaded* version performs classical multiplication but splits the
+    computation over the number of threads returned by *flint_get_num_threads()*.
+
+    The default version chooses an algorithm automatically.

 .. function:: void arb_mat_mul_entrywise(arb_mat_t C, const arb_mat_t A, const arb_mat_t B, slong prec)

@ -267,6 +285,14 @@ Arithmetic
    Sets *res* to *mat* raised to the power *exp*. Requires that *mat*
    is a square matrix.

+.. function:: void _arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B, slong ar, slong ac, slong bc)
+
+    Helper function for matrix multiplication.
+    Adds to the radii of *C* the matrix product of the matrices represented
+    by *A* and *B*, where *A* is a linear array of coefficients in row-major
+    order and *B* is a linear array of coefficients in column-major order. 
+    This function assumes that all exponents are small and is unsafe
+    for general use.

 Scalar arithmetic
 -------------------------------------------------------------------------------
@ -303,6 +329,10 @@ Scalar arithmetic
 Gaussian elimination and solving
 -------------------------------------------------------------------------------

+.. function:: int arb_mat_lu_classical(slong * perm, arb_mat_t LU, const arb_mat_t A, slong prec)
+
+.. function:: int arb_mat_lu_recursive(slong * perm, arb_mat_t LU, const arb_mat_t A, slong prec)
+
 .. function:: int arb_mat_lu(slong * perm, arb_mat_t LU, const arb_mat_t A, slong prec)

    Given an `n \times n` matrix `A`, computes an LU decomposition `PLU = A`
@ -323,6 +353,33 @@ Gaussian elimination and solving
    computed to insufficient precision, or the LU decomposition was
    attempted at insufficient precision.

+    The *classical* version uses Gaussian elimination directly while
+    the *recursive* version performs the computation in a block recursive
+    way to benefit from fast matrix multiplication. The default version
+    chooses an algorithm automatically.
+
+.. function:: void arb_mat_solve_tril_classical(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
+
+.. function:: void arb_mat_solve_tril_recursive(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
+
+.. function:: void arb_mat_solve_tril(arb_mat_t X, const arb_mat_t L, const arb_mat_t B, int unit, slong prec)
+
+.. function:: void arb_mat_solve_triu_classical(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec)
+
+.. function:: void arb_mat_solve_triu_recursive(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec)
+
+.. function:: void arb_mat_solve_triu(arb_mat_t X, const arb_mat_t U, const arb_mat_t B, int unit, slong prec)
+
+    Solves the lower triangular system `LX = B` or the upper triangular system
+    `UX = B`, respectively. If *unit* is set, the main diagonal of *L* or *U*
+    is taken to consist of all ones, and in that case the actual entries on
+    the diagonal are not read at all and can contain other data.
+
+    The *classical* versions perform the computations iteratively while the
+    *recursive* versions perform the computations in a block recursive
+    way to benefit from fast matrix multiplication. The default versions
+    choose an algorithm automatically.
+
 .. function:: void arb_mat_solve_lu_precomp(arb_mat_t X, const slong * perm, const arb_mat_t LU, const arb_mat_t B, slong prec)

    Solves `AX = B` given the precomputed nonsingular LU decomposition `A = PLU`.