/*=============================================================================

    This file is part of ARB.

    ARB is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    ARB is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with ARB; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2013 Fredrik Johansson

******************************************************************************/

#include "elefun.h"

/* todo: dynamic alloc */
TLS_PREFIX fmpz_t ln2_cache;
TLS_PREFIX fmpz exp_cache[EXP_CACHE_LEVELS][EXP_CACHE_NUM];
TLS_PREFIX long exp_cache_exp[EXP_CACHE_LEVELS][EXP_CACHE_NUM];
TLS_PREFIX int exp_cache_init = 0;

void
fmprb_get_fmpz_fixed_si_check_1ulp(fmpz_t r, const fmprb_t x, long exponent)
{
    fmprb_t t;
    fmprb_init(t);

    fmpr_get_fmpz_fixed_si(r, fmprb_midref(x), exponent);

    /* now compute an upper bound for the error */
    fmprb_set_fmpz(t, r);
    fmprb_mul_2exp_si(t, t, exponent);

    fmprb_sub(t, t, x, FMPRB_RAD_PREC);
    fmprb_abs(t, t);

    fmpr_add(fmprb_radref(t), fmprb_midref(t), fmprb_radref(t),
        FMPRB_RAD_PREC, FMPR_RND_CEIL);

    if (fmpr_cmp_2exp_si(fmprb_radref(t), exponent) > 0)
    {
        printf("error larger than 1 ulp!\n");
        abort();
    }

    fmprb_clear(t);
}

void
exp_cache_cleanup()
{
    long i, j;

    for (i = 0; i < EXP_CACHE_LEVELS; i++)
        for (j = 0; j < EXP_CACHE_NUM; j++)
            fmpz_clear(exp_cache[i] + j);

    fmpz_clear(ln2_cache);

    exp_cache_init = 0;
}

void
compute_exp_cache()
{
    long i, j, wp;
    fmprb_t x, y;

    wp = EXP_CACHE_PREC + 30;

    fmprb_init(x);
    fmprb_init(y);

    for (i = 0; i < EXP_CACHE_LEVELS; i++)
    {
        fmpr_set_ui_2exp_si(fmprb_midref(x), 1, -(i+1) * EXP_CACHE_BITS);
        fmpr_zero(fmprb_radref(x));

        elefun_exp_via_mpfr(x, x, wp);
        fmprb_one(y);

        for (j = 0; j < EXP_CACHE_NUM; j++)
        {
            fmpz_init(exp_cache[i] + j);

            fmprb_get_fmpz_fixed_si_check_1ulp(exp_cache[i] + j, y, -EXP_CACHE_PREC);
            fmprb_mul(y, y, x, wp);
        }
    }

    fmprb_const_log2(x, wp);

    fmpz_init(ln2_cache);
    fmprb_get_fmpz_fixed_si_check_1ulp(ln2_cache, x, -EXP_CACHE_PREC);

    fmprb_clear(x);
    fmprb_clear(y);

    exp_cache_init = 1;

    flint_register_cleanup_function(exp_cache_cleanup);
}

void
elefun_exp_fixed_precomp(fmpz_t y, fmpz_t yerr, fmpz_t exponent,
    const fmpz_t x, const fmpz_t xerr, long prec)
{
    long i, n;
    ulong r, red;
    fmpz_t t, u, uerr;

    fmpz_init(t);
    fmpz_init(u);
    fmpz_init(uerr);

    if (!exp_cache_init)
        compute_exp_cache();

    /* x = u + exponent * ln2 */
    fmpz_tdiv_q_2exp(t, ln2_cache, EXP_CACHE_PREC - prec);
    fmpz_fdiv_qr(exponent, u, x, t);

    /* ln2 has 1 ulp error -> u gains exponent * ulp error */
    if (fmpz_sgn(exponent) >= 0)
        fmpz_add(uerr, xerr, exponent);
    else
        fmpz_sub(uerr, xerr, exponent);

    /* extract top bits of u */
    if (prec < EXP_CACHE_LEVELS * EXP_CACHE_BITS)
    {
        fmpz_mul_2exp(t, u, EXP_CACHE_LEVELS * EXP_CACHE_BITS - prec);
        red = fmpz_get_ui(t);
        fmpz_zero(u);
    }
    else
    {
        fmpz_tdiv_q_2exp(t, u, prec - EXP_CACHE_LEVELS * EXP_CACHE_BITS);
        red = fmpz_get_ui(t);
        fmpz_mul_2exp(t, t, prec - EXP_CACHE_LEVELS * EXP_CACHE_BITS);
        fmpz_sub(u, u, t);
    }

    /* Taylor series for remaining bits */
    n = elefun_exp_taylor_bound(-EXP_CACHE_REDUCTION, prec);
    elefun_exp_fixed_taylor_horner_precomp(y, yerr, u, n, prec);

    /* 1 ulp error for truncation of Taylor series */
    fmpz_add_ui(yerr, yerr, 1);

    /* cache for high bits */
    for (i = EXP_CACHE_LEVELS - 1; i >= 0; i--)
    {
        r = (red >> ((EXP_CACHE_LEVELS - i - 1) * EXP_CACHE_BITS));
        r &= ((1UL << (EXP_CACHE_BITS)) - 1);

        fmpz_tdiv_q_2exp(u, exp_cache[i] + r, EXP_CACHE_PREC - prec);
        fmpz_mul_tdiv_q_2exp(y, y, u, prec);
    }

    if (EXP_CACHE_LEVELS == 2)
    {
        fmpz_mul_ui(yerr, yerr, 7);
        fmpz_add_ui(yerr, yerr, 9);
    }
    else
        abort();

    /* propagated error <= (err(u) * exp(uerr)) * exp(x) */

    /* set uerr to a bound for err(u) * exp(err(u)) */
    if (fmpz_bits(uerr) <= prec)   /* for a < 1.79329, a*exp(a) < a + a^2 + a^3 */
    {
        fmpz_mul(u, uerr, uerr);  /* uerr^2 */
        fmpz_cdiv_q_2exp(u, u, prec);

        fmpz_mul(t, u, uerr);     /* uerr^3 */
        fmpz_cdiv_q_2exp(t, t, prec);

        fmpz_add(uerr, uerr, t);
        fmpz_add(uerr, uerr, u);
    }
    else
    {
        /* crude bound: a*exp(a) <= 4^ceil(a) */
        fmpz_cdiv_q_2exp(u, uerr, prec);

        if (fmpz_bits(u) > 20)
            abort();

        fmpz_one(uerr);
        fmpz_mul_2exp(uerr, uerr, prec + 2 * (*u));
    }

    fmpz_add(t, y, yerr); /* upper bound for exp(u) */

    fmpz_mul(uerr, uerr, t); /* multiply by (err(u) * exp(err(u))) */
    fmpz_cdiv_q_2exp(uerr, uerr, prec);

    fmpz_add(yerr, yerr, uerr);  /* add to final error for y = exp(x) */

    fmpz_clear(t);
    fmpz_clear(u);
    fmpz_clear(uerr);
}