arb/bernoulli/mod_p_harvey.c

/*
    Copyright (C) 2008, 2009, David Harvey
    Copyright (C) 2021 Fredrik Johansson

This file has been adapted from the BSD-licensed bernmm package by
David Harvey (see original copyright text below).

Changes in the Arb version:
* Use FLINT functions instead of NTL functions. (Some helper functions
  have been added -- these should probably be moved to FLINT.)
* C instead of C++; some renaming and reformatting for better consistency
  with FLINT/Arb coding conventions.

Important note on performance:
* The modular arithmetic in FLINT is quite a bit slower than NTL
  since it is designed to support full 64-bit moduli which we don't need
  here. This mainly affects bernoulli_sum_powg and bernsum_pow2, which
  fortunately don't get called often for multimodular computation.
  However, we'd want to re-optimize these in case they find some other use.

*/

/*
===============================================================================

bernmm: an implementation of the algorithm described in "A multimodular
        algorithm for computing Bernoulli numbers", by David Harvey, 2008.

version 1.1

Copyright (C) 2008, 2009, David Harvey

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

===============================================================================
*/

#include <string.h>
#include "flint/ulong_extras.h"
#include "bernoulli.h"

#define DEBUG 0
#define TIMING 1

/******************************************************************************

   Computing the main sum (general case)

******************************************************************************/

/* todo: this should be a function in ulong_extras */
ulong _bernoulli_n_muldivrem_precomp(ulong * q, ulong a, ulong b, ulong n, double bnpre)
{
    ulong qq, r;

    qq = (double) a * bnpre;
    r = a * b - qq * n;

    if ((slong) r < 0)
    {
        qq--;
        r += n;
    }

    if (r >= n)
    {
        qq++;
        r -= n;
    }

    *q = qq;
    return r;
}

/*
   Returns (1 - g^k) B_k / 2k mod p.

   PRECONDITIONS:
      5 <= p < 2^FLINT_D_BITS, p prime
      2 <= k <= p-3, k even
      pinv = n_preinvert_limb(p)
      g = a multiplicative generator of GF(p), in [0, p)
*/
static ulong
bernoulli_sum_powg(ulong p, ulong pinv, ulong k, ulong g)
{
    ulong half_gm1, sum, g_to_km1, g_to_jm1, g_to_km1_to_j, q, h;
    slong j;
    double g_pinv;

    g_pinv = (double) g / (double) p;
    half_gm1 = (g + ((g & 1) ? 0 : p) - 1) / 2;    /* (g-1)/2 mod p */
    g_to_km1 = n_powmod2_preinv(g, k-1, p, pinv);
    g_to_jm1 = 1;
    g_to_km1_to_j = g_to_km1;
    sum = 0;

    for (j = 1; j <= (p - 1) / 2; j++)
    {
        g_to_jm1 = _bernoulli_n_muldivrem_precomp(&q, g_to_jm1, g, p, g_pinv);
        h = n_submod(q, half_gm1, p);
        sum = n_submod(sum, n_mulmod2_preinv(h, g_to_km1_to_j, p, pinv), p);
        g_to_km1_to_j = n_mulmod2_preinv(g_to_km1_to_j, g_to_km1, p, pinv);
    }

    return sum;
}


/******************************************************************************

   Computing the main sum (c = 1/2 case)

******************************************************************************/

/*
    The Expander class stores precomputed information for a fixed integer p,
    that subsequently permits fast computation of the binary expansion of s/p
    for any 0 < s < p.

    The constructor takes p and max_words as input. Must have 1 <= max_words <=
    MAX_INV. It computes an approximation to 1/p.

    The function expand(mp_ptr res, long s, long n) computes n limbs of s/p.
    Must have 0 < s < p and 1 <= n <= max_words. The output is written to res.
    The first word of output is junk. The next n words are the digits of s/p,
    from least to most significant. The buffer must be at least n+2 words long
    (even though the first and last words are never used for output).
*/

#define MAX_INV 256

typedef struct
{
   /* Approximation to 1/p. We store (max_words + 1) limbs. */
   mp_limb_t pinv[MAX_INV + 2];
   mp_limb_t p;
   int max_words;
}
expander_t;

static void
expander_init(expander_t * this, ulong p, int max_words)
{
    mp_limb_t one;

    FLINT_ASSERT(max_words >= 1);
    FLINT_ASSERT(max_words <= MAX_INV);

    this->max_words = max_words;
    this->p = p;
    one = 1;
    mpn_divrem_1(this->pinv, max_words + 1, &one, 1, p);
}

static void
expander_expand(mp_ptr res, expander_t * this, ulong s, ulong n)
{
    slong i;

    FLINT_ASSERT(s > 0 && s < this->p);
    FLINT_ASSERT(n >= 1);
    FLINT_ASSERT(n <= this->max_words);

    if (s == 1)
    {
        /* already have 1/p; just copy it */
        for (i = 1; i <= n; i++)
            res[i] = this->pinv[this->max_words - n + i];
    }
    else
    {
        mpn_mul_1(res, this->pinv + this->max_words - n, n + 1, (mp_limb_t) s);

        /* If the first output limb is really close to 0xFFFF..., then there's
           a possibility of overflow, so fall back on doing division directly.
           This should happen extremely rarely --- essentially never on a
           64-bit system, and very occasionally on a 32-bit system. */
        if (res[0] > -((mp_limb_t) s))
        {
            mp_limb_t ss = s;
            mpn_divrem_1(res, n + 1, &ss, 1, this->p);
        }
    }
}


/*
   Returns (2^(-k) - 1) 2 B_k / k  mod p.

   (Note: this is useless if 2^k = 1 mod p.)

   PRECONDITIONS:
      5 <= p, p prime
      2 <= k <= p-3, k even
      pinv = n_preinvert_limb(p)
      g = a multiplicative generator of GF(p), in [0, p)
      n = multiplicative n_multiplicative_order of 2 in GF(p)
*/

#define TABLE_LG_SIZE 8
#define TABLE_SIZE (WORD(1) << TABLE_LG_SIZE)
#define TABLE_MASK (TABLE_SIZE - 1)
#define NUM_TABLES (FLINT_BITS / TABLE_LG_SIZE)

#if FLINT_BITS % TABLE_LG_SIZE != 0
#error Number of bits in a ulong must be divisible by TABLE_LG_SIZE
#endif

ulong bernsum_pow2(ulong p, ulong pinv, ulong k, ulong g, ulong n)
{
    slong i, m;
    ulong g_to_km1, two_to_km1, B_to_km1, s_jump;
    ulong tables[NUM_TABLES][TABLE_SIZE];
    ulong g_to_km1_to_i;
    ulong g_to_i;
    ulong sum;
    expander_t expander;

    slong h;
    ulong x;
    ulong weights[TABLE_SIZE];
    ulong x_jump;

    /* In the main summation loop we accumulate data into the _tables_ array;
      tables[y][z] contributes to the final answer with a weight of

      sum(-(-1)^z[t] * (2^(k-1))^(FLINT_BITS - 1 - y * TABLE_LG_SIZE - t) :
                                                       0 <= t < TABLE_LG_SIZE),

      where z[t] denotes the t-th binary digit of z (LSB is t = 0).
      The memory footprint for _tables_ is 4KB on a 32-bit machine, or 16KB
      on a 64-bit machine, so should fit easily into L1 cache. */
    memset(tables, 0, sizeof(ulong) * NUM_TABLES * TABLE_SIZE);

    m = (p-1) / n;

    /* take advantage of symmetry (n' and m' from the paper) */
    if (n & 1)
        m >>= 1;
    else
        n >>= 1;

    /* g^(k-1) */
    g_to_km1 = n_powmod2_preinv(g, k - 1, p, pinv);
    /* 2^(k-1) */
    two_to_km1 = n_powmod2_preinv(2, k - 1, p, pinv);
    /* B^(k-1), where B = 2^FLINT_BITS */
    B_to_km1 = n_powmod2_preinv(two_to_km1, FLINT_BITS, p, pinv);
    /* B^(MAX_INV) */
    s_jump = n_powmod2_preinv(2, MAX_INV * FLINT_BITS, p, pinv);

    /* todo - help speed up modmuls
    mulmod_precon_t g_pinv = PrepMulModPrecon(g, p, pinv);
    mulmod_precon_t g_to_km1_pinv = PrepMulModPrecon(g_to_km1, p, pinv);
    mulmod_precon_t two_to_km1_pinv = PrepMulModPrecon(two_to_km1, p, pinv);
    mulmod_precon_t B_to_km1_pinv = PrepMulModPrecon(B_to_km1, p, pinv);
    mulmod_precon_t s_jump_pinv = PrepMulModPrecon(s_jump, p, pinv);
    */

    g_to_km1_to_i = 1;
    g_to_i = 1;
    sum = 0;

    /* Precompute some of the binary expansion of 1/p; at most MAX_INV words,
      or possibly less if n is sufficiently small */
    expander_init(&expander, p, (n >= MAX_INV * FLINT_BITS)
                                       ? MAX_INV : ((n - 1) / FLINT_BITS + 1));

    /* =========== phase 1: main summation loop */

    /* loop over outer sum */
    for (i = 0; i < m; i++)
    {
        ulong s, x, y;
        slong nn;

        /* s keeps track of g^i*2^j mod p */
        s = g_to_i;
        /* x keeps track of (g^i*2^j)^(k-1) mod p */
        x = g_to_km1_to_i;

        /* loop over inner sum; break it up into chunks of length at most */
        /* MAX_INV * FLINT_BITS. If n is large, this allows us to do most of */
        /* the work with mpn_mul_1 instead of mpn_divrem_1, and also improves */
        /* memory locality. */
        for (nn = n; nn > 0; nn -= MAX_INV * FLINT_BITS)
        {
            mp_limb_t s_over_p[MAX_INV + 2];
            slong bits, words;
            mp_ptr next;

            if (nn >= MAX_INV * FLINT_BITS)
            {
                /* do one chunk of length exactly MAX_INV * FLINT_BITS */
                bits = MAX_INV * FLINT_BITS;
                words = MAX_INV;
            }
            else
            {
                /* last chunk of length less than MAX_INV * FLINT_BITS */
                bits = nn;
                words = (nn - 1) / FLINT_BITS + 1;
            }

            /* compute some bits of the binary expansion of s/p */
            expander_expand(s_over_p, &expander, s, words);

            next = s_over_p + words;

            /* loop over whole words */
            for (; bits >= FLINT_BITS; bits -= FLINT_BITS, next--)
            {
                mp_limb_t y;
#if NUM_TABLES != 8 && NUM_TABLES != 4
                mp_ptr target;
#else
                mp_ptr target0, target1, target2, target3, target4, target5, target6, target7;
#endif

                y = *next;

#if NUM_TABLES != 8 && NUM_TABLES != 4
                /* generic version */
                for (h = 0; h < NUM_TABLES; h++)
                {
                    target = &(tables[h][y & TABLE_MASK]);
                    *target = n_submod(*target, x, p);
                    y >>= TABLE_LG_SIZE;
                }
#else
                /* unrolled versions for 32-bit/64-bit machines */
                target0 = &(tables[0][y & TABLE_MASK]);
                *target0 = n_submod(*target0, x, p);

                target1 = &(tables[1][(y >> TABLE_LG_SIZE) & TABLE_MASK]);
                *target1 = n_submod(*target1, x, p);

                target2 = &(tables[2][(y >> (2*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target2 = n_submod(*target2, x, p);

                target3 = &(tables[3][(y >> (3*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target3 = n_submod(*target3, x, p);
#if NUM_TABLES == 8
                target4 = &(tables[4][(y >> (4*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target4 = n_submod(*target4, x, p);

                target5 = &(tables[5][(y >> (5*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target5 = n_submod(*target5, x, p);

                target6 = &(tables[6][(y >> (6*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target6 = n_submod(*target6, x, p);

                target7 = &(tables[7][(y >> (7*TABLE_LG_SIZE)) & TABLE_MASK]);
                *target7 = n_submod(*target7, x, p);
#endif
#endif

                x = n_mulmod2_preinv(x, B_to_km1, p, pinv);  /* todo: precond */
            }

            /* loop over remaining bits in the last word */
            y = *next;
            for (; bits > 0; bits--)
            {
                if (y & (UWORD(1) << (FLINT_BITS - 1)))
                    sum = n_submod(sum, x, p);
                else
                    sum = n_addmod(sum, x, p);

                x = n_mulmod2_preinv(x, two_to_km1, p, pinv);   /* todo: precond */
                y <<= 1;
            }

            s = n_mulmod2_preinv(s, s_jump, p, pinv);   /* todo: precond */
        }

        /* update g^i and (g^(k-1))^i */
        g_to_i = n_mulmod2_preinv(g_to_i, g, p, pinv);
        g_to_km1_to_i = n_mulmod2_preinv(g_to_km1_to_i, g_to_km1, p, pinv);
    }

#if DEBUG
    {
        slong i, j;
        for (i = 0; i < NUM_TABLES; i++)
        {
            printf("tab[%lu] = ", i);
            for (j = 0; j < TABLE_SIZE; j++)
                printf("%lu ", tables[i][j]);
            printf("\n");
        }
    }
#endif


    /* =========== phase 2: consolidate table data */

    /* compute weights[z] = sum((-1)^z[t] * (2^(k-1))^(TABLE_LG_SIZE - 1 - t) :
                                                       0 <= t < TABLE_LG_SIZE). */

    weights[0] = 0;
    for (h = 0, x = 1; h < TABLE_LG_SIZE;
         h++, x = n_mulmod2_preinv(x, two_to_km1, p, pinv))
    {
        for (i = (WORD(1) << h) - 1; i >= 0; i--)
        {
            weights[2*i+1] = n_submod(weights[i], x, p);
            weights[2*i]   = n_addmod(weights[i], x, p);
        }
    }

    /* combine table data with weights */

    x_jump = n_powmod2_preinv(two_to_km1, TABLE_LG_SIZE, p, pinv);

    for (h = NUM_TABLES - 1, x = 1; h >= 0; h--)
    {
        for (i = 0; i < TABLE_SIZE; i++)
        {
            ulong y = n_mulmod2_preinv(tables[h][i], weights[i], p, pinv);
            y = n_mulmod2_preinv(y, x, p, pinv);
            sum = n_submod(sum, y, p);
        }

        x = n_mulmod2_preinv(x_jump, x, p, pinv);
    }

    return sum;
}

/******************************************************************************

   Computing the main sum (c = 1/2 case, with REDC arithmetic)

   Throughout this section F denotes 2^(FLINT_BITS / 2).

******************************************************************************/


/*
   Returns x/F mod n. Output is in [0, 2n), i.e. *not* reduced completely
   into [0, n).

   PRECONDITIONS:
      3 <= n < F, n odd
      0 <= x < nF    (if n < F/2)
      0 <= x < nF/2  (if n > F/2)
      ninv2 = -1/n mod F
*/
#define LOW_MASK ((UWORD(1) << (FLINT_BITS / 2)) - 1)
static __inline__ ulong RedcFast(ulong x, ulong n, ulong ninv2)
{
    ulong y = (x * ninv2) & LOW_MASK;
    ulong z = x + (n * y);
    return z >> (FLINT_BITS / 2);
}

/*
   Same as RedcFast(), but reduces output into [0, n).
*/
static __inline__ ulong Redc(ulong x, ulong n, ulong ninv2)
{
    ulong y = RedcFast(x, n, ninv2);
    if (y >= n)
       y -= n;
    return y;
}

/*
   Computes -1/n mod F, in [0, F).

   PRECONDITIONS:
      3 <= n < F, n odd
*/
static ulong PrepRedc(ulong n)
{
    ulong bits;
    ulong ninv2 = -n;   /* already correct mod 8 */

    /* Newton's method for 2-adic inversion */
    for (bits = 3; bits < FLINT_BITS/2; bits *= 2)
       ninv2 = 2*ninv2 + n * ninv2 * ninv2;

    return ninv2 & LOW_MASK;
}

/*
   Same as bernsum_pow2(), but uses REDC arithmetic, and various delayed
   reduction strategies.

   PRECONDITIONS:
      Same as bernsum_pow2(), and in addition:
      p < 2^(FLINT_BITS/2 - 1)

   (See bernsum_pow2() for code comments; we only add comments here where
   something is different from bernsum_pow2())
*/
ulong bernsum_pow2_redc(ulong p, ulong pinv, ulong k, ulong g, ulong n)
{
    ulong pinv2 = PrepRedc(p);
    ulong F = (UWORD(1) << (FLINT_BITS/2)) % p;
    ulong x;
    slong h, i, m;
    ulong weights[TABLE_SIZE];
    ulong x_jump;
    ulong x_jump_redc;

    ulong g_to_km1;
    ulong two_to_km1;
    ulong B_to_km1;
    ulong s_jump;

    ulong g_redc;
    ulong g_to_km1_redc;
    ulong two_to_km1_redc;
    ulong B_to_km1_redc;
    ulong s_jump_redc;

    ulong g_to_km1_to_i;
    ulong g_to_i;
    ulong sum;

    ulong tables[NUM_TABLES][TABLE_SIZE];

    expander_t expander;

    memset(tables, 0, sizeof(ulong) * NUM_TABLES * TABLE_SIZE);

    m = (p-1) / n;

    if (n & 1)
        m >>= 1;
    else
        n >>= 1;

    g_to_km1 = n_powmod2_preinv(g, k-1, p, pinv);
    two_to_km1 = n_powmod2_preinv(2, k-1, p, pinv);
    B_to_km1 = n_powmod2_preinv(two_to_km1, FLINT_BITS, p, pinv);
    s_jump = n_powmod2_preinv(2, MAX_INV * FLINT_BITS, p, pinv);

    g_redc = n_mulmod2_preinv(g, F, p, pinv);
    g_to_km1_redc = n_mulmod2_preinv(g_to_km1, F, p, pinv);
    two_to_km1_redc = n_mulmod2_preinv(two_to_km1, F, p, pinv);
    B_to_km1_redc = n_mulmod2_preinv(B_to_km1, F, p, pinv);
    s_jump_redc = n_mulmod2_preinv(s_jump, F, p, pinv);

    g_to_km1_to_i = 1;    /* always in [0, 2p)  */
    g_to_i = 1;           /* always in [0, 2p)  */
    sum = 0;

#if DEBUG
    printf("%lu %lu %lu %lu %lu  %lu %lu %lu %lu\n", F, g_to_km1, two_to_km1, B_to_km1, s_jump, g_redc, g_to_km1_redc, B_to_km1_redc, s_jump_redc);
#endif

    expander_init(&expander, p, (n >= MAX_INV * FLINT_BITS)
                                       ? MAX_INV : ((n - 1) / FLINT_BITS + 1));

    /* =========== phase 1: main summation loop */

    for (i = 0; i < m; i++)
    {
        ulong s, x, y;
        slong nn, bits, words;
        mp_ptr next;

        s = g_to_i;           /* always in [0, p) */
        if (s >= p)
            s -= p;

        x = g_to_km1_to_i;    /* always in [0, 2p) */

        for (nn = n; nn > 0; nn -= MAX_INV * FLINT_BITS)
        {
            ulong s_over_p[MAX_INV + 2];

            if (nn >= MAX_INV * FLINT_BITS)
            {
                bits = MAX_INV * FLINT_BITS;
                words = MAX_INV;
            }
            else
            {
                bits = nn;
                words = (nn - 1) / FLINT_BITS + 1;
            }

            expander_expand(s_over_p, &expander, s, words);
            next = s_over_p + words;

            for (; bits >= FLINT_BITS; bits -= FLINT_BITS, next--)
            {
                y = *next;

#if DEBUG
                printf("i = %lu  nn = %lu  words = %lu  bits = %lu  y = %lu\n", i, nn, words, bits, y);
#endif

                /* note: we add the values into tables *without* reduction mod p */

#if NUM_TABLES != 8 && NUM_TABLES != 4
                /* generic version */
                for (h = 0; h < NUM_TABLES; h++)
                {
                    tables[h][y & TABLE_MASK] += x;
                    y >>= TABLE_LG_SIZE;
                }
#else
                /* unrolled versions for 32-bit/64-bit machines */
                tables[0][ y                       & TABLE_MASK] += x;
                tables[1][(y >>    TABLE_LG_SIZE ) & TABLE_MASK] += x;
                tables[2][(y >> (2*TABLE_LG_SIZE)) & TABLE_MASK] += x;
                tables[3][(y >> (3*TABLE_LG_SIZE)) & TABLE_MASK] += x;
#if NUM_TABLES == 8
                tables[4][(y >> (4*TABLE_LG_SIZE)) & TABLE_MASK] += x;
                tables[5][(y >> (5*TABLE_LG_SIZE)) & TABLE_MASK] += x;
                tables[6][(y >> (6*TABLE_LG_SIZE)) & TABLE_MASK] += x;
                tables[7][(y >> (7*TABLE_LG_SIZE)) & TABLE_MASK] += x;
#endif
#endif

                x = RedcFast(x * B_to_km1_redc, p, pinv2);
            }

            /* bring x into [0, p) for next loop */
            if (x >= p)
                x -= p;

            y = *next;
            for (; bits > 0; bits--)
            {
                if (y & (UWORD(1) << (FLINT_BITS - 1)))
                    sum = n_submod(sum, x, p);
                else
                    sum = n_addmod(sum, x, p);

                x = Redc(x * two_to_km1_redc, p, pinv2);
                y <<= 1;
            }

            s = Redc(s * s_jump_redc, p, pinv2);
        }

        g_to_i = RedcFast(g_to_i * g_redc, p, pinv2);
        g_to_km1_to_i = RedcFast(g_to_km1_to_i * g_to_km1_redc, p, pinv2);
    }

    /* At this point, each table entry is at most p^2 (since x was always */
    /* in [0, 2p), and the inner loop was called at most (p/2) / FLINT_BITS */
    /* times, and 2p * p/2 / FLINT_BITS * TABLE_LG_SIZE <= p^2). */

#if DEBUG
    {
        slong i, j;
        for (i = 0; i < NUM_TABLES; i++)
        {
            printf("tab[%lu] = ", i);
            for (j = 0; j < TABLE_SIZE; j++)
                printf("%lu ", tables[i][j]);
            printf("\n");
        }
    }
#endif

    /* =========== phase 2: consolidate table data */

    weights[0] = 0;
    /* we store the weights multiplied by a factor of 2^(3*FLINT_BITS/2) to */
    /* compensate for the three rounds of REDC reduction in the loop below */
    for (h = 0, x = n_powmod2_preinv(2, 3*FLINT_BITS/2, p, pinv);
         h < TABLE_LG_SIZE; h++, x = Redc(x * two_to_km1_redc, p, pinv2))
    {
        for (i = (WORD(1) << h) - 1; i >= 0; i--)
        {
            weights[2*i+1] = n_submod(weights[i], x, p);
            weights[2*i]   = n_addmod(weights[i], x, p);
        }
    }

    x_jump = n_powmod2_preinv(two_to_km1, TABLE_LG_SIZE, p, pinv);
    x_jump_redc = n_mulmod2_preinv(x_jump, F, p, pinv);

    for (h = NUM_TABLES - 1, x = 1; h >= 0; h--)
    {
        for (i = 0; i < TABLE_SIZE; i++)
        {
            ulong y;
            y = RedcFast(tables[h][i], p, pinv2);
            y = RedcFast(y * weights[i], p, pinv2);
            y = RedcFast(y * x, p, pinv2);
            sum += y;
        }

        x = Redc(x * x_jump_redc, p, pinv2);
    }

    return sum % p;
}


/******************************************************************************

   Wrappers for bernsum_*

******************************************************************************/

/*
    Returns B_k/k mod p, in the range [0, p).

    PRECONDITIONS:
        5 <= p < NTL_SP_BOUND, p prime
        2 <= k <= p-3, k even
        pinv = PrepMulMod(p)

    Algorithm: uses bernoulli_sum_powg() to compute the main sum.
*/
ulong _bernoulli_mod_p_harvey_powg(ulong p, ulong pinv, ulong k)
{
    ulong x, g, t, g_to_k;

    g = n_primitive_root_prime(p);

    /* compute main sum */
    x = bernoulli_sum_powg(p, pinv, k, g);

    /* divide by (1 - g^k) and multiply by 2 */
    g_to_k = n_powmod2_preinv(g, k, p, pinv);
    t = n_invmod(p + 1 - g_to_k, p);
    x = n_mulmod2_preinv(x, t, p, pinv);
    x = n_addmod(x, x, p);

    return x;
}

static ulong
n_multiplicative_order(ulong x, ulong p, ulong pinv, n_factor_t * F)
{
    ulong m, q, mm;
    slong i;

    /* in the loop below, m is always some multiple of the n_multiplicative_order of x */
    m = p - 1;

    /* try to remove factors from m until we can't remove any more */
    for (i = 0; i < F->num; i++)
    {
        q = F->p[i];

        while (m % q == 0)
        {
            mm = m / q;
            if (n_powmod2_preinv(x, mm, p, pinv) != 1)
                break;
            m = mm;
        }
    }

    return m;
}

/*
   Returns B_k/k mod p, in the range [0, p).

   PRECONDITIONS:
      5 <= p < NTL_SP_BOUND, p prime
      2 <= k <= p-3, k even
      pinv = PrepMulMod(p)
      2^k != 1 mod p

   Algorithm: uses bernsum_pow2() (or bernsum_pow2_redc() if p is small
   enough) to compute the main sum.
*/
ulong _bernoulli_mod_p_harvey_pow2(ulong p, ulong pinv, ulong k)
{
    n_factor_t F;
    ulong g, n, x, t;

    n_factor_init(&F);
    n_factor(&F, p - 1, 1);

    g = n_primitive_root_prime_prefactor(p, &F);
    n = n_multiplicative_order(2, p, pinv, &F);

#if DEBUG
    printf("g = %lu, n = %lu\n", g, n);
#endif

    if (p < (UWORD(1) << (FLINT_BITS/2 - 1)))
        x = bernsum_pow2_redc(p, pinv, k, g, n);
    else
        x = bernsum_pow2(p, pinv, k, g, n);

    /* divide by 2*(2^(-k) - 1) */
    t = n_submod(n_invmod(n_powmod2_preinv(2, k, p, pinv), p), 1, p);
    t = n_addmod(t, t, p);
    t = n_invmod(t, p);
    x = n_mulmod2_preinv(x, t, p, pinv);

    return x;
}

/*
   Returns B_k/k mod p, in the range [0, p).

   PRECONDITIONS:
      5 <= p < NTL_SP_BOUND, p prime
      2 <= k <= p-3, k even
      pinv = PrepMulMod(p)
*/
ulong _bernoulli_mod_p_harvey(ulong p, ulong pinv, ulong k)
{
    if (n_powmod2_preinv(2, k, p, pinv) != 1)
    {
        /* 2^k != 1 mod p, so we use the faster version */
        return _bernoulli_mod_p_harvey_pow2(p, pinv, k);
    }
    else
    {
          /* forced to use slower version */
        return _bernoulli_mod_p_harvey_powg(p, pinv, k);
    }
}

/******************************************************************************

   Main bernoulli_mod_p routine

******************************************************************************/

/* beware: opposite argument order to bernmm */
ulong bernoulli_mod_p_harvey(ulong k, ulong p)
{
    ulong m, x, pinv;

    FLINT_ASSERT(k >= 0);
    FLINT_ASSERT(2 <= p && p < (UWORD(1) << FLINT_D_BITS));

    if (k == 0)
        return 1;

    if (k == 1)
    {
        if (p == 2)
            return -1;
        return (p - 1) / 2;
    }

    if (k & 1)
        return 0;

    /* denominator of B_k is always divisible by 6 for k >= 2 */
    if (p <= 3)
        return UWORD_MAX;

    /* use Kummer's congruence (k = m mod p-1  =>  B_k/k = B_m/m mod p) */
    m = k % (p - 1);
    if (m == 0)
        return UWORD_MAX;

    pinv = n_preinvert_limb(p);
    x = _bernoulli_mod_p_harvey(p, pinv, m);    /* = B_m/m mod p  */

    return n_mulmod2_preinv(x, k % p, p, pinv);
}