#ifndef MAD_TPSA_COMPOSE_PAR_TC
#define MAD_TPSA_COMPOSE_PAR_TC

/*
 o----------------------------------------------------------------------------o
 |
 | TPSA parallel map composition module implementation
 |
 | Methodical Accelerator Design - Copyright (c) 2016+
 | Support: http://cern.ch/mad  - mad at cern.ch
 | Authors: L. Deniau, laurent.deniau at cern.ch
 |          C. Tomoiaga
 | Contrib: -
 |
 o----------------------------------------------------------------------------o
 | You can redistribute this file and/or modify it under the terms of the GNU
 | General Public License GPLv3 (or later), as published by the Free Software
 | Foundation. This file is distributed in the hope that it will be useful, but
 | WITHOUT ANY WARRANTY OF ANY KIND. See http://gnu.org/licenses for details.
 o----------------------------------------------------------------------------o
*/

struct compose_ctx_par {
  const D *d;
  T **cached;
  ssz_t cached_size, sa;
};

#define MAX_CACHED_ORDS 4
#define CTX struct compose_ctx_par

static inline T*
get_mono(idx_t c, idx_t tmp_idx, T *tmps[2], ord_t complement[], CTX *ctx)
{
  if (c < ctx->cached_size && ctx->cached[c])
    return ctx->cached[c];

  const D *d = ctx->d;
  T **cached = ctx->cached;

  for (idx_t m = ctx->cached_size - 1; m > 0; --m)
    if (cached[m] && mad_mono_le(d->nv, d->To[m], d->To[c])) {
      mad_mono_sub(d->nv, d->To[c], d->To[m], complement);
      idx_t compl_idx = mad_desc_idxm(d, d->nv, complement);
      T *t = get_mono(compl_idx, tmp_idx ^ 1, tmps, complement, ctx);

#if DEBUG_COMPOSE
      mad_mono_print(d->nv, d->To[c], 0);
      printf(" = ");
      mad_mono_print(d->nv, d->To[m], 0);
      mad_mono_print(d->nv, d->To[compl_idx], 0);
      printf("m=%d", m);
      printf("\n");
#endif

      FUN(mul)(t, cached[m], tmps[tmp_idx]);
      break;
    }

  // update cache if needed
  if (c < ctx->cached_size) {
    assert(!cached[c]);             // no double alloc
    cached[c] = FUN(newd)(d, mad_tpsa_default);
    FUN(copy)(tmps[tmp_idx], cached[c]);
  }
  return tmps[tmp_idx];
}

static inline void
compose_ser(idx_t sa, const T *ma[sa], T *mc[sa], CTX *ctx)
{
  assert(ma && mc && ctx);

  // cleanup & ord 0
  for (idx_t i = 0; i < sa; ++i)
    FUN(setvar)(mc[i], ma[i]->coef[0],0,0);

  ord_t mono[ctx->d->nv];
  T *tmps[2] = { FUN(newd)(ctx->d, ctx->d->to),
                 FUN(newd)(ctx->d, ctx->d->to) }, *t = NULL;
  for (idx_t c = 1; c < ctx->cached_size; ++c) {
    // TODO: only cache what is needed
    t = get_mono(c, 0, tmps, mono, ctx);
    for (idx_t i = 0; i < sa; ++i)
      if (ma[i]->coef[c])
        FUN(acc)(t, ma[i]->coef[c], mc[i]);
  }
  FUN(del)(tmps[0]);
  FUN(del)(tmps[1]);
}

static inline void
compose_par(idx_t sa, const T *ma[sa], T *mc[sa], CTX *ctx)
{
  ord_t highest = 0;
  for (idx_t i = 0; i < sa; ++i)
    if (ma[i]->hi > highest)
      highest = ma[i]->hi;
  idx_t max_coeff = ctx->d->ord2idx[highest+1];

  T *mt[ctx->d->nth][sa];
  #pragma omp parallel num_threads(ctx->d->nth)
  {
    int id = omp_get_thread_num();

    // alloc private vars
    ord_t mono[ctx->d->nv];
    T *tmps[2] = { FUN(newd)(ctx->d, ctx->d->to),
                   FUN(newd)(ctx->d, ctx->d->to) }, *t = NULL;
    T **m_curr_thread = mt[id];
    for (idx_t i = 0; i < sa; ++i)
      m_curr_thread[i] = FUN(newd)(ctx->d, mad_tpsa_default);

    #pragma omp for
    for (idx_t c = ctx->cached_size; c < max_coeff; ++c) {
      int needed = 0;
      for (idx_t i = 0; i < sa; ++i)
        if (ma[i]->coef[c]) {
          needed = 1;
          break;
        }
      if (!needed) continue;

      t = get_mono(c, 0, tmps, mono, ctx);
      for (idx_t i = 0; i < sa; ++i)
        if (ma[i]->coef[c])
          FUN(acc)(t, ma[i]->coef[c], m_curr_thread[i]);
    }

    FUN(del)(tmps[0]);
    FUN(del)(tmps[1]);
  }

  for (int thread = 0; thread < ctx->d->nth; ++thread)
    for (idx_t i = 0; i < sa; ++i) {
      FUN(acc)(mt[thread][i], 1, mc[i]);
      FUN(del)(mt[thread][i]);
    }
}

static inline void
compose_parallel(ssz_t sa, const T *ma[sa], ssz_t sb, const T *mb[sb], T *mc[sa],
                 ord_t hi_ord)
{
  // locals
  const D *d = ma[0]->d;
  ssz_t nv = d->nv;

  ord_t to_cache = MIN(d->mo, MAX_CACHED_ORDS);
  ssz_t cached_size = d->ord2idx[to_cache+1];
  T *cached[cached_size];

  /* cached[0] not in use --> */              cached[0] = NULL;
  for (idx_t c =    1; c <= nv         ; ++c) cached[c] = (T *) mb[c-1];
  for (idx_t c = nv+1; c <  cached_size; ++c) cached[c] = NULL;

  CTX ctx = { .d = d, .cached_size = cached_size, .cached = cached };

  // compose
  compose_ser(sa, ma, mc, &ctx);
  compose_par(sa, ma, mc, &ctx);

  // finalize
  for (idx_t c = nv+1; c < cached_size; ++c)
    FUN(del)(cached[c]);
}

#undef MAX_CACHED_ORDS
#undef CTX

#endif // MAD_TPSA_COMPOSE_PAR_TC