#ifndef MAD_TPSA_COMPOSE_PAR_TC #define MAD_TPSA_COMPOSE_PAR_TC /* o----------------------------------------------------------------------------o | | TPSA parallel map composition module implementation | | Methodical Accelerator Design - Copyright (c) 2016+ | Support: http://cern.ch/mad - mad at cern.ch | Authors: L. Deniau, laurent.deniau at cern.ch | C. Tomoiaga | Contrib: - | o----------------------------------------------------------------------------o | You can redistribute this file and/or modify it under the terms of the GNU | General Public License GPLv3 (or later), as published by the Free Software | Foundation. This file is distributed in the hope that it will be useful, but | WITHOUT ANY WARRANTY OF ANY KIND. See http://gnu.org/licenses for details. o----------------------------------------------------------------------------o */ struct compose_ctx_par { const D *d; T **cached; ssz_t cached_size, sa; }; #define MAX_CACHED_ORDS 4 #define CTX struct compose_ctx_par static inline T* get_mono(idx_t c, idx_t tmp_idx, T *tmps[2], ord_t complement[], CTX *ctx) { if (c < ctx->cached_size && ctx->cached[c]) return ctx->cached[c]; const D *d = ctx->d; T **cached = ctx->cached; for (idx_t m = ctx->cached_size - 1; m > 0; --m) if (cached[m] && mad_mono_le(d->nv, d->To[m], d->To[c])) { mad_mono_sub(d->nv, d->To[c], d->To[m], complement); idx_t compl_idx = mad_desc_idxm(d, d->nv, complement); T *t = get_mono(compl_idx, tmp_idx ^ 1, tmps, complement, ctx); #if DEBUG_COMPOSE mad_mono_print(d->nv, d->To[c], 0); printf(" = "); mad_mono_print(d->nv, d->To[m], 0); mad_mono_print(d->nv, d->To[compl_idx], 0); printf("m=%d", m); printf("\n"); #endif FUN(mul)(t, cached[m], tmps[tmp_idx]); break; } // update cache if needed if (c < ctx->cached_size) { assert(!cached[c]); // no double alloc cached[c] = FUN(newd)(d, mad_tpsa_default); FUN(copy)(tmps[tmp_idx], cached[c]); } return tmps[tmp_idx]; } static inline void compose_ser(idx_t sa, const T *ma[sa], T *mc[sa], CTX *ctx) { assert(ma && mc && ctx); // cleanup & ord 0 for (idx_t i = 0; i < sa; ++i) FUN(setvar)(mc[i], ma[i]->coef[0],0,0); ord_t mono[ctx->d->nv]; T *tmps[2] = { FUN(newd)(ctx->d, ctx->d->to), FUN(newd)(ctx->d, ctx->d->to) }, *t = NULL; for (idx_t c = 1; c < ctx->cached_size; ++c) { // TODO: only cache what is needed t = get_mono(c, 0, tmps, mono, ctx); for (idx_t i = 0; i < sa; ++i) if (ma[i]->coef[c]) FUN(acc)(t, ma[i]->coef[c], mc[i]); } FUN(del)(tmps[0]); FUN(del)(tmps[1]); } static inline void compose_par(idx_t sa, const T *ma[sa], T *mc[sa], CTX *ctx) { ord_t highest = 0; for (idx_t i = 0; i < sa; ++i) if (ma[i]->hi > highest) highest = ma[i]->hi; idx_t max_coeff = ctx->d->ord2idx[highest+1]; T *mt[ctx->d->nth][sa]; #pragma omp parallel num_threads(ctx->d->nth) { int id = omp_get_thread_num(); // alloc private vars ord_t mono[ctx->d->nv]; T *tmps[2] = { FUN(newd)(ctx->d, ctx->d->to), FUN(newd)(ctx->d, ctx->d->to) }, *t = NULL; T **m_curr_thread = mt[id]; for (idx_t i = 0; i < sa; ++i) m_curr_thread[i] = FUN(newd)(ctx->d, mad_tpsa_default); #pragma omp for for (idx_t c = ctx->cached_size; c < max_coeff; ++c) { int needed = 0; for (idx_t i = 0; i < sa; ++i) if (ma[i]->coef[c]) { needed = 1; break; } if (!needed) continue; t = get_mono(c, 0, tmps, mono, ctx); for (idx_t i = 0; i < sa; ++i) if (ma[i]->coef[c]) FUN(acc)(t, ma[i]->coef[c], m_curr_thread[i]); } FUN(del)(tmps[0]); FUN(del)(tmps[1]); } for (int thread = 0; thread < ctx->d->nth; ++thread) for (idx_t i = 0; i < sa; ++i) { FUN(acc)(mt[thread][i], 1, mc[i]); FUN(del)(mt[thread][i]); } } static inline void compose_parallel(ssz_t sa, const T *ma[sa], ssz_t sb, const T *mb[sb], T *mc[sa], ord_t hi_ord) { // locals const D *d = ma[0]->d; ssz_t nv = d->nv; ord_t to_cache = MIN(d->mo, MAX_CACHED_ORDS); ssz_t cached_size = d->ord2idx[to_cache+1]; T *cached[cached_size]; /* cached[0] not in use --> */ cached[0] = NULL; for (idx_t c = 1; c <= nv ; ++c) cached[c] = (T *) mb[c-1]; for (idx_t c = nv+1; c < cached_size; ++c) cached[c] = NULL; CTX ctx = { .d = d, .cached_size = cached_size, .cached = cached }; // compose compose_ser(sa, ma, mc, &ctx); compose_par(sa, ma, mc, &ctx); // finalize for (idx_t c = nv+1; c < cached_size; ++c) FUN(del)(cached[c]); } #undef MAX_CACHED_ORDS #undef CTX #endif // MAD_TPSA_COMPOSE_PAR_TC