/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2007-2013 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2007      Mellanox Technologies, Inc.  All rights reserved.
 * Copyright (c) 2012-2015 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 *
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
#include "opal_config.h"

#include "btl_openib.h"
#include "btl_openib_proc.h"
#include "connect/base.h"
#include "connect/btl_openib_connect_empty.h"
#if OPAL_HAVE_RDMACM
#include "connect/btl_openib_connect_rdmacm.h"
#endif
#if OPAL_HAVE_UDCM
#include "connect/btl_openib_connect_udcm.h"
#endif

#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"

#include "opal/util/sys_limits.h"
#include "opal/align.h"

/*
 * Array of all possible connection functions
 */
static opal_btl_openib_connect_base_component_t *all[] = {
    /* Always have an entry here so that the CP indexes will always be
       the same: OOB has been removed, so use the "empty" CPC */
    &opal_btl_openib_connect_empty,

    /* Always have an entry here so that the CP indexes will always be
       the same: XOOB has been removed, so use the "empty" CPC */
    &opal_btl_openib_connect_empty,

    /* Always have an entry here so that the CP indexes will always be
       the same: if RDMA CM is not available, use the "empty" CPC */
#if OPAL_HAVE_RDMACM
    &opal_btl_openib_connect_rdmacm,
#else
    &opal_btl_openib_connect_empty,
#endif

    /* Always have an entry here so that the CP indexes will always be
       the same: if UD CM is not enabled, use the "empty" CPC */
#if OPAL_HAVE_UDCM
    &opal_btl_openib_connect_udcm,
#else
    &opal_btl_openib_connect_empty,
#endif

    NULL
};

/* increase this count if any more cpcs are added */
static opal_btl_openib_connect_base_component_t *available[5];
static int num_available = 0;

static char *btl_openib_cpc_include;
static char *btl_openib_cpc_exclude;

/*
 * Register MCA parameters
 */
int opal_btl_openib_connect_base_register(void)
{
    int i, j, save;
    char **temp = NULL, *string = NULL, *all_cpc_names = NULL;

    /* Make an MCA parameter to select which connect module to use */
    for (i = 0; NULL != all[i]; ++i) {
        /* The CPC name "empty" is reserved for "fake" CPC modules */
        if (0 != strcmp(all[i]->cbc_name, "empty")) {
            opal_argv_append_nosize(&temp, all[i]->cbc_name);
        }
    }
    all_cpc_names = opal_argv_join(temp, ',');
    opal_argv_free(temp);
    asprintf(&string,
             "Method used to select OpenFabrics connections (valid values: %s)",
             all_cpc_names);

    btl_openib_cpc_include = NULL;
    (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
                                           "cpc_include", string, MCA_BASE_VAR_TYPE_STRING,
                                           NULL, 0, 0, OPAL_INFO_LVL_9,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &btl_openib_cpc_include);
    free(string);

    asprintf(&string,
             "Method used to exclude OpenFabrics connections (valid values: %s)",
             all_cpc_names);

    btl_openib_cpc_exclude = NULL;
    (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
                                           "cpc_exclude", string, MCA_BASE_VAR_TYPE_STRING,
                                           NULL, 0, 0, OPAL_INFO_LVL_9,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &btl_openib_cpc_exclude);
    free(string);

    /* Parse the if_[in|ex]clude paramters to come up with a list of
       CPCs that are available */
    /* If we have an "include" list, then find all those CPCs and put
       them in available[] */
    if (NULL != btl_openib_cpc_include) {
        mca_btl_openib_component.cpc_explicitly_defined = true;
        temp = opal_argv_split(btl_openib_cpc_include, ',');
        for (save = j = 0; NULL != temp[j]; ++j) {
            for (i = 0; NULL != all[i]; ++i) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) {
                    opal_output(-1, "include: saving %s", all[i]->cbc_name);
                    available[save++] = all[i];
                    ++num_available;
                    break;
                }
            }
            if (NULL == all[i]) {
                opal_show_help("help-mpi-btl-openib-cpc-base.txt",
                               "cpc name not found", true,
                               "include", opal_process_info.nodename,
                               "include", btl_openib_cpc_include, temp[j],
                               all_cpc_names);
                opal_argv_free(temp);
                free(all_cpc_names);
                return OPAL_ERR_NOT_FOUND;
            }
        }
        opal_argv_free(temp);
    }

    /* Otherwise, if we have an "exclude" list, take all the CPCs that
       are not in that list and put them in available[] */
    else if (NULL != btl_openib_cpc_exclude) {
        mca_btl_openib_component.cpc_explicitly_defined = true;
        temp = opal_argv_split(btl_openib_cpc_exclude, ',');
        /* First: error check -- ensure that all the names are valid */
        for (j = 0; NULL != temp[j]; ++j) {
            for (i = 0; NULL != all[i]; ++i) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) {
                    break;
                }
            }
            if (NULL == all[i]) {
                opal_show_help("help-mpi-btl-openib-cpc-base.txt",
                               "cpc name not found", true,
                               "exclude", opal_process_info.nodename,
                               "exclude", btl_openib_cpc_exclude, temp[j],
                               all_cpc_names);
                opal_argv_free(temp);
                free(all_cpc_names);
                return OPAL_ERR_NOT_FOUND;
            }
        }

        /* Now do the exclude */
        for (save = i = 0; NULL != all[i]; ++i) {
            for (j = 0; NULL != temp[j]; ++j) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) {
                    break;
                }
            }
            if (NULL == temp[j]) {
                opal_output(-1, "exclude: saving %s", all[i]->cbc_name);
                available[save++] = all[i];
                ++num_available;
            }
        }
        opal_argv_free(temp);
    }

    /* If there's no include/exclude list, copy all[] into available[] */
    else {
        opal_output(-1, "no include or exclude: saving all");
        memcpy(available, all, sizeof(all));
        num_available = (sizeof(all) /
                         sizeof(opal_btl_openib_connect_base_module_t *)) - 1;
    }

    /* Call the register function on all the CPCs so that they may
       setup any MCA params specific to the connection type */
    for (i = 0; NULL != available[i]; ++i) {
        if (NULL != available[i]->cbc_register) {
            available[i]->cbc_register();
        }
    }

    free (all_cpc_names);
    return OPAL_SUCCESS;
}

/*
 * Called once during openib BTL component initialization to allow CPC
 * components to initialize.
 */
int opal_btl_openib_connect_base_init(void)
{
    int i, rc;

    /* Call each available CPC component's open function, if it has
       one.  If the CPC component open function returns OPAL_SUCCESS,
       keep it.  If it returns ERR_NOT_SUPPORTED, remove it from the
       available[] array.  If it returns something else, return that
       error upward. */
    for (i = num_available = 0; NULL != available[i]; ++i) {
        if (NULL == available[i]->cbc_init) {
            available[num_available++] = available[i];
            opal_output(-1, "found available cpc (NULL init): %s",
                        all[i]->cbc_name);
            continue;
        }

        rc = available[i]->cbc_init();
        if (OPAL_SUCCESS == rc) {
            available[num_available++] = available[i];
            opal_output(-1, "found available cpc (SUCCESS init): %s",
                        all[i]->cbc_name);
            continue;
        } else if (OPAL_ERR_NOT_SUPPORTED == rc) {
            continue;
        } else {
            return rc;
        }
    }
    available[num_available] = NULL;

    return (num_available > 0) ? OPAL_SUCCESS : OPAL_ERR_NOT_AVAILABLE;
}


/*
 * Find all the CPCs that are eligible for a single local port (i.e.,
 * openib module).
 */
int opal_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *btl)
{
    char *msg = NULL;
    int i, rc, cpc_index, len;
    opal_btl_openib_connect_base_module_t **cpcs;

    cpcs = (opal_btl_openib_connect_base_module_t **) calloc(num_available,
                  sizeof(opal_btl_openib_connect_base_module_t *));
    if (NULL == cpcs) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* Go through all available CPCs and query them to see if they
       want to run on this module.  If they do, save them to a running
       array. */
    for (len = 1, i = 0; NULL != available[i]; ++i) {
        len += strlen(available[i]->cbc_name) + 2;
    }
    msg = (char *) malloc(len);
    if (NULL == msg) {
        free(cpcs);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    msg[0] = '\0';
    for (cpc_index = i = 0; NULL != available[i]; ++i) {
        if (i > 0) {
            strcat(msg, ", ");
        }
        strcat(msg, available[i]->cbc_name);

        rc = available[i]->cbc_query(btl, &cpcs[cpc_index]);
        if (OPAL_ERR_NOT_SUPPORTED == rc || OPAL_ERR_UNREACH == rc) {
            continue;
        } else if (OPAL_SUCCESS != rc) {
            free(cpcs);
            free(msg);
            return rc;
        }
        opal_output(-1, "match cpc for local port: %s",
                    available[i]->cbc_name);

        /* If the CPC wants to use the CTS protocol, check to ensure
           that QP 0 is PP; if it's not, we can't use this CPC (or the
           CTS protocol) */
        if (cpcs[cpc_index]->cbm_uses_cts &&
            !BTL_OPENIB_QP_TYPE_PP(0)) {
            BTL_VERBOSE(("this CPC only supports when the first btl_openib_receive_queues QP is a PP QP"));
            continue;
        }

        /* This CPC has indicated that it wants to run on this openib
           BTL module.  Woo hoo! */
        ++cpc_index;
    }

    /* If we got an empty array, then no CPCs were eligible.  Doh! */
    if (0 == cpc_index) {
        opal_show_help("help-mpi-btl-openib-cpc-base.txt",
                       "no cpcs for port", true,
                       opal_process_info.nodename,
                       ibv_get_device_name(btl->device->ib_dev),
                       btl->port_num, msg);
        free(cpcs);
        free(msg);
        return OPAL_ERR_NOT_SUPPORTED;
    }
    free(msg);

    /* We got at least one eligible CPC; save the array into the
       module's port_info */
    btl->cpcs = cpcs;
    btl->num_cpcs = cpc_index;

    return OPAL_SUCCESS;
}

/*
 * This function is invoked when determining whether we have a CPC in
 * common with a specific remote port.  We already know that the
 * subnet ID is the same between a specific local port and the target
 * remote port; now we need to know if we can find a CPC in common
 * between the two.
 *
 * If yes, be sure to find the *same* CPC on both sides.  We know
 * which CPCs are available on each side, and we know the priorities
 * that were assigned on both sides.  So find a CPC that is common to
 * both sides and has the highest overall priority (between both
 * sides).
 *
 * Return the matching CPC, or NULL if not found.
 */
int
opal_btl_openib_connect_base_find_match(mca_btl_openib_module_t *btl,
                                        mca_btl_openib_proc_modex_t *peer_port,
                                        opal_btl_openib_connect_base_module_t **ret_local_cpc,
                                        opal_btl_openib_connect_base_module_data_t **ret_remote_cpc_data)
{
    int i, j, max = -1;
    opal_btl_openib_connect_base_module_t *local_cpc, *local_selected = NULL;
    opal_btl_openib_connect_base_module_data_t *local_cpcd, *remote_cpcd,
        *remote_selected = NULL;

    /* Iterate over all the CPCs on the local module */
    for (i = 0; i < btl->num_cpcs; ++i) {
        local_cpc = btl->cpcs[i];
        local_cpcd = &(local_cpc->data);

        /* Iterate over all the CPCs on the remote port */
        for (j = 0; j < peer_port->pm_cpc_data_count; ++j) {
            remote_cpcd = &(peer_port->pm_cpc_data[j]);

            /* Are the components the same? */
            if (local_cpcd->cbm_component == remote_cpcd->cbm_component) {
                /* If so, update the max priority found so far */
                if (max < local_cpcd->cbm_priority) {
                    max = local_cpcd->cbm_priority;
                    local_selected = local_cpc;
                    remote_selected = remote_cpcd;
                }
                if (max < remote_cpcd->cbm_priority) {
                    max = remote_cpcd->cbm_priority;
                    local_selected = local_cpc;
                    remote_selected = remote_cpcd;
                }
            }
        }
    }

    /* All done! */
    if (NULL != local_selected) {
        *ret_local_cpc = local_selected;
        *ret_remote_cpc_data = remote_selected;
        opal_output(-1, "find_match: found match!");
        return OPAL_SUCCESS;
    } else {
        opal_output(-1, "find_match: did NOT find match!");
        return OPAL_ERR_NOT_FOUND;
    }
}

/*
 * Lookup a CPC component's index in the all[] array so that we can
 * send it int the modex
 */
int opal_btl_openib_connect_base_get_cpc_index(opal_btl_openib_connect_base_component_t *cpc)
{
    int i;
    for (i = 0; NULL != all[i]; ++i) {
        if (all[i] == cpc) {
            return i;
        }
    }

    /* Not found */
    return -1;
}

/*
 * Lookup a CPC by its index (received from the modex)
 */
opal_btl_openib_connect_base_component_t *
opal_btl_openib_connect_base_get_cpc_byindex(uint8_t index)
{
    return (index >= (sizeof(all) /
                      sizeof(opal_btl_openib_connect_base_module_t *))) ?
        NULL : all[index];
}

int opal_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
{
    opal_free_list_item_t *fli;
    int length = sizeof(mca_btl_openib_header_t) +
        sizeof(mca_btl_openib_header_coalesced_t) +
        sizeof(mca_btl_openib_control_header_t) +
        sizeof(mca_btl_openib_footer_t) +
        mca_btl_openib_component.qp_infos[mca_btl_openib_component.credits_qp].size;

    int align_it = 0;
    int page_size;

    page_size = opal_getpagesize();
    if (length >= page_size / 2) { align_it = 1; }
    if (align_it) {
// I think this is only active for ~64k+ buffers anyway, but I'm not
// positive, so I'm only increasing the buffer size and alignment if
// it's not too small. That way we'd avoid wasting excessive memory
// in case this code was active for tiny buffers.
        length = OPAL_ALIGN(length, page_size, int);
    }

    /* Explicitly don't use the mpool registration */
    fli = &(endpoint->endpoint_cts_frag.super.super.base.super);
    fli->registration = NULL;
    if (!align_it) {
        fli->ptr = malloc(length);
    } else {
        posix_memalign((void**)&(fli->ptr), page_size, length);
    }
    if (NULL == fli->ptr) {
        BTL_ERROR(("malloc failed"));
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    endpoint->endpoint_cts_mr =
        ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd,
                   fli->ptr, length,
                   IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
                   IBV_ACCESS_REMOTE_READ);
    OPAL_OUTPUT((-1, "registered memory %p, length %d", fli->ptr, length));
    if (NULL == endpoint->endpoint_cts_mr) {
        free(fli->ptr);
        BTL_ERROR(("Failed to reg mr!"));
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    /* NOTE: We do not need to register this memory with the
       opal_memory subsystem, because this is OMPI-controlled memory
       -- we do not need to worry about this memory being freed out
       from underneath us. */

    /* Copy the lkey where it needs to go */
    endpoint->endpoint_cts_frag.super.sg_entry.lkey =
        endpoint->endpoint_cts_mr->lkey;
    endpoint->endpoint_cts_frag.super.sg_entry.length = length;

    /* Construct the rest of the recv_frag_t */
    OBJ_CONSTRUCT(&(endpoint->endpoint_cts_frag), mca_btl_openib_recv_frag_t);
    endpoint->endpoint_cts_frag.super.super.base.order =
        mca_btl_openib_component.credits_qp;
    endpoint->endpoint_cts_frag.super.endpoint = endpoint;
    OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
                 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
                 (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
                 endpoint->endpoint_cts_frag.super.sg_entry.length,
                 endpoint->endpoint_cts_frag.super.sg_entry.lkey));

    return OPAL_SUCCESS;
}

int opal_btl_openib_connect_base_free_cts(mca_btl_base_endpoint_t *endpoint)
{
    /* NOTE: We don't need to deregister this memory with opal_memory
       because it was not registered there in the first place (see
       comment above, near call to ibv_reg_mr). */
    if (NULL != endpoint->endpoint_cts_mr) {
        ibv_dereg_mr(endpoint->endpoint_cts_mr);
        endpoint->endpoint_cts_mr = NULL;
    }
    if (NULL != endpoint->endpoint_cts_frag.super.super.base.super.ptr) {
        free(endpoint->endpoint_cts_frag.super.super.base.super.ptr);
        endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL;
        OPAL_OUTPUT((-1, "Freeing CTS frag"));
    }

    return OPAL_SUCCESS;
}

/*
 * Called to start a connection
 */
int opal_btl_openib_connect_base_start(
        opal_btl_openib_connect_base_module_t *cpc,
        mca_btl_base_endpoint_t *endpoint)
{
    /* If the CPC uses the CTS protocol, provide a frag buffer for the
       CPC to post.  Must allocate these frags up here in the main
       thread because the FREE_LIST_WAIT is not thread safe. */
    if (cpc->cbm_uses_cts) {
        int rc;
        rc = opal_btl_openib_connect_base_alloc_cts(endpoint);
        if (OPAL_SUCCESS != rc) {
            return rc;
        }
    }

    return cpc->cbm_start_connect(cpc, endpoint);
}

/*
 * Called during openib btl component close
 */
void opal_btl_openib_connect_base_finalize(void)
{
    int i;

    for (i = 0 ; i < num_available ; ++i) {
        if (NULL != available[i]->cbc_finalize) {
            available[i]->cbc_finalize();
        }
    }
}