/* * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "opal_config.h" #include #include #include #include "opal/align.h" #include "opal/util/output.h" #include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_datatype_cuda.h" static bool initialized = false; int opal_cuda_verbose = 0; static int opal_cuda_enabled = 0; /* Starts out disabled */ static int opal_cuda_output = 0; static void opal_cuda_support_init(void); static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; static opal_common_cuda_function_table_t ftable; /* This function allows the common cuda code to register an * initialization function that gets called the first time an attempt * is made to send or receive a GPU pointer. This allows us to delay * some CUDA initialization until after MPI_Init(). */ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) { common_cuda_initialization_function = fptr; } /** * This function is called when a convertor is instantiated. It has to call * the opal_cuda_support_init() function once to figure out if CUDA support * is enabled or not. If CUDA is not enabled, then short circuit out * for all future calls. */ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) { /* Only do the initialization on the first GPU access */ if (!initialized) { opal_cuda_support_init(); } /* This is needed to handle case where convertor is not fully initialized * like when trying to do a sendi with convertor on the statck */ convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; /* If not enabled, then nothing else to do */ if (!opal_cuda_enabled) { return; } if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { convertor->flags |= CONVERTOR_CUDA; } } /* Checks the type of pointer * * @param dest One pointer to check * @param source Another pointer to check */ bool opal_cuda_check_bufs(char *dest, char *src) { /* Only do the initialization on the first GPU access */ if (!initialized) { opal_cuda_support_init(); } if (!opal_cuda_enabled) { return false; } if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { return true; } else { return false; } } /* * With CUDA enabled, all contiguous copies will pass through this function. * Therefore, the first check is to see if the convertor is a GPU buffer. * Note that if there is an error with any of the CUDA calls, the program * aborts as there is no recovering. */ /* Checks the type of pointer * * @param buf check one pointer providing a convertor. * Provides aditional information, e.g. managed vs. unmanaged GPU buffer */ bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor ) { /* Only do the initialization on the first GPU access */ if (!initialized) { opal_cuda_support_init(); } if (!opal_cuda_enabled) { return false; } return ( ftable.gpu_is_gpu_buffer(buf, convertor)); } /* * With CUDA enabled, all contiguous copies will pass through this function. * Therefore, the first check is to see if the convertor is a GPU buffer. * Note that if there is an error with any of the CUDA calls, the program * aborts as there is no recovering. */ void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor) { int res; if (!(convertor->flags & CONVERTOR_CUDA)) { return memcpy(dest, src, size); } if (convertor->flags & CONVERTOR_CUDA_ASYNC) { res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor); } else { res = ftable.gpu_cu_memcpy(dest, (void *)src, size); } if (res != 0) { opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, (int)size); abort(); } else { return dest; } } /* * This function is needed in cases where we do not have contiguous * datatypes. The current code has macros that cannot handle a convertor * argument to the memcpy call. */ void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) { int res; res = ftable.gpu_cu_memcpy(dest, src, size); if (res != 0) { opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, (int)size); abort(); } else { return dest; } } /* * In some cases, need an implementation of memmove. This is not fast, but * it is not often needed. */ void *opal_cuda_memmove(void *dest, void *src, size_t size) { int res; res = ftable.gpu_memmove(dest, src, size); if(res != 0){ opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest, src, (int)size); abort(); } return dest; } /** * This function gets called once to check if the program is running in a cuda * environment. */ static void opal_cuda_support_init(void) { if (initialized) { return; } /* Set different levels of verbosity in the cuda related code. */ opal_cuda_output = opal_output_open(NULL); opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); /* Callback into the common cuda initialization routine. This is only * set if some work had been done already in the common cuda code.*/ if (NULL != common_cuda_initialization_function) { if (0 == common_cuda_initialization_function(&ftable)) { opal_cuda_enabled = 1; } } if (1 == opal_cuda_enabled) { opal_output_verbose(10, opal_cuda_output, "CUDA: enabled successfully, CUDA device pointers will work"); } else { opal_output_verbose(10, opal_cuda_output, "CUDA: not enabled, CUDA device pointers will not work"); } initialized = true; } /** * Tell the convertor that copies will be asynchronous CUDA copies. The * flags are cleared when the convertor is reinitialized. */ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream) { convertor->flags |= CONVERTOR_CUDA_ASYNC; convertor->stream = stream; }