/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO USER:   
 *
 * This source code is subject to NVIDIA ownership rights under U.S. and 
 * international Copyright laws.  Users and possessors of this source code 
 * are hereby granted a nonexclusive, royalty-free license to use this code 
 * in individual and commercial software.
 *
 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
 * OR PERFORMANCE OF THIS SOURCE CODE.  
 *
 * U.S. Government End Users.   This source code is a "commercial item" as 
 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
 * "commercial computer  software"  and "commercial computer software 
 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
 * and is provided to the U.S. Government only as a commercial end item.  
 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
 * source code with only those rights set forth herein. 
 *
 * Any use of this source code in individual and commercial software must 
 * include, in the user documentation and internal comments to the code,
 * the above Disclaimer and U.S. Government End Users Notice.
 */

#ifndef __cuda_cuda_h__
#define __cuda_cuda_h__

#include <stdlib.h>

/**
 * \file
 * \name Data types used by CUDA driver
 * \author NVIDIA Corporation
 * \brief Data types used by CUDA driver
 */

/**
 * \defgroup CUDA_TYPES Data types used by CUDA driver
 * \ingroup CUDA_DRIVER
 * @{
 */

/**
 * CUDA API version number
 */
#define CUDA_VERSION 3010 /* 3.1 */

#ifdef __cplusplus
extern "C" {
#endif
    typedef unsigned int CUdeviceptr;       ///< CUDA device pointer

    typedef int CUdevice;                   ///< CUDA device
    typedef struct CUctx_st *CUcontext;     ///< CUDA context
    typedef struct CUmod_st *CUmodule;      ///< CUDA module
    typedef struct CUfunc_st *CUfunction;   ///< CUDA function
    typedef struct CUarray_st *CUarray;     ///< CUDA array
    typedef struct CUtexref_st *CUtexref;   ///< CUDA texture reference
    typedef struct CUsurfref_st *CUsurfref; ///< CUDA surface reference
    typedef struct CUevent_st *CUevent;     ///< CUDA event
    typedef struct CUstream_st *CUstream;   ///< CUDA stream
    typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA graphics interop resource

    typedef struct CUuuid_st {              ///< CUDA definition of UUID
        char bytes[16];
    } CUuuid;

/************************************
 **
 **    Enums
 **
 ***********************************/

/**
 * Context creation flags
 */
typedef enum CUctx_flags_enum {
    CU_CTX_SCHED_AUTO  = 0,     ///< Automatic scheduling
    CU_CTX_SCHED_SPIN  = 1,     ///< Set spin as default scheduling
    CU_CTX_SCHED_YIELD = 2,     ///< Set yield as default scheduling
    CU_CTX_SCHED_MASK  = 0x3,
    CU_CTX_BLOCKING_SYNC = 4,   ///< Use blocking synchronization
    CU_CTX_MAP_HOST = 8,        ///< Support mapped pinned allocations
    CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after launch
    CU_CTX_FLAGS_MASK  = 0x1f
} CUctx_flags;

/**
 * Event creation flags
 */
typedef enum CUevent_flags_enum {
    CU_EVENT_DEFAULT       = 0, ///< Default event flag
    CU_EVENT_BLOCKING_SYNC = 1  ///< Event uses blocking synchronization
} CUevent_flags;

/**
 * Array formats
 */
typedef enum CUarray_format_enum {
    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, ///< Unsigned 8-bit integers
    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers
    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers
    CU_AD_FORMAT_SIGNED_INT8    = 0x08, ///< Signed 8-bit integers
    CU_AD_FORMAT_SIGNED_INT16   = 0x09, ///< Signed 16-bit integers
    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, ///< Signed 32-bit integers
    CU_AD_FORMAT_HALF           = 0x10, ///< 16-bit floating point
    CU_AD_FORMAT_FLOAT          = 0x20  ///< 32-bit floating point
} CUarray_format;

/**
 * Texture reference addressing modes
 */
typedef enum CUaddress_mode_enum {
    CU_TR_ADDRESS_MODE_WRAP = 0,    ///< Wrapping address mode
    CU_TR_ADDRESS_MODE_CLAMP = 1,   ///< Clamp to edge address mode
    CU_TR_ADDRESS_MODE_MIRROR = 2   ///< Mirror address mode
} CUaddress_mode;

/**
 * Texture reference filtering modes
 */
typedef enum CUfilter_mode_enum {
    CU_TR_FILTER_MODE_POINT = 0,    ///< Point filter mode
    CU_TR_FILTER_MODE_LINEAR = 1    ///< Linear filter mode
} CUfilter_mode;

/**
 * Device properties
 */
typedef enum CUdevice_attribute_enum {
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,  ///< Maximum number of threads per block
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,        ///< Maximum block dimension X
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,        ///< Maximum block dimension Y
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,        ///< Maximum block dimension Z
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,         ///< Maximum grid dimension X
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,         ///< Maximum grid dimension Y
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,         ///< Maximum grid dimension Z
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,    ///< Maximum shared memory available per block in bytes
    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,    ///< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,  ///< Memory available on device for __constant__ variables in a CUDA C kernel in bytes
    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,             ///< Warp size in threads
    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,             ///< Maximum pitch in bytes allowed by memory copies
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,   ///< Maximum number of 32-bit registers available per block
    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,   ///< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,            ///< Peak clock frequency in kilohertz
    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,     ///< Alignment requirement for textures

    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,           ///< Device can possibly copy memory and execute a kernel concurrently
    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,  ///< Number of multiprocessors on device
    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,   ///< Specifies whether there is a run time limit on kernels
    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,            ///< Device is integrated with host memory
    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,   ///< Device can map host memory into CUDA address space
    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,          ///< Compute mode (See ::CUcomputemode for details)
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ///< Maximum 1D texture width
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D texture width
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D texture height
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D texture width
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D texture height
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D texture depth
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum texture array width
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum texture array height
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximum slices in a texture array
    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement for surfaces
    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly execute multiple kernels concurrently
    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ///< Device has ECC support enabled
    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ////< PCI bus ID of the device
    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34 ////< PCI device ID of the device
} CUdevice_attribute;

/**
 * Legacy device properties
 */
typedef struct CUdevprop_st {
    int maxThreadsPerBlock;     ///< Maximum number of threads per block
    int maxThreadsDim[3];       ///< Maximum size of each dimension of a block
    int maxGridSize[3];         ///< Maximum size of each dimension of a grid
    int sharedMemPerBlock;      ///< Shared memory available per block in bytes
    int totalConstantMemory;    ///< Constant memory available on device in bytes
    int SIMDWidth;              ///< Warp size in threads
    int memPitch;               ///< Maximum pitch in bytes allowed by memory copies
    int regsPerBlock;           ///< 32-bit registers available per block
    int clockRate;              ///< Clock frequency in kilohertz
    int textureAlign;           ///< Alignment requirement for textures
} CUdevprop;

/**
 * Function properties
 */
typedef enum CUfunction_attribute_enum {
    /**
     * The number of threads beyond which a launch of the function would fail.
     * This number depends on both the function and the device on which the
     * function is currently loaded.
     */
    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,

    /**
     * The size in bytes of statically-allocated shared memory required by
     * this function. This does not include dynamically-allocated shared
     * memory requested by the user at runtime.
     */
    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,

    /**
     * The size in bytes of user-allocated constant memory required by this
     * function.
     */
    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,

    /**
     * The size in bytes of thread local memory used by this function.
     */
    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

    /**
     * The number of registers used by each thread of this function.
     */
    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,

    /**
     * The PTX virtual architecture version for which the function was compiled.
     */
    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,

    /**
     * The binary version for which the function was compiled.
     */
    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,

    CU_FUNC_ATTRIBUTE_MAX
} CUfunction_attribute;

/**
 * Function cache configurations
 */
typedef enum CUfunc_cache_enum {
    CU_FUNC_CACHE_PREFER_NONE   = 0x00,
    CU_FUNC_CACHE_PREFER_SHARED = 0x01,
    CU_FUNC_CACHE_PREFER_L1     = 0x02
} CUfunc_cache;  

/**
 * Memory types
 */
typedef enum CUmemorytype_enum {
    CU_MEMORYTYPE_HOST = 0x01,      ///< Host memory
    CU_MEMORYTYPE_DEVICE = 0x02,    ///< Device memory
    CU_MEMORYTYPE_ARRAY = 0x03      ///< Array memory
} CUmemorytype;

/**
 * Compute Modes
 */
typedef enum CUcomputemode_enum {
    CU_COMPUTEMODE_DEFAULT    = 0,     ///< Default compute mode (Multiple contexts allowed per device)
    CU_COMPUTEMODE_EXCLUSIVE  = 1,     ///< Compute-exclusive mode (Only one context can be present on this device at a time)
    CU_COMPUTEMODE_PROHIBITED = 2      ///< Compute-prohibited mode (No contexts can be created on this device at this time)
} CUcomputemode;

/**
 * Online compiler options
 */
typedef enum CUjit_option_enum
{
    /**
     * Max number of registers that a thread may use.\n
     * Option type: unsigned int
     */
    CU_JIT_MAX_REGISTERS            = 0,

    /**
     * IN: Specifies minimum number of threads per block to target compilation
     * for\n
     * OUT: Returns the number of threads the compiler actually targeted.
     * This restricts the resource utilization fo the compiler (e.g. max
     * registers) such that a block with the given number of threads should be
     * able to launch based on register limitations. Note, this option does not
     * currently take into account any other resource limitations, such as
     * shared memory utilization.\n
     * Option type: unsigned int
     */
    CU_JIT_THREADS_PER_BLOCK,

    /**
     * Returns a float value in the option of the wall clock time, in
     * milliseconds, spent creating the cubin\n
     * Option type: float
     */
    CU_JIT_WALL_TIME,

    /**
     * Pointer to a buffer in which to print any log messsages from PTXAS
     * that are informational in nature (the buffer size is specified via
     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
     * Option type: char*
     */
    CU_JIT_INFO_LOG_BUFFER,

    /**
     * IN: Log buffer size in bytes.  Log messages will be capped at this size
     * (including null terminator)\n
     * OUT: Amount of log buffer filled with messages\n
     * Option type: unsigned int
     */
    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,

    /**
     * Pointer to a buffer in which to print any log messages from PTXAS that
     * reflect errors (the buffer size is specified via option
     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
     * Option type: char*
     */
    CU_JIT_ERROR_LOG_BUFFER,

    /**
     * IN: Log buffer size in bytes.  Log messages will be capped at this size
     * (including null terminator)\n
     * OUT: Amount of log buffer filled with messages\n
     * Option type: unsigned int
     */
    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,

    /**
     * Level of optimizations to apply to generated code (0 - 4), with 4
     * being the default and highest level of optimizations.\n
     * Option type: unsigned int
     */
    CU_JIT_OPTIMIZATION_LEVEL,

    /**
     * No option value required. Determines the target based on the current
     * attached context (default)\n
     * Option type: No option value needed
     */
    CU_JIT_TARGET_FROM_CUCONTEXT,

    /**
     * Target is chosen based on supplied ::CUjit_target_enum.\n
     * Option type: unsigned int for enumerated type ::CUjit_target_enum
     */
    CU_JIT_TARGET,

    /**
     * Specifies choice of fallback strategy if matching cubin is not found.
     * Choice is based on supplied ::CUjit_fallback_enum.\n
     * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
     */
    CU_JIT_FALLBACK_STRATEGY
    
} CUjit_option;

/**
 * Online compilation targets
 */
typedef enum CUjit_target_enum
{
    CU_TARGET_COMPUTE_10            = 0,    ///< Compute device class 1.0
    CU_TARGET_COMPUTE_11,                   ///< Compute device class 1.1
    CU_TARGET_COMPUTE_12,                   ///< Compute device class 1.2
    CU_TARGET_COMPUTE_13,                   ///< Compute device class 1.3
    CU_TARGET_COMPUTE_20                    ///< Compute device class 2.0
} CUjit_target;

/**
 * Cubin matching fallback strategies
 */
typedef enum CUjit_fallback_enum
{
    /** Prefer to compile ptx */
    CU_PREFER_PTX                   = 0,

    /** Prefer to fall back to compatible binary code */
    CU_PREFER_BINARY

} CUjit_fallback;

/**
 * Flags to register a graphics resource
 */
typedef enum CUgraphicsRegisterFlags_enum {
    CU_GRAPHICS_REGISTER_FLAGS_NONE  = 0x00
} CUgraphicsRegisterFlags;

/**
 * Flags for mapping and unmapping interop resources
 */
typedef enum CUgraphicsMapResourceFlags_enum {
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
} CUgraphicsMapResourceFlags;

/**
 * Array indices for cube faces
 */
typedef enum CUarray_cubemap_face_enum {
    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, ///< Positive X face of cubemap
    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, ///< Negative X face of cubemap
    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, ///< Positive Y face of cubemap
    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, ///< Negative Y face of cubemap
    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, ///< Positive Z face of cubemap
    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  ///< Negative Z face of cubemap
} CUarray_cubemap_face;

/**
 * Limits
 */
typedef enum CUlimit_enum {
    CU_LIMIT_STACK_SIZE        = 0x00, ///< GPU thread stack size
    CU_LIMIT_PRINTF_FIFO_SIZE  = 0x01  ///< GPU printf FIFO size
} CUlimit;

/************************************
 **
 **    Error codes
 **
 ***********************************/

/**
 * Error codes
 */
typedef enum cudaError_enum {

    CUDA_SUCCESS                              = 0,   ///< No errors
    CUDA_ERROR_INVALID_VALUE                  = 1,   ///< Invalid value
    CUDA_ERROR_OUT_OF_MEMORY                  = 2,   ///< Out of memory
    CUDA_ERROR_NOT_INITIALIZED                = 3,   ///< Driver not initialized
    CUDA_ERROR_DEINITIALIZED                  = 4,   ///< Driver deinitialized

    CUDA_ERROR_NO_DEVICE                      = 100, ///< No CUDA-capable device available
    CUDA_ERROR_INVALID_DEVICE                 = 101, ///< Invalid device

    CUDA_ERROR_INVALID_IMAGE                  = 200, ///< Invalid kernel image
    CUDA_ERROR_INVALID_CONTEXT                = 201, ///< Invalid context
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202, ///< Context already current
    CUDA_ERROR_MAP_FAILED                     = 205, ///< Map failed
    CUDA_ERROR_UNMAP_FAILED                   = 206, ///< Unmap failed
    CUDA_ERROR_ARRAY_IS_MAPPED                = 207, ///< Array is mapped
    CUDA_ERROR_ALREADY_MAPPED                 = 208, ///< Already mapped
    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209, ///< No binary for GPU
    CUDA_ERROR_ALREADY_ACQUIRED               = 210, ///< Already acquired
    CUDA_ERROR_NOT_MAPPED                     = 211, ///< Not mapped
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212, ///< Mapped resource not available for access as an array
    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213, ///< Mapped resource not available for access as a pointer
    CUDA_ERROR_ECC_UNCORRECTABLE              = 214, ///< Uncorrectable ECC error detected
    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215, ///< CUlimit not supported by device

    CUDA_ERROR_INVALID_SOURCE                 = 300, ///< Invalid source
    CUDA_ERROR_FILE_NOT_FOUND                 = 301, ///< File not found
    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, ///< Link to a shared object failed to resolve
    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303, ///< Shared object initialization failed

    CUDA_ERROR_INVALID_HANDLE                 = 400, ///< Invalid handle

    CUDA_ERROR_NOT_FOUND                      = 500, ///< Not found

    CUDA_ERROR_NOT_READY                      = 600, ///< CUDA not ready

    CUDA_ERROR_LAUNCH_FAILED                  = 700, ///< Launch failed
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701, ///< Launch exceeded resources
    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702, ///< Launch exceeded timeout
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703, ///< Launch with incompatible texturing

    CUDA_ERROR_POINTER_IS_64BIT               = 800, ///< Attempted to retrieve 64-bit pointer via 32-bit API function
    CUDA_ERROR_SIZE_IS_64BIT                  = 801, ///< Attempted to retrieve 64-bit size via 32-bit API function

    CUDA_ERROR_UNKNOWN                        = 999  ///< Unknown error
} CUresult;

/**
 * If set, host memory is portable between CUDA contexts.
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_PORTABLE        0x01

/**
 * If set, host memory is mapped into CUDA address space and
 * ::cuMemHostGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_DEVICEMAP       0x02

/**
 * If set, host memory is allocated as write-combined - fast to write,
 * faster to DMA, slow to read except via SSE4 streaming load instruction
 * (MOVNTDQA).
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04

/**
 * 2D memory copy parameters
 */
typedef struct CUDA_MEMCPY2D_st {

    unsigned int srcXInBytes,   ///< Source X in bytes
                 srcY;          ///< Source Y
    CUmemorytype srcMemoryType; ///< Source memory type (host, device, array)
        const void *srcHost;    ///< Source host pointer
        CUdeviceptr srcDevice;  ///< Source device pointer
        CUarray srcArray;       ///< Source array reference
        unsigned int srcPitch;  ///< Source pitch (ignored when src is array)

    unsigned int dstXInBytes,   ///< Destination X in bytes
                 dstY;          ///< Destination Y
    CUmemorytype dstMemoryType; ///< Destination memory type (host, device, array)
        void *dstHost;          ///< Destination host pointer
        CUdeviceptr dstDevice;  ///< Destination device pointer
        CUarray dstArray;       ///< Destination array reference
        unsigned int dstPitch;  ///< Destination pitch (ignored when dst is array)

    unsigned int WidthInBytes;  ///< Width of 2D memory copy in bytes
    unsigned int Height;        ///< Height of 2D memory copy
} CUDA_MEMCPY2D;

/**
 * 3D memory copy parameters
 */
typedef struct CUDA_MEMCPY3D_st {

    unsigned int srcXInBytes,   ///< Source X in bytes
                 srcY,          ///< Source Y
                 srcZ;          ///< Source Z
    unsigned int srcLOD;        ///< Source LOD
    CUmemorytype srcMemoryType; ///< Source memory type (host, device, array)
        const void *srcHost;    ///< Source host pointer
        CUdeviceptr srcDevice;  ///< Source device pointer
        CUarray srcArray;       ///< Source array reference
        void *reserved0;        ///< Must be NULL
        unsigned int srcPitch;  ///< Source pitch (ignored when src is array)
        unsigned int srcHeight; ///< Source height (ignored when src is array; may be 0 if Depth==1)

    unsigned int dstXInBytes,   ///< Destination X in bytes
                 dstY,          ///< Destination Y
                 dstZ;          ///< Destination Z
    unsigned int dstLOD;        ///< Destination LOD
    CUmemorytype dstMemoryType; ///< Destination memory type (host, device, array)
        void *dstHost;          ///< Destination host pointer
        CUdeviceptr dstDevice;  ///< Destination device pointer
        CUarray dstArray;       ///< Destination array reference
        void *reserved1;        ///< Must be NULL
        unsigned int dstPitch;  ///< Destination pitch (ignored when dst is array)
        unsigned int dstHeight; ///< Destination height (ignored when dst is array; may be 0 if Depth==1)

    unsigned int WidthInBytes;  ///< Width of 3D memory copy in bytes
    unsigned int Height;        ///< Height of 3D memory copy
    unsigned int Depth;         ///< Depth of 3D memory copy
} CUDA_MEMCPY3D;

/**
 * Array descriptor
 */
typedef struct
{
    unsigned int Width;         ///< Width of array
    unsigned int Height;        ///< Height of array
    
    CUarray_format Format;      ///< Array format

    unsigned int NumChannels;   ///< Channels per array element
} CUDA_ARRAY_DESCRIPTOR;

/**
 * 3D array descriptor
 */
typedef struct
{
    unsigned int Width;         ///< Width of 3D array
    unsigned int Height;        ///< Height of 3D array
    unsigned int Depth;         ///< Depth of 3D array

    CUarray_format Format;      ///< Array format
    
    unsigned int NumChannels;   ///< Channels per array element

    unsigned int Flags;         ///< Flags
} CUDA_ARRAY3D_DESCRIPTOR;

// if set, the CUDA array contains an array of 2D slices
// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
// the number of slices, not the depth of a 3D array.
#define CUDA_ARRAY3D_2DARRAY        0x01

// this flag must be set in order to bind a surface reference
// to the CUDA array
#define CUDA_ARRAY3D_SURFACE_LDST   0x02

/**
 * Override the texref format with a format inferred from the array.
 * Flag for ::cuTexRefSetArray()
 */
#define CU_TRSA_OVERRIDE_FORMAT 0x01

/**
 * Read the texture as integers rather than promoting the values to floats
 * in the range [0,1].
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_READ_AS_INTEGER         0x01

/**
 * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_NORMALIZED_COORDINATES  0x02

/**
 * For texture references loaded into the module, use default texunit from
 * texture reference.
 */
#define CU_PARAM_TR_DEFAULT -1

/** @} */
/** @} */ /* END CUDA_TYPES */

#ifdef _WIN32
#define CUDAAPI __stdcall
#else
#define CUDAAPI 
#endif

    /*********************************
     ** Initialization
     *********************************/
    CUresult  CUDAAPI cuInit(unsigned int Flags);

    /*********************************
     ** Driver Version Query
     *********************************/
    CUresult  CUDAAPI cuDriverGetVersion(int *driverVersion);

    /************************************
     **
     **    Device management
     **
     ***********************************/
   
    CUresult  CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
    CUresult  CUDAAPI cuDeviceGetCount(int *count);
    CUresult  CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
    CUresult  CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
    CUresult  CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
    CUresult  CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
    CUresult  CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
        
    /************************************
     **
     **    Context management
     **
     ***********************************/

    CUresult  CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev );
    CUresult  CUDAAPI cuCtxDestroy( CUcontext ctx );
    CUresult  CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
    CUresult  CUDAAPI cuCtxDetach(CUcontext ctx);
    CUresult  CUDAAPI cuCtxPushCurrent( CUcontext ctx );
    CUresult  CUDAAPI cuCtxPopCurrent( CUcontext *pctx );
    CUresult  CUDAAPI cuCtxGetDevice(CUdevice *device);
    CUresult  CUDAAPI cuCtxSynchronize(void);


    /************************************
     **
     **    Module management
     **
     ***********************************/
    
    CUresult  CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
    CUresult  CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
    CUresult  CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
    CUresult  CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
    CUresult  CUDAAPI cuModuleUnload(CUmodule hmod);
    CUresult  CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
    CUresult  CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
    CUresult  CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
    CUresult  CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);    
    
    /************************************
     **
     **    Memory management
     **
     ***********************************/
    
    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);

    CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize);
    CUresult CUDAAPI cuMemAllocPitch( CUdeviceptr *dptr, 
                                      unsigned int *pPitch,
                                      unsigned int WidthInBytes, 
                                      unsigned int Height, 
                                      // size of biggest r/w to be performed by kernels on this memory
                                      // 4, 8 or 16 bytes
                                      unsigned int ElementSizeBytes
                                     );
    CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
    CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr );

    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
    CUresult CUDAAPI cuMemFreeHost(void *p);

    CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags );
 
    CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p, unsigned int Flags );
    CUresult CUDAAPI cuMemHostGetFlags( unsigned int *pFlags, void *p );

    /************************************
     **
     **    Synchronous Memcpy
     **
     ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
     ** but if host memory is involved, they wait until the copy is done before returning.
     **
     ***********************************/

    // 1D functions
        // system <-> device memory
        CUresult  CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount );
        CUresult  CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount );

        // device <-> device memory
        CUresult  CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount );

        // device <-> array memory
        CUresult  CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount );
        CUresult  CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount );

        // system <-> array memory
        CUresult  CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount );
        CUresult  CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount );

        // array <-> array memory
        CUresult  CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount );

    // 2D memcpy

        CUresult  CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );
        CUresult  CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy );

    // 3D memcpy

        CUresult  CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );

    /************************************
     **
     **    Asynchronous Memcpy
     **
     ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
     ** memcpy's done with these functions execute in parallel with the CPU and, if
     ** the hardware is available, may execute in parallel with the GPU.
     ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
     **
     ***********************************/

    // 1D functions
        // system <-> device memory
        CUresult  CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice, 
            const void *srcHost, unsigned int ByteCount, CUstream hStream );
        CUresult  CUDAAPI cuMemcpyDtoHAsync (void *dstHost, 
            CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );

        // device <-> device memory
        CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,
            CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );

        // system <-> array memory
        CUresult  CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int dstOffset, 
            const void *srcHost, unsigned int ByteCount, CUstream hStream );
        CUresult  CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArray, unsigned int srcOffset, 
            unsigned int ByteCount, CUstream hStream );

        // 2D memcpy
        CUresult  CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUstream hStream );

        // 3D memcpy
        CUresult  CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUstream hStream );

    /************************************
     **
     **    Memset
     **
     ***********************************/
        CUresult  CUDAAPI cuMemsetD8( CUdeviceptr dstDevice, unsigned char uc, unsigned int N );
        CUresult  CUDAAPI cuMemsetD16( CUdeviceptr dstDevice, unsigned short us, unsigned int N );
        CUresult  CUDAAPI cuMemsetD32( CUdeviceptr dstDevice, unsigned int ui, unsigned int N );

        CUresult  CUDAAPI cuMemsetD2D8( CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height );
        CUresult  CUDAAPI cuMemsetD2D16( CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height );
        CUresult  CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height );

    /************************************
     **
     **    Function management
     **
     ***********************************/


    CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, int z);
    CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int bytes);
    CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attrib, CUfunction hfunc);
    CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);

    /************************************
     **
     **    Array management 
     **
     ***********************************/
   
    CUresult  CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray );
    CUresult  CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray );
    CUresult  CUDAAPI cuArrayDestroy( CUarray hArray );

    CUresult  CUDAAPI cuArray3DCreate( CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray );
    CUresult  CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray );


    /************************************
     **
     **    Texture reference management
     **
     ***********************************/
    CUresult  CUDAAPI cuTexRefCreate( CUtexref *pTexRef );
    CUresult  CUDAAPI cuTexRefDestroy( CUtexref hTexRef );
    
    CUresult  CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, unsigned int Flags );
    CUresult  CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes );
    CUresult CUDAAPI  cuTexRefSetAddress2D( CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
    CUresult  CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents );
    CUresult  CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CUaddress_mode am );
    CUresult  CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mode fm );
    CUresult  CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flags );

    CUresult  CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTexRef );
    CUresult  CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );
    CUresult  CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );
    CUresult  CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref hTexRef );
    CUresult  CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef );
    CUresult  CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTexRef );

    /************************************
     **
     **    Surface reference management
     **
     ***********************************/

    CUresult  CUDAAPI cuSurfRefSetArray( CUsurfref hSurfRef, CUarray hArray, unsigned int Flags );
    CUresult  CUDAAPI cuSurfRefGetArray( CUarray *phArray, CUsurfref hSurfRef );

    /************************************
     **
     **    Parameter management
     **
     ***********************************/

    CUresult  CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbytes);
    CUresult  CUDAAPI cuParamSeti    (CUfunction hfunc, int offset, unsigned int value);
    CUresult  CUDAAPI cuParamSetf    (CUfunction hfunc, int offset, float value);
    CUresult  CUDAAPI cuParamSetv    (CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
    CUresult  CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);


    /************************************
     **
     **    Launch functions
     **
     ***********************************/

    CUresult CUDAAPI cuLaunch ( CUfunction f );
    CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_height);
    CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int grid_height, CUstream hStream );

    /************************************
     **
     **    Events
     **
     ***********************************/
    CUresult CUDAAPI cuEventCreate( CUevent *phEvent, unsigned int Flags );
    CUresult CUDAAPI cuEventRecord( CUevent hEvent, CUstream hStream );
    CUresult CUDAAPI cuEventQuery( CUevent hEvent );
    CUresult CUDAAPI cuEventSynchronize( CUevent hEvent );
    CUresult CUDAAPI cuEventDestroy( CUevent hEvent );
    CUresult CUDAAPI cuEventElapsedTime( float *pMilliseconds, CUevent hStart, CUevent hEnd );

    /************************************
     **
     **    Streams
     **
     ***********************************/
    CUresult CUDAAPI  cuStreamCreate( CUstream *phStream, unsigned int Flags );
    CUresult CUDAAPI  cuStreamQuery( CUstream hStream );
    CUresult CUDAAPI  cuStreamSynchronize( CUstream hStream );
    CUresult CUDAAPI  cuStreamDestroy( CUstream hStream );

    /************************************
     **
     **    Graphics interop
     **
     ***********************************/
    CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
    CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray( CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel );
    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer( CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource );
    CUresult CUDAAPI cuGraphicsResourceSetMapFlags( CUgraphicsResource resource, unsigned int flags ); 
    CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphicsResource *resources, CUstream hStream );
    CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphicsResource *resources, CUstream hStream );

    /************************************
     **
     **    Export tables
     **
     ***********************************/
    CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CUuuid *pExportTableId );

    /************************************
     **
     **    Limits
     **
     ***********************************/

    CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
    CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);

#ifdef __cplusplus
}
#endif

#endif /* __cuda_cuda_h__ */