/*
 *     Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 */

#ifndef NVHPC_OMP_RUNTIME_H
#define NVHPC_OMP_RUNTIME_H

#include "cuda_runtime.h"

#define create_min_max_op(NAME, TY, OP)         \
    __inline__ __device__                       \
    TY NAME(TY x, TY y) {                       \
        return x OP y ? x : y;                  \
    }

// When compiling bitcode file, we mark it as extern "C" so that
// compiler does not apply C++ name mangling. Additionally, we mark it
// as used so that it is clear these routines are used.
#ifdef USED
#undef USED
#endif
#if defined(PGI_COMPILE_BITCODE)
#define EXTERNC extern "C"
#define STATIC
#define USED __attribute__((used))
#else
#define EXTERNC
#define STATIC static
#define USED
#endif

create_min_max_op(__pgi_icmaxic, signed char, > )
create_min_max_op(__pgi_icminic, signed char, < )
create_min_max_op(__pgi_ucmaxuc, unsigned char, > )
create_min_max_op(__pgi_ucminuc, unsigned char, < )
create_min_max_op(__pgi_ismaxis, signed short, > )
create_min_max_op(__pgi_isminis, signed short, < )
create_min_max_op(__pgi_usmaxus, unsigned short, > )
create_min_max_op(__pgi_usminus, unsigned short, < )
create_min_max_op(__pgi_imaxi, int, > )
create_min_max_op(__pgi_imini, int, < )
create_min_max_op(__pgi_umaxu, unsigned int, > )
create_min_max_op(__pgi_uminu, unsigned int, < )
create_min_max_op(__pgi_lmaxl, long long, > )
create_min_max_op(__pgi_lminl, long long, < )
create_min_max_op(__pgi_ulmaxul, unsigned long long, > )
create_min_max_op(__pgi_ulminul, unsigned long long, < )
create_min_max_op(__pgi_fminf, float, < )
create_min_max_op(__pgi_fmaxf, float, > )
create_min_max_op(__pgi_dmind, double, < )
create_min_max_op(__pgi_dmaxd, double, > )

struct __fcomplex {
    float real;
    float imag;
};

__inline__ __device__ __fcomplex __nvhpc_fcomplex_add(__fcomplex val1, __fcomplex val2)
{
    __fcomplex res;
    res.real = val1.real + val2.real;
    res.imag = val1.imag + val2.imag;
    return res;
}

__inline__ __device__ __fcomplex __nvhpc_fcomplex_mul(__fcomplex val1, __fcomplex val2)
{
    __fcomplex res;
    res.real = val1.real * val2.real - val1.imag * val2.imag;
    res.imag = val1.real * val2.imag + val1.imag * val2.real;
    return res;
}

struct __dcomplex {
    double real;
    double imag;
};

__inline__ __device__ __dcomplex __nvhpc_dcomplex_add(__dcomplex val1, __dcomplex val2)
{
    __dcomplex res;
    res.real = val1.real + val2.real;
    res.imag = val1.imag + val2.imag;
    return res;
}

__inline__ __device__ __dcomplex __nvhpc_dcomplex_mul(__dcomplex val1, __dcomplex val2)
{
    __dcomplex res;
    res.real = val1.real * val2.real - val1.imag * val2.imag;
    res.imag = val1.real * val2.imag + val1.imag * val2.real;
    return res;
}

#include "nvhpc_declarations.h"

#if !defined(PGI_COMPILE_BITCODE)

// omp.h declarations

typedef enum omp_sched_t {
    // schedule kinds
    omp_sched_static = 0x1,
    omp_sched_dynamic = 0x2,
    omp_sched_guided = 0x3,
    omp_sched_auto = 0x4,

    // schedule modifier
    omp_sched_monotonic = 0x80000000u
} omp_sched_t;

typedef struct {
    void *impl[2];
} omp_lock_t;

typedef struct {
    void *impl[4];
} omp_nest_lock_t;

typedef enum omp_proc_bind_t {
    omp_proc_bind_false = 0,
    omp_proc_bind_true = 1,
    omp_proc_bind_master = 2,
    omp_proc_bind_close = 3,
    omp_proc_bind_spread = 4
} omp_proc_bind_t;

typedef enum omp_pause_resource_t {
    omp_pause_soft = 1,
    omp_pause_hard = 2
} omp_pause_resource_t;

typedef enum omp_sync_hint_t {
    omp_sync_hint_none = 0x0,
    omp_lock_hint_none = omp_sync_hint_none,
    omp_sync_hint_uncontended = 0x1,
    omp_lock_hint_uncontended = omp_sync_hint_uncontended,
    omp_sync_hint_contended = 0x2,
    omp_lock_hint_contended = omp_sync_hint_contended,
    omp_sync_hint_nonspeculative = 0x4,
    omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative,
    omp_sync_hint_speculative = 0x8,
    omp_lock_hint_speculative = omp_sync_hint_speculative
} omp_sync_hint_t;

typedef omp_sync_hint_t omp_lock_hint_t;

typedef int omp_depend_t;

typedef unsigned long int omp_uintptr_t;

typedef enum omp_event_handle_t {
    omp_allow_completion_event = 0,
    omp_task_fullfill_event = 1
} omp_event_handle_t;

typedef enum omp_alloctrait_key_t {
    omp_atk_sync_hint = 1,
    omp_atk_alignment = 2,
    omp_atk_access = 3,
    omp_atk_pool_size = 4,
    omp_atk_fallback = 5,
    omp_atk_fb_data = 6,
    omp_atk_pinned = 7,
    omp_atk_partition = 8
} omp_alloctrait_key_t;

typedef enum omp_alloctrait_value_t {
    omp_atv_false = 0,
    omp_atv_true = 1,
    omp_atv_default = 2,
    omp_atv_contended = 3,
    omp_atv_uncontended = 4,
    omp_atv_sequential = 5,
    omp_atv_private = 6,
    omp_atv_all = 7,
    omp_atv_thread = 8,
    omp_atv_pteam = 9,
    omp_atv_cgroup = 10,
    omp_atv_default_mem_fb = 11,
    omp_atv_null_fb = 12,
    omp_atv_abort_fb = 13,
    omp_atv_allocator_fb = 14,
    omp_atv_environment = 15,
    omp_atv_nearest = 16,
    omp_atv_blocked = 17,
    omp_atv_interleaved = 18
} omp_alloctrait_value_t;

typedef struct omp_alloctrait_t {
    omp_alloctrait_key_t key;
    omp_uintptr_t value;
} omp_alloctrait_t;

typedef enum omp_memspace_handle_t {
    omp_default_mem_space = 0,
    omp_large_cap_mem_space = omp_default_mem_space,
    omp_const_mem_space = omp_default_mem_space,
    omp_high_bw_mem_space = omp_default_mem_space,
    omp_low_lat_mem_space = omp_default_mem_space
} omp_memspace_handle_t;

typedef enum omp_allocator_handle_t {
    omp_null_allocator = 0,
    omp_default_mem_alloc = 1,
    omp_large_cap_mem_alloc = omp_default_mem_alloc,
    omp_const_mem_alloc = omp_default_mem_alloc,
    omp_high_bw_mem_alloc = omp_default_mem_alloc,
    omp_low_lat_mem_alloc = omp_default_mem_alloc,
    omp_thread_mem_alloc = omp_atv_thread,
    omp_pteam_mem_alloc = omp_atv_pteam,
    omp_cgroup_mem_alloc = omp_atv_cgroup,
} omp_allocator_handle_t;

extern "C" __device__ double omp_get_wtick(void);
extern "C" __device__ double omp_get_wtime(void);
extern "C" __device__ int omp_get_active_level(void);
extern "C" __device__ int omp_get_ancestor_thread_num(int level);
extern "C" __device__ int omp_get_cancellation(void);
extern "C" __device__ int omp_get_default_device(void);
extern "C" __device__ int omp_get_device_num(void);
extern "C" __device__ int omp_get_dynamic(void);
extern "C" __device__ int omp_get_initial_device(void);
extern "C" __device__ int omp_get_level(void);
extern "C" __device__ int omp_get_max_active_levels(void);
extern "C" __device__ int omp_get_max_task_priority(void);
extern "C" __device__ int omp_get_max_threads(void);
extern "C" __device__ int omp_get_nested(void);
extern "C" __device__ int omp_get_num_devices(void);
extern "C" __device__ int omp_get_num_places(void);
extern "C" __device__ int omp_get_num_procs(void);
extern "C" __device__ int omp_get_num_teams(void);
extern "C" __device__ int omp_get_num_threads(void);
extern "C" __device__ int omp_get_partition_num_places(void);
extern "C" __device__ int omp_get_place_num(void);
extern "C" __device__ int omp_get_place_num_procs(int place_num);
extern "C" __device__ int omp_get_place_proc_ids(int place_num, int *ids);
extern "C" __device__ int omp_get_supported_active_levels(void);
extern "C" __device__ int omp_get_team_num(void);
extern "C" __device__ int omp_get_thread_limit(void);
extern "C" __device__ unsigned int omp_in_final(void);
extern "C" __device__ unsigned int omp_in_parallel(void);
extern "C" __device__ unsigned int omp_is_initial_device(void);
extern "C" __device__ int omp_pause_resource(omp_pause_resource_t kind, int device_num);
extern "C" __device__ int omp_pause_resource_all(omp_pause_resource_t kind);
extern "C" __device__ int omp_target_associate_ptr(const void *host_ptr, const void *device_ptr, size_t size, size_t device_offset, int device_num);
extern "C" __device__ int omp_target_disassociate_ptr(const void *ptr, int device_num);
extern "C" __device__ unsigned int omp_target_is_present(const void *ptr, int device_num);
extern "C" __device__ int omp_target_memcpy(void *dst, const void *src, size_t length, size_t dst_offset, size_t src_offset, int dst_device_num, int src_device_num);
extern "C" __device__ int omp_target_memcpy_rect(void *dst, const void *src, size_t element_size, int num_dims, const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets, const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device_num, int src_device_num);
extern "C" __device__ int omp_test_lock(omp_lock_t *lock);
extern "C" __device__ int omp_test_lock_ftn(omp_lock_t **lock);
extern "C" __device__ int omp_test_nest_lock(omp_nest_lock_t *lock);
extern "C" __device__ omp_allocator_handle_t omp_get_default_allocator(void);
extern "C" __device__ omp_allocator_handle_t omp_init_allocator(omp_memspace_handle_t memspace, int ntraits, const omp_alloctrait_t *traits);
extern "C" __device__ omp_proc_bind_t omp_get_proc_bind(void);
extern "C" __device__ size_t omp_capture_affinity(char *buffer, size_t size, const char*format);
extern "C" __device__ size_t omp_get_affinity_format(char *buffer, size_t size);
extern "C" __device__ void *omp_alloc(size_t size, omp_allocator_handle_t allocator);
extern "C" __device__ void nvomp_proc_bind_off(void);
extern "C" __device__ void nvomp_set_memory_preferred_location_device(void *ptr, int size);
extern "C" __device__ void omp_destroy_allocator(omp_allocator_handle_t allocator);
extern "C" __device__ void omp_destroy_lock(omp_lock_t *lock);
extern "C" __device__ void omp_destroy_lock_ftn(omp_lock_t **lock);
extern "C" __device__ void omp_destroy_nest_lock(omp_nest_lock_t *lock);
extern "C" __device__ void omp_display_affinity(const char *format);
extern "C" __device__ void omp_free(void *ptr, omp_allocator_handle_t allocator);
extern "C" __device__ void omp_fulfill_event(omp_event_handle_t event);
extern "C" __device__ void omp_get_partition_place_nums(int *place_nums);
extern "C" __device__ void omp_get_schedule(omp_sched_t *kind, int *chunk);
extern "C" __device__ void omp_init_lock(omp_lock_t *lock);
extern "C" __device__ void omp_init_lock_ftn(omp_lock_t **lock);
extern "C" __device__ void omp_init_lock_with_hint(omp_lock_t *lock, omp_sync_hint_t hint);
extern "C" __device__ void omp_init_lock_with_hint_ftn(omp_lock_t **lock, omp_sync_hint_t hint);
extern "C" __device__ void omp_init_nest_lock(omp_nest_lock_t *lock);
extern "C" __device__ void omp_init_nest_lock_with_hint(omp_lock_t *lock, omp_sync_hint_t hint);
extern "C" __device__ void omp_set_affinity_format(const char *format);
extern "C" __device__ void omp_set_default_allocator(omp_allocator_handle_t allocator);
extern "C" __device__ void omp_set_default_device(int device_num);
extern "C" __device__ void omp_set_dynamic(int dynamic_threads);
extern "C" __device__ void omp_set_lock(omp_lock_t *lock);
extern "C" __device__ void omp_set_lock_ftn(omp_lock_t **lock);
extern "C" __device__ void omp_set_max_active_levels(int max_levels);
extern "C" __device__ void omp_set_nest_lock(omp_nest_lock_t *lock);
extern "C" __device__ void omp_set_nested(int nested);
extern "C" __device__ void omp_set_num_threads(int num_threads);
extern "C" __device__ void omp_set_schedule(omp_sched_t kind, int chunk);
extern "C" __device__ void omp_target_free(void *device_ptr, int device_num);
extern "C" __device__ void omp_unset_lock(omp_lock_t *lock);
extern "C" __device__ void omp_unset_lock_ftn(omp_lock_t **lock);
extern "C" __device__ void omp_unset_nest_lock(omp_nest_lock_t *lock);
extern "C" __device__ void* omp_target_alloc(size_t size, int device_num);

__inline__ __device__ int __nvhpc_int_permute(int x, int y, int s)
{
    int r;
    asm volatile ("{ prmt.b32 %0,%1,%2,%3; }\n"
                  : "=r"(r)
                  : "r"(x),
                    "r"(y),
                    "r"(s)
                  : "memory");

    return r;
}

__inline__ __device__ unsigned short int __nvhpc_atomicCASushort_llvm(void *p, unsigned short int v, unsigned short int w)
{
    unsigned short r;
    asm volatile ("{ atom.cas.b16 %0,[%1],%2,%3; }\n"
                  : "=h"(r)
                  : "l"(p),
                    "h"(v),
                    "h"(w)
                  : "memory");

    return r;
}
#else
extern "C" __device__ int __pgi_int_permute(int x, int y, int s);
extern "C" __device__ unsigned short int __pgi_atomicCASushort_llvm(void *p, unsigned short int v, unsigned short int w);
#endif

__inline__ __device__ unsigned long long __pgi_atomicAdd_ull(void *address, unsigned long long val) {
    return atomicAdd((unsigned long long int*)address, (unsigned long long int)val);
}

__inline__ __device__ long long __pgi_atomicAdd_ll(void *address, long long val) {
    return (long long)__pgi_atomicAdd_ull(address, (unsigned long long)val);
}

__inline__ __device__ unsigned long long __pgi_atomicMul_ull(void *address, unsigned long long val)
{
    unsigned long long *address_as = (unsigned long long *)address;
    unsigned long long old = *address_as, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as, assumed, assumed * val);
    } while (assumed != old);
    return old;
}

__inline__ __device__ long long __pgi_atomicMul_ll(void *address, long long val)
{
    return (long long)__pgi_atomicMul_ull(address, (unsigned long long)val);
}

__inline__ __device__ unsigned int __pgi_atomicMul_u(void *address, unsigned int val)
{
    unsigned int *address_as = (unsigned int *)address;
    unsigned int old = *address_as, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as, assumed, assumed * val);
    } while (assumed != old);
    return old;
}

__inline__ __device__
long long __pgi_atomicOr_ul(void *address, unsigned long long val)
{
    return (unsigned long long)__pgi_atomicOril(address, (long long)val);
}

__inline__ __device__ long long __pgi_atomicAnd_ul(void *address, unsigned long long val)
{
    return (unsigned long long)__pgi_atomicAndil(address, (long long)val);
}

__inline__ __device__ void __nvhpc_fcomplex_atomicadd(void* address, __fcomplex val)
{
    __pgi_atomicAddf(address, val.real);
    __pgi_atomicAddf((void*)((char*)address + sizeof(val.real)), val.imag);
}


__inline__ __device__ void __nvhpc_dcomplex_atomicadd(void* address, __dcomplex val)
{
    __pgi_atomicAddd(address, val.real);
    __pgi_atomicAddd((void*)((char*)address + sizeof(val.real)), val.imag);
}


#define create_atomic_byte_op(NAME, TY, OP, OP2)                               \
    __inline__ __device__                                                      \
    TY NAME(void *address, TY val) {                                           \
        unsigned int mod = (size_t) address & 1;                               \
        unsigned short* base_addr = (unsigned short*) ((char*) address - mod); \
        unsigned int selector = mod == 1 ? 0x3240 : 0x3214;                    \
        unsigned short old = *base_addr, assumed, newv;                        \
        do {                                                                   \
            assumed = old;                                                     \
            newv = OP2(((unsigned short)(__nvhpc_int_permute((int)assumed, 0, mod)) & 0xFF) OP val); \
            newv = (unsigned short)__nvhpc_int_permute((int)old, (int)newv, selector); \
            old = __nvhpc_atomicCASushort_llvm(base_addr, assumed, newv);      \
        } while (old != assumed);                                              \
        return __nvhpc_int_permute(old, 0, mod);                               \
    }

#define create_atomic_op(NAME, TY, OP, ACAS, OP2)                              \
    __inline__ __device__                                                      \
    TY NAME(void *address, TY val) {                                           \
        TY *address_as = (TY *)address;                                        \
        TY old = *address_as, assumed;                                         \
        do {                                                                   \
            assumed = old;                                                     \
            old = (TY)ACAS(address_as, assumed, (TY)OP2(assumed OP val));      \
        } while (assumed != old);                                              \
        return old;                                                            \
    }

/* Reduction across whole grid */
#define create_grid_and_block_reduce(NAME, TY, OP, INITVAL, AOP, OP2)          \
    EXTERNC __device__ STATIC USED __inline__                                  \
    void NAME ( signed char* out, TY in) {                                     \
        const unsigned mask = 0xFFFFFFFF;                                      \
        in = OP2 (__shfl_down_sync(mask, in, 16) OP in);                       \
        in = OP2 (__shfl_down_sync(mask, in, 8) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 4) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 2) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 1) OP in);                        \
        __shared__ TY block_res[1];                                            \
        block_res[0] = ( TY ) INITVAL ;                                        \
        __syncthreads();                                                       \
        if ((threadIdx.x & (warpSize - 1)) == 0)                               \
            AOP (block_res, in);                                               \
        __syncthreads();                                                       \
        if (threadIdx.x == 0)                                                  \
            AOP (out, block_res[0]);                                           \
        __syncthreads();                                                       \
    }

/* Reduction across threads in block */
#define create_block_reduce(NAME, TY, OP, INITVAL, AOP, OP2)                   \
    EXTERNC __device__ STATIC USED __inline__                                  \
    void NAME ( signed char* out, TY in) {                                     \
        const unsigned mask = 0xFFFFFFFF;                                      \
        in = OP2 (__shfl_down_sync(mask, in, 16) OP in);                       \
        in = OP2 (__shfl_down_sync(mask, in, 8) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 4) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 2) OP in);                        \
        in = OP2 (__shfl_down_sync(mask, in, 1) OP in);                        \
        if ((threadIdx.x & (warpSize - 1)) == 0)                               \
            AOP (out, in);                                                     \
    }

/* Reduction across blocks */
#define create_grid_reduce(NAME, TY, AOP)                                      \
    EXTERNC __device__ STATIC USED __inline__                                  \
    void NAME ( signed char* out, TY in) {                                     \
        if (omp_get_thread_num() == 0)                                         \
            AOP (out, in);                                                     \
    }

#define comma ,
create_atomic_byte_op(__pgi_atomicAdd_uchar, unsigned char, +, )
create_atomic_byte_op(__pgi_atomicAdd_schar, signed char, +, )
create_atomic_byte_op(__pgi_atomicMul_uchar, unsigned char, *, )
create_atomic_byte_op(__pgi_atomicMul_schar, signed char, *, )
create_atomic_byte_op(__pgi_atomicMax_uchar, unsigned char, comma, __pgi_ucmaxuc)
create_atomic_byte_op(__pgi_atomicMax_schar, signed char, comma, __pgi_icmaxic)
create_atomic_byte_op(__pgi_atomicMin_uchar, unsigned char, comma, __pgi_ucminuc)
create_atomic_byte_op(__pgi_atomicMin_schar, signed char, comma, __pgi_icminic)
create_atomic_byte_op(__pgi_atomicAnd_uchar, unsigned char, &, )
create_atomic_byte_op(__pgi_atomicAnd_schar, signed char, &, )
create_atomic_byte_op(__pgi_atomicOr_uchar, unsigned char, |, )
create_atomic_byte_op(__pgi_atomicOr_schar, signed char, |, )
create_atomic_byte_op(__pgi_atomicXor_uchar, unsigned char, ^, )
create_atomic_byte_op(__pgi_atomicXor_schar, signed char, ^, )
create_atomic_byte_op(__pgi_atomicNXor_uchar, unsigned char, ^, ~)
create_atomic_byte_op(__pgi_atomicNXor_schar, signed char, ^, ~)
create_atomic_byte_op(__pgi_atomicLAnd_uchar, unsigned char, &&, )
create_atomic_byte_op(__pgi_atomicLAnd_schar, signed char, &&, )
create_atomic_byte_op(__pgi_atomicLOr_uchar, unsigned char, ||, )
create_atomic_byte_op(__pgi_atomicLOr_schar, signed char, ||, )

create_atomic_op(__pgi_atomicAdd_ushort, unsigned short, +, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicAdd_sshort, signed short, +, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicMul_ushort, unsigned short, *, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicMul_sshort, signed short, *, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicMax_ushort, unsigned short, comma, __nvhpc_atomicCASushort_llvm, __pgi_usmaxus)
create_atomic_op(__pgi_atomicMax_sshort, signed short, comma, __nvhpc_atomicCASushort_llvm, __pgi_ismaxis)
create_atomic_op(__pgi_atomicMin_ushort, unsigned short, comma, __nvhpc_atomicCASushort_llvm, __pgi_usminus)
create_atomic_op(__pgi_atomicMin_sshort, signed short, comma, __nvhpc_atomicCASushort_llvm, __pgi_isminis)
create_atomic_op(__pgi_atomicAnd_ushort, unsigned short, &, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicAnd_sshort, signed short, &, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicOr_ushort, unsigned short, |, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicOr_sshort, signed short, |, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicXor_ushort, unsigned short, ^, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicXor_sshort, signed short, ^, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicNXor_ushort, unsigned short, ^, __nvhpc_atomicCASushort_llvm, ~)
create_atomic_op(__pgi_atomicNXor_sshort, signed short, ^, __nvhpc_atomicCASushort_llvm, ~)
create_atomic_op(__pgi_atomicLAnd_ushort, unsigned short, &&, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicLAnd_sshort, signed short, &&, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicLOr_ushort, unsigned short, ||, __nvhpc_atomicCASushort_llvm, )
create_atomic_op(__pgi_atomicLOr_sshort, signed short, ||, __nvhpc_atomicCASushort_llvm, )

create_atomic_op(__pgi_atomicLAnd_int, int, &&, atomicCAS, )
create_atomic_op(__pgi_atomicLOr_int, int, ||, atomicCAS, )
create_atomic_op(__pgi_atomicNXor_int, int, ^, atomicCAS, ~)
create_atomic_op(__pgi_atomicLAnd_uint, unsigned int, &&, atomicCAS, )
create_atomic_op(__pgi_atomicLOr_uint, unsigned int, ||, atomicCAS, )
create_atomic_op(__pgi_atomicNXor_uint, unsigned int, ^, atomicCAS, ~)
create_atomic_op(__pgi_atomicLAnd_long, long long, &&, __pgi_atomicCASil, )
create_atomic_op(__pgi_atomicLOr_long, long long, ||, __pgi_atomicCASil, )
create_atomic_op(__pgi_atomicNXor_long, long long, ^, __pgi_atomicCASil, ~)
create_atomic_op(__pgi_atomicLAnd_ulong, unsigned long long, &&, atomicCAS, )
create_atomic_op(__pgi_atomicLOr_ulong, unsigned long long, ||, atomicCAS, )
create_atomic_op(__pgi_atomicNXor_ulong, unsigned long long, ^, atomicCAS, ~)
create_atomic_op(__pgi_atomicLAnd_float, float, &&, __pgi_atomicCASf, )
create_atomic_op(__pgi_atomicLOr_float, float, ||, __pgi_atomicCASf, )
create_atomic_op(__pgi_atomicLAnd_double, double, &&, __pgi_atomicCASd, )
create_atomic_op(__pgi_atomicLOr_double, double, ||, __pgi_atomicCASd, )

create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_uchar, unsigned char, +, 0, __pgi_atomicAdd_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_schar, signed char, +, 0, __pgi_atomicAdd_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_ushort, unsigned short, +, 0, __pgi_atomicAdd_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_sshort, signed short, +, 0, __pgi_atomicAdd_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_int, int, +, 0, __pgi_atomicAddi, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_long, long long, +, 0, __pgi_atomicAdd_ll, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_uint, unsigned int, +, 0, __pgi_atomicAddu, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_ulong, unsigned long long, +, 0, __pgi_atomicAdd_ull, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_float, float, +, 0.0f, __pgi_atomicAddf, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_add_double, double, +, 0.0, __pgi_atomicAddd, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_uchar, unsigned char, *, 1, __pgi_atomicMul_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_schar, signed char, *, 1, __pgi_atomicMul_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_ushort, unsigned short, *, 1, __pgi_atomicMul_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_sshort, signed short, *, 1, __pgi_atomicMul_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_int, int, *, 1, __pgi_atomicMuli, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_long, long long, *, 1, __pgi_atomicMul_ll, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_uint, unsigned int, *, 1, __pgi_atomicMul_u, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_ulong, unsigned long long, *, 1, __pgi_atomicMul_ull, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_float, float, *, 1.0f, __pgi_atomicMulf, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_mul_double, double, *, 1.0, __pgi_atomicMuld, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_uchar, unsigned char, comma, 0, __pgi_atomicMax_uchar, __pgi_ucmaxuc)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_schar, signed char, comma, -128, __pgi_atomicMax_schar, __pgi_icmaxic)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_ushort, unsigned short, comma, 0, __pgi_atomicMax_ushort, __pgi_usmaxus)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_sshort, signed short, comma, -32768, __pgi_atomicMax_sshort, __pgi_ismaxis)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_int, int, comma , -2147483648, __pgi_atomicMaxi, __pgi_imaxi)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_uint, unsigned int, comma , 0, __pgi_atomicMaxu, __pgi_umaxu)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_long, long long, comma , 0x8000000000000000, __pgi_atomicMaxil, __pgi_lmaxl)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_ulong, unsigned long long, comma , 0, __pgi_atomicMaxul, __pgi_ulmaxul)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_float, float, comma , -3.4028235E38f, __pgi_atomicMaxf, __pgi_fmaxf)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_max_double, double, comma , -1.7976931348623157E308, __pgi_atomicMaxd, __pgi_dmaxd)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_uchar, unsigned char, comma, 0xFF, __pgi_atomicMin_uchar, __pgi_ucminuc)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_schar, signed char, comma, 127, __pgi_atomicMin_schar, __pgi_icminic)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_ushort, unsigned short, comma, 0xFFFF, __pgi_atomicMin_ushort, __pgi_usminus)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_sshort, signed short, comma, 32767, __pgi_atomicMin_sshort, __pgi_isminis)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_int, int, comma , 2147483647 , __pgi_atomicMini, __pgi_imini)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_uint, unsigned int, comma , 0xFFFFFFFF, __pgi_atomicMinu, __pgi_uminu)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_long, long long, comma , 0x7FFFFFFFFFFFFFFF, __pgi_atomicMinil, __pgi_lminl)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_ulong, unsigned long long, comma , 0xFFFFFFFFFFFFFFFF, __pgi_atomicMinul, __pgi_ulminul)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_float, float, comma , 3.4028235E38f, __pgi_atomicMinf, __pgi_fminf)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_min_double, double, comma , 1.7976931348623157E308, __pgi_atomicMind, __pgi_dmind)
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_uchar, unsigned char, &, 0xFF, __pgi_atomicAnd_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_schar, signed char, &, 0xFF, __pgi_atomicAnd_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_ushort, unsigned short, &, 0xFFFF, __pgi_atomicAnd_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_sshort, signed short, &, 0xFFFF, __pgi_atomicAnd_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_int, int, &, 0xFFFFFFFF , __pgi_atomicAndi, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_uint, unsigned int, &, 0xFFFFFFFF , __pgi_atomicAndu, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_long, long long, &, 0xFFFFFFFFFFFFFFFF , __pgi_atomicAndil, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_and_ulong, unsigned long long, &, 0xFFFFFFFFFFFFFFFF , __pgi_atomicAnd_ul, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_uchar, unsigned char, |, 0x0, __pgi_atomicOr_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_schar, signed char, |, 0x0, __pgi_atomicOr_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_ushort, unsigned short, |, 0x0, __pgi_atomicOr_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_sshort, signed short, |, 0x0, __pgi_atomicOr_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_int, int, |, 0x0 , __pgi_atomicOri, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_uint, unsigned int, |, 0x0 , __pgi_atomicOru, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_long, long long, |, 0x0 , __pgi_atomicOril, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_or_ulong, unsigned long long, |, 0x0 , __pgi_atomicOr_ul, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_uchar, unsigned char, ^, 0x0, __pgi_atomicXor_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_schar, signed char, ^, 0x0, __pgi_atomicXor_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_ushort, unsigned short, ^, 0x0, __pgi_atomicXor_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_sshort, signed short, ^, 0x0, __pgi_atomicXor_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_int, int, ^, 0x0 , __pgi_atomicXori, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_uint, unsigned int, ^, 0x0 , __pgi_atomicXoru, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_long, long long, ^, 0x0 , __pgi_atomicXoril, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_xor_ulong, unsigned long long, ^, 0x0 , __pgi_atomicXorul, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_uchar, unsigned char, ^, 0xFF , __pgi_atomicNXor_uchar, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_schar, signed char, ^, 0xFF , __pgi_atomicNXor_schar, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_ushort, unsigned short, ^, 0xFFFF , __pgi_atomicNXor_ushort, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_sshort, signed short, ^, 0xFFFF , __pgi_atomicNXor_sshort, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_int, int, ^, 0xFFFFFFFF , __pgi_atomicNXor_int, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_uint, unsigned int, ^, 0xFFFFFFFF , __pgi_atomicNXor_uint, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_long, long long, ^, 0xFFFFFFFFFFFFFFFF , __pgi_atomicNXor_long, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_nxor_ulong, unsigned long long, ^, 0xFFFFFFFFFFFFFFFF , __pgi_atomicNXor_ulong, ~ )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_uchar, unsigned char, ||, 0, __pgi_atomicLOr_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_schar, signed char, ||, 0, __pgi_atomicLOr_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_ushort, unsigned short, ||, 0, __pgi_atomicLOr_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_sshort, signed short, ||, 0, __pgi_atomicLOr_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_int, int, ||, 0 , __pgi_atomicLOr_int, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_uint, unsigned int, ||, 0 , __pgi_atomicLOr_uint, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_long, long long, ||, 0 , __pgi_atomicLOr_long, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_ulong, unsigned long long, ||, 0 , __pgi_atomicLOr_ulong, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_float, float, ||, 0 , __pgi_atomicLOr_float, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_lor_double, double, ||, 0 , __pgi_atomicLOr_double, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_uchar, unsigned char, &&, 1, __pgi_atomicLAnd_uchar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_schar, signed char, &&, 1, __pgi_atomicLAnd_schar, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_ushort, unsigned short, &&, 1, __pgi_atomicLAnd_ushort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_sshort, signed short, &&, 1, __pgi_atomicLAnd_sshort, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_int, int, &&, 1, __pgi_atomicLAnd_int, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_uint, unsigned int, &&, 1, __pgi_atomicLAnd_uint, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_long, long long, &&, 1, __pgi_atomicLAnd_long, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_ulong, unsigned long long, &&, 1, __pgi_atomicLAnd_ulong, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_float, float, &&, 1, __pgi_atomicLAnd_float, )
create_grid_and_block_reduce(__nvhpc_grid_and_block_reduce_land_double, double, &&, 1, __pgi_atomicLAnd_double, )

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_grid_and_block_reduce_add_fcomplex ( signed char* out, float real, float imag) {
    __nvhpc_grid_and_block_reduce_add_float(out, real);
    __nvhpc_grid_and_block_reduce_add_float(out + sizeof (real), imag);
}

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_grid_and_block_reduce_add_dcomplex ( signed char* out, double real, double imag) {
    __nvhpc_grid_and_block_reduce_add_double(out, real);
    __nvhpc_grid_and_block_reduce_add_double(out + sizeof (real), imag);
}

create_block_reduce(__nvhpc_block_reduce_add_uchar, unsigned char, +, 0, __pgi_atomicAdd_uchar, )
create_block_reduce(__nvhpc_block_reduce_add_schar, signed char, +, 0, __pgi_atomicAdd_schar, )
create_block_reduce(__nvhpc_block_reduce_add_ushort, unsigned short, +, 0, __pgi_atomicAdd_ushort, )
create_block_reduce(__nvhpc_block_reduce_add_sshort, signed short, +, 0, __pgi_atomicAdd_sshort, )
create_block_reduce(__nvhpc_block_reduce_add_int, int, +, 0, __pgi_atomicAddi, )
create_block_reduce(__nvhpc_block_reduce_add_long, long long, +, 0, __pgi_atomicAdd_ll, )
create_block_reduce(__nvhpc_block_reduce_add_uint, unsigned int, +, 0, __pgi_atomicAddu, )
create_block_reduce(__nvhpc_block_reduce_add_ulong, unsigned long long, +, 0, __pgi_atomicAdd_ull, )
create_block_reduce(__nvhpc_block_reduce_add_float, float, +, 0.0f, __pgi_atomicAddf, )
create_block_reduce(__nvhpc_block_reduce_add_double, double, +, 0.0, __pgi_atomicAddd, )
create_block_reduce(__nvhpc_block_reduce_mul_uchar, unsigned char, *, 1, __pgi_atomicMul_uchar, )
create_block_reduce(__nvhpc_block_reduce_mul_schar, signed char, *, 1, __pgi_atomicMul_schar, )
create_block_reduce(__nvhpc_block_reduce_mul_ushort, unsigned short, *, 1, __pgi_atomicMul_ushort, )
create_block_reduce(__nvhpc_block_reduce_mul_sshort, signed short, *, 1, __pgi_atomicMul_sshort, )
create_block_reduce(__nvhpc_block_reduce_mul_int, int, *, 1, __pgi_atomicMuli, )
create_block_reduce(__nvhpc_block_reduce_mul_long, long long, *, 1, __pgi_atomicMul_ll, )
create_block_reduce(__nvhpc_block_reduce_mul_uint, unsigned int, *, 1, __pgi_atomicMul_u, )
create_block_reduce(__nvhpc_block_reduce_mul_ulong, unsigned long long, *, 1, __pgi_atomicMul_ull, )
create_block_reduce(__nvhpc_block_reduce_mul_float, float, *, 1.0f, __pgi_atomicMulf, )
create_block_reduce(__nvhpc_block_reduce_mul_double, double, *, 1.0, __pgi_atomicMuld, )
create_block_reduce(__nvhpc_block_reduce_max_uchar, unsigned char, comma, 0, __pgi_atomicMax_uchar, __pgi_ucmaxuc)
create_block_reduce(__nvhpc_block_reduce_max_schar, signed char, comma, -128, __pgi_atomicMax_schar, __pgi_icmaxic)
create_block_reduce(__nvhpc_block_reduce_max_ushort, unsigned short, comma, 0, __pgi_atomicMax_ushort, __pgi_usmaxus)
create_block_reduce(__nvhpc_block_reduce_max_sshort, signed short, comma, -32768, __pgi_atomicMax_sshort, __pgi_ismaxis)
create_block_reduce(__nvhpc_block_reduce_max_int, int, comma , -2147483648, __pgi_atomicMaxi, __pgi_imaxi)
create_block_reduce(__nvhpc_block_reduce_max_uint, unsigned int, comma , 0, __pgi_atomicMaxu, __pgi_umaxu)
create_block_reduce(__nvhpc_block_reduce_max_long, long long, comma , 0x8000000000000000, __pgi_atomicMaxil, __pgi_lmaxl)
create_block_reduce(__nvhpc_block_reduce_max_ulong, unsigned long long, comma , 0, __pgi_atomicMaxul, __pgi_ulmaxul)
create_block_reduce(__nvhpc_block_reduce_max_float, float, comma , -3.4028235E38f, __pgi_atomicMaxf, __pgi_fmaxf)
create_block_reduce(__nvhpc_block_reduce_max_double, double, comma , -1.7976931348623157E308, __pgi_atomicMaxd, __pgi_dmaxd)
create_block_reduce(__nvhpc_block_reduce_min_uchar, unsigned char, comma, 0xFF, __pgi_atomicMin_uchar, __pgi_ucminuc)
create_block_reduce(__nvhpc_block_reduce_min_schar, signed char, comma, 127, __pgi_atomicMin_schar, __pgi_icminic)
create_block_reduce(__nvhpc_block_reduce_min_ushort, unsigned short, comma, 0xFFFF, __pgi_atomicMin_ushort, __pgi_usminus)
create_block_reduce(__nvhpc_block_reduce_min_sshort, signed short, comma, 32767, __pgi_atomicMin_sshort, __pgi_isminis)
create_block_reduce(__nvhpc_block_reduce_min_int, int, comma , 2147483647 , __pgi_atomicMini, __pgi_imini)
create_block_reduce(__nvhpc_block_reduce_min_uint, unsigned int, comma , 0xFFFFFFFF, __pgi_atomicMinu, __pgi_uminu)
create_block_reduce(__nvhpc_block_reduce_min_long, long long, comma , 0x7FFFFFFFFFFFFFFF, __pgi_atomicMinil, __pgi_lminl)
create_block_reduce(__nvhpc_block_reduce_min_ulong, unsigned long long, comma , 0xFFFFFFFFFFFFFFFF, __pgi_atomicMinul, __pgi_ulminul)
create_block_reduce(__nvhpc_block_reduce_min_float, float, comma , 3.4028235E38f, __pgi_atomicMinf, __pgi_fminf)
create_block_reduce(__nvhpc_block_reduce_min_double, double, comma , 1.7976931348623157E308, __pgi_atomicMind, __pgi_dmind)
create_block_reduce(__nvhpc_block_reduce_and_uchar, unsigned char, &, 0xFF, __pgi_atomicAnd_uchar, )
create_block_reduce(__nvhpc_block_reduce_and_schar, signed char, &, 0xFF, __pgi_atomicAnd_schar, )
create_block_reduce(__nvhpc_block_reduce_and_ushort, unsigned short, &, 0xFFFF, __pgi_atomicAnd_ushort, )
create_block_reduce(__nvhpc_block_reduce_and_sshort, signed short, &, 0xFFFF, __pgi_atomicAnd_sshort, )
create_block_reduce(__nvhpc_block_reduce_and_int, int, &, 0xFFFFFFFF , __pgi_atomicAndi, )
create_block_reduce(__nvhpc_block_reduce_and_uint, unsigned int, &, 0xFFFFFFFF , __pgi_atomicAndu, )
create_block_reduce(__nvhpc_block_reduce_and_long, long long, &, 0xFFFFFFFFFFFFFFFF , __pgi_atomicAndil, )
create_block_reduce(__nvhpc_block_reduce_and_ulong, unsigned long long, &, 0xFFFFFFFFFFFFFFFF , __pgi_atomicAnd_ul, )
create_block_reduce(__nvhpc_block_reduce_or_uchar, unsigned char, |, 0x0, __pgi_atomicOr_uchar, )
create_block_reduce(__nvhpc_block_reduce_or_schar, signed char, |, 0x0, __pgi_atomicOr_schar, )
create_block_reduce(__nvhpc_block_reduce_or_ushort, unsigned short, |, 0x0, __pgi_atomicOr_ushort, )
create_block_reduce(__nvhpc_block_reduce_or_sshort, signed short, |, 0x0, __pgi_atomicOr_sshort, )
create_block_reduce(__nvhpc_block_reduce_or_int, int, |, 0x0 , __pgi_atomicOri, )
create_block_reduce(__nvhpc_block_reduce_or_uint, unsigned int, |, 0x0 , __pgi_atomicOru, )
create_block_reduce(__nvhpc_block_reduce_or_long, long long, |, 0x0 , __pgi_atomicOril, )
create_block_reduce(__nvhpc_block_reduce_or_ulong, unsigned long long, |, 0x0 , __pgi_atomicOr_ul, )
create_block_reduce(__nvhpc_block_reduce_xor_uchar, unsigned char, ^, 0x0, __pgi_atomicXor_uchar, )
create_block_reduce(__nvhpc_block_reduce_xor_schar, signed char, ^, 0x0, __pgi_atomicXor_schar, )
create_block_reduce(__nvhpc_block_reduce_xor_ushort, unsigned short, ^, 0x0, __pgi_atomicXor_ushort, )
create_block_reduce(__nvhpc_block_reduce_xor_sshort, signed short, ^, 0x0, __pgi_atomicXor_sshort, )
create_block_reduce(__nvhpc_block_reduce_xor_int, int, ^, 0x0 , __pgi_atomicXori, )
create_block_reduce(__nvhpc_block_reduce_xor_uint, unsigned int, ^, 0x0 , __pgi_atomicXoru, )
create_block_reduce(__nvhpc_block_reduce_xor_long, long long, ^, 0x0 , __pgi_atomicXoril, )
create_block_reduce(__nvhpc_block_reduce_xor_ulong, unsigned long long, ^, 0x0 , __pgi_atomicXorul, )
create_block_reduce(__nvhpc_block_reduce_nxor_uchar, unsigned char, ^, 0xFF , __pgi_atomicNXor_uchar, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_schar, signed char, ^, 0xFF , __pgi_atomicNXor_schar, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_ushort, unsigned short, ^, 0xFFFF , __pgi_atomicNXor_ushort, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_sshort, signed short, ^, 0xFFFF , __pgi_atomicNXor_sshort, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_int, int, ^, 0xFFFFFFFF , __pgi_atomicNXor_int, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_uint, unsigned int, ^, 0xFFFFFFFF , __pgi_atomicNXor_uint, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_long, long long, ^, 0xFFFFFFFFFFFFFFFF , __pgi_atomicNXor_long, ~ )
create_block_reduce(__nvhpc_block_reduce_nxor_ulong, unsigned long long, ^, 0xFFFFFFFFFFFFFFFF , __pgi_atomicNXor_ulong, ~ )
create_block_reduce(__nvhpc_block_reduce_lor_uchar, unsigned char, ||, 0, __pgi_atomicLOr_uchar, )
create_block_reduce(__nvhpc_block_reduce_lor_schar, signed char, ||, 0, __pgi_atomicLOr_schar, )
create_block_reduce(__nvhpc_block_reduce_lor_ushort, unsigned short, ||, 0, __pgi_atomicLOr_ushort, )
create_block_reduce(__nvhpc_block_reduce_lor_sshort, signed short, ||, 0, __pgi_atomicLOr_sshort, )
create_block_reduce(__nvhpc_block_reduce_lor_int, int, ||, 0 , __pgi_atomicLOr_int, )
create_block_reduce(__nvhpc_block_reduce_lor_uint, unsigned int, ||, 0 , __pgi_atomicLOr_uint, )
create_block_reduce(__nvhpc_block_reduce_lor_long, long long, ||, 0 , __pgi_atomicLOr_long, )
create_block_reduce(__nvhpc_block_reduce_lor_ulong, unsigned long long, ||, 0 , __pgi_atomicLOr_ulong, )
create_block_reduce(__nvhpc_block_reduce_lor_float, float, ||, 0 , __pgi_atomicLOr_float, )
create_block_reduce(__nvhpc_block_reduce_lor_double, double, ||, 0 , __pgi_atomicLOr_double, )
create_block_reduce(__nvhpc_block_reduce_land_uchar, unsigned char, &&, 1, __pgi_atomicLAnd_uchar, )
create_block_reduce(__nvhpc_block_reduce_land_schar, signed char, &&, 1, __pgi_atomicLAnd_schar, )
create_block_reduce(__nvhpc_block_reduce_land_ushort, unsigned short, &&, 1, __pgi_atomicLAnd_ushort, )
create_block_reduce(__nvhpc_block_reduce_land_sshort, signed short, &&, 1, __pgi_atomicLAnd_sshort, )
create_block_reduce(__nvhpc_block_reduce_land_int, int, &&, 1, __pgi_atomicLAnd_int, )
create_block_reduce(__nvhpc_block_reduce_land_uint, unsigned int, &&, 1, __pgi_atomicLAnd_uint, )
create_block_reduce(__nvhpc_block_reduce_land_long, long long, &&, 1, __pgi_atomicLAnd_long, )
create_block_reduce(__nvhpc_block_reduce_land_ulong, unsigned long long, &&, 1, __pgi_atomicLAnd_ulong, )
create_block_reduce(__nvhpc_block_reduce_land_float, float, &&, 1, __pgi_atomicLAnd_float, )
create_block_reduce(__nvhpc_block_reduce_land_double, double, &&, 1, __pgi_atomicLAnd_double, )

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_block_reduce_add_fcomplex ( signed char* out, float real, float imag) {
    __nvhpc_block_reduce_add_float(out, real);
    __nvhpc_block_reduce_add_float(out + sizeof (real), imag);
}

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_block_reduce_add_dcomplex ( signed char* out, double real, double imag) {
    __nvhpc_block_reduce_add_double(out, real);
    __nvhpc_block_reduce_add_double(out + sizeof (real), imag);
}

create_grid_reduce(__nvhpc_grid_reduce_add_uchar, unsigned char, __pgi_atomicAdd_uchar)
create_grid_reduce(__nvhpc_grid_reduce_add_schar, signed char, __pgi_atomicAdd_schar)
create_grid_reduce(__nvhpc_grid_reduce_add_ushort, unsigned short, __pgi_atomicAdd_ushort)
create_grid_reduce(__nvhpc_grid_reduce_add_sshort, signed short, __pgi_atomicAdd_sshort)
create_grid_reduce(__nvhpc_grid_reduce_add_int, int, __pgi_atomicAddi)
create_grid_reduce(__nvhpc_grid_reduce_add_uint, unsigned int, __pgi_atomicAddu)
create_grid_reduce(__nvhpc_grid_reduce_add_long, long long, __pgi_atomicAdd_ll)
create_grid_reduce(__nvhpc_grid_reduce_add_ulong, unsigned long long, __pgi_atomicAdd_ull)
create_grid_reduce(__nvhpc_grid_reduce_add_float, float, __pgi_atomicAddf)
create_grid_reduce(__nvhpc_grid_reduce_add_double, double, __pgi_atomicAddd)
create_grid_reduce(__nvhpc_grid_reduce_mul_uchar, unsigned char, __pgi_atomicMul_uchar)
create_grid_reduce(__nvhpc_grid_reduce_mul_schar, signed char, __pgi_atomicMul_schar)
create_grid_reduce(__nvhpc_grid_reduce_mul_ushort, unsigned short, __pgi_atomicMul_ushort)
create_grid_reduce(__nvhpc_grid_reduce_mul_sshort, signed short, __pgi_atomicMul_sshort)
create_grid_reduce(__nvhpc_grid_reduce_mul_int, int, __pgi_atomicMuli)
create_grid_reduce(__nvhpc_grid_reduce_mul_uint, unsigned int, __pgi_atomicMul_u)
create_grid_reduce(__nvhpc_grid_reduce_mul_long, long long, __pgi_atomicMul_ll)
create_grid_reduce(__nvhpc_grid_reduce_mul_ulong, unsigned long long, __pgi_atomicMul_ull)
create_grid_reduce(__nvhpc_grid_reduce_mul_float, float, __pgi_atomicMulf)
create_grid_reduce(__nvhpc_grid_reduce_mul_double, double, __pgi_atomicMuld)
create_grid_reduce(__nvhpc_grid_reduce_max_uchar, unsigned char, __pgi_atomicMax_uchar)
create_grid_reduce(__nvhpc_grid_reduce_max_schar, signed char, __pgi_atomicMax_schar)
create_grid_reduce(__nvhpc_grid_reduce_max_ushort, unsigned short, __pgi_atomicMax_ushort)
create_grid_reduce(__nvhpc_grid_reduce_max_sshort, signed short, __pgi_atomicMax_sshort)
create_grid_reduce(__nvhpc_grid_reduce_max_int, int, __pgi_atomicMaxi)
create_grid_reduce(__nvhpc_grid_reduce_max_uint, unsigned int, __pgi_atomicMaxu)
create_grid_reduce(__nvhpc_grid_reduce_max_long, long long, __pgi_atomicMaxil)
create_grid_reduce(__nvhpc_grid_reduce_max_ulong, unsigned long long, __pgi_atomicMaxul)
create_grid_reduce(__nvhpc_grid_reduce_max_float, float, __pgi_atomicMaxf)
create_grid_reduce(__nvhpc_grid_reduce_max_double, double, __pgi_atomicMaxd)
create_grid_reduce(__nvhpc_grid_reduce_min_uchar, unsigned char, __pgi_atomicMin_uchar)
create_grid_reduce(__nvhpc_grid_reduce_min_schar, signed char, __pgi_atomicMin_schar)
create_grid_reduce(__nvhpc_grid_reduce_min_ushort, unsigned short, __pgi_atomicMin_ushort)
create_grid_reduce(__nvhpc_grid_reduce_min_sshort, signed short, __pgi_atomicMin_sshort)
create_grid_reduce(__nvhpc_grid_reduce_min_int, int, __pgi_atomicMini)
create_grid_reduce(__nvhpc_grid_reduce_min_uint, unsigned int, __pgi_atomicMinu)
create_grid_reduce(__nvhpc_grid_reduce_min_long, long long, __pgi_atomicMinil)
create_grid_reduce(__nvhpc_grid_reduce_min_ulong, unsigned long long, __pgi_atomicMinul)
create_grid_reduce(__nvhpc_grid_reduce_min_float, float, __pgi_atomicMinf)
create_grid_reduce(__nvhpc_grid_reduce_min_double, double, __pgi_atomicMind)
create_grid_reduce(__nvhpc_grid_reduce_and_uchar, unsigned char, __pgi_atomicAnd_uchar)
create_grid_reduce(__nvhpc_grid_reduce_and_schar, signed char, __pgi_atomicAnd_schar)
create_grid_reduce(__nvhpc_grid_reduce_and_ushort, unsigned short, __pgi_atomicAnd_ushort)
create_grid_reduce(__nvhpc_grid_reduce_and_sshort, signed short, __pgi_atomicAnd_sshort)
create_grid_reduce(__nvhpc_grid_reduce_and_int, int, __pgi_atomicAndi)
create_grid_reduce(__nvhpc_grid_reduce_and_uint, unsigned int, __pgi_atomicAndu)
create_grid_reduce(__nvhpc_grid_reduce_and_long, long long, __pgi_atomicAndil)
create_grid_reduce(__nvhpc_grid_reduce_and_ulong, unsigned long long, __pgi_atomicAnd_ul)
create_grid_reduce(__nvhpc_grid_reduce_or_uchar, unsigned char, __pgi_atomicOr_uchar)
create_grid_reduce(__nvhpc_grid_reduce_or_schar, signed char, __pgi_atomicOr_schar)
create_grid_reduce(__nvhpc_grid_reduce_or_ushort, unsigned short, __pgi_atomicOr_ushort)
create_grid_reduce(__nvhpc_grid_reduce_or_sshort, signed short, __pgi_atomicOr_sshort)
create_grid_reduce(__nvhpc_grid_reduce_or_int, int, __pgi_atomicOri)
create_grid_reduce(__nvhpc_grid_reduce_or_uint, unsigned int, __pgi_atomicOru)
create_grid_reduce(__nvhpc_grid_reduce_or_long, long long, __pgi_atomicOril)
create_grid_reduce(__nvhpc_grid_reduce_or_ulong, unsigned long long, __pgi_atomicOr_ul)
create_grid_reduce(__nvhpc_grid_reduce_xor_uchar, unsigned char, __pgi_atomicXor_uchar)
create_grid_reduce(__nvhpc_grid_reduce_xor_schar, signed char, __pgi_atomicXor_schar)
create_grid_reduce(__nvhpc_grid_reduce_xor_ushort, unsigned short, __pgi_atomicXor_ushort)
create_grid_reduce(__nvhpc_grid_reduce_xor_sshort, signed short, __pgi_atomicXor_sshort)
create_grid_reduce(__nvhpc_grid_reduce_xor_int, int, __pgi_atomicXori)
create_grid_reduce(__nvhpc_grid_reduce_xor_uint, unsigned int, __pgi_atomicXoru)
create_grid_reduce(__nvhpc_grid_reduce_xor_long, long long, __pgi_atomicXoril)
create_grid_reduce(__nvhpc_grid_reduce_xor_ulong, unsigned long long, __pgi_atomicXorul)
create_grid_reduce(__nvhpc_grid_reduce_nxor_uchar, unsigned char, __pgi_atomicNXor_uchar)
create_grid_reduce(__nvhpc_grid_reduce_nxor_schar, signed char, __pgi_atomicNXor_schar)
create_grid_reduce(__nvhpc_grid_reduce_nxor_ushort, unsigned short, __pgi_atomicNXor_ushort)
create_grid_reduce(__nvhpc_grid_reduce_nxor_sshort, signed short, __pgi_atomicNXor_sshort)
create_grid_reduce(__nvhpc_grid_reduce_nxor_int, int, __pgi_atomicNXor_int)
create_grid_reduce(__nvhpc_grid_reduce_nxor_uint, unsigned int, __pgi_atomicNXor_uint)
create_grid_reduce(__nvhpc_grid_reduce_nxor_long, long long, __pgi_atomicNXor_long)
create_grid_reduce(__nvhpc_grid_reduce_nxor_ulong, unsigned long long, __pgi_atomicNXor_ulong)
create_grid_reduce(__nvhpc_grid_reduce_lor_uchar, unsigned char, __pgi_atomicLOr_uchar)
create_grid_reduce(__nvhpc_grid_reduce_lor_schar, signed char, __pgi_atomicLOr_schar)
create_grid_reduce(__nvhpc_grid_reduce_lor_ushort, unsigned short, __pgi_atomicLOr_ushort)
create_grid_reduce(__nvhpc_grid_reduce_lor_sshort, signed short, __pgi_atomicLOr_sshort)
create_grid_reduce(__nvhpc_grid_reduce_lor_int, int, __pgi_atomicLOr_int)
create_grid_reduce(__nvhpc_grid_reduce_lor_uint, unsigned int, __pgi_atomicLOr_uint)
create_grid_reduce(__nvhpc_grid_reduce_lor_long, long long, __pgi_atomicLOr_long)
create_grid_reduce(__nvhpc_grid_reduce_lor_ulong, unsigned long long, __pgi_atomicLOr_ulong)
create_grid_reduce(__nvhpc_grid_reduce_lor_float, float, __pgi_atomicLOr_float)
create_grid_reduce(__nvhpc_grid_reduce_lor_double, double, __pgi_atomicLOr_double)
create_grid_reduce(__nvhpc_grid_reduce_land_uchar, unsigned char, __pgi_atomicLAnd_uchar)
create_grid_reduce(__nvhpc_grid_reduce_land_schar, signed char, __pgi_atomicLAnd_schar)
create_grid_reduce(__nvhpc_grid_reduce_land_ushort, unsigned short, __pgi_atomicLAnd_ushort)
create_grid_reduce(__nvhpc_grid_reduce_land_sshort, signed short, __pgi_atomicLAnd_sshort)
create_grid_reduce(__nvhpc_grid_reduce_land_int, int, __pgi_atomicLAnd_int)
create_grid_reduce(__nvhpc_grid_reduce_land_uint, unsigned int, __pgi_atomicLAnd_uint)
create_grid_reduce(__nvhpc_grid_reduce_land_long, long long, __pgi_atomicLAnd_long)
create_grid_reduce(__nvhpc_grid_reduce_land_ulong, unsigned long long, __pgi_atomicLAnd_ulong)
create_grid_reduce(__nvhpc_grid_reduce_land_float, float, __pgi_atomicLAnd_float)
create_grid_reduce(__nvhpc_grid_reduce_land_double, double, __pgi_atomicLAnd_double)

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_grid_reduce_add_fcomplex( signed char* out, float real, float imag)
{
    __nvhpc_grid_reduce_add_float(out, real);
    __nvhpc_grid_reduce_add_float(out + sizeof (real), imag);
}

EXTERNC __device__ STATIC USED __inline__ void __nvhpc_grid_reduce_add_dcomplex( signed char* out, double real, double imag)
{
    __nvhpc_grid_reduce_add_double(out, real);
    __nvhpc_grid_reduce_add_double(out + sizeof (real), imag);
}

#endif /* NVHPC_OMP_RUNTIME_H */
