// Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#ifndef _NVCOMPILER_STDPAR_CONFIG_
#define _NVCOMPILER_STDPAR_CONFIG_

#if !defined(_NVHPC_STDPAR_GPU_MULTICORE) && \
    _NVHPC_STDPAR_GPU && _NVHPC_STDPAR_MULTICORE
  #define _NVHPC_STDPAR_GPU_MULTICORE 1
#endif

#define _NVHPC_FIRST_ARG1(x, ...) x
#define _NVHPC_FIRST_ARG(x, ...) _NVHPC_FIRST_ARG1(x)
#define _NVHPC_LOWEST_GPU_ARCH _NVHPC_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST)

#if _NVHPC_STDPAR_GPU_MULTICORE
  #include <openacc.h>
#endif

#if defined(_NVHPC_STDPAR_MULTICORE) || defined(_NVHPC_STDPAR_GPU)
  #define _NVHPC_PARALLEL_ALGORITHM [[nv::__parallel_algorithm]]
  #define _NVHPC_PARALLEL_IMPL_THRUST [[nv::__parallel_impl_thrust]]
  #define _NVHPC_PARALLEL_IMPL_OPENACC [[nv::__parallel_impl_openacc]]
  #define _NVHPC_PARALLEL_FRAMEWORK_IMPL [[nv::__parallel_framework_impl]]
#else
  #define _NVHPC_PARALLEL_ALGORITHM
  #define _NVHPC_PARALLEL_IMPL_THRUST
  #define _NVHPC_PARALLEL_IMPL_OPENACC
  #define _NVHPC_PARALLEL_FRAMEWORK_IMPL
#endif

namespace std {
namespace __stdpar {

enum class __back_end : int {
  __seq = 1,
  __thrust_multicore,
  __thrust_gpu,
  __gpu_multicore,
  __openacc,
  __openacc_errors
};

template <__back_end...> struct __back_end_list { };

// Some commonly used back end lists.
using __seq_only = __back_end_list<__back_end::__seq>;
using __gpu_seq = __back_end_list<
#if __NVCOMPILER_STDPAR_OPENACC_GPU
                                  __back_end::__openacc,
#endif
                                  __back_end::__thrust_gpu,
                                  __back_end::__seq>;
using __gpu_only = __back_end_list<
#if __NVCOMPILER_STDPAR_OPENACC_GPU
                                   __back_end::__openacc,
#endif
                                   __back_end::__thrust_gpu>;
using __cpu_seq = __back_end_list<__back_end::__thrust_multicore,
                                  __back_end::__seq>;
using __cpu_only = __back_end_list<__back_end::__thrust_multicore>;

// Map each execution policy to the list of back ends to use for that policy.
// For some policies the back ends to use depends on the compilation mode.

template <typename _EP> struct __back_ends_for_impl {
  using type = __back_end_list<>;
};
template <> struct __back_ends_for_impl<std::execution::sequenced_policy> {
  using type = __seq_only;
};
template <> struct __back_ends_for_impl<std::execution::unsequenced_policy> {
  using type = __seq_only;
};

template <> struct __back_ends_for_impl<std::execution::parallel_policy> {
#if _NVHPC_STDPAR_GPU_MULTICORE && _NVHPC_LOWEST_GPU_ARCH >= 70
  using type = __back_end_list<__back_end::__gpu_multicore>;
  using __gpu = __gpu_seq;
  using __cpu = __cpu_seq;
#elif _NVHPC_STDPAR_GPU && _NVHPC_LOWEST_GPU_ARCH >= 70
  using type = __gpu_seq;
#elif _NVHPC_STDPAR_MULTICORE
  using type = __cpu_seq;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<std::execution::parallel_unsequenced_policy> {
#if _NVHPC_STDPAR_GPU_MULTICORE
  using type = __back_end_list<__back_end::__gpu_multicore>;
  using __gpu = __gpu_seq;
  using __cpu = __cpu_seq;
#elif _NVHPC_STDPAR_GPU
  using type = __gpu_seq;
#elif _NVHPC_STDPAR_MULTICORE
  using type = __cpu_seq;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_openacc_policy> {
#if _NVHPC_STDPAR_GPU
  using type = __back_end_list<__back_end::__openacc,
                               __back_end::__thrust_gpu,
                               __back_end::__seq>;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_required_policy> {
#if _NVHPC_STDPAR_GPU_MULTICORE
  using type = __back_end_list<__back_end::__gpu_multicore>;
  using __gpu = __gpu_only;
  using __cpu = __cpu_only;
#elif _NVHPC_STDPAR_GPU
  using type = __gpu_only;
#elif _NVHPC_STDPAR_MULTICORE
  using type = __cpu_only;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_on_cpu_policy> {
#if _NVHPC_STDPAR_MULTICORE
  using type = __cpu_seq;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_on_gpu_policy> {
#if _NVHPC_STDPAR_GPU
  using type = __gpu_seq;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_required_openacc_policy> {
#if __NVCOMPILER_STDPAR_OPENACC_GPU || _NVHPC_STDPAR_GPU
  using type = __back_end_list<__back_end::__openacc,
                               __back_end::__openacc_errors>;
#elif _NVHPC_STDPAR_MULTICORE
  using type = __back_end_list<>;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_required_on_cpu_policy> {
#if _NVHPC_STDPAR_MULTICORE
  using type = __cpu_only;
#elif __NVCOMPILER_STDPAR_OPENACC_GPU || _NVHPC_STDPAR_GPU
  using type = __back_end_list<>;
#else
  using type = __seq_only;
#endif
};

template <>
struct __back_ends_for_impl<::nv::execution::parallel_required_on_gpu_policy> {
#if _NVHPC_STDPAR_GPU
  using type = __gpu_only;
#elif _NVHPC_STDPAR_MULTICORE
  using type = __back_end_list<>;
#else
  using type = __seq_only;
#endif
};

template <typename _EP>
using __back_ends_for =
  typename __back_ends_for_impl<typename std::decay<_EP>::type>::type;

// For -stdpar=gpu,multicore, __gpu_backend_available() is used to decide
// at runtime whether to run the GPU version or the CPU version.
#if _NVHPC_STDPAR_GPU_MULTICORE
inline bool __gpu_backend_available_uncached() {
  return acc_get_num_devices(acc_device_nvidia) > 0;
}
inline bool __gpu_backend_available() {
  static thread_local bool __r = __gpu_backend_available_uncached();
  return __r;
}
#endif

// __stdpar_call_impl, __stdpar_call, and __dispatch are used to choose the
// correct implementation a parallel algorithm call.

template <typename _F, typename _EP, typename _BE_list> struct __stdpar_call;
template <typename _F, typename _EP, bool _Is_gpu_multicore, bool _Valid,
          __back_end _First, __back_end... _Rest>
struct __stdpar_call_impl;

template <typename _F, typename _EP, bool _Valid, __back_end _First,
          __back_end... _Rest>
struct __stdpar_call_impl<_F, _EP, true, _Valid, _First, _Rest...> {
  _NVHPC_PARALLEL_FRAMEWORK_IMPL
  static typename _F::__return_type __call(_F const& __f) {
    // The -stdpar=gpu,multicore case.  Decide at runtime whether to run this
    // on the GPU or CPU, then use the same dispatch mechanism to forward the
    // call to the correct GPU or CPU back end.
#if _NVHPC_STDPAR_GPU_MULTICORE
    if (__gpu_backend_available()) {
      return __stdpar_call<_F, _EP,
                           typename __back_ends_for_impl<
                               typename std::decay<_EP>::type>::__gpu>
                 ::__call(__f);
    } else {
      return __stdpar_call<_F, _EP,
                           typename __back_ends_for_impl<
                               typename std::decay<_EP>::type>::__cpu>
                 ::__call(__f);
    }
#else
    static_assert(false, "Internal error: stdpar gpu-multicore back end used "
                         "when GPU/multicore mode is not enabled.");
    throw 0;
#endif
  }
};

template <typename _F, typename _EP, __back_end _First, __back_end... _Rest>
struct __stdpar_call_impl<_F, _EP, false, true, _First, _Rest...> {
  _NVHPC_PARALLEL_FRAMEWORK_IMPL
  static typename _F::__return_type __call(_F const& __f) {
    // The back end _First has a valid implementation of the algorithm. Call it.
    return __f.template __call<_First>();
  }
};

template <typename _F, typename _EP, __back_end _First, __back_end... _Rest>
struct __stdpar_call_impl<_F, _EP, false, false, _First, _Rest...> {
  _NVHPC_PARALLEL_FRAMEWORK_IMPL
  static typename _F::__return_type __call(_F const& __f) {
    // The back end _First does not have a valid implementation of the
    // algorithm.  Pass it on to the next back end in the list.
    return __stdpar_call<_F, _EP, __back_end_list<_Rest...>>::__call(__f);
  }
};

template <typename _F, typename _EP>
struct __stdpar_call<_F, _EP, __back_end_list<>> {
  static typename _F::__return_type __call(_F const& __f) {
    // The list of back ends is empty.  No back end in the original list of
    // back ends implements this algorithm.  This is a compilation error.
    static_assert(
      sizeof(_F) < 1,
      "This stdpar algorithm is not implemented for this execution policy");
    throw 0;
  }
};

template <typename _F, typename _EP, __back_end _First, __back_end... _Rest>
struct __stdpar_call<_F, _EP, __back_end_list<_First, _Rest...>> {
  _NVHPC_PARALLEL_FRAMEWORK_IMPL
  static typename _F::__return_type __call(_F const& __f) {
    return __stdpar_call_impl<_F, _EP, _First == __back_end::__gpu_multicore,
                              _F::template __is_valid<_First>::value, _First,
                              _Rest...>::__call(__f);
  }
};

template <typename _EP, typename _F>
_NVHPC_PARALLEL_FRAMEWORK_IMPL
typename _F::__return_type __dispatch(_F const& __f) {
  return __stdpar_call<_F, _EP, __back_ends_for<_EP>>::__call(__f);
}

// Implementation of std::void_t, which is C++17
template <class> using __test_type = void;

// Create a trait named __is_valid that tests whether or not a particular call
// is implemented by a particular back end.
#define _NVHPC_CALL_IS_VALID(impl_struct, call)                                \
  template <__back_end, class = void> struct __is_valid : std::false_type { }; \
  template <__back_end _BE>                                                    \
  struct __is_valid<_BE, __test_type<decltype(impl_struct<_BE>::call)>>        \
      : std::true_type { }

// Shortcut for checking if a type is an execution policy.
template <class _EP, class _T>
using __enable_if_EP = typename std::enable_if<
  std::is_execution_policy<typename std::decay<_EP>::type>::value, _T>::type;

} // namespace __stdpar
} // namespace std

#if defined(_NVHPC_STDPAR_MULTICORE) || defined(_NVHPC_STDPAR_GPU)

  #define _NVHPC_INCLUDE_THRUST 1

  #if _NVHPC_STDPAR_GPU_MULTICORE
    #define _NVHPC_STDPAR_OPENMP 1
    #define THRUST_HOST_SYSTEM   THRUST_HOST_SYSTEM_OMP
    #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
  #elif _NVHPC_STDPAR_GPU
    #define THRUST_HOST_SYSTEM   THRUST_HOST_SYSTEM_CPP
    #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
  #elif _NVHPC_STDPAR_MULTICORE
    #define _NVHPC_STDPAR_OPENMP 1
    #define THRUST_HOST_SYSTEM   THRUST_HOST_SYSTEM_CPP
    #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_OMP
  #endif

  // Stdpar has its own checks for these. Suppress the CCCL checks.
  #define THRUST_IGNORE_DEPRECATED_CPP_11
  #define THRUST_IGNORE_DEPRECATED_CPP_DIALECT 1
  #define CUB_IGNORE_DEPRECATED_CPP_DIALECT 1
  #define CCCL_IGNORE_DEPRECATED_CPP_DIALECT 1
  #define CCCL_IGNORE_DEPRECATED_CUDA_BELOW_12 1

  #define __NVCOMPILER_PROCESSING_THRUST_INCLUDES
  #include <thrust/version.h>
  #if THRUST_VERSION < 200800
    #error An unexpected version of Thrust which is incompatible with NVC++ is on the include path. NVC++ includes its own version of Thrust; no user-supplied version should be needed.
  #endif

  #if _NVHPC_STDPAR_GPU && THRUST_VERSION >= 300000 && CUDA_VERSION < 12000
    #error CUDA 12 or newer is required for GPU offloading of parallel algorithms (-stdpar=gpu)
  #endif

  #include <thrust/execution_policy.h>
  #if _NVHPC_STDPAR_GPU
    #include <thrust/detail/caching_allocator.h>
  #endif
  #undef __NVCOMPILER_PROCESSING_THRUST_INCLUDES
#endif

namespace std {
namespace __stdpar {

// __no_policy is used for back ends where the execution policy doesn't matter.
struct __no_policy { };

template <__back_end _BE> struct __policy_for {
  static __no_policy __policy() { return __no_policy{}; }
};

#if _NVHPC_STDPAR_MULTICORE
template <> struct __policy_for<__back_end::__thrust_multicore> {
  static decltype(::thrust::omp::par) __policy() { return ::thrust::omp::par; }
};
#endif

#if _NVHPC_STDPAR_GPU
template <> struct __policy_for<__back_end::__thrust_gpu> {
  static decltype(::thrust::cuda::par(
    ::thrust::detail::single_device_tls_caching_allocator()))
  __policy() {
    return ::thrust::cuda::par(
      ::thrust::detail::single_device_tls_caching_allocator());
  }
};
#endif

} // namespace __stdpar
} // namespace std

#include <exception>
#include <iterator>

#if _NVHPC_STDPAR_GPU && !__NVCOMPILER_GPU_UNIFIED_FUNCTIONS
  #define _ASSERT_NOT_FUNC_PTR(F)                                             \
    static_assert(                                                            \
        !std::is_pointer<F>::value ||                                         \
        !std::is_function<typename std::remove_pointer<F>::type>::value,      \
        "This configuration of the compiler does not support function "       \
        "pointers in parallel algorithms. Use a lambda or a function object " \
        "instead.")
#else
  #define _ASSERT_NOT_FUNC_PTR(F) (void)0
#endif

#if __cplusplus >= 202002L
  #define _ASSERT_RANDOM_ACCESS(It)                                       \
    static_assert(std::random_access_iterator<It>,                        \
        "This configuration of the compiler only supports random access " \
        "iterators and raw pointers in parallel algorithms.")
#else
  #define _ASSERT_RANDOM_ACCESS(It)                                       \
    static_assert(                                                        \
        std::is_base_of<                                                  \
          std::random_access_iterator_tag,                                \
          typename std::iterator_traits<It>::iterator_category>::value,   \
        "This configuration of the compiler only supports random access " \
        "iterators and raw pointers in parallel algorithms.")
#endif

#if _NVHPC_INCLUDE_THRUST && !defined(NVHPC_STDPAR_DISABLE_NVTX_RANGES)
  #define __NVCOMPILER_INCLUDING_NVTX3_HEADER
  #include <nvtx3/nvtx3.hpp>
  #undef __NVCOMPILER_INCLUDING_NVTX3_HEADER
  struct __stdpar_nvtx_domain { static constexpr char const* name{"STDPAR"}; };
  #define _NVHPC_STDPAR_NVTX_RANGE                                         \
    ::nvtx3::v1::detail::optional_scoped_range_in<__stdpar_nvtx_domain>    \
      __optional_nvtx3_range;                                              \
    if target (nv::target::is_host) {                                      \
      static ::nvtx3::v1::registered_string_in<__stdpar_nvtx_domain> const \
        __nvtx3_func_name{__func__};                                       \
      static ::nvtx3::v1::event_attributes const __nvtx3_func_attr{        \
        __nvtx3_func_name};                                                \
      __optional_nvtx3_range.begin(__nvtx3_func_attr);                     \
    }
#else
  #define _NVHPC_STDPAR_NVTX_RANGE
#endif

namespace std { 
namespace __stdpar { 
namespace __detail {

// min and max functions where the argument may be of different types and
// where things are passed and returned by value.  These are designed for
// when the arguments are arithmetic types.

template <class _T, class _U>
inline typename std::common_type<_T, _U>::type min(_T __a, _U __b) {
  return __b < __a ? __b : __a;
}

template <class _T, class _U>
inline typename std::common_type<_T, _U>::type max(_T __a, _U __b) {
  return __b < __a ? __a : __b;
}

// Integer division, but round up instead of down.
template <class _T> inline _T __div_round_up(_T __num, _T __denom) {
  return (__num + (__denom - 1)) / __denom;
}

#if _NVHPC_STDPAR_GPU || __NVCOMPILER_STDPAR_OPENACC_GPU

inline cudaDeviceProp* __getDeviceProperties_uncached() {
  static cudaDeviceProp __device_properties;
  cudaError_t __cuda_result = cudaGetDeviceProperties(&__device_properties, 0);
  if (__cuda_result != cudaSuccess) {
    throw thrust::system_error(__cuda_result, thrust::cuda_category(),
                               "Failed to get GPU properties\n");
  }
  return &__device_properties;
}
inline cudaDeviceProp* __getDeviceProperties() {
  static cudaDeviceProp* __device_properties = __getDeviceProperties_uncached();
  return __device_properties;
}

inline int __get_device_sm_count() {
  return __getDeviceProperties()->multiProcessorCount;
}
inline int __get_device_total_thread_count() {
  cudaDeviceProp* __device_properties = __getDeviceProperties();
  return __device_properties->multiProcessorCount *
      __device_properties->maxThreadsPerMultiProcessor;
}

template <class _Index>
_Index __iterations_for_reduce_or_scan(_Index __input_size) {
  if (__input_size < 1024) {
    return 0;
  }
  if (__input_size < (1 << 22)) {
    return __get_device_sm_count() *
        std::lround(0.0614 * std::sqrt(__input_size));
  }
  return (__get_device_sm_count() * 128) *
      std::lround(0.95 * std::sqrt(__input_size) / (8192) + 0.95);
}

#endif

template <class _Index>
inline _Index __chunk_start(_Index __chunk_no, _Index __chunk_size,
                            _Index __leftover) {
  return (__chunk_no * (__chunk_size + 1)) -
         ((__chunk_no - __leftover) * _Index(__chunk_no >= __leftover));
}
template <class _Index>
inline _Index __chunk_end(_Index __chunk_no, _Index __chunk_size,
                          _Index __leftover) {
  return ((__chunk_no + 1) * (__chunk_size + 1)) -
         ((__chunk_no + 1 - __leftover) * _Index(__chunk_no + 1 >= __leftover));
}

template <class _It1, class _It2> struct __either_or_iterator {
  static_assert(
      std::is_same<typename std::iterator_traits<_It1>::value_type,
                   typename std::iterator_traits<_It2>::value_type>::value,
      "internal error: iterator value types must match");
  using value_type = typename std::iterator_traits<_It1>::value_type;
  using difference_type = typename std::common_type<
      typename std::iterator_traits<_It1>::difference_type,
      typename std::iterator_traits<_It2>::difference_type>::type;
  _It1 __it1;
  _It2 __it2;
  bool __use1;
  explicit __either_or_iterator(_It1 __it) : __it1(__it), __use1(true) { }
  explicit __either_or_iterator(_It2 __it) : __it2(__it), __use1(false) { }
  __either_or_iterator(_It1 __it1, _It2 __it2, bool __use1)
      : __it1(__it1), __it2(__it2), __use1(__use1) { }
  value_type& operator*() {
    if (__use1) {
      return *__it1;
    } else {
      return *__it2;
    }
  }
  value_type& operator[](difference_type __n) {
    if (__use1) {
      return __it1[__n];
    } else {
      return __it2[__n];
    }
  }
  __either_or_iterator operator+(difference_type __n) const {
    if (__use1) {
      return __either_or_iterator{ __it1 + __n };
    } else {
      return __either_or_iterator{ __it2 + __n };
    }
  }
};

template <class _It> struct __either_or_iterator<_It, _It> {
  using value_type = typename std::iterator_traits<_It>::value_type;
  using difference_type = typename std::iterator_traits<_It>::difference_type;
  _It __it;
  explicit __either_or_iterator(_It __it) : __it(__it) { }
  __either_or_iterator(_It __it1, _It __it2, bool __use1)
      : __it(__use1 ? __it1 : __it2) { }
  value_type& operator*() { return *__it; }
  value_type& operator[](difference_type __n) { return __it[__n]; }
  __either_or_iterator operator+(difference_type __n) const {
    return __either_or_iterator{__it + __n};
  }
};

template <class _It1, class _It2>
__either_or_iterator<_It1, _It2> __make_either_or(_It1 __it1, _It2 __it2,
                                                  bool __use1) {
  return __either_or_iterator<_It1, _It2>{__it1, __it2, __use1};
}

}}}

#endif
