diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ee6ab944..4fbaf0d90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,7 @@ if(NOT USE_HIP_CPU) if(GPU_TARGETS STREQUAL "all") rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" ) set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() diff --git a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp index b4b013d8c..6e5c916f3 100644 --- a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp +++ b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp @@ -221,9 +221,9 @@ class texture_cache_iterator #else texture_type words[multiple]; - #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) - #pragma message "Texture cache iterator is not supported on gfx94x as the texture fetch functions in HIP are not available." - ROCPRIM_PRINT_ERROR_ONCE("WARNING: Usage of texture_cache_iterator on gfx94x device is not supported and will not produce valid results.") + #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || defined(__gfx1201__) + #pragma message "Texture cache iterator is not supported on gfx94x or gfx120x as the texture fetch functions in HIP are not available." + ROCPRIM_PRINT_ERROR_ONCE("WARNING: Usage of texture_cache_iterator on gfx94x or gfx120x devices is not supported and will not produce valid results.") #else ROCPRIM_UNROLL for(unsigned int i = 0; i < multiple; i++) diff --git a/rocprim/include/rocprim/thread/thread_load.hpp b/rocprim/include/rocprim/thread/thread_load.hpp index cb724f5ab..8339f0997 100644 --- a/rocprim/include/rocprim/thread/thread_load.hpp +++ b/rocprim/include/rocprim/thread/thread_load.hpp @@ -59,59 +59,65 @@ ROCPRIM_DEVICE __forceinline__ T AsmThreadLoad(void * ptr) #if ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS == 1 - // Important for syncing. Check section 9.2.2 or 7.3 in the following document - // http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf - #define ROCPRIM_ASM_THREAD_LOAD(cache_modifier, \ - llvm_cache_modifier, \ - type, \ - interim_type, \ - asm_operator, \ - output_modifier, \ - wait_cmd) \ - template<> \ - ROCPRIM_DEVICE __forceinline__ type AsmThreadLoad(void* ptr) \ - { \ - interim_type retval; \ - asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \ - "s_waitcnt " wait_cmd "(%2)" \ - : "=" #output_modifier(retval) \ - : "v"(ptr), "I"(0x00)); \ - return retval; \ - } +// Important for syncing. Check section 9.2.2 or 7.3 in the following document +// http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf +#define ROCPRIM_ASM_THREAD_LOAD(cache_modifier, \ + llvm_cache_modifier, \ + type, \ + interim_type, \ + asm_operator, \ + output_modifier, \ + wait_inst, \ + wait_cmd) \ + template<> \ + ROCPRIM_DEVICE __forceinline__ type AsmThreadLoad(void* ptr) \ + { \ + interim_type retval; \ + asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \ + wait_inst wait_cmd "(%2)" \ + : "=" #output_modifier(retval) \ + : "v"(ptr), "I"(0x00)); \ + return retval; \ + } // TODO Add specialization for custom larger data types -#define ROCPRIM_ASM_THREAD_LOAD_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_load_sbyte, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_load_sshort, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_load_ubyte, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_load_ushort, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_load_dword, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_load_dword, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_cmd); +#define ROCPRIM_ASM_THREAD_LOAD_GROUP(cache_modifier, llvm_cache_modifier, wait_inst, wait_cmd) \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_load_sbyte, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_load_sshort, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_load_ubyte, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_load_ushort, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_load_dword, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_load_dword, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_inst, wait_cmd); // [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile') #ifndef __HIP_CPU_RT__ #if defined(__gfx940__) || defined(__gfx941__) -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc1", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0 sc1", "vmcnt"); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0 sc1", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc1", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0 sc1", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0 sc1", "s_waitcnt", "vmcnt"); #elif defined(__gfx942__) -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc0 nt", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0", "vmcnt"); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc0 nt", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0", "s_waitcnt", "vmcnt"); +#elif defined(__gfx1200__) || defined(__gfx1201__) +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", ""); #else -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "vmcnt"); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "s_waitcnt", "vmcnt"); #endif // TODO find correct modifiers to match these -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", ""); -ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", "s_waitcnt", ""); #endif // __HIP_CPU_RT__ #endif diff --git a/rocprim/include/rocprim/thread/thread_store.hpp b/rocprim/include/rocprim/thread/thread_store.hpp index 917f6bf01..b1f962c1e 100644 --- a/rocprim/include/rocprim/thread/thread_store.hpp +++ b/rocprim/include/rocprim/thread/thread_store.hpp @@ -62,54 +62,60 @@ ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, T val) // Important for syncing. Check section 9.2.2 or 7.3 in the following document // http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf -#define ROCPRIM_ASM_THREAD_STORE(cache_modifier, \ +#define ROCPRIM_ASM_THREAD_STORE(cache_modifier, \ llvm_cache_modifier, \ type, \ interim_type, \ asm_operator, \ output_modifier, \ + wait_inst, \ wait_cmd) \ template<> \ - ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, type val) \ + ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, type val) \ { \ - interim_type temp_val = val; \ - asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier : : "v"(ptr), #output_modifier(temp_val)); \ - asm volatile("s_waitcnt " wait_cmd "(%0)" : : "I"(0x00)); \ + interim_type temp_val = val; \ + asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \ + wait_inst wait_cmd "(%2)" \ + : : "v"(ptr), #output_modifier(temp_val), "I"(0x00)); \ } // TODO fix flat_store_ubyte and flat_store_sbyte issues // TODO Add specialization for custom larger data types -#define ROCPRIM_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \ - ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd); +#define ROCPRIM_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_inst, wait_cmd) \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_inst, wait_cmd); \ + ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_inst, wait_cmd); // [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile') - #ifndef __HIP_CPU_RT__ #if defined(__gfx940__) || defined(__gfx941__) -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0 sc1", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 sc1", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0 sc1", "vmcnt"); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0 sc1", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0 sc1", "s_waitcnt", ""); // TODO: gfx942 validation +ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 sc1", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0 sc1", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0 sc1", "s_waitcnt", "vmcnt"); #elif defined(__gfx942__) -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 nt", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0", "vmcnt"); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 nt", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0", "s_waitcnt", "vmcnt"); +#elif defined(__gfx1200__) || defined(__gfx1201__) +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", ""); // TODO: gfx942 validation +ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_storecnt_dscnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", ""); #else -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", ""); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "vmcnt"); -ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", "s_waitcnt", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "s_waitcnt", "vmcnt"); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "s_waitcnt", "vmcnt"); #endif // TODO find correct modifiers to match these -ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", ""); +ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", "s_waitcnt", ""); #endif // __HIP_CPU_RT__ #endif diff --git a/test/rocprim/test_texture_cache_iterator.cpp b/test/rocprim/test_texture_cache_iterator.cpp index 62d91abe0..8b0e5b24a 100644 --- a/test/rocprim/test_texture_cache_iterator.cpp +++ b/test/rocprim/test_texture_cache_iterator.cpp @@ -74,9 +74,9 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform) hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, device_id)); std::string deviceName = std::string(props.gcnArchName); - if (deviceName.rfind("gfx94", 0) == 0) { - // This is a gfx94x device, so skip this test - GTEST_SKIP() << "Test not run on gfx94x as texture cache API is not supported"; + if (deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) { + // This is a gfx94x or gfx120x device, so skip this test + GTEST_SKIP() << "Test not run on gfx94x or gfx120x as texture cache API is not supported"; } HIP_CHECK(hipSetDevice(device_id));