This commit is contained in:
Felix Marty 2023-08-04 15:00:12 +00:00
parent d0608b09db
commit c203a85dee
8 changed files with 13 additions and 11 deletions

View file

@ -69,7 +69,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y --no-install-recommends rocthrust-dev sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }} python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }}
python -m pip install --upgrade build setuptools wheel ninja python -m pip install --upgrade build setuptools wheel ninja

View file

@ -105,6 +105,8 @@ To install from source for AMD GPUs supporting RoCm, please specify the `ROCM_VE
ROCM_VERSION=5.6 pip install . ROCM_VERSION=5.6 pip install .
``` ```
For RoCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
</details> </details>
## Quick Tour ## Quick Tour

View file

@ -30,7 +30,7 @@
// } // }
// #endif // #endif
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION) #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh // adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) { __device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {

View file

@ -31,7 +31,7 @@
// #endif // #endif
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION) #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh // adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) { __device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2)); unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));

View file

@ -43,12 +43,12 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
// //
#if defined(__CUDA_ARCH__) || defined(ROCM_VERSION) #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(ROCM_VERSION) #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(ROCM_VERSION) #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif #endif

View file

@ -4,9 +4,9 @@
#include "column_remap.cuh" #include "column_remap.cuh"
#include "../util.cuh" #include "../util.cuh"
#include "../matrix.cuh" #include "../matrix.cuh"
#include "../cuda_compat.cuh" #include "../cu_compat.cuh"
#include "../cuda_buffers.cuh" #include "../cuda_buffers.cuh"
#if defined(ROCM_VERSION) #if defined(USE_ROCM)
#include "../hip_compat.cuh" #include "../hip_compat.cuh"
#endif #endif
@ -133,7 +133,7 @@ __global__ void q4_matmul_kernel
if constexpr (use_half2) if constexpr (use_half2)
{ {
half result = __hadd(acc.x, acc.y); half result = __hadd(__low2half(acc), __high2half(acc));
atomicAdd(out_.item_ptr(x_row, w_column), result); atomicAdd(out_.item_ptr(x_row, w_column), result);
} }
else else

View file

@ -13,7 +13,7 @@
#include "../tuning.h" #include "../tuning.h"
// Workaround for hipify_python using rocblas instead of hipblas. // Workaround for hipify_python using rocblas instead of hipblas.
#if defined(ROCM_VERSION) #if defined(USE_ROCM)
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
#define rocblas_handle hipblasHandle_t #define rocblas_handle hipblasHandle_t
#endif #endif

View file

@ -8,7 +8,7 @@
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#if defined(ROCM_VERSION) #if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown #define cudaUnspecified hipErrorUnknown
#else #else
#define cudaUnspecified cudaErrorApiFailureBase #define cudaUnspecified cudaErrorApiFailureBase