This commit is contained in:
Felix Marty 2023-08-04 15:00:12 +00:00
parent d0608b09db
commit c203a85dee
8 changed files with 13 additions and 11 deletions

View file

@ -69,7 +69,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends rocthrust-dev
sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }}
python -m pip install --upgrade build setuptools wheel ninja

View file

@ -105,6 +105,8 @@ To install from source for AMD GPUs supporting RoCm, please specify the `ROCM_VE
ROCM_VERSION=5.6 pip install .
```
For RoCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
</details>
## Quick Tour

View file

@ -30,7 +30,7 @@
// }
// #endif
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {

View file

@ -31,7 +31,7 @@
// #endif
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));

View file

@ -43,12 +43,12 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
//
#if defined(__CUDA_ARCH__) || defined(ROCM_VERSION)
#if __CUDA_ARCH__ < 700 || defined(ROCM_VERSION)
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600 || defined(ROCM_VERSION)
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif

View file

@ -4,9 +4,9 @@
#include "column_remap.cuh"
#include "../util.cuh"
#include "../matrix.cuh"
#include "../cuda_compat.cuh"
#include "../cu_compat.cuh"
#include "../cuda_buffers.cuh"
#if defined(ROCM_VERSION)
#if defined(USE_ROCM)
#include "../hip_compat.cuh"
#endif
@ -133,7 +133,7 @@ __global__ void q4_matmul_kernel
if constexpr (use_half2)
{
half result = __hadd(acc.x, acc.y);
half result = __hadd(__low2half(acc), __high2half(acc));
atomicAdd(out_.item_ptr(x_row, w_column), result);
}
else

View file

@ -13,7 +13,7 @@
#include "../tuning.h"
// Workaround for hipify_python using rocblas instead of hipblas.
#if defined(ROCM_VERSION)
#if defined(USE_ROCM)
#include <hipblas/hipblas.h>
#define rocblas_handle hipblasHandle_t
#endif

View file

@ -8,7 +8,7 @@
#include <cstdint>
#include <cstdio>
#if defined(ROCM_VERSION)
#if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown
#else
#define cudaUnspecified cudaErrorApiFailureBase