fix
This commit is contained in:
parent
d0608b09db
commit
c203a85dee
8 changed files with 13 additions and 11 deletions
2
.github/workflows/build_wheels_rocm.yml
vendored
2
.github/workflows/build_wheels_rocm.yml
vendored
|
@ -69,7 +69,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y --no-install-recommends rocthrust-dev
|
sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
|
||||||
|
|
||||||
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }}
|
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }}
|
||||||
python -m pip install --upgrade build setuptools wheel ninja
|
python -m pip install --upgrade build setuptools wheel ninja
|
||||||
|
|
|
@ -105,6 +105,8 @@ To install from source for AMD GPUs supporting RoCm, please specify the `ROCM_VE
|
||||||
ROCM_VERSION=5.6 pip install .
|
ROCM_VERSION=5.6 pip install .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For RoCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## Quick Tour
|
## Quick Tour
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
// }
|
// }
|
||||||
// #endif
|
// #endif
|
||||||
|
|
||||||
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
|
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
|
||||||
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
|
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
|
||||||
|
|
||||||
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
|
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
|
||||||
|
|
|
@ -31,7 +31,7 @@
|
||||||
// #endif
|
// #endif
|
||||||
|
|
||||||
|
|
||||||
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
|
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
|
||||||
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
|
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
|
||||||
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
|
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
|
||||||
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
|
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
|
||||||
|
|
|
@ -43,12 +43,12 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) || defined(ROCM_VERSION)
|
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
|
||||||
#if __CUDA_ARCH__ < 700 || defined(ROCM_VERSION)
|
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
|
||||||
|
|
||||||
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
|
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
|
||||||
|
|
||||||
#if __CUDA_ARCH__ < 600 || defined(ROCM_VERSION)
|
#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
|
||||||
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
|
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -4,9 +4,9 @@
|
||||||
#include "column_remap.cuh"
|
#include "column_remap.cuh"
|
||||||
#include "../util.cuh"
|
#include "../util.cuh"
|
||||||
#include "../matrix.cuh"
|
#include "../matrix.cuh"
|
||||||
#include "../cuda_compat.cuh"
|
#include "../cu_compat.cuh"
|
||||||
#include "../cuda_buffers.cuh"
|
#include "../cuda_buffers.cuh"
|
||||||
#if defined(ROCM_VERSION)
|
#if defined(USE_ROCM)
|
||||||
#include "../hip_compat.cuh"
|
#include "../hip_compat.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ __global__ void q4_matmul_kernel
|
||||||
|
|
||||||
if constexpr (use_half2)
|
if constexpr (use_half2)
|
||||||
{
|
{
|
||||||
half result = __hadd(acc.x, acc.y);
|
half result = __hadd(__low2half(acc), __high2half(acc));
|
||||||
atomicAdd(out_.item_ptr(x_row, w_column), result);
|
atomicAdd(out_.item_ptr(x_row, w_column), result);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
#include "../tuning.h"
|
#include "../tuning.h"
|
||||||
|
|
||||||
// Workaround for hipify_python using rocblas instead of hipblas.
|
// Workaround for hipify_python using rocblas instead of hipblas.
|
||||||
#if defined(ROCM_VERSION)
|
#if defined(USE_ROCM)
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#define rocblas_handle hipblasHandle_t
|
#define rocblas_handle hipblasHandle_t
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
#if defined(ROCM_VERSION)
|
#if defined(USE_ROCM)
|
||||||
#define cudaUnspecified hipErrorUnknown
|
#define cudaUnspecified hipErrorUnknown
|
||||||
#else
|
#else
|
||||||
#define cudaUnspecified cudaErrorApiFailureBase
|
#define cudaUnspecified cudaErrorApiFailureBase
|
||||||
|
|
Loading…
Add table
Reference in a new issue