fix

2023-08-04 15:00:12 +00:00 · 2023-08-04 15:00:12 +00:00 · c203a85dee
commit c203a85dee
parent d0608b09db
8 changed files with 13 additions and 11 deletions
--- a/.github/workflows/build_wheels_rocm.yml
+++ b/.github/workflows/build_wheels_rocm.yml
@ -69,7 +69,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends rocthrust-dev
+          sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev

          python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm }}
          python -m pip install --upgrade build setuptools wheel ninja
--- a/README.md
+++ b/README.md
@ -105,6 +105,8 @@ To install from source for AMD GPUs supporting RoCm, please specify the `ROCM_VE
 ROCM_VERSION=5.6 pip install .
 ```

+For RoCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
+
 </details>

 ## Quick Tour
--- a/autogptq_cuda/autogptq_cuda_kernel_256.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_256.cu
@ -30,7 +30,7 @@
 // }
 // #endif

-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
 // adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh

 __device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
--- a/autogptq_cuda/autogptq_cuda_kernel_64.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_64.cu
@ -31,7 +31,7 @@
 // #endif


-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(ROCM_VERSION)
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
 // adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
 __device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
--- a/autogptq_cuda/exllama/cuda_compat.cuh
+++ b/autogptq_cuda/exllama/cuda_compat.cuh
@ -43,12 +43,12 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)

 //

-#if defined(__CUDA_ARCH__) || defined(ROCM_VERSION)
-#if __CUDA_ARCH__ < 700 || defined(ROCM_VERSION)
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)

 __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }

-#if __CUDA_ARCH__ < 600 || defined(ROCM_VERSION)
+#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
 __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif

--- a/autogptq_cuda/exllama/cuda_func/q4_matmul.cu
+++ b/autogptq_cuda/exllama/cuda_func/q4_matmul.cu
@ -4,9 +4,9 @@
 #include "column_remap.cuh"
 #include "../util.cuh"
 #include "../matrix.cuh"
-#include "../cuda_compat.cuh"
+#include "../cu_compat.cuh"
 #include "../cuda_buffers.cuh"
-#if defined(ROCM_VERSION)
+#if defined(USE_ROCM)
 #include "../hip_compat.cuh"
 #endif

@ -133,7 +133,7 @@ __global__ void q4_matmul_kernel

    if constexpr (use_half2)
    {
-        half result = __hadd(acc.x, acc.y);
+        half result = __hadd(__low2half(acc), __high2half(acc));
        atomicAdd(out_.item_ptr(x_row, w_column), result);
    }
    else
--- a/autogptq_cuda/exllama/cuda_func/q4_matmul.cuh
+++ b/autogptq_cuda/exllama/cuda_func/q4_matmul.cuh
@ -13,7 +13,7 @@
 #include "../tuning.h"

 // Workaround for hipify_python using rocblas instead of hipblas.
-#if defined(ROCM_VERSION)
+#if defined(USE_ROCM)
 #include <hipblas/hipblas.h>
 #define rocblas_handle hipblasHandle_t
 #endif
--- a/autogptq_cuda/exllama/util.cuh
+++ b/autogptq_cuda/exllama/util.cuh
@ -8,7 +8,7 @@
 #include <cstdint>
 #include <cstdio>

-#if defined(ROCM_VERSION)
+#if defined(USE_ROCM)
 #define cudaUnspecified hipErrorUnknown
 #else
 #define cudaUnspecified cudaErrorApiFailureBase