install qigen and move file

2023-08-10 10:06:08 +09:00 · 2023-08-10 10:06:08 +09:00 · 1b3723a584
commit 1b3723a584
parent 69cdfe80fd
23 changed files with 2048 additions and 11 deletions
--- a/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
+++ b/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
--- a/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
+++ b/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
--- a/autogptq_extension/cuda_64/autogptq_cuda_64.cpp
+++ b/autogptq_extension/cuda_64/autogptq_cuda_64.cpp
--- a/autogptq_extension/cuda_64/autogptq_cuda_kernel_64.cu
+++ b/autogptq_extension/cuda_64/autogptq_cuda_kernel_64.cu
--- a/autogptq_extension/exllama/cu_compat.cuh
+++ b/autogptq_extension/exllama/cu_compat.cuh
--- a/autogptq_extension/exllama/cuda_buffers.cu
+++ b/autogptq_extension/exllama/cuda_buffers.cu
--- a/autogptq_extension/exllama/cuda_buffers.cuh
+++ b/autogptq_extension/exllama/cuda_buffers.cuh
--- a/autogptq_extension/exllama/cuda_func/column_remap.cu
+++ b/autogptq_extension/exllama/cuda_func/column_remap.cu
--- a/autogptq_extension/exllama/cuda_func/column_remap.cuh
+++ b/autogptq_extension/exllama/cuda_func/column_remap.cuh
--- a/autogptq_extension/exllama/cuda_func/q4_matmul.cu
+++ b/autogptq_extension/exllama/cuda_func/q4_matmul.cu
--- a/autogptq_extension/exllama/cuda_func/q4_matmul.cuh
+++ b/autogptq_extension/exllama/cuda_func/q4_matmul.cuh
--- a/autogptq_extension/exllama/cuda_func/q4_matrix.cu
+++ b/autogptq_extension/exllama/cuda_func/q4_matrix.cu
--- a/autogptq_extension/exllama/cuda_func/q4_matrix.cuh
+++ b/autogptq_extension/exllama/cuda_func/q4_matrix.cuh
--- a/autogptq_extension/exllama/exllama_ext.cpp
+++ b/autogptq_extension/exllama/exllama_ext.cpp
--- a/autogptq_extension/exllama/hip_compat.cuh
+++ b/autogptq_extension/exllama/hip_compat.cuh
--- a/autogptq_extension/exllama/matrix.cuh
+++ b/autogptq_extension/exllama/matrix.cuh
--- a/autogptq_extension/exllama/tuning.h
+++ b/autogptq_extension/exllama/tuning.h
--- a/autogptq_extension/exllama/util.cuh
+++ b/autogptq_extension/exllama/util.cuh
--- a/autogptq_extension/qigen/generate.py
+++ b/autogptq_extension/qigen/generate.py
--- a/autogptq_extension/qigen/intrin.py
+++ b/autogptq_extension/qigen/intrin.py
@ -0,0 +1,149 @@
 def load_int(to, address, const=True):
    if const:
        return f"const __m256i {to} = _mm256_loadu_si256({address});"
    else:
        return f"__m256i {to} = _mm256_loadu_si256({address});"
 def load_fp(to, address, const=True):
    if const:
        return f"const __m256 {to} = _mm256_loadu_ps({address});"
    else:
        return f"__m256 {to} = _mm256_loadu_ps({address});"
 # to = a * b + c
 def vfma(to, a, b, c):
    return f"__m256 {to} = _mm256_fmadd_ps({a}, {b}, {c});"
 def vsrli(to, a, b):
    return f"const __m256i {to} = _mm256_srli_epi32({a}, {b});"
 def vand(to, a, b):
    return f"const __m256i {to} = _mm256_and_si256({a}, {b});"
 def vbroadcast_fp(to, a):
    return f"const __m256 {to} = _mm256_set1_ps({a});"
 def vbroadcast_int32(to, a):
    return f"__m256i {to} = _mm256_set1_epi32({a});"
 def vsetzero(to):
    return f"__m256 {to} = _mm256_setzero_ps();"
 def vcvtepi32_ps(to, a):
    return f"const __m256 {to} = _mm256_cvtepi32_ps({a});"
 def _256extractf128_ps(to, a, imm):
    return f"const __m128 {to} = _mm256_extractf128_ps({a}, {imm});"
 def _256castps256_ps128(to, a):
    return f"const __m128 {to} = _mm256_castps256_ps128({a});"
 def _add_ps(to, a, b):
    return f"const __m128 {to} = _mm_add_ps({a}, {b});"
 def _movehl_ps(to, a, b):
    return f"const __m128 {to} = _mm_movehl_ps({a}, {b});"
 def _shuffle_ps(to, a, b, imm):
    return f"const __m128 {to} = _mm_shuffle_ps({a}, {b}, {imm});"
 def _cvtss_f32(to, a):
    return f"const float {to} = _mm_cvtss_f32({a});"
 def _reduce8_acc(a, b, c, d, e, f, g, h):
    res = ""
    res += _256extractf128_ps("hi_quad0", a, 1)
    res += _256extractf128_ps("hi_quad1", b, 1)
    res += _256extractf128_ps("hi_quad2", c, 1)
    res += _256extractf128_ps("hi_quad3", d, 1)
    res += _256extractf128_ps("hi_quad4", e, 1)
    res += _256extractf128_ps("hi_quad5", f, 1)
    res += _256extractf128_ps("hi_quad6", g, 1)
    res += _256extractf128_ps("hi_quad7", h, 1)
    res += _256castps256_ps128("lo_quad0", a)
    res += _256castps256_ps128("lo_quad1", b)
    res += _256castps256_ps128("lo_quad2", c)
    res += _256castps256_ps128("lo_quad3", d)
    res += _256castps256_ps128("lo_quad4", e)
    res += _256castps256_ps128("lo_quad5", f)
    res += _256castps256_ps128("lo_quad6", g)
    res += _256castps256_ps128("lo_quad7", h)
    res += _add_ps("sum_quad0", "lo_quad0", "hi_quad0")
    res += _add_ps("sum_quad1", "lo_quad1", "hi_quad1")
    res += _add_ps("sum_quad2", "lo_quad2", "hi_quad2")
    res += _add_ps("sum_quad3", "lo_quad3", "hi_quad3")
    res += _add_ps("sum_quad4", "lo_quad4", "hi_quad4")
    res += _add_ps("sum_quad5", "lo_quad5", "hi_quad5")
    res += _add_ps("sum_quad6", "lo_quad6", "hi_quad6")
    res += _add_ps("sum_quad7", "lo_quad7", "hi_quad7")
    res += _movehl_ps("hi_dual0", "sum_quad0", "sum_quad0")
    res += _movehl_ps("hi_dual1", "sum_quad1", "sum_quad1")
    res += _movehl_ps("hi_dual2", "sum_quad2", "sum_quad2")
    res += _movehl_ps("hi_dual3", "sum_quad3", "sum_quad3")
    res += _movehl_ps("hi_dual4", "sum_quad4", "sum_quad4")
    res += _movehl_ps("hi_dual5", "sum_quad5", "sum_quad5")
    res += _movehl_ps("hi_dual6", "sum_quad6", "sum_quad6")
    res += _movehl_ps("hi_dual7", "sum_quad7", "sum_quad7")
    res += _add_ps("sum_dual0", "sum_quad0", "hi_dual0")
    res += _add_ps("sum_dual1", "sum_quad1", "hi_dual1")
    res += _add_ps("sum_dual2", "sum_quad2", "hi_dual2")
    res += _add_ps("sum_dual3", "sum_quad3", "hi_dual3")
    res += _add_ps("sum_dual4", "sum_quad4", "hi_dual4")
    res += _add_ps("sum_dual5", "sum_quad5", "hi_dual5")
    res += _add_ps("sum_dual6", "sum_quad6", "hi_dual6")
    res += _add_ps("sum_dual7", "sum_quad7", "hi_dual7")
    res += _shuffle_ps("hi0", "sum_dual0", "sum_dual0", 0x1)
    res += _shuffle_ps("hi1", "sum_dual1", "sum_dual1", 0x1)
    res += _shuffle_ps("hi2", "sum_dual2", "sum_dual2", 0x1)
    res += _shuffle_ps("hi3", "sum_dual3", "sum_dual3", 0x1)
    res += _shuffle_ps("hi4", "sum_dual4", "sum_dual4", 0x1)
    res += _shuffle_ps("hi5", "sum_dual5", "sum_dual5", 0x1)
    res += _shuffle_ps("hi6", "sum_dual6", "sum_dual6", 0x1)
    res += _shuffle_ps("hi7", "sum_dual7", "sum_dual7", 0x1)
    res += _add_ps("sum0", "sum_dual0", "hi0")
    res += _add_ps("sum1", "sum_dual1", "hi1")
    res += _add_ps("sum2", "sum_dual2", "hi2")
    res += _add_ps("sum3", "sum_dual3", "hi3")
    res += _add_ps("sum4", "sum_dual4", "hi4")
    res += _add_ps("sum5", "sum_dual5", "hi5")
    res += _add_ps("sum6", "sum_dual6", "hi6")
    res += _add_ps("sum7", "sum_dual7", "hi7")
    res += _cvtss_f32(f"f{a}", "sum0")
    res += _cvtss_f32(f"f{b}", "sum1")
    res += _cvtss_f32(f"f{c}", "sum2")
    res += _cvtss_f32(f"f{d}", "sum3")
    res += _cvtss_f32(f"f{e}", "sum4")
    res += _cvtss_f32(f"f{f}", "sum5")
    res += _cvtss_f32(f"f{g}", "sum6")
    res += _cvtss_f32(f"f{h}", "sum7")
    return res
 acc_idx = 0
 def _reduce_add(a):
    global acc_idx
    res = ""
    res += _256extractf128_ps(f"hi_quad{acc_idx}", a, 1)
    res += _256castps256_ps128(f"lo_quad{acc_idx}", a)
    res += _add_ps(f"sum_quad{acc_idx}", f"lo_quad{acc_idx}", f"hi_quad{acc_idx}")
    res += _movehl_ps(f"hi_dual{acc_idx}", f"sum_quad{acc_idx}", f"sum_quad{acc_idx}")
    res += _add_ps(f"sum_dual{acc_idx}", f"sum_quad{acc_idx}", f"hi_dual{acc_idx}")
    res += _shuffle_ps(f"hi{acc_idx}", f"sum_dual{acc_idx}", f"sum_dual{acc_idx}", 0x1)
    res += _add_ps(f"sum{acc_idx}", f"sum_dual{acc_idx}", f"hi{acc_idx}")
    res += _cvtss_f32(f"f{a}", f"sum{acc_idx}")
    acc_idx += 1
    return res
--- a/autogptq_extension/qigen/mmm.cpp
+++ b/autogptq_extension/qigen/mmm.cpp
@ -0,0 +1,302 @@
 #include <iostream>
 #include "forward.h"
 #include <cstring>
 #include <algorithm>
 #include <vector>
 #include <chrono>
 #include <fstream>
 #define mymin(a,b) ((a)<(b)?(a):(b))
 #define mymax(a,b) ((a)>(b)?(a):(b))
 void print_matrix(std::string name, float* A, int N, int M){
 	std::cout<<name<<std::endl;
 	for(int i = 0; i < N; i++){
 		for(int j = 0; j < M; j++){
 			std::cout << A[i*M+j] << " ";
 		}
 		std::cout << std::endl;
 	}
 	std::cout<<std::endl;
 }
 void oracle_mmadd(float* A, float* B, float* bias, float* C, int n, int m, int t){
 	// triple loop matmul and add bias
 	for (int i = 0; i < n; i++){
 		for (int j = 0; j < t; j++){
 			float sum = 0;
 			for (int k = 0; k < m; k++){
 				sum += A[i*m+k] * B[k*t+j];
 			}
 			C[i*t+j] += sum + bias[j];
 		}
 	}
 }
 void compute_reduction(float *in, float *out, int n, int m, int gs){
 	int ng;
 	if(gs == -1){
 		ng = 1;
 		gs = m;
 	}else{
 		ng = m/gs;
 	}
 	for(int i = 0; i < n; i++){
 		for(int j0 = 0; j0 < m; j0+=gs){
 			int j = j0/gs;
 			out[i*ng+j] = 0;
 			for(int j1 = j0; j1 < j0+gs; j1++){
 				out[i*ng+j] += in[i*m+j1];
 			}
 		}
 	}
 }
 void quantize_sim(float* A, float* BQ, float* scales, float* zeros, int n, int m, int bits, int gs){
 	//find scales and zeros arrays
 	if(gs == -1){
 		gs = n;
 	}
 	float range = (1<<bits) - 1;
 	int packed = 32 / bits;
 	for(int i0 = 0; i0 < n; i0+=gs){
 		int row = i0/gs;
 		for(int j = 0; j < m; j++){
 			float min = A[i0*m + j];
 			float max = A[i0*m + j];
 			for(int i1 = i0; i1 < i0+gs; i1++){
 				min = mymin(min, A[i1*m+j]);
 				max = mymax(max, A[i1*m+j]);
 			}
 			scales[row*m + j] = (max-min)/range;
 			zeros[row*m + j ] = min;
 		}
 		for(int j = 0; j < m; j++){
 			for (int i1 = i0; i1 < i0+gs; i1++){
 				uint32_t acc = 0;
 				int temp = (A[i1*m+j] - zeros[row*m+j])/scales[row*m+j];
 				float val = ((float) temp + zeros[row*m+j]) * scales[row*m+j];
 				BQ[i1*m+j] = val;
 			}
 		}
 	}
 }
 void quantize(float* A, int* BQ, float* scales, float* zeros, int n, int m, int bits, int gs){
 	//find scales and zeros arrays
 	if(gs == -1){
 		gs = n;
 	}
 	float range = (1<<bits) - 1;
 	int packed = 32 / bits;
 	for(int i0 = 0; i0 < n; i0+=gs){
 		int row = i0/gs;
 		for(int j = 0; j < m; j++){
 			float min = A[i0*m + j];
 			float max = A[i0*m + j];
 			for(int i1 = i0; i1 < i0+gs; i1++){
 				min = mymin(min, A[i1*m+j]);
 				max = mymax(max, A[i1*m+j]);
 			}
 			scales[row*m + j] = (max-min)/range;
 			zeros[row*m + j ] = min;
 		}
 		for(int j = 0; j < m; j++){
 			if(bits == 3){
 				for (int i1 = i0; i1 < i0+gs; i1+=32){
 					uint32_t acc = 0;
 					int temp0 = ((int)((A[(i1+0)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 0;
 					int temp1 = ((int)((A[(i1+1)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 3;
 					int temp2 = ((int)((A[(i1+2)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 6;
 					int temp3 = ((int)((A[(i1+3)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 9;
 					int temp4 = ((int)((A[(i1+4)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 12;
 					int temp5 = ((int)((A[(i1+5)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 15;
 					int temp6 = ((int)((A[(i1+6)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 18;
 					int temp7 = ((int)((A[(i1+7)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 21;
 					int temp8 = ((int)((A[(i1+8)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 24;
 					int temp9 = ((int)((A[(i1+9)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 27;
 					int temp10_0 = ((int)((A[(i1+10)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 30;
 					int temp10_1 = ((int)((A[(i1+10)*m+j] - zeros[row*m+j])/scales[row*m+j])) >> 2;
 					int temp11 = ((int)((A[(i1+11)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 1;
 					int temp12 = ((int)((A[(i1+12)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 4;
 					int temp13 = ((int)((A[(i1+13)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 7;
 					int temp14 = ((int)((A[(i1+14)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 10;
 					int temp15 = ((int)((A[(i1+15)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 13;
 					int temp16 = ((int)((A[(i1+16)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 16;
 					int temp17 = ((int)((A[(i1+17)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 19;
 					int temp18 = ((int)((A[(i1+18)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 22;
 					int temp19 = ((int)((A[(i1+19)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 25;
 					int temp20 = ((int)((A[(i1+20)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 28;
 					int temp21_0 = ((int)((A[(i1+21)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 31;
 					int temp21_1 = ((int)((A[(i1+21)*m+j] - zeros[row*m+j])/scales[row*m+j])) >> 1;
 					int temp22 = ((int)((A[(i1+22)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 2;
 					int temp23 = ((int)((A[(i1+23)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 5;
 					int temp24 = ((int)((A[(i1+24)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 8;
 					int temp25 = ((int)((A[(i1+25)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 11;
 					int temp26 = ((int)((A[(i1+26)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 14;
 					int temp27 = ((int)((A[(i1+27)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 17;
 					int temp28 = ((int)((A[(i1+28)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 20;
 					int temp29 = ((int)((A[(i1+29)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 23;
 					int temp30 = ((int)((A[(i1+30)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 26;
 					int temp31 = ((int)((A[(i1+31)*m+j] - zeros[row*m+j])/scales[row*m+j])) << 29;
 					int acc0 = 0, acc1 = 0, acc2 = 0;
 					acc0 |= temp0;
 					acc0 |= temp1;
 					acc0 |= temp2;
 					acc0 |= temp3;
 					acc0 |= temp4;
 					acc0 |= temp5;
 					acc0 |= temp6;
 					acc0 |= temp7;
 					acc0 |= temp8;
 					acc0 |= temp9;
 					acc0 |= temp10_0;
 					acc1 |= temp10_1;
 					acc1 |= temp11;
 					acc1 |= temp12;
 					acc1 |= temp13;
 					acc1 |= temp14;
 					acc1 |= temp15;
 					acc1 |= temp16;
 					acc1 |= temp17;
 					acc1 |= temp18;
 					acc1 |= temp19;
 					acc1 |= temp20;
 					acc1 |= temp21_0;
 					acc2 |= temp21_1;
 					acc2 |= temp22;
 					acc2 |= temp23;
 					acc2 |= temp24;
 					acc2 |= temp25;
 					acc2 |= temp26;
 					acc2 |= temp27;
 					acc2 |= temp28;
 					acc2 |= temp29;
 					acc2 |= temp30;
 					acc2 |= temp31;
 					BQ[(3*i1/32)*m+j] = acc0;
 					BQ[(3*i1/32+1)*m+j] = acc1;
 					BQ[(3*i1/32+2)*m+j] = acc2;
 			}
 			}else{
 				for (int i1 = i0; i1 < i0+gs; i1+=packed){
 					uint32_t acc = 0;
 					for (int i2 = i1; i2 < i1+packed; i2++){
 						int temp = (A[i2*m+j] - zeros[row*m+j])/scales[row*m+j];
 						acc = acc | (temp << (bits*(i2-i1)));
 					}
 					BQ[(i1/packed)*m+j] = acc;
 				}
 			}
 		}
 	}
 }
 int main(int argc, char *argv[]){
 	// read n m t from args
 	if(argc == 0){std::cout << "Parameters not given\n"; return 0;}
 	int n = atoi(argv[1]);
 	int m = atoi(argv[2]);
 	int t = atoi(argv[3]);
 	int bits = atoi(argv[4]);
 	int gs = atoi(argv[5]);
 	int ng;
 	if(gs == -1){
 		ng = 1;
 	}else{
 		ng = m/gs;
 	}
 	float* A = new float[n*m];
 	float* AB = new float[n*m];
 	float* B = new float[m*t];
 	float* BQS = new float[m*t];
 	float* scales = new float[t*ng];
 	float* zeros = new float[t*ng];
 	int* BQ = new int[m*t/8];
 	int* BQB = new int[m*t/8];
 	float* sums = new float[n*ng];
 	float* bias = new float[t];
 	float* C = new float[n*t];
 	float* CB = new float[n*t];
 	float* C2 = new float[n*t];
 	srand(1);
 	for (int i = 0; i < n*m; i++){
 		A[i] = (float)rand() / RAND_MAX;
 	}
 	for (int i = 0; i < t*m; i++){
 		B[i] = (float)rand() / RAND_MAX;
 	}
 	for (int i = 0; i < t; i++){
 		bias[i] = (float)rand() / RAND_MAX;
 	}
 	for (int i = 0; i < n*t; i++){
 		C[i] = 0.0;
 		C2[i] = 0.0;
 	}
 	quantize_sim(B,BQS,scales,zeros,m,t,bits,gs);
 	quantize(B,BQ,scales,zeros,m,t,bits,gs);
 	quantize_sim(B,BQS,scales,zeros,m,t,bits,gs);
 	quantize(B,BQ,scales,zeros,m,t,bits,gs);
 	oracle_mmadd(A, BQS, bias, C, n, m, t);
 	pack_input(A,AB);
 	pack_qw(BQ,BQB);
 	pack_output(C,CB);
 	compute_reduction(A,sums,n,m,gs);
 	qforward(AB,BQB,scales,zeros,bias,sums,C2,n,m,t);
 	float norm = 0.0;
 	for (int i = 0; i < n*t; i++){
 		norm += (C[i] - C2[i]) * (C[i] - C2[i]); 
 	}
 	if(norm / (n*t) < 0.0001){
 		int iter = 30;
 		for(int _ = 0; _ < iter; _++){
 			qforward(AB,BQB,scales,zeros,bias,sums,C2,n,m,t);
 		}
 		int num_runs = 15;
 		std::vector<long int> runs(num_runs);
 		for(int r = 0; r < num_runs; r++){
 			auto start = std::chrono::high_resolution_clock::now();
 			for(int _ = 0; _ < iter; _++){
 				qforward(AB,BQB,scales,zeros,bias,sums,C2,n,m,t);
 			}
 			auto end = std::chrono::high_resolution_clock::now();
 			runs[r] = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
 		}
 		std::sort(runs.begin(), runs.end());
 		float cycles_final = runs[num_runs/2 + 1] / iter;
 		std::ofstream outfile;
 		outfile.open("./autogptq_extension/qigen/tmp.csv", std::ios_base::app);
 		print_parameters();
 		outfile << cycles_final << std::endl;
 	}else{
 		float cycles_final = int(10e12);
 		std::ofstream outfile;
 		outfile.open("./autogptq_extension/qigen/tmp.csv", std::ios_base::app);
 		print_parameters();
 		outfile << cycles_final << std::endl;
 	}
 	return 0;	
 }
--- a/autogptq_extension/qigen/template.py
+++ b/autogptq_extension/qigen/template.py
@ -0,0 +1,85 @@
 def includes():
    out = " \
 #include <torch/all.h>\n \
 #include <torch/python.h>\n \
 #include <omp.h>\n \
 #include <cmath>\n \
 #include <immintrin.h>\n \
 \n \
 #define mymin(a,b) ((a)<(b)?(a):(b))\n \
 #define mymax(a,b) ((a)>(b)?(a):(b))\n \
 "
    return out
 def module(bits_list=[4, 2]):
    out = 'PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n'
    for bits in bits_list:
        out += '  m.def("forward{}", &forward{}_cpu);\n'.format(bits, bits)
    for bits in bits_list:
        out += '  m.def("unpack_zeros{}", &unpack_zeros{});\n'.format(bits, bits)
    for bits in bits_list:
        out += '  m.def("forward_gs{}", &forward{}_gs_cpu);\n'.format(bits, bits)
    for bits in bits_list:
        out += '  m.def("pack{}", &pack{}_w_cpu);\n'.format(bits, bits)
    out += 'm.def("compute_reduction_cpp", &compute_reduction);\n'
    out += 'm.def("unquantize_sim", &unquantize_sim);\n'
    # if oracle:
        # out += '  m.def("forward4_oracle", &forward4_oracle_cpu);\n'
    out += 'm.def("quant_scalar_scaled", &quant_scalar_cpu);\n'
    out += '}\n'
    return out
 def quant_scalar():
    out = " \
 void quantize_scalar(float* A, int* BQ, float* scales, float* zeros, int n, int m, int bits){ \n \
 	//find scales and zeros arrays \n \
 	//quantize \n \
 	int pack = 32/bits;\n \
 	for (int j = 0; j < m; j++){\n \
 		for (int i = 0; i < n; i+=pack){\n \
 			uint32_t acc = 0;\n \
 			for (int ii = i; ii < i+pack; ii++){\n \
 				float ftemp = std::round((A[ii*m+j] + zeros[j])/scales[j]);\n \
 				int temp = (int)ftemp;\n \
 				acc = acc | (temp << (bits*(ii-i)));\n \
 			}\n \
 			BQ[(i/pack)*m+j] = acc;\n \
 			//BQ[0] = acc;\n \
 		}\n \
 	}\n \
 }\n \
 \n \
 void quant_scalar_cpu(\n \
 	torch::Tensor in, torch::Tensor out, \n \
 	torch::Tensor scales, torch::Tensor zeros, int bits\n \
 ) {\n \
 \n \
 	int N  = in.size(0);\n \
 	int M  = in.size(1);\n \
 \n \
 	float* input = in.data_ptr<float>(); \n \
 	float* s   = scales.data_ptr<float>();\n \
 	float* z   = zeros.data_ptr<float>();\n \
 	int* O   = out.data_ptr<int>();\n \
 		\n \
 	quantize_scalar(input, O, s, z, N, M, bits);\n \
 \n \
 }\n"
    return out
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,12 @@
 import os
 import sys
 from pathlib import Path
-from setuptools import setup, find_packages
+from setuptools import setup, Extension, find_packages
 import subprocess
 import math
 os.environ["CC"] = "g++"
 os.environ["CXX"] = "g++"
 common_setup_kwargs = {
    "version": "0.4.0",
@ -66,12 +70,15 @@ if BUILD_CUDA_EXT:
 requirements = [
    "accelerate>=0.19.0",
    "datasets",
    "sentencepiece",
    "numpy",
    "rouge",
    "gekko",
    "torch>=1.13.0",
    "safetensors",
    "transformers>=4.31.0",
-    "peft"
+    "peft",
    "tqdm",
 ]
 extras_require = {
@ -85,6 +92,9 @@ additional_setup_kwargs = dict()
 if BUILD_CUDA_EXT:
    from torch.utils import cpp_extension
    p = int(subprocess.run("cat /proc/cpuinfo | grep cores | head -1", shell=True, check=True, text=True, stdout=subprocess.PIPE).stdout.split(" ")[2])
    subprocess.call(["python", "./autogptq_extension/qigen/generate.py", "--module", "--search", "--p", str(p)])
    if not ROCM_VERSION:
        from distutils.sysconfig import get_python_lib
        conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
@ -97,16 +107,23 @@ if BUILD_CUDA_EXT:
        cpp_extension.CUDAExtension(
            "autogptq_cuda_64",
            [
-                "autogptq_cuda/autogptq_cuda_64.cpp",
+                "autogptq_extension/cuda_64/autogptq_cuda_64.cpp",
-                "autogptq_cuda/autogptq_cuda_kernel_64.cu"
+                "autogptq_extension/cuda_64/autogptq_cuda_kernel_64.cu"
            ]
        ),
        cpp_extension.CUDAExtension(
            "autogptq_cuda_256",
            [
-                "autogptq_cuda/autogptq_cuda_256.cpp",
+                "autogptq_extension/cuda_256/autogptq_cuda_256.cpp",
-                "autogptq_cuda/autogptq_cuda_kernel_256.cu"
+                "autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu"
            ]
        ),
        cpp_extension.CppExtension(
            "cQIGen",
            [
                'autogptq_extension/qigen/backend.cpp'
            ],
            extra_compile_args = ["-O3", "-mavx", "-mavx2", "-mfma", "-march=native", "-ffast-math", "-ftree-vectorize", "-faligned-new", "-std=c++17", "-fopenmp", "-fno-signaling-nans", "-fno-trapping-math"]
        )
    ]
@ -115,11 +132,11 @@ if BUILD_CUDA_EXT:
            cpp_extension.CUDAExtension(
                "exllama_kernels",
                [
-                    "autogptq_cuda/exllama/exllama_ext.cpp",
+                    "autogptq_extension/exllama/exllama_ext.cpp",
-                    "autogptq_cuda/exllama/cuda_buffers.cu",
+                    "autogptq_extension/exllama/cuda_buffers.cu",
-                    "autogptq_cuda/exllama/cuda_func/column_remap.cu",
+                    "autogptq_extension/exllama/cuda_func/column_remap.cu",
-                    "autogptq_cuda/exllama/cuda_func/q4_matmul.cu",
+                    "autogptq_extension/exllama/cuda_func/q4_matmul.cu",
-                    "autogptq_cuda/exllama/cuda_func/q4_matrix.cu"
+                    "autogptq_extension/exllama/cuda_func/q4_matrix.cu"
                ]
            )
        )