New coder model, also move some to pascal again
This commit is contained in:
parent
aa392f5bea
commit
4d3bea71a8
3 changed files with 47 additions and 35 deletions
60
config.toml
60
config.toml
|
@ -24,20 +24,20 @@ openai_key="12345"
|
|||
model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
|
||||
|
||||
[text_gen.models.guard1]
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["guard"]
|
||||
gpu_list = [6]
|
||||
gpu_list = [4]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20]
|
||||
model = "LlamaGuard-7B-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -54,16 +54,16 @@ openai_key="12345"
|
|||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["chat"]
|
||||
gpu_list = [6]
|
||||
gpu_list = [3]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20]
|
||||
model = "dolphin-2.6-mistral-7B-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -80,16 +80,16 @@ openai_key="12345"
|
|||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["coder", "embeddings"]
|
||||
gpu_list = [6]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "deepseek-coder-33B-instruct-GGUF/deepseek-coder-33b-instruct.Q4_K_M.gguf"
|
||||
no_inject_fused_attention = true
|
||||
memory_split = [20]
|
||||
model = "Codestral-22B-v0.1-GGUF/Codestral-22B-v0.1-Q6_K.gguf"
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -106,8 +106,8 @@ openai_key="12345"
|
|||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["openai","api"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["embeddings"]
|
||||
gpu_list = [5]
|
||||
|
@ -115,7 +115,7 @@ openai_key="12345"
|
|||
# loader = "llamacpp"
|
||||
memory_split = [2]
|
||||
# model = "null"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -133,8 +133,8 @@ openai_key="12345"
|
|||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["mixtral", "embeddings"]
|
||||
gpu_list = [0,1,2]
|
||||
|
@ -142,7 +142,7 @@ openai_key="12345"
|
|||
loader = "llamacpp"
|
||||
memory_split = [14,14,14]
|
||||
model = "dolphin-2.5-mixtral-8x7b-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -155,20 +155,20 @@ openai_key="12345"
|
|||
multi_user = true
|
||||
|
||||
[text_gen.models.tiny1]
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["tiny", "embeddings"]
|
||||
gpu_list = [6]
|
||||
gpu_list = [4]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
@ -181,20 +181,20 @@ openai_key="12345"
|
|||
multi_user = true
|
||||
|
||||
[text_gen.models.phi1]
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
|
||||
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
desc_act = false
|
||||
disable_exllama = false
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["phi", "embeddings"]
|
||||
gpu_list = [6]
|
||||
gpu_list = [5]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "dolphin-2_6-phi-2-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_inject_fused_attention = false
|
||||
no_use_cuda_fp16 = false
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
|
|
|
@ -57,6 +57,9 @@ server {
|
|||
location / {
|
||||
proxy_pass http://<%= "$name" %>-<%= $type %>/;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Forwarded-For $remote_addr;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection $connection_upgrade;
|
||||
}
|
||||
|
@ -75,6 +78,9 @@ server {
|
|||
location / {
|
||||
proxy_pass http://family-<%= "$fam" %>-<%= $type %>/;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Forwarded-For $remote_addr;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection $connection_upgrade;
|
||||
}
|
||||
|
@ -92,6 +98,9 @@ server {
|
|||
location / {
|
||||
proxy_pass http://lobe-<%= $name %>:3210/;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Forwarded-For $remote_addr;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection $connection_upgrade;
|
||||
}
|
||||
|
|
|
@ -63,6 +63,7 @@ class AITextGenModelSettings :Struct {
|
|||
field $no_use_cuda_fp16 = 0; # TODO boolean, default true
|
||||
field $no_inject_fused_attention = undef;
|
||||
field $disable_exllama = undef;
|
||||
field $multi_user = 0;
|
||||
|
||||
field $m_name = undef;
|
||||
|
||||
|
@ -78,18 +79,20 @@ class AITextGenModelSettings :Struct {
|
|||
my @args = ("--verbose");
|
||||
|
||||
push @args, "--desc_act" if $self->desc_act;
|
||||
push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16;
|
||||
print "cuda: ",$self->no_use_cuda_fp16, "\n";
|
||||
push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16 eq 'true';
|
||||
push @args, "--listen" if $self->listen;
|
||||
push @args, "--auto-devices" if $self->auto_devices;
|
||||
push @args, "--disable_exllama" if $self->disable_exllama;
|
||||
push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention;
|
||||
push @args, "--disable_exllama" if $self->disable_exllama eq 'true';
|
||||
push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention eq 'true';
|
||||
|
||||
push @args, "--logits_all" if $self->logits_all;
|
||||
push @args, "--multi-user" if $self->multi_user eq 'true';
|
||||
push @args, "--logits_all" if $self->logits_all eq 'true';
|
||||
push @args, "--n-gpu-layers", $self->n_gpu_layers if $self->n_gpu_layers;
|
||||
push @args, "--numa" if $self->numa;
|
||||
push @args, "--no-mmap" if $self->no_mmap;
|
||||
push @args, "--tensor_split", join(",", $self->tensor_split->@*) if $self->tensor_split;
|
||||
push @args, "--no_mul_mat_q" if $self->no_mul_mat_q;
|
||||
push @args, "--no_mul_mat_q" if $self->no_mul_mat_q eq 'true';
|
||||
|
||||
my @mem_split = ($self->memory_split // $conf->model_split)->@*;
|
||||
push @args, "--gpu-memory", @mem_split;
|
||||
|
|
Loading…
Add table
Reference in a new issue