New coder model, also move some to pascal again

This commit is contained in:
Ryan Voots 2024-05-30 14:15:44 -04:00
parent aa392f5bea
commit 4d3bea71a8
3 changed files with 47 additions and 35 deletions

View file

@ -24,20 +24,20 @@ openai_key="12345"
model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level? model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
[text_gen.models.guard1] [text_gen.models.guard1]
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base" image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["guard"] families = ["guard"]
gpu_list = [6] gpu_list = [4]
listen = true listen = true
loader = "llamacpp" loader = "llamacpp"
memory_split = [20] memory_split = [20]
model = "LlamaGuard-7B-GGUF" model = "LlamaGuard-7B-GGUF"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -54,16 +54,16 @@ openai_key="12345"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["chat"] families = ["chat"]
gpu_list = [6] gpu_list = [3]
listen = true listen = true
loader = "llamacpp" loader = "llamacpp"
memory_split = [20] memory_split = [20]
model = "dolphin-2.6-mistral-7B-GGUF" model = "dolphin-2.6-mistral-7B-GGUF"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -80,16 +80,16 @@ openai_key="12345"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["coder", "embeddings"] families = ["coder", "embeddings"]
gpu_list = [6] gpu_list = [6]
listen = true listen = true
loader = "llamacpp" loader = "llamacpp"
memory_split = [20, 20] memory_split = [20]
model = "deepseek-coder-33B-instruct-GGUF/deepseek-coder-33b-instruct.Q4_K_M.gguf" model = "Codestral-22B-v0.1-GGUF/Codestral-22B-v0.1-Q6_K.gguf"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -106,8 +106,8 @@ openai_key="12345"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["openai","api"] # TODO this way? or flags for each named extension in each one? extensions = ["openai","api"] # TODO this way? or flags for each named extension in each one?
families = ["embeddings"] families = ["embeddings"]
gpu_list = [5] gpu_list = [5]
@ -115,7 +115,7 @@ openai_key="12345"
# loader = "llamacpp" # loader = "llamacpp"
memory_split = [2] memory_split = [2]
# model = "null" # model = "null"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -133,8 +133,8 @@ openai_key="12345"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["mixtral", "embeddings"] families = ["mixtral", "embeddings"]
gpu_list = [0,1,2] gpu_list = [0,1,2]
@ -142,7 +142,7 @@ openai_key="12345"
loader = "llamacpp" loader = "llamacpp"
memory_split = [14,14,14] memory_split = [14,14,14]
model = "dolphin-2.5-mixtral-8x7b-GGUF" model = "dolphin-2.5-mixtral-8x7b-GGUF"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -155,20 +155,20 @@ openai_key="12345"
multi_user = true multi_user = true
[text_gen.models.tiny1] [text_gen.models.tiny1]
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base" image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["tiny", "embeddings"] families = ["tiny", "embeddings"]
gpu_list = [6] gpu_list = [4]
listen = true listen = true
loader = "llamacpp" loader = "llamacpp"
memory_split = [20, 20] memory_split = [20, 20]
model = "TinyLlama-1.1B-Chat-v1.0-GGUF" model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true
@ -181,20 +181,20 @@ openai_key="12345"
multi_user = true multi_user = true
[text_gen.models.phi1] [text_gen.models.phi1]
image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base" image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
api = true api = true
auto_devices = true auto_devices = true
chat_buttons = true chat_buttons = true
desc_act = true desc_act = false
disable_exllama = true disable_exllama = false
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one? extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["phi", "embeddings"] families = ["phi", "embeddings"]
gpu_list = [6] gpu_list = [5]
listen = true listen = true
loader = "llamacpp" loader = "llamacpp"
memory_split = [20, 20] memory_split = [20, 20]
model = "dolphin-2_6-phi-2-GGUF" model = "dolphin-2_6-phi-2-GGUF"
no_inject_fused_attention = true no_inject_fused_attention = false
no_use_cuda_fp16 = false no_use_cuda_fp16 = false
openai_api = true openai_api = true
trust_remote_code = true trust_remote_code = true

View file

@ -57,6 +57,9 @@ server {
location / { location / {
proxy_pass http://<%= "$name" %>-<%= $type %>/; proxy_pass http://<%= "$name" %>-<%= $type %>/;
proxy_http_version 1.1; proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_set_header Upgrade $http_upgrade; proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade; proxy_set_header Connection $connection_upgrade;
} }
@ -75,6 +78,9 @@ server {
location / { location / {
proxy_pass http://family-<%= "$fam" %>-<%= $type %>/; proxy_pass http://family-<%= "$fam" %>-<%= $type %>/;
proxy_http_version 1.1; proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_set_header Upgrade $http_upgrade; proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade; proxy_set_header Connection $connection_upgrade;
} }
@ -92,6 +98,9 @@ server {
location / { location / {
proxy_pass http://lobe-<%= $name %>:3210/; proxy_pass http://lobe-<%= $name %>:3210/;
proxy_http_version 1.1; proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $remote_addr;
proxy_set_header Upgrade $http_upgrade; proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade; proxy_set_header Connection $connection_upgrade;
} }

View file

@ -63,6 +63,7 @@ class AITextGenModelSettings :Struct {
field $no_use_cuda_fp16 = 0; # TODO boolean, default true field $no_use_cuda_fp16 = 0; # TODO boolean, default true
field $no_inject_fused_attention = undef; field $no_inject_fused_attention = undef;
field $disable_exllama = undef; field $disable_exllama = undef;
field $multi_user = 0;
field $m_name = undef; field $m_name = undef;
@ -78,18 +79,20 @@ class AITextGenModelSettings :Struct {
my @args = ("--verbose"); my @args = ("--verbose");
push @args, "--desc_act" if $self->desc_act; push @args, "--desc_act" if $self->desc_act;
push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16; print "cuda: ",$self->no_use_cuda_fp16, "\n";
push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16 eq 'true';
push @args, "--listen" if $self->listen; push @args, "--listen" if $self->listen;
push @args, "--auto-devices" if $self->auto_devices; push @args, "--auto-devices" if $self->auto_devices;
push @args, "--disable_exllama" if $self->disable_exllama; push @args, "--disable_exllama" if $self->disable_exllama eq 'true';
push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention; push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention eq 'true';
push @args, "--logits_all" if $self->logits_all; push @args, "--multi-user" if $self->multi_user eq 'true';
push @args, "--logits_all" if $self->logits_all eq 'true';
push @args, "--n-gpu-layers", $self->n_gpu_layers if $self->n_gpu_layers; push @args, "--n-gpu-layers", $self->n_gpu_layers if $self->n_gpu_layers;
push @args, "--numa" if $self->numa; push @args, "--numa" if $self->numa;
push @args, "--no-mmap" if $self->no_mmap; push @args, "--no-mmap" if $self->no_mmap;
push @args, "--tensor_split", join(",", $self->tensor_split->@*) if $self->tensor_split; push @args, "--tensor_split", join(",", $self->tensor_split->@*) if $self->tensor_split;
push @args, "--no_mul_mat_q" if $self->no_mul_mat_q; push @args, "--no_mul_mat_q" if $self->no_mul_mat_q eq 'true';
my @mem_split = ($self->memory_split // $conf->model_split)->@*; my @mem_split = ($self->memory_split // $conf->model_split)->@*;
push @args, "--gpu-memory", @mem_split; push @args, "--gpu-memory", @mem_split;