New coder model, also move some to pascal again

2024-05-30 14:15:44 -04:00 · 2024-05-30 14:15:44 -04:00 · 4d3bea71a8
commit 4d3bea71a8
parent aa392f5bea
3 changed files with 47 additions and 35 deletions
--- a/config.toml
+++ b/config.toml
@ -24,20 +24,20 @@ openai_key="12345"
    model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
 [text_gen.models.guard1]
-  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
+  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["guard"]
-    gpu_list = [6]
+    gpu_list = [4]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
    model = "LlamaGuard-7B-GGUF"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -54,16 +54,16 @@ openai_key="12345"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["chat"]
-    gpu_list = [6]
+    gpu_list = [3]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
    model = "dolphin-2.6-mistral-7B-GGUF"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -80,16 +80,16 @@ openai_key="12345"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["coder", "embeddings"]
    gpu_list = [6]
    listen = true
    loader = "llamacpp"
-    memory_split = [20, 20]
+    memory_split = [20]
-    model = "deepseek-coder-33B-instruct-GGUF/deepseek-coder-33b-instruct.Q4_K_M.gguf"
+    model = "Codestral-22B-v0.1-GGUF/Codestral-22B-v0.1-Q6_K.gguf"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -106,8 +106,8 @@ openai_key="12345"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["openai","api"] # TODO this way? or flags for each named extension in each one?
    families = ["embeddings"]
    gpu_list = [5]
@ -115,7 +115,7 @@ openai_key="12345"
 #    loader = "llamacpp"
    memory_split = [2]
 #    model = "null"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -133,8 +133,8 @@ openai_key="12345"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["mixtral", "embeddings"]
    gpu_list = [0,1,2]
@ -142,7 +142,7 @@ openai_key="12345"
    loader = "llamacpp"
    memory_split = [14,14,14]
    model = "dolphin-2.5-mixtral-8x7b-GGUF"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -155,20 +155,20 @@ openai_key="12345"
    multi_user = true
 [text_gen.models.tiny1]
-  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
+  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["tiny", "embeddings"]
-    gpu_list = [6]
+    gpu_list = [4]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
@ -181,20 +181,20 @@ openai_key="12345"
    multi_user = true
 [text_gen.models.phi1]
-  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-ampere-base"
+  image_name="gitea.simcop2387.info/simcop2387/text-gen-python-pascal-base"
    api = true
    auto_devices = true
    chat_buttons = true
-    desc_act = true
+    desc_act = false
-    disable_exllama = true
+    disable_exllama = false
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["phi", "embeddings"]
-    gpu_list = [6]
+    gpu_list = [5]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "dolphin-2_6-phi-2-GGUF"
-    no_inject_fused_attention = true
+    no_inject_fused_attention = false
    no_use_cuda_fp16 = false
    openai_api = true
    trust_remote_code = true
--- a/files/nginx.conf.tmpl
+++ b/files/nginx.conf.tmpl
@ -57,6 +57,9 @@ server {
  location / {
    proxy_pass http://<%= "$name" %>-<%= $type %>/;
    proxy_http_version 1.1;
    proxy_set_header Host            $host;
    proxy_set_header X-Forwarded-For $remote_addr;
    proxy_set_header Upgrade $http_upgrade;
    proxy_set_header Connection $connection_upgrade;
  }
@ -75,6 +78,9 @@ server {
  location / {
    proxy_pass http://family-<%= "$fam" %>-<%= $type %>/;
    proxy_http_version 1.1;
    proxy_set_header Host            $host;
    proxy_set_header X-Forwarded-For $remote_addr;
    proxy_set_header Upgrade $http_upgrade;
    proxy_set_header Connection $connection_upgrade;
  }
@ -92,6 +98,9 @@ server {
   location / {
   	proxy_pass http://lobe-<%= $name %>:3210/;
 	proxy_http_version 1.1;
    proxy_set_header Host            $host;
    proxy_set_header X-Forwarded-For $remote_addr;
 	proxy_set_header Upgrade $http_upgrade;
 	proxy_set_header Connection $connection_upgrade;
    }
--- a/lib/AIConfig.pm
+++ b/lib/AIConfig.pm
@ -63,6 +63,7 @@ class AITextGenModelSettings :Struct {
  field $no_use_cuda_fp16 = 0; # TODO boolean, default true
  field $no_inject_fused_attention = undef;
  field $disable_exllama = undef;
  field $multi_user = 0;
  field $m_name = undef;
@ -78,18 +79,20 @@ class AITextGenModelSettings :Struct {
    my @args = ("--verbose");
    push @args, "--desc_act" if $self->desc_act;
-    push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16;
+    print "cuda: ",$self->no_use_cuda_fp16, "\n";
    push @args, "--no_use_cuda_fp16" if $self->no_use_cuda_fp16 eq 'true';
    push @args, "--listen" if $self->listen;
    push @args, "--auto-devices" if $self->auto_devices;
-    push @args, "--disable_exllama" if $self->disable_exllama;
+    push @args, "--disable_exllama" if $self->disable_exllama eq 'true';
-    push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention;
+    push @args, "--no_inject_fused_attention" if $self->no_inject_fused_attention eq 'true';
-    push @args, "--logits_all" if $self->logits_all;
+    push @args, "--multi-user" if $self->multi_user eq 'true';
    push @args, "--logits_all" if $self->logits_all eq 'true';
    push @args, "--n-gpu-layers", $self->n_gpu_layers if $self->n_gpu_layers;
    push @args, "--numa" if $self->numa;
    push @args, "--no-mmap" if $self->no_mmap;
    push @args, "--tensor_split", join(",", $self->tensor_split->@*) if $self->tensor_split;
-    push @args, "--no_mul_mat_q" if $self->no_mul_mat_q;
+    push @args, "--no_mul_mat_q" if $self->no_mul_mat_q eq 'true';
    my @mem_split = ($self->memory_split // $conf->model_split)->@*;
    push @args, "--gpu-memory", @mem_split;