Adding lobe chat support

2024-01-22 10:59:37 -05:00 · 2024-01-22 10:59:37 -05:00 · ea8b7fccbe
commit ea8b7fccbe
parent d385272ab6
4 changed files with 183 additions and 60 deletions
--- a/build.pl
+++ b/build.pl
@ -37,4 +37,4 @@ for my $file (path("files")->children(qr/\.tmpl/)) {
  $output_file->spew_utf8($data);
 }
-data_print($conf->text_gen->models->{general1}->gen_cli($conf))
+#data_print($conf->text_gen->models->{general1}->gen_cli($conf))
--- a/config.toml
+++ b/config.toml
@ -6,40 +6,24 @@
    # Todo mounts?
    model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
-#[text_gen.models.highcontext]
+[text_gen.models.guard1]
 #    api = true
 #    auto_devices = true
 #    chat_buttons = true
 #    desc_act = true
 #    disable_exllama = true
 #    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
 #    families = ["highcontext"]
 #    gpu_list = []
 #    listen = true
 #    loader = "llamacpp"
 #    memory_split = [0]
 #    model = "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ"
 #    no_inject_fused_attention = true
 #    no_use_cuda_fp16 = true
 #    trust_remote_code = true
 [text_gen.models.general1]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
-    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
-    families = ["general", "embeddings"]
+    families = ["guard"]
-    gpu_list = ["3"]
+    gpu_list = ["2"]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
-    model = "LlongOrca-13B-16K-GGUF"
+    model = "LlamaGuard-7B-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
@ -52,24 +36,49 @@
    chat_buttons = true
    desc_act = true
    disable_exllama = true
-    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
-    families = ["chat", "embeddings"]
+    families = ["chat"]
    gpu_list = ["0"]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
-    model = "Chronos-Hermes-13b-v2-GGUF"
+    model = "dolphin-2.6-mistral-7B-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
-[text_gen.models.guard1]
+[text_gen.models.guard3]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
    families = ["guard"]
    gpu_list = ["2"]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
    model = "LlamaGuard-7B-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 [text_gen.models.guard4]
    api = true
    auto_devices = true
    chat_buttons = true
@ -86,35 +95,12 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 #[text_gen.models.censored1]
 #    api = true
 #    auto_devices = true
 #    chat_buttons = true
 #    desc_act = true
 #    disable_exllama = true
 #    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
 #    families = ["censored", "embeddings"]
 #    gpu_list = ["1"]
 #    listen = true
 #    loader = "llamacpp"
 #    memory_split = [20]
 #    model = "WizardLM-13B-V1.2-GGUF"
 #    no_inject_fused_attention = true
 #    no_use_cuda_fp16 = true
 #    openai_api = true
 #    trust_remote_code = true
 #    no_mul_mat_q = true
 #    no_mmap = true
 #    n_gpu_layers = 128
 #    numa = true
 #    logits_all = true
 #    tensor_split = [1]
 [text_gen.models.coder1]
    api = true
@ -124,7 +110,7 @@
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["coder", "embeddings"]
-    gpu_list = [0, 1, 3, 4]
+    gpu_list = [2,3]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
@ -133,13 +119,12 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
-    tensor_split = [1,1,1,5]
+    tensor_split = [1,3]
 [text_gen.models.embedding1]
    api = true
@ -158,13 +143,135 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 #dolphin-2.5-mixtral-8x7b-GGUF
 [text_gen.models.mixtral1]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["mixtral", "embeddings"]
    gpu_list = [0,1]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "dolphin-2.5-mixtral-8x7b-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [2,4]
 [text_gen.models.tiny1]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["tiny", "embeddings"]
    gpu_list = [3]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 [text_gen.models.tiny2]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["tiny", "embeddings"]
    gpu_list = [3]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 [text_gen.models.phi1]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["phi", "embeddings"]
    gpu_list = [3]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "dolphin-2_6-phi-2-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 [text_gen.models.phi2]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["phi", "embeddings"]
    gpu_list = [0]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
    model = "dolphin-2_6-phi-2-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
 [agnai]
 anonymous = false
--- a/files/docker-compose.yml.tmpl
+++ b/files/docker-compose.yml.tmpl
@ -2,7 +2,7 @@ version: "3.9"
 services:
 <% for my ($name, $gen_config) ($config->text_gen->models->%*) { =%>
  text-<%= $name %>:
-    image: gitea.simcop2387.info/simcop2387/text-gen-python-base:<%= $gen_config->image_tag %>
+    image: <%= $gen_config->image_name %>:<%= $gen_config->image_tag %>
    restart: unless-stopped
    environment:
      CONTAINER_PORT: 7860
@ -35,6 +35,19 @@ services:
              device_ids: ['<%= join(',', $gen_config->gpu_list->@*) %>']
              capabilities: [gpu]
 <% } =%>
 <% for my ($name, $lobe_config) ($config->lobe_chat->%*) { =%>
    lobe-<%= $name %>:
      image: <%= $lobe_config->image_name %>:<%= $lobe_config->image_tag %>
      restart: unless-stopped
      environment: # TODO This needs the proxy really, and config in the thing
      	- OPENAI_API_KEY=11111111
 	- OPENAI_PROXY_URL=http://openai.mixtral1-model.brainiac.ai.simcop2387.info/v1
 	- ACCESS_CODE=12345
      ports:
      	- "<%= $lobe_config->lobe_host_port %>:3210"
 <% } =%>
  nginx:
    image: nginx:latest
    restart: unless-stopped
@ -42,3 +55,4 @@ services:
      - 80:80
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
--- a/lib/AIConfig.pm
+++ b/lib/AIConfig.pm
@ -182,7 +182,9 @@ class AIConfig :does(Object::PadX::Role::AutoMarshal) :Struct {
  method from_file :common ($file) {
    my $file_p = path($file);
-    my $hr = from_toml($file_p->slurp_utf8);
+    my ($hr, $toml_error) = from_toml($file_p->slurp_utf8);
    croak $toml_error if $toml_error;
    my ($conf, $error) = AIConfig->new($hr->%*);