Adding lobe chat support

2024-01-22 10:59:37 -05:00 · 2024-01-22 10:59:37 -05:00 · ea8b7fccbe
commit ea8b7fccbe
parent d385272ab6
4 changed files with 183 additions and 60 deletions
--- a/build.pl
+++ b/build.pl
@ -37,4 +37,4 @@ for my $file (path("files")->children(qr/\.tmpl/)) {
  $output_file->spew_utf8($data);
 }

-data_print($conf->text_gen->models->{general1}->gen_cli($conf))
+#data_print($conf->text_gen->models->{general1}->gen_cli($conf))
--- a/config.toml
+++ b/config.toml
@ -6,40 +6,24 @@
    # Todo mounts?
    model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?

-#[text_gen.models.highcontext]
-#    api = true
-#    auto_devices = true
-#    chat_buttons = true
-#    desc_act = true
-#    disable_exllama = true
-#    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
-#    families = ["highcontext"]
-#    gpu_list = []
-#    listen = true
-#    loader = "llamacpp"
-#    memory_split = [0]
-#    model = "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ"
-#    no_inject_fused_attention = true
-#    no_use_cuda_fp16 = true
-#    trust_remote_code = true
-
-[text_gen.models.general1]
+[text_gen.models.guard1]
    api = true
    auto_devices = true
    chat_buttons = true
    desc_act = true
    disable_exllama = true
-    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
-    families = ["general", "embeddings"]
-    gpu_list = ["3"]
+    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
+    families = ["guard"]
+    gpu_list = ["2"]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
-    model = "LlongOrca-13B-16K-GGUF"
+    model = "LlamaGuard-7B-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
+    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
@ -52,24 +36,49 @@
    chat_buttons = true
    desc_act = true
    disable_exllama = true
-    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
-    families = ["chat", "embeddings"]
+    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
+    families = ["chat"]
    gpu_list = ["0"]
    listen = true
    loader = "llamacpp"
    memory_split = [20]
-    model = "Chronos-Hermes-13b-v2-GGUF"
+    model = "dolphin-2.6-mistral-7B-GGUF"
    no_inject_fused_attention = true
    no_use_cuda_fp16 = true
+    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]

-[text_gen.models.guard1]
+[text_gen.models.guard3]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
+    families = ["guard"]
+    gpu_list = ["2"]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20]
+    model = "LlamaGuard-7B-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [1]
+
+[text_gen.models.guard4]
    api = true
    auto_devices = true
    chat_buttons = true
@ -86,35 +95,12 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]
-#[text_gen.models.censored1]
-#    api = true
-#    auto_devices = true
-#    chat_buttons = true
-#    desc_act = true
-#    disable_exllama = true
-#    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
-#    families = ["censored", "embeddings"]
-#    gpu_list = ["1"]
-#    listen = true
-#    loader = "llamacpp"
-#    memory_split = [20]
-#    model = "WizardLM-13B-V1.2-GGUF"
-#    no_inject_fused_attention = true
-#    no_use_cuda_fp16 = true
-#    openai_api = true
-#    trust_remote_code = true
-#    no_mul_mat_q = true
-#    no_mmap = true
-#    n_gpu_layers = 128
-#    numa = true
-#    logits_all = true
-#    tensor_split = [1]

 [text_gen.models.coder1]
    api = true
@ -124,7 +110,7 @@
    disable_exllama = true
    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
    families = ["coder", "embeddings"]
-    gpu_list = [0, 1, 3, 4]
+    gpu_list = [2,3]
    listen = true
    loader = "llamacpp"
    memory_split = [20, 20]
@ -133,13 +119,12 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
-    tensor_split = [1,1,1,5]
-
+    tensor_split = [1,3]

 [text_gen.models.embedding1]
    api = true
@ -158,13 +143,135 @@
    no_use_cuda_fp16 = true
    openai_api = true
    trust_remote_code = true
-    no_mul_mat_q = true
+    no_mul_mat_q = false
    no_mmap = true
    n_gpu_layers = 128
    numa = true
    logits_all = true
    tensor_split = [1]

+#dolphin-2.5-mixtral-8x7b-GGUF
+[text_gen.models.mixtral1]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    families = ["mixtral", "embeddings"]
+    gpu_list = [0,1]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20, 20]
+    model = "dolphin-2.5-mixtral-8x7b-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [2,4]
+
+[text_gen.models.tiny1]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    families = ["tiny", "embeddings"]
+    gpu_list = [3]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20, 20]
+    model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [1]
+
+[text_gen.models.tiny2]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    families = ["tiny", "embeddings"]
+    gpu_list = [3]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20, 20]
+    model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [1]
+
+[text_gen.models.phi1]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    families = ["phi", "embeddings"]
+    gpu_list = [3]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20, 20]
+    model = "dolphin-2_6-phi-2-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [1]
+
+[text_gen.models.phi2]
+    api = true
+    auto_devices = true
+    chat_buttons = true
+    desc_act = true
+    disable_exllama = true
+    extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
+    families = ["phi", "embeddings"]
+    gpu_list = [0]
+    listen = true
+    loader = "llamacpp"
+    memory_split = [20, 20]
+    model = "dolphin-2_6-phi-2-GGUF"
+    no_inject_fused_attention = true
+    no_use_cuda_fp16 = true
+    openai_api = true
+    trust_remote_code = true
+    no_mul_mat_q = false
+    no_mmap = true
+    n_gpu_layers = 128
+    numa = true
+    logits_all = true
+    tensor_split = [1]
+
+
 [agnai]
 anonymous = false

--- a/files/docker-compose.yml.tmpl
+++ b/files/docker-compose.yml.tmpl
@ -2,7 +2,7 @@ version: "3.9"
 services:
 <% for my ($name, $gen_config) ($config->text_gen->models->%*) { =%>
  text-<%= $name %>:
-    image: gitea.simcop2387.info/simcop2387/text-gen-python-base:<%= $gen_config->image_tag %>
+    image: <%= $gen_config->image_name %>:<%= $gen_config->image_tag %>
    restart: unless-stopped
    environment:
      CONTAINER_PORT: 7860
@ -35,6 +35,19 @@ services:
              device_ids: ['<%= join(',', $gen_config->gpu_list->@*) %>']
              capabilities: [gpu]
 <% } =%>
+
+<% for my ($name, $lobe_config) ($config->lobe_chat->%*) { =%>
+    lobe-<%= $name %>:
+      image: <%= $lobe_config->image_name %>:<%= $lobe_config->image_tag %>
+      restart: unless-stopped
+      environment: # TODO This needs the proxy really, and config in the thing
+      	- OPENAI_API_KEY=11111111
+	- OPENAI_PROXY_URL=http://openai.mixtral1-model.brainiac.ai.simcop2387.info/v1
+	- ACCESS_CODE=12345
+      ports:
+      	- "<%= $lobe_config->lobe_host_port %>:3210"
+<% } =%>
+
  nginx:
    image: nginx:latest
    restart: unless-stopped
@ -42,3 +55,4 @@ services:
      - 80:80
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
+
--- a/lib/AIConfig.pm
+++ b/lib/AIConfig.pm
@ -182,7 +182,9 @@ class AIConfig :does(Object::PadX::Role::AutoMarshal) :Struct {
  method from_file :common ($file) {
    my $file_p = path($file);

-    my $hr = from_toml($file_p->slurp_utf8);
+    my ($hr, $toml_error) = from_toml($file_p->slurp_utf8);
+    
+    croak $toml_error if $toml_error;

    my ($conf, $error) = AIConfig->new($hr->%*);