Adding lobe chat support

This commit is contained in:
Ryan Voots 2024-01-22 10:59:37 -05:00
parent d385272ab6
commit ea8b7fccbe
4 changed files with 183 additions and 60 deletions

View file

@ -37,4 +37,4 @@ for my $file (path("files")->children(qr/\.tmpl/)) {
$output_file->spew_utf8($data);
}
data_print($conf->text_gen->models->{general1}->gen_cli($conf))
#data_print($conf->text_gen->models->{general1}->gen_cli($conf))

View file

@ -6,40 +6,24 @@
# Todo mounts?
model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
#[text_gen.models.highcontext]
# api = true
# auto_devices = true
# chat_buttons = true
# desc_act = true
# disable_exllama = true
# extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
# families = ["highcontext"]
# gpu_list = []
# listen = true
# loader = "llamacpp"
# memory_split = [0]
# model = "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ"
# no_inject_fused_attention = true
# no_use_cuda_fp16 = true
# trust_remote_code = true
[text_gen.models.general1]
[text_gen.models.guard1]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["general", "embeddings"]
gpu_list = ["3"]
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["guard"]
gpu_list = ["2"]
listen = true
loader = "llamacpp"
memory_split = [20]
model = "LlongOrca-13B-16K-GGUF"
model = "LlamaGuard-7B-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
@ -52,24 +36,49 @@
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["chat", "embeddings"]
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["chat"]
gpu_list = ["0"]
listen = true
loader = "llamacpp"
memory_split = [20]
model = "Chronos-Hermes-13b-v2-GGUF"
model = "dolphin-2.6-mistral-7B-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[text_gen.models.guard1]
[text_gen.models.guard3]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
families = ["guard"]
gpu_list = ["2"]
listen = true
loader = "llamacpp"
memory_split = [20]
model = "LlamaGuard-7B-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[text_gen.models.guard4]
api = true
auto_devices = true
chat_buttons = true
@ -86,35 +95,12 @@
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
#[text_gen.models.censored1]
# api = true
# auto_devices = true
# chat_buttons = true
# desc_act = true
# disable_exllama = true
# extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
# families = ["censored", "embeddings"]
# gpu_list = ["1"]
# listen = true
# loader = "llamacpp"
# memory_split = [20]
# model = "WizardLM-13B-V1.2-GGUF"
# no_inject_fused_attention = true
# no_use_cuda_fp16 = true
# openai_api = true
# trust_remote_code = true
# no_mul_mat_q = true
# no_mmap = true
# n_gpu_layers = 128
# numa = true
# logits_all = true
# tensor_split = [1]
[text_gen.models.coder1]
api = true
@ -124,7 +110,7 @@
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["coder", "embeddings"]
gpu_list = [0, 1, 3, 4]
gpu_list = [2,3]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
@ -133,13 +119,12 @@
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1,1,1,5]
tensor_split = [1,3]
[text_gen.models.embedding1]
api = true
@ -158,13 +143,135 @@
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
#dolphin-2.5-mixtral-8x7b-GGUF
[text_gen.models.mixtral1]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["mixtral", "embeddings"]
gpu_list = [0,1]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
model = "dolphin-2.5-mixtral-8x7b-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [2,4]
[text_gen.models.tiny1]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["tiny", "embeddings"]
gpu_list = [3]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[text_gen.models.tiny2]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["tiny", "embeddings"]
gpu_list = [3]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[text_gen.models.phi1]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["phi", "embeddings"]
gpu_list = [3]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
model = "dolphin-2_6-phi-2-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[text_gen.models.phi2]
api = true
auto_devices = true
chat_buttons = true
desc_act = true
disable_exllama = true
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
families = ["phi", "embeddings"]
gpu_list = [0]
listen = true
loader = "llamacpp"
memory_split = [20, 20]
model = "dolphin-2_6-phi-2-GGUF"
no_inject_fused_attention = true
no_use_cuda_fp16 = true
openai_api = true
trust_remote_code = true
no_mul_mat_q = false
no_mmap = true
n_gpu_layers = 128
numa = true
logits_all = true
tensor_split = [1]
[agnai]
anonymous = false

View file

@ -2,7 +2,7 @@ version: "3.9"
services:
<% for my ($name, $gen_config) ($config->text_gen->models->%*) { =%>
text-<%= $name %>:
image: gitea.simcop2387.info/simcop2387/text-gen-python-base:<%= $gen_config->image_tag %>
image: <%= $gen_config->image_name %>:<%= $gen_config->image_tag %>
restart: unless-stopped
environment:
CONTAINER_PORT: 7860
@ -35,6 +35,19 @@ services:
device_ids: ['<%= join(',', $gen_config->gpu_list->@*) %>']
capabilities: [gpu]
<% } =%>
<% for my ($name, $lobe_config) ($config->lobe_chat->%*) { =%>
lobe-<%= $name %>:
image: <%= $lobe_config->image_name %>:<%= $lobe_config->image_tag %>
restart: unless-stopped
environment: # TODO This needs the proxy really, and config in the thing
- OPENAI_API_KEY=11111111
- OPENAI_PROXY_URL=http://openai.mixtral1-model.brainiac.ai.simcop2387.info/v1
- ACCESS_CODE=12345
ports:
- "<%= $lobe_config->lobe_host_port %>:3210"
<% } =%>
nginx:
image: nginx:latest
restart: unless-stopped
@ -42,3 +55,4 @@ services:
- 80:80
volumes:
- ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro

View file

@ -182,7 +182,9 @@ class AIConfig :does(Object::PadX::Role::AutoMarshal) :Struct {
method from_file :common ($file) {
my $file_p = path($file);
my $hr = from_toml($file_p->slurp_utf8);
my ($hr, $toml_error) = from_toml($file_p->slurp_utf8);
croak $toml_error if $toml_error;
my ($conf, $error) = AIConfig->new($hr->%*);