Adding lobe chat support
This commit is contained in:
parent
d385272ab6
commit
ea8b7fccbe
4 changed files with 183 additions and 60 deletions
2
build.pl
2
build.pl
|
@ -37,4 +37,4 @@ for my $file (path("files")->children(qr/\.tmpl/)) {
|
|||
$output_file->spew_utf8($data);
|
||||
}
|
||||
|
||||
data_print($conf->text_gen->models->{general1}->gen_cli($conf))
|
||||
#data_print($conf->text_gen->models->{general1}->gen_cli($conf))
|
||||
|
|
221
config.toml
221
config.toml
|
@ -6,40 +6,24 @@
|
|||
# Todo mounts?
|
||||
model_dir = "/app/models/TheBloke" # should this be automatic? part of the top level?
|
||||
|
||||
#[text_gen.models.highcontext]
|
||||
# api = true
|
||||
# auto_devices = true
|
||||
# chat_buttons = true
|
||||
# desc_act = true
|
||||
# disable_exllama = true
|
||||
# extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
# families = ["highcontext"]
|
||||
# gpu_list = []
|
||||
# listen = true
|
||||
# loader = "llamacpp"
|
||||
# memory_split = [0]
|
||||
# model = "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ"
|
||||
# no_inject_fused_attention = true
|
||||
# no_use_cuda_fp16 = true
|
||||
# trust_remote_code = true
|
||||
|
||||
[text_gen.models.general1]
|
||||
[text_gen.models.guard1]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["general", "embeddings"]
|
||||
gpu_list = ["3"]
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["guard"]
|
||||
gpu_list = ["2"]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20]
|
||||
model = "LlongOrca-13B-16K-GGUF"
|
||||
model = "LlamaGuard-7B-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
|
@ -52,24 +36,49 @@
|
|||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["chat", "embeddings"]
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["chat"]
|
||||
gpu_list = ["0"]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20]
|
||||
model = "Chronos-Hermes-13b-v2-GGUF"
|
||||
model = "dolphin-2.6-mistral-7B-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
[text_gen.models.guard1]
|
||||
[text_gen.models.guard3]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["guard"]
|
||||
gpu_list = ["2"]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20]
|
||||
model = "LlamaGuard-7B-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
[text_gen.models.guard4]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
|
@ -86,35 +95,12 @@
|
|||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
#[text_gen.models.censored1]
|
||||
# api = true
|
||||
# auto_devices = true
|
||||
# chat_buttons = true
|
||||
# desc_act = true
|
||||
# disable_exllama = true
|
||||
# extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
# families = ["censored", "embeddings"]
|
||||
# gpu_list = ["1"]
|
||||
# listen = true
|
||||
# loader = "llamacpp"
|
||||
# memory_split = [20]
|
||||
# model = "WizardLM-13B-V1.2-GGUF"
|
||||
# no_inject_fused_attention = true
|
||||
# no_use_cuda_fp16 = true
|
||||
# openai_api = true
|
||||
# trust_remote_code = true
|
||||
# no_mul_mat_q = true
|
||||
# no_mmap = true
|
||||
# n_gpu_layers = 128
|
||||
# numa = true
|
||||
# logits_all = true
|
||||
# tensor_split = [1]
|
||||
|
||||
[text_gen.models.coder1]
|
||||
api = true
|
||||
|
@ -124,7 +110,7 @@
|
|||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["coder", "embeddings"]
|
||||
gpu_list = [0, 1, 3, 4]
|
||||
gpu_list = [2,3]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
|
@ -133,13 +119,12 @@
|
|||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1,1,1,5]
|
||||
|
||||
tensor_split = [1,3]
|
||||
|
||||
[text_gen.models.embedding1]
|
||||
api = true
|
||||
|
@ -158,13 +143,135 @@
|
|||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
#dolphin-2.5-mixtral-8x7b-GGUF
|
||||
[text_gen.models.mixtral1]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["mixtral", "embeddings"]
|
||||
gpu_list = [0,1]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "dolphin-2.5-mixtral-8x7b-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [2,4]
|
||||
|
||||
[text_gen.models.tiny1]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["tiny", "embeddings"]
|
||||
gpu_list = [3]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
[text_gen.models.tiny2]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["tiny", "embeddings"]
|
||||
gpu_list = [3]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "TinyLlama-1.1B-Chat-v1.0-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
[text_gen.models.phi1]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["phi", "embeddings"]
|
||||
gpu_list = [3]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "dolphin-2_6-phi-2-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
[text_gen.models.phi2]
|
||||
api = true
|
||||
auto_devices = true
|
||||
chat_buttons = true
|
||||
desc_act = true
|
||||
disable_exllama = true
|
||||
extensions = ["api", "openai", "superboogav2"] # TODO this way? or flags for each named extension in each one?
|
||||
families = ["phi", "embeddings"]
|
||||
gpu_list = [0]
|
||||
listen = true
|
||||
loader = "llamacpp"
|
||||
memory_split = [20, 20]
|
||||
model = "dolphin-2_6-phi-2-GGUF"
|
||||
no_inject_fused_attention = true
|
||||
no_use_cuda_fp16 = true
|
||||
openai_api = true
|
||||
trust_remote_code = true
|
||||
no_mul_mat_q = false
|
||||
no_mmap = true
|
||||
n_gpu_layers = 128
|
||||
numa = true
|
||||
logits_all = true
|
||||
tensor_split = [1]
|
||||
|
||||
|
||||
[agnai]
|
||||
anonymous = false
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ version: "3.9"
|
|||
services:
|
||||
<% for my ($name, $gen_config) ($config->text_gen->models->%*) { =%>
|
||||
text-<%= $name %>:
|
||||
image: gitea.simcop2387.info/simcop2387/text-gen-python-base:<%= $gen_config->image_tag %>
|
||||
image: <%= $gen_config->image_name %>:<%= $gen_config->image_tag %>
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
CONTAINER_PORT: 7860
|
||||
|
@ -35,6 +35,19 @@ services:
|
|||
device_ids: ['<%= join(',', $gen_config->gpu_list->@*) %>']
|
||||
capabilities: [gpu]
|
||||
<% } =%>
|
||||
|
||||
<% for my ($name, $lobe_config) ($config->lobe_chat->%*) { =%>
|
||||
lobe-<%= $name %>:
|
||||
image: <%= $lobe_config->image_name %>:<%= $lobe_config->image_tag %>
|
||||
restart: unless-stopped
|
||||
environment: # TODO This needs the proxy really, and config in the thing
|
||||
- OPENAI_API_KEY=11111111
|
||||
- OPENAI_PROXY_URL=http://openai.mixtral1-model.brainiac.ai.simcop2387.info/v1
|
||||
- ACCESS_CODE=12345
|
||||
ports:
|
||||
- "<%= $lobe_config->lobe_host_port %>:3210"
|
||||
<% } =%>
|
||||
|
||||
nginx:
|
||||
image: nginx:latest
|
||||
restart: unless-stopped
|
||||
|
@ -42,3 +55,4 @@ services:
|
|||
- 80:80
|
||||
volumes:
|
||||
- ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||
|
||||
|
|
|
@ -182,7 +182,9 @@ class AIConfig :does(Object::PadX::Role::AutoMarshal) :Struct {
|
|||
method from_file :common ($file) {
|
||||
my $file_p = path($file);
|
||||
|
||||
my $hr = from_toml($file_p->slurp_utf8);
|
||||
my ($hr, $toml_error) = from_toml($file_p->slurp_utf8);
|
||||
|
||||
croak $toml_error if $toml_error;
|
||||
|
||||
my ($conf, $error) = AIConfig->new($hr->%*);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue