Local changes for ml-host

2023-06-26 21:37:00 -04:00 · 2023-06-26 21:37:00 -04:00 · 65b8ef44c8
commit 65b8ef44c8
parent 2661c9899a
3 changed files with 127 additions and 0 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,27 @@
+TORCH_CUDA_ARCH_LIST=8.6
+
+# these commands worked for me with roughly 4.5GB of vram
+CLI_ARGS=--model airoboros-13B-gpt4-1.2-GPTQ --wbits 4 --listen --auto-devices --extension api --gpu-memory 20 --load-in-8bit
+
+# the following examples have been tested with the files linked in docs/README_docker.md:
+# example running 13b with 4bit/128 groupsize        : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25
+# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share
+# example running 7b with 8bit groupsize             : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices
+
+# the port the webui binds to on the host
+HOST_PORT=7860
+# the port the webui binds to inside the container
+CONTAINER_PORT=7860
+
+# the port the api binds to on the host
+HOST_API_PORT=5000
+# the port the api binds to inside the container
+CONTAINER_API_PORT=5000
+
+# the port the api stream endpoint binds to on the host
+HOST_API_STREAM_PORT=5005
+# the port the api stream endpoint binds to inside the container
+CONTAINER_API_STREAM_PORT=5005
+
+# the version used to install text-generation-webui from
+WEBUI_VERSION=HEAD
--- a/68
+++ b/68
@ -0,0 +1,68 @@
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build
+
+WORKDIR /build
+
+RUN python3 -m venv /build/venv
+RUN . /build/venv/bin/activate && \
+    pip3 install --upgrade pip setuptools wheel && \
+    pip3 install torch torchvision torchaudio && \
+    pip3 install -r requirements.txt
+
+# https://developer.nvidia.com/cuda-gpus
+# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
+ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
+RUN . /build/venv/bin/activate && \
+    python3 setup_cuda.py bdist_wheel -d .
+
+FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
+
+LABEL maintainer="Your Name <your.email@example.com>"
+LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv
+RUN mkdir /app
+
+WORKDIR /app
+
+ARG WEBUI_VERSION
+RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
+
+RUN virtualenv /app/venv
+RUN . /app/venv/bin/activate && \
+    pip3 install --upgrade pip setuptools wheel && \
+    pip3 install torch torchvision torchaudio
+
+COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
+RUN . /app/venv/bin/activate && \
+    pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
+
+COPY extensions/api/requirements.txt /app/extensions/api/requirements.txt
+COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/requirements.txt
+COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt
+COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt
+COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt
+
+COPY requirements.txt /app/requirements.txt
+RUN . /app/venv/bin/activate && \
+    pip3 install -r requirements.txt
+
+RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
+
+COPY . /app/
+ENV CLI_ARGS=""
+CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,32 @@
+version: "3.3"
+services:
+  text-generation-webui:
+    build:
+      context: .
+      args:
+        # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
+        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}
+        WEBUI_VERSION: ${WEBUI_VERSION}
+    env_file: .env
+    ports:
+      - "${HOST_PORT}:${CONTAINER_PORT}"
+      - "${HOST_API_PORT}:${CONTAINER_API_PORT}"
+      - "${HOST_API_STREAM_PORT}:${CONTAINER_API_STREAM_PORT}"
+    stdin_open: true
+    tty: true
+    volumes:
+      - ./characters:/app/characters
+      - ./extensions:/app/extensions
+      - ./loras:/app/loras
+      - /mnt/ml_models/text-gen/:/app/models
+      - ./presets:/app/presets
+      - ./prompts:/app/prompts
+      - ./softprompts:/app/softprompts
+      - ./training:/app/training
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]