Merge pull request #81 from Fmstrat/linux-and-docker

Add Linux support via Docker
2025-06-07 14:15:57 -04:00 · 2024-04-23 13:25:12 -07:00 · 2024-04-23 13:25:12 -07:00 · 24892dc2cc
commit 24892dc2cc
parent ed1a1b5691 0639aca588
15 changed files with 174 additions and 45 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -2,3 +2,4 @@
 /training
 /voices
 /bin
+Dockerfile
--- a/.gitignore
+++ b/.gitignore
@ -149,3 +149,5 @@ dmypy.json
 .custom/*
 results/*
 debug_states/*
+bin/*
+
--- a/74
+++ b/74
@ -1,18 +1,36 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

 ARG DEBIAN_FRONTEND=noninteractive
 ARG TZ=UTC
 ARG MINICONDA_VERSION=23.1.0-1
-ARG PYTHON_VERSION=3.9.13
+ARG PYTHON_VERSION=3.11
+ARG UID=1000
+ARG GID=1000

+# TZ
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# Prereqs
 RUN apt-get update
-RUN apt install -y curl wget git ffmpeg
-RUN adduser --disabled-password --gecos '' --shell /bin/bash user
+RUN apt-get install -y \
+    curl \
+    wget \
+    git \
+    ffmpeg \
+    p7zip-full \
+    gcc \
+    g++ \
+    vim
+
+# User
+RUN groupadd --gid $GID user
+RUN useradd --no-log-init --create-home --shell /bin/bash --uid $UID --gid $GID user
 USER user
 ENV HOME=/home/user
 WORKDIR $HOME
 RUN mkdir $HOME/.cache $HOME/.config && chmod -R 777 $HOME
+
+# Python
 RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
 RUN chmod +x Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
 RUN ./Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh -b -p /home/user/miniconda
@ -20,18 +38,60 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
 RUN conda init
 RUN conda install python=$PYTHON_VERSION
 RUN python3 -m pip install --upgrade pip
-RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

+# Base path
 RUN mkdir $HOME/ai-voice-cloning
 WORKDIR $HOME/ai-voice-cloning
-COPY --chown=user:user modules modules

+# Built in modules
+COPY --chown=user:user modules modules
 RUN python3 -m pip install -r ./modules/tortoise-tts/requirements.txt
 RUN python3 -m pip install -e ./modules/tortoise-tts/
 RUN python3 -m pip install -r ./modules/dlas/requirements.txt
 RUN python3 -m pip install -e ./modules/dlas/
+
+# RVC
+RUN \
+    curl -L -o /tmp/rvc.zip https://huggingface.co/Jmica/rvc/resolve/main/rvc_lightweight.zip?download=true &&\
+    7z x /tmp/rvc.zip &&\
+    rm -f /tmp/rvc.zip
+USER root
+RUN \
+    chown user:user rvc -R &&\
+    chmod -R u+rwX,go+rX,go-w rvc
+USER user
+RUN python3 -m pip install -r ./rvc/requirements.txt
+
+# Fairseq
+# Using patched version for Python 3.11 due to https://github.com/facebookresearch/fairseq/issues/5012
+RUN python3 -m pip install git+https://github.com/liyaodev/fairseq
+
+# RVC Pipeline
+RUN python3 -m pip install git+https://github.com/JarodMica/rvc-tts-pipeline.git@lightweight#egg=rvc_tts_pipe
+
+# Deepspeed
+RUN python3 -m pip install deepspeed
+
+# PyFastMP3Decoder
+RUN python3 -m pip install cython
+RUN git clone https://github.com/neonbjb/pyfastmp3decoder.git
+RUN \
+    cd pyfastmp3decoder &&\
+    git submodule update --init --recursive &&\
+    python setup.py install &&\
+    cd ..
+
+# WhisperX
+RUN python3 -m pip install git+https://github.com/m-bain/whisperx.git
+
+# Main requirements
 ADD requirements.txt requirements.txt
 RUN python3 -m pip install -r ./requirements.txt
+
+# The app
 ADD --chown=user:user . $HOME/ai-voice-cloning

-CMD ["python", "./src/main.py", "--listen", "0.0.0.0:7680"]
+ENV IN_DOCKER=true
+
+CMD ["./start.sh"]
--- a/README.md
+++ b/README.md
@ -14,21 +14,20 @@ That being said, some enhancements added compared to the original repo:

 This is a fork of the repo originally located here: https://git.ecker.tech/mrq/ai-voice-cloning.  All of the work that was put into it to incoporate training with DLAS and inference with Tortoise belong to mrq, the author of the original ai-voice-cloning repo.  

-## Setup (Windows + Nvidia)
-This repo only works on **Windows with NVIDIA GPUs**.  I don't have any plans on making it compatible with other systems, but it shouldn't be too difficult to port if you have experience in coding or are an expert level ChatGPT user.  If you do successfully do this and want to share, pull requests are always welcome!
-> **Tips for developers:** setup-cuda.bat should have everything that you need for the packages to be installed.  All of the different requirements files make it quite a mess in the script, but each repo has their requirements installed, and then at the end, the requirements.txt in the root is needed to change the version *back* to compatible versions for this repo.
+## Setup
+This repo works on **Windows with NVIDIA GPUs** and **Linux running Docker with NVIDIA GPUs**. 

-### Package Installation (Recommended)
-Install 7zip on your computer: https://www.7-zip.org/
+### Windows Package (Recommended)
+1. Optional, but recommended: Install 7zip on your computer: https://www.7-zip.org/
    - If you run into any extraction issues, most likely it's due to your 7zip being out-of-date OR you're using a different extractor.
+2. Head over to the releases tab and download the latest package on Hugging Face: https://github.com/JarodMica/ai-voice-cloning/releases/tag/v3.0
+3. Extract the 7zip archive.
+4. Open up ai-voice-cloning and then run ```start.bat```

-1. Head over to the releases tab and download the latest package on Hugging Face: https://github.com/JarodMica/ai-voice-cloning/releases/tag/v2.0
-2. Extract the 7zip archive.
-3. Open up ai-voice-cloning and then run ```start.bat```
+#### Alternative Manual Installation

-### Manual Installation
 If you are installing this manually, you will need:
- Python 3.9: https://www.python.org/downloads/release/python-3913/
+- Python 3.11: https://www.python.org/downloads/release/python-311/
 - Git: https://www.git-scm.com/downloads

 1. Clone the repository
@ -36,15 +35,45 @@ If you are installing this manually, you will need:
 git clone https://github.com/JarodMica/ai-voice-cloning.git
 ```
 2. Run the ```setup-cuda.bat``` file and it will start running through all of the python packages needed
-    - If you don't have python 3.9, it won't work and you'll need to go download it
+    - If you don't have python 3.11, it won't work and you'll need to go download it
 3. After it finishes, run ```start.bat``` and this will start downloading most of the models you'll need.
    - Some models are downloaded when you first use them.  You'll incur additional downloads during generation and when training (for whisper).  However, once they are finished, you won't ever have to download them again as long as you don't delete them.  They are located in the ```models``` folder of the root.
 4. **(Optional)** You can opt to install whisperx for training by running ```setup-whipserx.bat```
    - Check out the whisperx github page for more details, but it's much faster for longer audio files.  If you're processing one-by-one with an already split dataset, it doesn't improve speeds that much.

+
+### Docker for Linux (or WSL2)
+
+#### Linux Specific Setup
+1. Make sure the latest nvidia drivers are installed: `sudo ubuntu-drivers install`
+2. Install Docker your preferred way
+
+#### Windows Specific Setup
+> Make sure your Nvidia drivers are up to date: https://www.nvidia.com/download/index.aspx
+1. Install WSL2 in PowerShell with `wsl --install` and restart
+2. Open PowerShell, type and enter ```ubuntu```.  It should now load you into wsl2
+3. Remove the original nvidia cache key: `sudo apt-key del 7fa2af80`
+4. Download CUDA toolkit keyring: `wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb`
+5. Install keyring: `sudo dpkg -i cuda-keyring_1.1-1_all.deb`
+6. Update package list: `sudo apt-get update`
+7. Install CUDA toolkit: `sudo apt-get -y install cuda-toolkit-12-4`
+8. Install Docker Desktop using WSL2 as the backend
+9. Restart
+10. If you wish to monitor the terminal remotely via SSH, follow [this guide](https://www.hanselman.com/blog/how-to-ssh-into-wsl2-on-windows-10-from-an-external-machine).
+11. Open PowerShell, type ```ubuntu```, [then follow below](#building-and-running-in-docker)
+
+#### Building and Running in Docker
+1. Open a terminal (or Ubuntu WSL)
+2. Clone the repository: `git clone https://github.com/JarodMica/ai-voice-cloning.git && cd ai-voice-cloning`
+3. Build the image with `./setup-docker.sh`
+4. Start the container with `./start-docker.sh`
+5. Visit `http://localhost:7860` or remotely with `http://<ip>:7860`
+
 ## Instructions
 Checkout the YouTube video:
-Watch First: https://www.youtube.com/watch?v=p31Ax_A5VKA&t=158s
+
+Watch First: https://youtu.be/WWhNqJEmF9M?si=RhUZhYersAvSZ4wf
+
 Watch Second (RVC update): https://www.youtube.com/watch?v=7tpWH8_S8es&t=504s

 Everything is pretty much the same as before if you've used this repository in the past, however, there is a new option to convert text output using ```rvc```.  Before you can use it, you will need a **trained** RVC .pth file that you get from RVC or online, and then you will need to place it in ```models/rvc_models/```.  Both .index and .pth files can be placed in here and they'll show up correctly in their respective dropdown menus.
@ -57,12 +86,12 @@ You will now have access to parameters you could adjust in RVC for the RVC voice
 ## Updating Your Installation
 Below are how you can update the package for the latest updates

-### Package
+### Windows
 >**NOTE:** If there are major feature change, check the latest release to see if ```update_package.bat``` will work.  If NOT, you will need to re-download and re-extract the package from Hugging Face.
-1. Run the update_package.bat file
+1. Run the `update_package.bat `file
    - It will clone the repo and will copy the src folder from the repo to the package.

-### Manual Installation
+#### Alternative Manual Installation
 You should be able to navigate into the folder and then pull the repo to update it.
 ```
 cd ai-voice-cloning
@ -70,6 +99,14 @@ git pull
 ```
 If there are large features added, you may need to delete the venv and the re-run the setup-cuda script to make sure there are no package issues

+### Linux via Docker
+You should be able to navigate into the folder and then pull the repo to update it, then rebuild your Docker image.
+```
+cd ai-voice-cloning
+git pull
+./setup-docker.sh
+```
+
 ## Documentation

 ### Troubleshooting Manual Installation
@ -82,8 +119,9 @@ pip uninstall torch
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 ```

-Other documentation tbd
-
 ## Bug Reporting

 If you run into any problems, please open up a new issue on the issues tab.
+
+## Tips for developers
+`setup-cuda.bat` should have everything that you need for the packages to be installed.  All of the different requirements files make it quite a mess in the script, but each repo has their requirements installed, and then at the end, the `requirements.txt` in the root is needed to change the version *back* to compatible versions for this repo.
--- a/setup-cuda.sh
+++ b/setup-cuda.sh
--- a/setup-docker.sh
+++ b/setup-docker.sh
@ -1,4 +1,15 @@
 #!/bin/bash
-git submodule init
-git submodule update --remote
-docker build -t ai-voice-cloning .
+
+function main() {
+    if [ ! -f modules/tortoise-tts/README.md ]; then
+        git submodule init
+        git submodule update --remote
+    fi
+    docker build \
+        --build-arg UID=$(id -u) \
+        --build-arg GID=$(id -g) \
+        -t ai-voice-cloning \
+        .
+}
+
+main
--- a/setup-rocm-bnb.sh
+++ b/setup-rocm-bnb.sh
--- a/setup-rocm.sh
+++ b/setup-rocm.sh
--- a/src/utils.py
+++ b/src/utils.py
@ -43,6 +43,8 @@ from tortoise.api_fast import TextToSpeech as Toroise_TTS_Hifi
 from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices
 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
+# TODO: The below import blocks any CLI parameters.
+#       Try running with --low-vram
 from rvc_pipe.rvc_infer import rvc_convert

 MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
@ -1181,6 +1183,7 @@ def generate_tortoise(**kwargs):
 			model_hash = settings["model_hash"][:8] if settings is not None and "model_hash" in settings else tts.autoregressive_model_hash[:8]

 			dir = f'{get_voice_dir()}/{voice}/'
+			# TODO: Use of model_hash here causes issues in development as new hashes are added to the repo.
 			latents_path = f'{dir}/cond_latents_{model_hash}.pth'

 			if voice == "random" or voice == "microphone":
@ -3485,7 +3488,7 @@ def setup_args(cli=False):
 		try:
 			match = re.findall(r"^(?:(.+?):(\d+))?(\/.*?)?$", args.listen)[0]

-			args.listen_host = match[0] if match[0] != "" else "127.0.0.1"
+			args.listen_host = match[0] if match[0] != "" else "0.0.0.0"
 			args.listen_port = match[1] if match[1] != "" else None
 			args.listen_path = match[2] if match[2] != "" else "/"
 		except Exception as e:
@ -3789,6 +3792,8 @@ def unload_tts():
 	do_gc()

 def reload_tts():
+	in_docker = os.environ.get("IN_DOCKER", "false")
+	if in_docker == "false":
 		subprocess.Popen(["start.bat"])
 	with open("reload_flag.txt", "w") as f:
 		f.write("reload")
--- a/start-docker.sh
+++ b/start-docker.sh
@ -1,14 +1,24 @@
 #!/bin/bash
-CMD="python3 ./src/main.py $@"
-# CMD="bash"
-CPATH="/home/user/ai-voice-cloning"
-docker run --rm --gpus all \
-    --mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
-    --mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
-    --mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
-    --mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
-    --workdir $CPATH \
-    --user "$(id -u):$(id -g)" \
-    --net host \
-    -it ai-voice-cloning $CMD

+docker run \
+    -ti \
+    --rm \
+    --gpus all \
+    --name ai-voice-cloning \
+    -v "${PWD}/models:/home/user/ai-voice-cloning/models" \
+    -v "${PWD}/training:/home/user/ai-voice-cloning/training" \
+    -v "${PWD}/voices:/home/user/ai-voice-cloning/voices" \
+    -v "${PWD}/bin:/home/user/ai-voice-cloning/bin" \
+    -v "${PWD}/config:/home/user/ai-voice-cloning/config" \
+    --user "$(id -u):$(id -g)" \
+    -p "7860:7860" \
+    ai-voice-cloning $@
+
+# For dev:
+#     -v "${PWD}/src:/home/user/ai-voice-cloning/src" \
+#     -v "${PWD}/modules/tortoise_dataset_tools/dataset_whisper_tools:/home/user/ai-voice-cloning/modules/tortoise_dataset_tools/dataset_whisper_tools" \
+#     -v "${PWD}/modules/dlas/dlas:/home/user/ai-voice-cloning/modules/dlas/dlas" \
+#     -v "/home/user/ai-voice-cloning/src/__pycache__" \
+
+# For testing:
+#    -e "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" \
--- a/start.sh
+++ b/start.sh
@ -1,5 +1,7 @@
 #!/bin/bash
 ulimit -Sn `ulimit -Hn` # ROCm is a bitch
-source ./venv/bin/activate
-python3 ./src/main.py "$@"
-deactivate
+while [ true ]; do
+    python3 ./src/main.py "$@"
+    echo "Press Cntrl-C to quit or application will restart... (5s)"
+    sleep 5
+done
--- a/train-docker.sh
+++ b/train-docker.sh
--- a/train.sh
+++ b/train.sh
--- a/update-force.sh
+++ b/update-force.sh
--- a/update.sh
+++ b/update.sh