fixes to setup-cuda and auto-manual install

2025-06-07 06:05:52 -04:00 · 2024-03-29 00:39:29 -07:00 · 2024-03-29 00:39:29 -07:00 · 15b27a9e14
commit 15b27a9e14
parent 84c0c7c36e
10 changed files with 959 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,6 @@
 # ignores user files
 /venv/
 /venv_cpu/
-/models/*
 /training/*
 /config/*
 output/
@ -13,6 +12,7 @@ pyfastmp3decoder/
 .vscode/
 onnx_test_stuff/
 *.whl
+*.exe

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -145,7 +145,7 @@ dmypy.json
 .pyre/

 .idea/*
-.models/*
+
 .custom/*
 results/*
 debug_states/*
--- a/download_ffmpeg.bat
+++ b/download_ffmpeg.bat
@ -0,0 +1,34 @@
+@echo off
+
+set ffmpeg_url=https://www.gyan.dev/ffmpeg/builds/ffmpeg-git-essentials.7z?download=true
+set ffmpeg_folder=ffmpeg
+set ffmpeg_zip=ffmpeg.7z
+
+curl -o 7z.exe  "https://www.7-zip.org/a/7zr.exe"
+
+if not exist "%ffmpeg_folder%" (
+        if not exist "%ffmpeg_zip%" (
+            echo Downloading %ffmpeg_zip%...
+            curl -L -o "%ffmpeg_zip%" "%ffmpeg_url%"
+            if errorlevel 1 (
+                echo Download failed. Please check your internet connection or the URL and try again.
+                exit /b 1
+            )
+        ) else (
+            echo File %ffmpeg_zip% already exists, skipping download.
+        )
+        
+        echo Extracting %ffmpeg_zip%...
+        7z.exe x %ffmpeg_zip% -o%ffmpeg_folder%
+        echo FFmpeg has finished downloading and extracting.
+    ) else (
+        echo FFmpeg folder %ffmpeg_folder% already exists, skipping download and extraction.
+    )
+
+:: Move ffmpeg.exe and ffprobe.exe to the ffmpeg folder root
+for /D %%i in ("%ffmpeg_folder%\*") do (
+    if exist "%%i\bin\ffmpeg.exe" move "%%i\bin\ffmpeg.exe" "ffmpeg.exe"
+    if exist "%%i\bin\ffprobe.exe" move "%%i\bin\ffprobe.exe" "ffprobe.exe"
+)
+
+echo FFmpeg moved out of downloaded folder
--- a/models/.gitignore
+++ b/models/.gitignore
@ -1,3 +1,6 @@
 *
+!tokenizers/
 !.gitignore
-!rvc_models/
+!rvc_models/
+!.template.dlas.yaml
+!tortoise/
--- a/models/.template.dlas.yaml
+++ b/models/.template.dlas.yaml
@ -0,0 +1,139 @@
+name: '${voice}'
+model: extensibletrainer
+scale: 1
+gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
+start_step: 0
+checkpointing_enabled: true 
+fp16: ${half_p}
+bitsandbytes: ${bitsandbytes}
+gpus: ${gpus}
+
+datasets:
+  train:
+    name: training
+    n_workers: ${workers}
+    batch_size: ${batch_size}
+    mode: paired_voice_audio
+    path: ${dataset_path}
+    fetcher_mode: ['lj']
+    phase: train
+    max_wav_length: 255995 # ~11.6 seconds
+    max_text_length: 200
+    sample_rate: 22050
+    load_conditioning: True
+    num_conditioning_candidates: 2
+    conditioning_length: 44000
+    use_bpe_tokenizer: True
+    tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
+    load_aligned_codes: False
+  val:
+    name: validation
+    n_workers: ${workers}
+    batch_size: ${validation_batch_size}
+    mode: paired_voice_audio
+    path: ${validation_path}
+    fetcher_mode: ['lj']
+    phase: val
+    max_wav_length: 255995
+    max_text_length: 200
+    sample_rate: 22050
+    load_conditioning: True
+    num_conditioning_candidates: 2
+    conditioning_length: 44000
+    use_bpe_tokenizer: True
+    tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
+    load_aligned_codes: False
+
+steps:        
+  gpt_train:
+    training: gpt
+    loss_log_buffer: 500
+
+    # Generally follows the recipe from the DALLE paper.
+    optimizer: ${optimizer} # this should be adamw_zero if you're using distributed training
+    optimizer_params:
+      lr: !!float ${learning_rate} # originally: 1e-4
+      weight_decay: !!float 1e-2
+      beta1: 0.9
+      beta2: 0.96
+    clip_grad_eps: 4
+
+    injectors:
+      paired_to_mel:
+        type: torch_mel_spectrogram
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
+        in: wav
+        out: paired_mel
+      paired_cond_to_mel:
+        type: for_each
+        subtype: torch_mel_spectrogram
+        mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
+        in: conditioning
+        out: paired_conditioning_mel
+      to_codes:
+        type: discrete_token
+        in: paired_mel
+        out: paired_mel_codes
+        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
+      paired_fwd_text:
+        type: generator
+        generator: gpt
+        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
+        out: [loss_text_ce, loss_mel_ce, logits]      
+    losses:
+      text_ce:
+        type: direct
+        weight: ${text_lr_weight}
+        key: loss_text_ce
+      mel_ce:
+        type: direct
+        weight: ${mel_lr_weight}
+        key: loss_mel_ce
+
+networks:
+  gpt:
+    type: generator
+    which_model_G: unified_voice2
+    kwargs:
+      layers: 30 # originally: 8
+      model_dim: 1024 # originally: 512
+      heads: 16 # originally: 8
+      max_text_tokens: 402 # originally: 120
+      max_mel_tokens: 604 # originally: 250
+      max_conditioning_inputs: 2 # originally: 1
+      mel_length_compression: 1024
+      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
+      number_mel_codes: 8194
+      start_mel_token: 8192
+      stop_mel_token: 8193
+      start_text_token: 255
+      train_solo_embeddings: False # missing in uv3/4
+      use_mel_codes_as_input: True # ditto
+      checkpointing: True
+      tortoise_compat: True
+      # freeze_everything_but_position_embeddings: True
+
+path:
+  strict_load: true
+  ${source_model} 
+  ${resume_state}
+
+train:
+  niter: ${iterations}
+  warmup_iter: -1
+  mega_batch_factor: ${gradient_accumulation_size}
+  val_freq: ${validation_rate}
+
+  ema_enabled: false # I really don't think EMA matters
+
+  ${learning_rate_scheme}
+
+eval:
+  pure: ${validation_enabled}
+  output_state: gen
+
+logger: 
+  save_checkpoint_freq: ${save_rate}
+  visuals: [gen, mel]
+  visual_debug_rate: ${save_rate}
+  is_mel_spectrogram: true
--- a/models/tokenizers/ipa.json
+++ b/models/tokenizers/ipa.json
@ -0,0 +1,144 @@
+{
+    "version": "1.0",
+    "truncation": null,
+    "padding": null,
+    "normalizer": null,
+    "pre_tokenizer": null,
+    "post_processor": null,
+    "decoder": null,
+    "added_tokens": [
+        {
+            "id": 0,
+            "special": true,
+            "content": "[STOP]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false
+        },
+        {
+            "id": 1,
+            "special": true,
+            "content": "[UNK]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false
+        },
+        {
+            "id": 2,
+            "special": true,
+            "content": "[SPACE]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false
+        }
+    ],
+    "model": {
+        "type": "BPE",
+        "dropout": null,
+        "unk_token": "[UNK]",
+        "continuing_subword_prefix": null,
+        "end_of_word_suffix": null,
+        "fuse_unk": false,
+        "vocab": {
+            "[STOP]": 0,
+            "[UNK]": 1,
+            "[SPACE]": 2,
+            "!": 3,
+            "'": 4,
+            "(": 5,
+            ")": 6,
+            ",": 7,
+            "-": 8,
+            ".": 9,
+            "/": 10,
+            ":": 11,
+            ";": 12,
+            "?": 13,
+            "a": 14,
+            "b": 15,
+            "c": 16,
+            "d": 17,
+            "e": 18,
+            "f": 19,
+            "g": 20,
+            "h": 21,
+            "i": 22,
+            "j": 23,
+            "k": 24,
+            "l": 25,
+            "m": 26,
+            "n": 27,
+            "o": 28,
+            "p": 29,
+            "q": 30,
+            "r": 31,
+            "s": 32,
+            "t": 33,
+            "u": 34,
+            "v": 35,
+            "w": 36,
+            "x": 37,
+            "y": 38,
+            "z": 39,
+            "d͡": 40,
+            "t͡": 41,
+            "|": 42,
+            "æ": 43,
+            "ð": 44,
+            "ŋ": 45,
+            "ɑ": 46,
+            "ɔ": 47,
+            "ə": 48,
+            "ɚ": 49,
+            "ɛ": 50,
+            "ɡ": 51,
+            "ɪ": 52,
+            "ɹ": 53,
+            "ʃ": 54,
+            "ʊ": 55,
+            "ʌ": 56,
+            "ʒ": 57,
+            "θ": 58,
+            "ɐ": 59,
+            "ɜ": 60,
+            "ᵻ": 61,
+            "ɾ": 62,
+            "n\u0329": 63,
+            "ː": 64,
+            "ˈ": 65,
+            "ˌ": 66,
+            "ʔ": 67,
+            "d͡ʒ": 68,
+            "aɪ": 69,
+            "aʊ": 70,
+            "eɪ": 71,
+            "oʊ": 72,
+            "t͡ʃ": 73,
+            "ɔɪ": 74,
+            "ɔː": 75,
+            "uː": 76,
+            "iː": 77,
+            "ɑː": 78,
+            "oː": 79,
+            "ɜː": 80
+        },
+        "merges": [
+            "a ɪ",
+            "e ɪ",
+            "ɔ ɪ",
+            "a ʊ",
+            "o ʊ",
+            "d͡ ʒ",
+            "t͡ ʃ",
+            "i ː",
+            "o ː",
+            "u ː",
+            "ɑ ː",
+            "ɔ ː",
+            "ɜ ː"
+        ]
+    }
+}
--- a/models/tokenizers/spanish_bpe_tokenizer.json
+++ b/models/tokenizers/spanish_bpe_tokenizer.json
@ -0,0 +1,583 @@
+{
+    "version": "1.0",
+    "truncation": null,
+    "padding": null,
+    "added_tokens": [
+        {
+            "id": 0,
+            "content": "[STOP]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 1,
+            "content": "[UNK]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 2,
+            "content": "[SPACE]",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 3,
+            "content": "0",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 4,
+            "content": "1",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 5,
+            "content": "2",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 6,
+            "content": "3",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 7,
+            "content": "4",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 8,
+            "content": "5",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 9,
+            "content": "6",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 10,
+            "content": "7",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 11,
+            "content": "8",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 12,
+            "content": "9",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        }
+    ],
+    "normalizer": null,
+    "pre_tokenizer": {
+        "type": "Whitespace"
+    },
+    "post_processor": null,
+    "decoder": null,
+    "model": {
+        "type": "BPE",
+        "dropout": null,
+        "unk_token": "[UNK]",
+        "continuing_subword_prefix": null,
+        "end_of_word_suffix": null,
+        "fuse_unk": false,
+        "byte_fallback": false,
+        "vocab": {
+            "[STOP]": 0,
+            "[UNK]": 1,
+            "[SPACE]": 2,
+            "0": 3,
+            "1": 4,
+            "2": 5,
+            "3": 6,
+            "4": 7,
+            "5": 8,
+            "6": 9,
+            "7": 10,
+            "8": 11,
+            "9": 12,
+            "!": 13,
+            "\"": 14,
+            "%": 15,
+            "'": 16,
+            ",": 17,
+            "-": 18,
+            ".": 19,
+            "?": 20,
+            "a": 21,
+            "b": 22,
+            "c": 23,
+            "d": 24,
+            "e": 25,
+            "f": 26,
+            "g": 27,
+            "h": 28,
+            "i": 29,
+            "j": 30,
+            "k": 31,
+            "l": 32,
+            "m": 33,
+            "n": 34,
+            "o": 35,
+            "p": 36,
+            "q": 37,
+            "r": 38,
+            "s": 39,
+            "t": 40,
+            "u": 41,
+            "v": 42,
+            "w": 43,
+            "x": 44,
+            "y": 45,
+            "z": 46,
+            "¡": 47,
+            "ª": 48,
+            "°": 49,
+            "º": 50,
+            "»": 51,
+            "¿": 52,
+            "à": 53,
+            "á": 54,
+            "â": 55,
+            "å": 56,
+            "ç": 57,
+            "è": 58,
+            "é": 59,
+            "ê": 60,
+            "ë": 61,
+            "í": 62,
+            "ñ": 63,
+            "ó": 64,
+            "ô": 65,
+            "ö": 66,
+            "ú": 67,
+            "û": 68,
+            "ü": 69,
+            "œ": 70,
+            "—": 71,
+            "…": 72,
+            "de": 73,
+            "en": 74,
+            "os": 75,
+            "es": 76,
+            "la": 77,
+            "er": 78,
+            "qu": 79,
+            "an": 80,
+            "ar": 81,
+            "on": 82,
+            "as": 83,
+            "or": 84,
+            "que": 85,
+            "el": 86,
+            "do": 87,
+            "al": 88,
+            "ci": 89,
+            "re": 90,
+            "in": 91,
+            "un": 92,
+            "ab": 93,
+            "to": 94,
+            "te": 95,
+            "se": 96,
+            "di": 97,
+            "tr": 98,
+            "con": 99,
+            "ad": 100,
+            "su": 101,
+            "los": 102,
+            "le": 103,
+            "ta": 104,
+            "co": 105,
+            "ti": 106,
+            "mi": 107,
+            "no": 108,
+            "lo": 109,
+            "cu": 110,
+            "ía": 111,
+            "me": 112,
+            "ri": 113,
+            "por": 114,
+            "vi": 115,
+            "si": 116,
+            "ch": 117,
+            "ca": 118,
+            "mo": 119,
+            "ra": 120,
+            "par": 121,
+            "las": 122,
+            "ro": 123,
+            "th": 124,
+            "da": 125,
+            "pu": 126,
+            "per": 127,
+            "ón": 128,
+            "ent": 129,
+            "des": 130,
+            "li": 131,
+            "so": 132,
+            "ma": 133,
+            "est": 134,
+            "del": 135,
+            "aba": 136,
+            "gu": 137,
+            "men": 138,
+            "mp": 139,
+            "is": 140,
+            "mb": 141,
+            "ha": 142,
+            "mu": 143,
+            "tu": 144,
+            "una": 145,
+            "era": 146,
+            "hab": 147,
+            "go": 148,
+            "res": 149,
+            "dos": 150,
+            "man": 151,
+            "ás": 152,
+            "para": 153,
+            "po": 154,
+            "ve": 155,
+            "tra": 156,
+            "pa": 157,
+            "vo": 158,
+            "mos": 159,
+            "the": 160,
+            "ten": 161,
+            "ando": 162,
+            "pi": 163,
+            "fu": 164,
+            "ada": 165,
+            "jo": 166,
+            "ce": 167,
+            "ver": 168,
+            "bi": 169,
+            "qui": 170,
+            "como": 171,
+            "tan": 172,
+            "us": 173,
+            "más": 174,
+            "pe": 175,
+            "dad": 176,
+            "ción": 177,
+            "ho": 178,
+            "hi": 179,
+            "car": 180,
+            "ter": 181,
+            "án": 182,
+            "cer": 183,
+            "cia": 184,
+            "sus": 185,
+            "cas": 186,
+            "he": 187,
+            "ado": 188,
+            "fi": 189,
+            "yo": 190,
+            "gr": 191,
+            "mente": 192,
+            "ba": 193,
+            "sa": 194,
+            "ni": 195,
+            "ser": 196,
+            "tro": 197,
+            "cor": 198,
+            "mar": 199,
+            "pro": 200,
+            "tar": 201,
+            "lla": 202,
+            "lu": 203,
+            "za": 204,
+            "ces": 205,
+            "les": 206,
+            "sin": 207,
+            "ex": 208,
+            "pero": 209,
+            "and": 210,
+            "mis": 211,
+            "ven": 212,
+            "cho": 213,
+            "tes": 214,
+            "ya": 215,
+            "of": 216,
+            "cos": 217,
+            "ia": 218,
+            "du": 219,
+            "lan": 220,
+            "señ": 221,
+            "esta": 222,
+            "ga": 223,
+            "va": 224,
+            "ir": 225,
+            "ing": 226,
+            "endo": 227,
+            "bre": 228,
+            "min": 229,
+            "aque": 230,
+            "na": 231,
+            "lle": 232,
+            "bu": 233,
+            "pas": 234,
+            "tos": 235,
+            "io": 236,
+            "mas": 237,
+            "for": 238,
+            "emp": 239,
+            "ente": 240,
+            "és": 241,
+            "había": 242,
+            "ora": 243,
+            "ían": 244,
+            "ones": 245,
+            "je": 246,
+            "tor": 247,
+            "cab": 248,
+            "pre": 249,
+            "all": 250,
+            "nu": 251,
+            "én": 252,
+            "eron": 253,
+            "él": 254,
+            "ero": 255
+        },
+        "merges": [
+            "d e",
+            "e n",
+            "o s",
+            "e s",
+            "l a",
+            "e r",
+            "q u",
+            "a n",
+            "a r",
+            "o n",
+            "a s",
+            "o r",
+            "qu e",
+            "e l",
+            "d o",
+            "a l",
+            "c i",
+            "r e",
+            "i n",
+            "u n",
+            "a b",
+            "t o",
+            "t e",
+            "s e",
+            "d i",
+            "t r",
+            "c on",
+            "a d",
+            "s u",
+            "l os",
+            "l e",
+            "t a",
+            "c o",
+            "t i",
+            "m i",
+            "n o",
+            "l o",
+            "c u",
+            "í a",
+            "m e",
+            "r i",
+            "p or",
+            "v i",
+            "s i",
+            "c h",
+            "c a",
+            "m o",
+            "r a",
+            "p ar",
+            "la s",
+            "r o",
+            "t h",
+            "d a",
+            "p u",
+            "p er",
+            "ó n",
+            "en t",
+            "de s",
+            "l i",
+            "s o",
+            "m a",
+            "es t",
+            "de l",
+            "ab a",
+            "g u",
+            "m en",
+            "m p",
+            "i s",
+            "m b",
+            "h a",
+            "m u",
+            "t u",
+            "un a",
+            "er a",
+            "h ab",
+            "g o",
+            "r es",
+            "d os",
+            "m an",
+            "á s",
+            "par a",
+            "p o",
+            "v e",
+            "tr a",
+            "p a",
+            "v o",
+            "m os",
+            "th e",
+            "t en",
+            "an do",
+            "p i",
+            "f u",
+            "ad a",
+            "j o",
+            "c e",
+            "v er",
+            "b i",
+            "qu i",
+            "co mo",
+            "t an",
+            "u s",
+            "m ás",
+            "p e",
+            "d ad",
+            "ci ón",
+            "h o",
+            "h i",
+            "c ar",
+            "t er",
+            "á n",
+            "c er",
+            "ci a",
+            "su s",
+            "c as",
+            "h e",
+            "a do",
+            "f i",
+            "y o",
+            "g r",
+            "men te",
+            "b a",
+            "s a",
+            "n i",
+            "s er",
+            "tr o",
+            "c or",
+            "m ar",
+            "p ro",
+            "t ar",
+            "l la",
+            "l u",
+            "z a",
+            "c es",
+            "l es",
+            "s in",
+            "e x",
+            "per o",
+            "an d",
+            "mi s",
+            "v en",
+            "ch o",
+            "t es",
+            "y a",
+            "o f",
+            "c os",
+            "i a",
+            "d u",
+            "la n",
+            "se ñ",
+            "es ta",
+            "g a",
+            "v a",
+            "i r",
+            "in g",
+            "en do",
+            "b re",
+            "m in",
+            "a que",
+            "n a",
+            "l le",
+            "b u",
+            "p as",
+            "t os",
+            "i o",
+            "m as",
+            "f or",
+            "e mp",
+            "en te",
+            "é s",
+            "hab ía",
+            "or a",
+            "í an",
+            "on es",
+            "j e",
+            "t or",
+            "c ab",
+            "p re",
+            "al l",
+            "n u",
+            "é n",
+            "er on",
+            "é l",
+            "er o"
+        ],
+        "language": "es"
+    }
+}
--- a/models/tortoise/train_diffusion_vocoder_22k_level.yml
+++ b/models/tortoise/train_diffusion_vocoder_22k_level.yml
@ -0,0 +1,18 @@
+path:
+  pretrain_model_dvae: './models/tortoise/dvae.pth'
+  strict_load: true
+  #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
+networks:
+  dvae:
+    type: generator
+    which_model_G: lucidrains_dvae
+    kwargs:
+      channels: 80
+      codebook_dim: 512
+      hidden_dim: 512
+      kernel_size: 3
+      num_layers: 2
+      num_resnet_blocks: 3
+      num_tokens: 8192
+      positional_dims: 1
+      use_transposed_convs: false
--- a/modules/tortoise_dataset_tools
+++ b/modules/tortoise_dataset_tools
@ -1 +1 @@
-Subproject commit 5bd8624d24be8331fe512c4f8f143d18721bfceb
+Subproject commit 42a4fa99fee363a6ba1d643f8100f52914de04f0
--- a/setup-cuda.bat
+++ b/setup-cuda.bat
@ -90,24 +90,35 @@ if not exist "%fileds_name%" (
    echo File %fileds_name% already exists, skipping download.
 )

-:: Install Fairseq, Deepspeed and RVC TTS Pipeline
+set download_pyfastmp3decoder=
+set filepyfast_name=pyfastmp3decoder-0.0.1-cp311-cp311-win_amd64.whl
+
+if not exist "%filepyfast_name%" (
+    echo Downloading %filepyfast_name%...
+    curl -L -O "%download_pyfastmp3decoder%"
+    if errorlevel 1 (
+        echo Download failed. Please check your internet connection or the URL and try again.
+        exit /b 1
+    )
+) else (
+    echo File %filepyfast_name% already exists, skipping download.
+)
+
+:: Install Fairseq, Deepspeed, pyfast, and RVC TTS Pipeline
 python -m pip install .\fairseq-0.12.4-cp311-cp311-win_amd64.whl
 python -m pip install git+https://github.com/JarodMica/rvc-tts-pipeline.git@lightweight#egg=rvc_tts_pipe
 python -m pip install deepspeed-0.14.0-cp311-cp311-win_amd64.whl
+python -m pip install pyfastmp3decoder-0.0.1-cp311-cp311-win_amd64.whl

 :: Install whisperx
 python -m pip install git+https://github.com/m-bain/whisperx.git

-:: Install JBetker's repo for mp3 training
-git clone https://github.com/neonbjb/pyfastmp3decoder.git
-cd pyfastmp3decoder
-git submodule update --init --recursive
-python setup.py install
-cd ..
-
 :: Install other requirements (this is done last due to potential package conflicts)
 python -m pip install -r requirements.txt

+:: Download and install ffmpeg
+call download_ffmpeg.bat
+
 :: Setup BnB
 .\setup-cuda-bnb.bat

--- a/src/webui.py
+++ b/src/webui.py
@ -535,15 +535,28 @@ def save_training_settings_proxy(*args):
    settings, messages = save_training_settings(**kwargs)
    return "\n".join(messages)

+def get_dataset_continuation(voice):
+    try:
+        training_dir = f"training/{voice}/processed"
+        if os.path.exists(training_dir):
+            processed_dataset_list = [folder for folder in os.listdir(training_dir) if os.path.isdir(os.path.join(training_dir, folder))]
+            if processed_dataset_list:
+                processed_dataset_list.append("")
+                return gr.Dropdown(choices=processed_dataset_list, value="", interactive=True)
+    except Exception as e:
+        print(f"Error getting dataset continuation: {str(e)}")
+    return gr.Dropdown(choices=[], value="", interactive=True)   

-def update_voices():
+
+def update_voices(voice):
    return (
        gr.Dropdown(choices=get_voice_list(append_defaults=True)),
        gr.Dropdown(choices=get_voice_list()),
        gr.Dropdown(choices=get_voice_list(args.results_folder)),
        gr.Dropdown(choices=get_rvc_models()),  # Update for RVC models
        gr.Dropdown(choices=get_rvc_indexes()),  # Update for RVC models
-        gr.Dropdown(choices=get_voice_list())
+        gr.Dropdown(choices=get_voice_list()),
+        get_dataset_continuation(voice)
    )


@ -579,17 +592,6 @@ def setup_gradio():
    voice_list = get_voice_list()
    result_voices = get_voice_list(args.results_folder)
    
-    def get_dataset_continuation(voice):
-        try:
-            training_dir = f"training/{voice}/processed"
-            if os.path.exists(training_dir):
-                processed_dataset_list = [folder for folder in os.listdir(training_dir) if os.path.isdir(os.path.join(training_dir, folder))]
-                if processed_dataset_list:
-                    processed_dataset_list.append("")
-                    return gr.Dropdown(choices=processed_dataset_list, value="", interactive=True)
-        except Exception as e:
-            print(f"Error getting dataset continuation: {str(e)}")
-        return gr.Dropdown(choices=[], value="", interactive=True)   


    valle_models = get_valle_models()