fixes to setup-cuda and auto-manual install

This commit is contained in:
Jarod Mica 2024-03-29 00:39:29 -07:00
parent 84c0c7c36e
commit 15b27a9e14
10 changed files with 959 additions and 25 deletions

4
.gitignore vendored
View file

@ -1,7 +1,6 @@
# ignores user files
/venv/
/venv_cpu/
/models/*
/training/*
/config/*
output/
@ -13,6 +12,7 @@ pyfastmp3decoder/
.vscode/
onnx_test_stuff/
*.whl
*.exe
# Byte-compiled / optimized / DLL files
__pycache__/
@ -145,7 +145,7 @@ dmypy.json
.pyre/
.idea/*
.models/*
.custom/*
results/*
debug_states/*

34
download_ffmpeg.bat Normal file
View file

@ -0,0 +1,34 @@
@echo off
set ffmpeg_url=https://www.gyan.dev/ffmpeg/builds/ffmpeg-git-essentials.7z?download=true
set ffmpeg_folder=ffmpeg
set ffmpeg_zip=ffmpeg.7z
curl -o 7z.exe "https://www.7-zip.org/a/7zr.exe"
if not exist "%ffmpeg_folder%" (
if not exist "%ffmpeg_zip%" (
echo Downloading %ffmpeg_zip%...
curl -L -o "%ffmpeg_zip%" "%ffmpeg_url%"
if errorlevel 1 (
echo Download failed. Please check your internet connection or the URL and try again.
exit /b 1
)
) else (
echo File %ffmpeg_zip% already exists, skipping download.
)
echo Extracting %ffmpeg_zip%...
7z.exe x %ffmpeg_zip% -o%ffmpeg_folder%
echo FFmpeg has finished downloading and extracting.
) else (
echo FFmpeg folder %ffmpeg_folder% already exists, skipping download and extraction.
)
:: Move ffmpeg.exe and ffprobe.exe to the ffmpeg folder root
for /D %%i in ("%ffmpeg_folder%\*") do (
if exist "%%i\bin\ffmpeg.exe" move "%%i\bin\ffmpeg.exe" "ffmpeg.exe"
if exist "%%i\bin\ffprobe.exe" move "%%i\bin\ffprobe.exe" "ffprobe.exe"
)
echo FFmpeg moved out of downloaded folder

5
models/.gitignore vendored
View file

@ -1,3 +1,6 @@
*
!tokenizers/
!.gitignore
!rvc_models/
!rvc_models/
!.template.dlas.yaml
!tortoise/

139
models/.template.dlas.yaml Normal file
View file

@ -0,0 +1,139 @@
name: '${voice}'
model: extensibletrainer
scale: 1
gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
start_step: 0
checkpointing_enabled: true
fp16: ${half_p}
bitsandbytes: ${bitsandbytes}
gpus: ${gpus}
datasets:
train:
name: training
n_workers: ${workers}
batch_size: ${batch_size}
mode: paired_voice_audio
path: ${dataset_path}
fetcher_mode: ['lj']
phase: train
max_wav_length: 255995 # ~11.6 seconds
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
val:
name: validation
n_workers: ${workers}
batch_size: ${validation_batch_size}
mode: paired_voice_audio
path: ${validation_path}
fetcher_mode: ['lj']
phase: val
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ${tokenizer_json} # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
steps:
gpt_train:
training: gpt
loss_log_buffer: 500
# Generally follows the recipe from the DALLE paper.
optimizer: ${optimizer} # this should be adamw_zero if you're using distributed training
optimizer_params:
lr: !!float ${learning_rate} # originally: 1e-4
weight_decay: !!float 1e-2
beta1: 0.9
beta2: 0.96
clip_grad_eps: 4
injectors:
paired_to_mel:
type: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: wav
out: paired_mel
paired_cond_to_mel:
type: for_each
subtype: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: conditioning
out: paired_conditioning_mel
to_codes:
type: discrete_token
in: paired_mel
out: paired_mel_codes
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
paired_fwd_text:
type: generator
generator: gpt
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
out: [loss_text_ce, loss_mel_ce, logits]
losses:
text_ce:
type: direct
weight: ${text_lr_weight}
key: loss_text_ce
mel_ce:
type: direct
weight: ${mel_lr_weight}
key: loss_mel_ce
networks:
gpt:
type: generator
which_model_G: unified_voice2
kwargs:
layers: 30 # originally: 8
model_dim: 1024 # originally: 512
heads: 16 # originally: 8
max_text_tokens: 402 # originally: 120
max_mel_tokens: 604 # originally: 250
max_conditioning_inputs: 2 # originally: 1
mel_length_compression: 1024
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
number_mel_codes: 8194
start_mel_token: 8192
stop_mel_token: 8193
start_text_token: 255
train_solo_embeddings: False # missing in uv3/4
use_mel_codes_as_input: True # ditto
checkpointing: True
tortoise_compat: True
# freeze_everything_but_position_embeddings: True
path:
strict_load: true
${source_model}
${resume_state}
train:
niter: ${iterations}
warmup_iter: -1
mega_batch_factor: ${gradient_accumulation_size}
val_freq: ${validation_rate}
ema_enabled: false # I really don't think EMA matters
${learning_rate_scheme}
eval:
pure: ${validation_enabled}
output_state: gen
logger:
save_checkpoint_freq: ${save_rate}
visuals: [gen, mel]
visual_debug_rate: ${save_rate}
is_mel_spectrogram: true

144
models/tokenizers/ipa.json Normal file
View file

@ -0,0 +1,144 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"normalizer": null,
"pre_tokenizer": null,
"post_processor": null,
"decoder": null,
"added_tokens": [
{
"id": 0,
"special": true,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 1,
"special": true,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
},
{
"id": 2,
"special": true,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false
}
],
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"vocab": {
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"!": 3,
"'": 4,
"(": 5,
")": 6,
",": 7,
"-": 8,
".": 9,
"/": 10,
":": 11,
";": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"g": 20,
"h": 21,
"i": 22,
"j": 23,
"k": 24,
"l": 25,
"m": 26,
"n": 27,
"o": 28,
"p": 29,
"q": 30,
"r": 31,
"s": 32,
"t": 33,
"u": 34,
"v": 35,
"w": 36,
"x": 37,
"y": 38,
"z": 39,
"d͡": 40,
"t͡": 41,
"|": 42,
"æ": 43,
"ð": 44,
"ŋ": 45,
"ɑ": 46,
"ɔ": 47,
"ə": 48,
"ɚ": 49,
"ɛ": 50,
"ɡ": 51,
"ɪ": 52,
"ɹ": 53,
"ʃ": 54,
"ʊ": 55,
"ʌ": 56,
"ʒ": 57,
"θ": 58,
"ɐ": 59,
"ɜ": 60,
"ᵻ": 61,
"ɾ": 62,
"n\u0329": 63,
"ː": 64,
"ˈ": 65,
"ˌ": 66,
"ʔ": 67,
"d͡ʒ": 68,
"aɪ": 69,
"aʊ": 70,
"eɪ": 71,
"oʊ": 72,
"t͡ʃ": 73,
"ɔɪ": 74,
"ɔː": 75,
"uː": 76,
"iː": 77,
"ɑː": 78,
"oː": 79,
"ɜː": 80
},
"merges": [
"a ɪ",
"e ɪ",
ɪ",
"a ʊ",
"o ʊ",
"d͡ ʒ",
"t͡ ʃ",
"i ː",
"o ː",
"u ː",
"ɑ ː",
ː",
ː"
]
}
}

View file

@ -0,0 +1,583 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "0",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "1",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "2",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "3",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "4",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "5",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 9,
"content": "6",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 10,
"content": "7",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 11,
"content": "8",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 12,
"content": "9",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"0": 3,
"1": 4,
"2": 5,
"3": 6,
"4": 7,
"5": 8,
"6": 9,
"7": 10,
"8": 11,
"9": 12,
"!": 13,
"\"": 14,
"%": 15,
"'": 16,
",": 17,
"-": 18,
".": 19,
"?": 20,
"a": 21,
"b": 22,
"c": 23,
"d": 24,
"e": 25,
"f": 26,
"g": 27,
"h": 28,
"i": 29,
"j": 30,
"k": 31,
"l": 32,
"m": 33,
"n": 34,
"o": 35,
"p": 36,
"q": 37,
"r": 38,
"s": 39,
"t": 40,
"u": 41,
"v": 42,
"w": 43,
"x": 44,
"y": 45,
"z": 46,
"¡": 47,
"ª": 48,
"°": 49,
"º": 50,
"»": 51,
"¿": 52,
"à": 53,
"á": 54,
"â": 55,
"å": 56,
"ç": 57,
"è": 58,
"é": 59,
"ê": 60,
"ë": 61,
"í": 62,
"ñ": 63,
"ó": 64,
"ô": 65,
"ö": 66,
"ú": 67,
"û": 68,
"ü": 69,
"œ": 70,
"—": 71,
"…": 72,
"de": 73,
"en": 74,
"os": 75,
"es": 76,
"la": 77,
"er": 78,
"qu": 79,
"an": 80,
"ar": 81,
"on": 82,
"as": 83,
"or": 84,
"que": 85,
"el": 86,
"do": 87,
"al": 88,
"ci": 89,
"re": 90,
"in": 91,
"un": 92,
"ab": 93,
"to": 94,
"te": 95,
"se": 96,
"di": 97,
"tr": 98,
"con": 99,
"ad": 100,
"su": 101,
"los": 102,
"le": 103,
"ta": 104,
"co": 105,
"ti": 106,
"mi": 107,
"no": 108,
"lo": 109,
"cu": 110,
"ía": 111,
"me": 112,
"ri": 113,
"por": 114,
"vi": 115,
"si": 116,
"ch": 117,
"ca": 118,
"mo": 119,
"ra": 120,
"par": 121,
"las": 122,
"ro": 123,
"th": 124,
"da": 125,
"pu": 126,
"per": 127,
"ón": 128,
"ent": 129,
"des": 130,
"li": 131,
"so": 132,
"ma": 133,
"est": 134,
"del": 135,
"aba": 136,
"gu": 137,
"men": 138,
"mp": 139,
"is": 140,
"mb": 141,
"ha": 142,
"mu": 143,
"tu": 144,
"una": 145,
"era": 146,
"hab": 147,
"go": 148,
"res": 149,
"dos": 150,
"man": 151,
"ás": 152,
"para": 153,
"po": 154,
"ve": 155,
"tra": 156,
"pa": 157,
"vo": 158,
"mos": 159,
"the": 160,
"ten": 161,
"ando": 162,
"pi": 163,
"fu": 164,
"ada": 165,
"jo": 166,
"ce": 167,
"ver": 168,
"bi": 169,
"qui": 170,
"como": 171,
"tan": 172,
"us": 173,
"más": 174,
"pe": 175,
"dad": 176,
"ción": 177,
"ho": 178,
"hi": 179,
"car": 180,
"ter": 181,
"án": 182,
"cer": 183,
"cia": 184,
"sus": 185,
"cas": 186,
"he": 187,
"ado": 188,
"fi": 189,
"yo": 190,
"gr": 191,
"mente": 192,
"ba": 193,
"sa": 194,
"ni": 195,
"ser": 196,
"tro": 197,
"cor": 198,
"mar": 199,
"pro": 200,
"tar": 201,
"lla": 202,
"lu": 203,
"za": 204,
"ces": 205,
"les": 206,
"sin": 207,
"ex": 208,
"pero": 209,
"and": 210,
"mis": 211,
"ven": 212,
"cho": 213,
"tes": 214,
"ya": 215,
"of": 216,
"cos": 217,
"ia": 218,
"du": 219,
"lan": 220,
"señ": 221,
"esta": 222,
"ga": 223,
"va": 224,
"ir": 225,
"ing": 226,
"endo": 227,
"bre": 228,
"min": 229,
"aque": 230,
"na": 231,
"lle": 232,
"bu": 233,
"pas": 234,
"tos": 235,
"io": 236,
"mas": 237,
"for": 238,
"emp": 239,
"ente": 240,
"és": 241,
"había": 242,
"ora": 243,
"ían": 244,
"ones": 245,
"je": 246,
"tor": 247,
"cab": 248,
"pre": 249,
"all": 250,
"nu": 251,
"én": 252,
"eron": 253,
"él": 254,
"ero": 255
},
"merges": [
"d e",
"e n",
"o s",
"e s",
"l a",
"e r",
"q u",
"a n",
"a r",
"o n",
"a s",
"o r",
"qu e",
"e l",
"d o",
"a l",
"c i",
"r e",
"i n",
"u n",
"a b",
"t o",
"t e",
"s e",
"d i",
"t r",
"c on",
"a d",
"s u",
"l os",
"l e",
"t a",
"c o",
"t i",
"m i",
"n o",
"l o",
"c u",
"í a",
"m e",
"r i",
"p or",
"v i",
"s i",
"c h",
"c a",
"m o",
"r a",
"p ar",
"la s",
"r o",
"t h",
"d a",
"p u",
"p er",
"ó n",
"en t",
"de s",
"l i",
"s o",
"m a",
"es t",
"de l",
"ab a",
"g u",
"m en",
"m p",
"i s",
"m b",
"h a",
"m u",
"t u",
"un a",
"er a",
"h ab",
"g o",
"r es",
"d os",
"m an",
"á s",
"par a",
"p o",
"v e",
"tr a",
"p a",
"v o",
"m os",
"th e",
"t en",
"an do",
"p i",
"f u",
"ad a",
"j o",
"c e",
"v er",
"b i",
"qu i",
"co mo",
"t an",
"u s",
"m ás",
"p e",
"d ad",
"ci ón",
"h o",
"h i",
"c ar",
"t er",
"á n",
"c er",
"ci a",
"su s",
"c as",
"h e",
"a do",
"f i",
"y o",
"g r",
"men te",
"b a",
"s a",
"n i",
"s er",
"tr o",
"c or",
"m ar",
"p ro",
"t ar",
"l la",
"l u",
"z a",
"c es",
"l es",
"s in",
"e x",
"per o",
"an d",
"mi s",
"v en",
"ch o",
"t es",
"y a",
"o f",
"c os",
"i a",
"d u",
"la n",
"se ñ",
"es ta",
"g a",
"v a",
"i r",
"in g",
"en do",
"b re",
"m in",
"a que",
"n a",
"l le",
"b u",
"p as",
"t os",
"i o",
"m as",
"f or",
"e mp",
"en te",
"é s",
"hab ía",
"or a",
"í an",
"on es",
"j e",
"t or",
"c ab",
"p re",
"al l",
"n u",
"é n",
"er on",
"é l",
"er o"
],
"language": "es"
}
}

View file

@ -0,0 +1,18 @@
path:
pretrain_model_dvae: './models/tortoise/dvae.pth'
strict_load: true
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
networks:
dvae:
type: generator
which_model_G: lucidrains_dvae
kwargs:
channels: 80
codebook_dim: 512
hidden_dim: 512
kernel_size: 3
num_layers: 2
num_resnet_blocks: 3
num_tokens: 8192
positional_dims: 1
use_transposed_convs: false

@ -1 +1 @@
Subproject commit 5bd8624d24be8331fe512c4f8f143d18721bfceb
Subproject commit 42a4fa99fee363a6ba1d643f8100f52914de04f0

View file

@ -90,24 +90,35 @@ if not exist "%fileds_name%" (
echo File %fileds_name% already exists, skipping download.
)
:: Install Fairseq, Deepspeed and RVC TTS Pipeline
set download_pyfastmp3decoder=
set filepyfast_name=pyfastmp3decoder-0.0.1-cp311-cp311-win_amd64.whl
if not exist "%filepyfast_name%" (
echo Downloading %filepyfast_name%...
curl -L -O "%download_pyfastmp3decoder%"
if errorlevel 1 (
echo Download failed. Please check your internet connection or the URL and try again.
exit /b 1
)
) else (
echo File %filepyfast_name% already exists, skipping download.
)
:: Install Fairseq, Deepspeed, pyfast, and RVC TTS Pipeline
python -m pip install .\fairseq-0.12.4-cp311-cp311-win_amd64.whl
python -m pip install git+https://github.com/JarodMica/rvc-tts-pipeline.git@lightweight#egg=rvc_tts_pipe
python -m pip install deepspeed-0.14.0-cp311-cp311-win_amd64.whl
python -m pip install pyfastmp3decoder-0.0.1-cp311-cp311-win_amd64.whl
:: Install whisperx
python -m pip install git+https://github.com/m-bain/whisperx.git
:: Install JBetker's repo for mp3 training
git clone https://github.com/neonbjb/pyfastmp3decoder.git
cd pyfastmp3decoder
git submodule update --init --recursive
python setup.py install
cd ..
:: Install other requirements (this is done last due to potential package conflicts)
python -m pip install -r requirements.txt
:: Download and install ffmpeg
call download_ffmpeg.bat
:: Setup BnB
.\setup-cuda-bnb.bat

View file

@ -535,15 +535,28 @@ def save_training_settings_proxy(*args):
settings, messages = save_training_settings(**kwargs)
return "\n".join(messages)
def get_dataset_continuation(voice):
try:
training_dir = f"training/{voice}/processed"
if os.path.exists(training_dir):
processed_dataset_list = [folder for folder in os.listdir(training_dir) if os.path.isdir(os.path.join(training_dir, folder))]
if processed_dataset_list:
processed_dataset_list.append("")
return gr.Dropdown(choices=processed_dataset_list, value="", interactive=True)
except Exception as e:
print(f"Error getting dataset continuation: {str(e)}")
return gr.Dropdown(choices=[], value="", interactive=True)
def update_voices():
def update_voices(voice):
return (
gr.Dropdown(choices=get_voice_list(append_defaults=True)),
gr.Dropdown(choices=get_voice_list()),
gr.Dropdown(choices=get_voice_list(args.results_folder)),
gr.Dropdown(choices=get_rvc_models()), # Update for RVC models
gr.Dropdown(choices=get_rvc_indexes()), # Update for RVC models
gr.Dropdown(choices=get_voice_list())
gr.Dropdown(choices=get_voice_list()),
get_dataset_continuation(voice)
)
@ -579,17 +592,6 @@ def setup_gradio():
voice_list = get_voice_list()
result_voices = get_voice_list(args.results_folder)
def get_dataset_continuation(voice):
try:
training_dir = f"training/{voice}/processed"
if os.path.exists(training_dir):
processed_dataset_list = [folder for folder in os.listdir(training_dir) if os.path.isdir(os.path.join(training_dir, folder))]
if processed_dataset_list:
processed_dataset_list.append("")
return gr.Dropdown(choices=processed_dataset_list, value="", interactive=True)
except Exception as e:
print(f"Error getting dataset continuation: {str(e)}")
return gr.Dropdown(choices=[], value="", interactive=True)
valle_models = get_valle_models()