Add RVC inference capabilities

This commit is contained in:
Jarod Mica 2024-01-15 02:53:07 -08:00
parent 09acc1a1d8
commit b7879cc8dc
6 changed files with 204 additions and 6 deletions

2
.gitignore vendored
View file

@ -3,6 +3,8 @@
/models/*
/training/*
/config/*
output/
*.wav
# Byte-compiled / optimized / DLL files
__pycache__/

View file

@ -1,4 +1,20 @@
# Changelogs & Notes
## 1/14/2024
- Look at 48khz hifigan
- NOTE TO SELF, when installing rvc-tts-pipeline at this time, you modified the rvc_infer to look for the rvc package inside of modules instead of having it in the parent folder.
- Also modified imports in rvc: find and replace all rvc.infer. with modules.rvc.infer.
- Modified hardcoded paths in rvc configs
- utils.py for hubert and target_folder
- pipeline.py for rmpve path
- Add option to be able to use RVC voice model
- EXEC_SETTINGS used instead as we just need to set a parameter to be able to use it
- Add a function that handles rvc voice models for the drop down menu. Voice models are located at models/rvc_models
- Add rvc voice model refresh to update_voices and the refresh_voices button
## Some date in December
- Added whisper large-v3 to the list of whisper models available.
## 12/17/2023
- Changed the
- Resolved an import error caused by a newer version of rotary_embedding_torch.
- Modified portions of the code in dlas to use broadcast_tensors instead of broadcat. In the latest version of rotary_embedding_torch (0.5.0 > and higher), broadcat was removed due to redudancy as it looks like broadcast_tensors is a part of torch

1
modules/rvc Submodule

@ -0,0 +1 @@
Subproject commit e17294b634674de8925c4bf4607adc15004bacce

1
reload_flag.txt Normal file
View file

@ -0,0 +1 @@
reload

View file

@ -42,11 +42,11 @@ from tortoise.api_fast import TextToSpeech as Toroise_TTS_Hifi
from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
from rvc_pipe.rvc_infer import rvc_convert
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
@ -1236,7 +1236,8 @@ def generate_tortoise(**kwargs):
parameters['seed'] = additionals[0]
except Exception as e:
raise RuntimeError(f'Possible latent mismatch: click the "(Re)Compute Voice Latents" button and then try again. Error: {e}')
# print(type(gen))
# print(gen)
run_time = time.time()-start_time
print(f"Generating line took {run_time} seconds")
@ -1358,6 +1359,32 @@ def generate_tortoise(**kwargs):
sample_voice = (tts.input_sample_rate, sample_voice.numpy())
info = get_info(voice=voice, latents=False)
#insert rvc stuff
if args.use_rvc:
rvc_settings = load_rvc_settings()
rvc_model_path = os.path.join("models", "rvc_models", rvc_settings['rvc_model'])
rvc_index_path = os.path.join("models", "rvc_models", rvc_settings['file_index'])
print (rvc_model_path)
rvc_out_path = rvc_convert(model_path=rvc_model_path,
input_path=output_voices[0],
f0_up_key=rvc_settings['f0_up_key'],
file_index=rvc_index_path,
index_rate=rvc_settings['index_rate'],
filter_radius=rvc_settings['filter_radius'],
resample_sr=rvc_settings['resample_sr'],
rms_mix_rate=rvc_settings['rms_mix_rate'],
protect=rvc_settings['protect'])
# Read the contents from rvc_out_path
with open(rvc_out_path, 'rb') as file:
content = file.read()
# Write the contents to output_voices[0], effectively replacing its contents
with open(output_voices[0], 'wb') as file:
file.write(content)
print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n")
info['seed'] = usedSeed
@ -3307,7 +3334,11 @@ def setup_args(cli=False):
'latents-lean-and-mean': True,
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
'use-deepspeed': False,
#stuff that jarod has added
'use-hifigan': False,
'use-rvc' : False,
'rvc-model' : None,
'voice-fixer-use-cuda': True,
@ -3368,6 +3399,18 @@ def setup_args(cli=False):
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
parser.add_argument("--use-deepspeed", action='store_true', default=default_arguments['use-deepspeed'], help="Use deepspeed for speed bump.")
parser.add_argument("--use-hifigan", action='store_true', default=default_arguments['use-hifigan'], help="Use Hifigan instead of Diffusion")
parser.add_argument("--use-rvc", action='store_true', default=default_arguments['use-rvc'], help="Run the outputted audio thorugh RVC")
parser.add_argument("--rvc-model", action='store_true', default=default_arguments['rvc-model'], help="Specifies RVC model to use")
# parser.add_argument("--f0_up_key", action='store_true', default=default_arguments['f0_up_key'], help="transpose of the audio file, changes pitch (positive makes voice higher pitch)")
# parser.add_argument("--f0method", action='store_true', default=default_arguments['f0method'], help="picks which f0 method to use: dio, harvest, crepe, rmvpe (requires rmvpe.pt)")
# parser.add_argument("--file_index", action='store_true', default=default_arguments['file_index'], help="path to file_index, defaults to None")
# parser.add_argument("--index_rate", action='store_true', default=default_arguments['index_rate'], help="strength of the index file if provided")
# parser.add_argument("--filter_radius", action='store_true', default=default_arguments['filter_radius'], help="if >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.")
# parser.add_argument("--resample_sr", action='store_true', default=default_arguments['resample_sr'], help="quality at which to resample audio to, defaults to no resample")
# parser.add_argument("--rms_mix_rate", action='store_true', default=default_arguments['rms_mix_rate'], help="adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume")
# parser.add_argument("--protect", action='store_true', default=default_arguments['protect'], help="protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy")
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
@ -3455,6 +3498,8 @@ def get_default_settings( hypenated=True ):
'voice-fixer': args.voice_fixer,
'use-deepspeed': args.use_deepspeed,
'use-hifigan': args.use_hifigan,
'use-rvc': args.use_rvc,
'rvc-model' : args.rvc_model,
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
'concurrency-count': args.concurrency_count,
'output-sample-rate': args.output_sample_rate,
@ -3510,6 +3555,8 @@ def update_args( **kwargs ):
args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda']
args.use_deepspeed = settings['use_deepspeed']
args.use_hifigan = settings['use_hifigan']
args.use_rvc = settings['use_rvc']
args.rvc_model = settings['rvc_model']
args.concurrency_count = settings['concurrency_count']
args.output_sample_rate = 44000
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
@ -4017,3 +4064,20 @@ def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.P
message = f"Saved to {output_path}"
print(message)
return message
#Stuff added by Jarod
def get_rvc_models():
folder_path = 'models/rvc_models'
return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.pth')]
def get_rvc_indexes():
folder_path = 'models/rvc_models'
return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.index')]
def load_rvc_settings():
rvc_settings_path = './config/rvc.json'
if os.path.exists(rvc_settings_path):
with open(rvc_settings_path, 'r') as file:
return json.load(file)
else:
return {} # Return an empty dict if the file doesn't exist

View file

@ -24,12 +24,25 @@ from utils import *
args = setup_args()
GENERATE_SETTINGS = {}
RVC_SETTINGS = {
'rvc_model': '',
'f0_up_key': 0,
'file_index': '',
'index_rate' : 0,
'filter_radius': 3,
'resample_sr': 48000,
'rms_mix_rate': 0.25,
'protect': 0.33,
}
TRANSCRIBE_SETTINGS = {}
EXEC_SETTINGS = {}
TRAINING_SETTINGS = {}
MERGER_SETTINGS = {}
GENERATE_SETTINGS_ARGS = []
PRESETS = {
'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
@ -55,6 +68,55 @@ HISTORY_HEADERS = {
"Model Hash": "model_hash",
}
# Load settings from a file if it exists
def load_rvc_settings():
global RVC_SETTINGS
try:
if os.path.exists('./config/rvc.json'):
with open('./config/rvc.json', 'r') as f:
RVC_SETTINGS.update(json.load(f))
except:
pass
def update_rvc_settings(**kwargs):
global RVC_SETTINGS
RVC_SETTINGS.update(kwargs)
save_rvc_settings()
def save_rvc_settings():
global RVC_SETTINGS
os.makedirs('./config/', exist_ok=True)
with open(f'./config/rvc.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(RVC_SETTINGS, indent='\t'))
def save_rvc_settings_to_json():
# Hardcoded path to the JSON file
config_file = 'config\\rvc.json'
try:
# Try to load the existing JSON data from the file
with open(config_file, 'r') as json_file:
existing_settings = json.load(json_file)
except (FileNotFoundError, json.JSONDecodeError):
# If the file doesn't exist or is unreadable, create a new dictionary
existing_settings = {}
# Update the existing settings with the new values from RVC_SETTINGS
for key, value in RVC_SETTINGS.items():
# Check if the value is a Gradio Slider object
if isinstance(value, gr.Slider):
# Extract the value from the Slider
existing_settings[key] = value.value
elif isinstance(value, gr.Dropdown):
# Extract the selected value from the Dropdown
existing_settings[key] = value.value
else:
existing_settings[key] = value
# Save the updated settings back to the JSON file
with open(config_file, 'w') as json_file:
json.dump(existing_settings, json_file, indent=4)
# can't use *args OR **kwargs if I want to retain the ability to use progress
def generate_proxy(
text,
@ -83,7 +145,7 @@ def generate_proxy(
progress=gr.Progress(track_tqdm=True)
):
kwargs = locals()
# save_rvc_settings_to_json()
try:
sample, outputs, stats = generate(**kwargs)
except Exception as e:
@ -261,6 +323,15 @@ def update_args_proxy( *args ):
kwargs[k] = v
update_args(**kwargs)
def update_rvc_settings_proxy(*args):
kwargs = {}
keys = list(RVC_SETTINGS.keys())
for i, key in enumerate(keys):
kwargs[key] = args[i]
update_rvc_settings(**kwargs)
def optimize_training_settings_proxy( *args ):
kwargs = {}
keys = list(TRAINING_SETTINGS.keys())
@ -327,6 +398,8 @@ def update_voices():
gr.Dropdown.update(choices=get_voice_list(append_defaults=True)),
gr.Dropdown.update(choices=get_voice_list()),
gr.Dropdown.update(choices=get_voice_list(args.results_folder)),
gr.Dropdown.update(choices=get_rvc_models()), # Update for RVC models
gr.Dropdown.update(choices=get_rvc_indexes()) # Update for RVC models
)
def history_copy_settings( voice, file ):
@ -365,6 +438,8 @@ def setup_gradio():
dataset_list = get_dataset_list()
training_list = get_training_list()
load_rvc_settings()
global GENERATE_SETTINGS_ARGS
GENERATE_SETTINGS_ARGS = list(inspect.signature(generate_proxy).parameters.keys())[:-1]
for i in range(len(GENERATE_SETTINGS_ARGS)):
@ -424,6 +499,20 @@ def setup_gradio():
["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"],
value="DDIM", label="Diffusion Samplers", type="value"
)
EXEC_SETTINGS['use_rvc'] = gr.Checkbox(label="Run the outputted audio through RVC", value=args.use_rvc)
with gr.Column(visible=args.use_rvc) as rvc_column:
RVC_SETTINGS['rvc_model'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=RVC_SETTINGS['rvc_model'], interactive=True)
RVC_SETTINGS['file_index'] = gr.Dropdown(choices=get_rvc_indexes(), label="RVC Index File", value=RVC_SETTINGS["file_index"], interactive=True)
RVC_SETTINGS['index_rate'] = gr.Slider(minimum=0, maximum=1, label="Index Rate", value=RVC_SETTINGS["index_rate"], interactive=True)
RVC_SETTINGS['f0_up_key'] = gr.Slider(minimum=-24, maximum=24, label="Voice Pitch (f0 key)", value=RVC_SETTINGS["f0_up_key"], interactive=True)
# RVC_SETTINGS['f0_method'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=args.rvc_model)
RVC_SETTINGS['filter_radius'] = gr.Slider(minimum=0, maximum=7, label="Filter Radius", value=RVC_SETTINGS["filter_radius"], interactive=True)
RVC_SETTINGS['resample_sr'] = gr.Slider(minimum=0, maximum=48000, label="Resample sample rate", value=RVC_SETTINGS["resample_sr"], interactive=True)
RVC_SETTINGS['rms_mix_rate'] = gr.Slider(minimum=0, maximum=1, label="RMS Mix Rate (Volume Envelope)", value=RVC_SETTINGS["rms_mix_rate"], interactive=True)
RVC_SETTINGS['protect'] = gr.Slider(minimum=0, maximum=0.5, label="Protect Voiceless Consonants", value=RVC_SETTINGS["protect"], interactive=True)
GENERATE_SETTINGS["cvvp_weight"] = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
GENERATE_SETTINGS["top_p"] = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P")
GENERATE_SETTINGS["diffusion_temperature"] = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature")
@ -719,6 +808,18 @@ def setup_gradio():
exec_inputs = list(EXEC_SETTINGS.values())
for k in EXEC_SETTINGS:
EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
rvc_inputs = list(RVC_SETTINGS.values())
# for k in RVC_SETTINGS:
# RVC_SETTINGS[k].change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
for k, component in RVC_SETTINGS.items():
if isinstance(component, gr.Dropdown):
component.change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
elif isinstance(component, gr.Slider):
component.release(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
EXEC_SETTINGS['autoregressive_model'].change(
fn=update_autoregressive_model,
@ -773,6 +874,13 @@ def setup_gradio():
inputs=show_experimental_settings,
outputs=experimental_column
)
EXEC_SETTINGS['use_rvc'].change(
fn=lambda use_rvc_checked: gr.update(visible=use_rvc_checked),
inputs=EXEC_SETTINGS['use_rvc'],
outputs=rvc_column
)
if preset:
preset.change(fn=update_presets,
inputs=preset,
@ -807,11 +915,17 @@ def setup_gradio():
outputs=[
GENERATE_SETTINGS['voice'],
DATASET_SETTINGS['voice'],
history_voices
history_voices,
RVC_SETTINGS['rvc_model'], # Add this line
RVC_SETTINGS['file_index']
]
)
generate_settings = list(GENERATE_SETTINGS.values())
rvc_settings = list(RVC_SETTINGS.values())
print(generate_settings)
print(rvc_settings)
submit.click(
lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)),
outputs=[source_sample, candidates_list, generation_results],