Add RVC inference capabilities

2025-06-07 06:05:52 -04:00 · 2024-01-15 02:53:07 -08:00 · 2024-01-15 02:53:07 -08:00 · b7879cc8dc
commit b7879cc8dc
parent 09acc1a1d8
6 changed files with 204 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,8 @@
 /models/*
 /training/*
 /config/*
+output/
+*.wav

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/changelog.md
+++ b/changelog.md
@ -1,4 +1,20 @@
 # Changelogs & Notes

+## 1/14/2024
+- Look at 48khz hifigan
+- NOTE TO SELF, when installing rvc-tts-pipeline at this time, you modified the rvc_infer to look for the rvc package inside of modules instead of having it in the parent folder.
+    - Also modified imports in rvc: find and replace all rvc.infer. with modules.rvc.infer.
+    - Modified hardcoded paths in rvc configs
+    - utils.py for hubert and target_folder
+    - pipeline.py for rmpve path
+- Add option to be able to use RVC voice model
+    - EXEC_SETTINGS used instead as we just need to set a parameter to be able to use it
+- Add a function that handles rvc voice models for the drop down menu.  Voice models are located at models/rvc_models
+- Add rvc voice model refresh to update_voices and the refresh_voices button
+
+## Some date in December
+- Added whisper large-v3 to the list of whisper models available.  
+
 ## 12/17/2023
- Changed the 
+- Resolved an import error caused by a newer version of rotary_embedding_torch.
+    - Modified portions of the code in dlas to use broadcast_tensors instead of broadcat.  In the latest version of rotary_embedding_torch (0.5.0 > and higher), broadcat was removed due to redudancy as it looks like broadcast_tensors is a part of torch
--- a/modules/rvc
+++ b/modules/rvc
@ -0,0 +1 @@
+Subproject commit e17294b634674de8925c4bf4607adc15004bacce
--- a/reload_flag.txt
+++ b/reload_flag.txt
@ -0,0 +1 @@
+reload
--- a/src/utils.py
+++ b/src/utils.py
@ -42,11 +42,11 @@ from tortoise.api_fast import TextToSpeech as Toroise_TTS_Hifi
 from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices
 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
-
+from rvc_pipe.rvc_infer import rvc_convert

 MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"

-WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
+WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]
 WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
 WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
 VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
@ -1236,7 +1236,8 @@ def generate_tortoise(**kwargs):
 				parameters['seed'] = additionals[0]
 		except Exception as e:
 			raise RuntimeError(f'Possible latent mismatch: click the "(Re)Compute Voice Latents" button and then try again. Error: {e}')
-		
+		# print(type(gen))
+		# print(gen)
 		run_time = time.time()-start_time
 		print(f"Generating line took {run_time} seconds")

@ -1358,6 +1359,32 @@ def generate_tortoise(**kwargs):
 		sample_voice = (tts.input_sample_rate, sample_voice.numpy())

 	info = get_info(voice=voice, latents=False)
+
+	#insert rvc stuff
+	if args.use_rvc:
+		rvc_settings = load_rvc_settings()
+		rvc_model_path = os.path.join("models", "rvc_models", rvc_settings['rvc_model'])
+		rvc_index_path = os.path.join("models", "rvc_models", rvc_settings['file_index'])
+		print (rvc_model_path)
+		rvc_out_path = rvc_convert(model_path=rvc_model_path, 
+							 		input_path=output_voices[0],
+									f0_up_key=rvc_settings['f0_up_key'],
+									file_index=rvc_index_path,
+									index_rate=rvc_settings['index_rate'],
+									filter_radius=rvc_settings['filter_radius'],
+									resample_sr=rvc_settings['resample_sr'],
+									rms_mix_rate=rvc_settings['rms_mix_rate'],
+									protect=rvc_settings['protect'])
+		
+		# Read the contents from rvc_out_path
+		with open(rvc_out_path, 'rb') as file:
+			content = file.read()
+
+		# Write the contents to output_voices[0], effectively replacing its contents
+		with open(output_voices[0], 'wb') as file:
+			file.write(content)
+
+
 	print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n")

 	info['seed'] = usedSeed
@ -3307,7 +3334,11 @@ def setup_args(cli=False):
 		'latents-lean-and-mean': True,
 		'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
 		'use-deepspeed': False,
+		#stuff that jarod has added
 		'use-hifigan': False,
+		'use-rvc' : False,
+		'rvc-model' : None,
+
 		'voice-fixer-use-cuda': True,

 		
@ -3368,6 +3399,18 @@ def setup_args(cli=False):
 	parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
 	parser.add_argument("--use-deepspeed", action='store_true', default=default_arguments['use-deepspeed'], help="Use deepspeed for speed bump.")
 	parser.add_argument("--use-hifigan", action='store_true', default=default_arguments['use-hifigan'], help="Use Hifigan instead of Diffusion")
+	parser.add_argument("--use-rvc", action='store_true', default=default_arguments['use-rvc'], help="Run the outputted audio thorugh RVC")
+	parser.add_argument("--rvc-model", action='store_true', default=default_arguments['rvc-model'], help="Specifies RVC model to use")
+
+	# parser.add_argument("--f0_up_key", action='store_true', default=default_arguments['f0_up_key'], help="transpose of the audio file, changes pitch (positive makes voice higher pitch)")
+	# parser.add_argument("--f0method", action='store_true', default=default_arguments['f0method'], help="picks which f0 method to use: dio, harvest, crepe, rmvpe (requires rmvpe.pt)")
+	# parser.add_argument("--file_index", action='store_true', default=default_arguments['file_index'], help="path to file_index, defaults to None")
+	# parser.add_argument("--index_rate", action='store_true', default=default_arguments['index_rate'], help="strength of the index file if provided")
+	# parser.add_argument("--filter_radius", action='store_true', default=default_arguments['filter_radius'], help="if >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.")
+	# parser.add_argument("--resample_sr", action='store_true', default=default_arguments['resample_sr'], help="quality at which to resample audio to, defaults to no resample")
+	# parser.add_argument("--rms_mix_rate", action='store_true', default=default_arguments['rms_mix_rate'], help="adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume")
+	# parser.add_argument("--protect", action='store_true', default=default_arguments['protect'], help="protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy")
+	

 	parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
 	parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
@ -3455,6 +3498,8 @@ def get_default_settings( hypenated=True ):
 		'voice-fixer': args.voice_fixer,
 		'use-deepspeed': args.use_deepspeed,
 		'use-hifigan': args.use_hifigan,
+		'use-rvc': args.use_rvc,
+		'rvc-model' : args.rvc_model,
 		'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
 		'concurrency-count': args.concurrency_count,
 		'output-sample-rate': args.output_sample_rate,
@ -3510,6 +3555,8 @@ def update_args( **kwargs ):
 	args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda']
 	args.use_deepspeed = settings['use_deepspeed']
 	args.use_hifigan = settings['use_hifigan']
+	args.use_rvc = settings['use_rvc']
+	args.rvc_model = settings['rvc_model']
 	args.concurrency_count = settings['concurrency_count']
 	args.output_sample_rate = 44000
 	args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
@ -4017,3 +4064,20 @@ def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.P
 	message = f"Saved to {output_path}"
 	print(message)
 	return message
+
+
+#Stuff added by Jarod
+def get_rvc_models():
+	folder_path = 'models/rvc_models'
+	return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.pth')]
+def get_rvc_indexes():
+	folder_path = 'models/rvc_models'
+	return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.index')]
+
+def load_rvc_settings():
+    rvc_settings_path = './config/rvc.json'
+    if os.path.exists(rvc_settings_path):
+        with open(rvc_settings_path, 'r') as file:
+            return json.load(file)
+    else:
+        return {}  # Return an empty dict if the file doesn't exist
--- a/src/webui.py
+++ b/src/webui.py
@ -24,12 +24,25 @@ from utils import *
 args = setup_args()

 GENERATE_SETTINGS = {}
+RVC_SETTINGS = {
+    'rvc_model': '',
+    'f0_up_key': 0,
+    'file_index': '',
+	'index_rate' : 0,
+    'filter_radius': 3,
+    'resample_sr': 48000,
+    'rms_mix_rate': 0.25,
+    'protect': 0.33,
+}
 TRANSCRIBE_SETTINGS = {}
 EXEC_SETTINGS = {}
 TRAINING_SETTINGS = {}
 MERGER_SETTINGS = {}
 GENERATE_SETTINGS_ARGS = []

+
+
+
 PRESETS = {
 	'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
 	'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
@ -55,6 +68,55 @@ HISTORY_HEADERS = {
 	"Model Hash": "model_hash",
 }

+# Load settings from a file if it exists
+def load_rvc_settings():
+	global RVC_SETTINGS
+	try:
+		if os.path.exists('./config/rvc.json'):
+			with open('./config/rvc.json', 'r') as f:
+				RVC_SETTINGS.update(json.load(f))
+	except:
+		pass
+
+def update_rvc_settings(**kwargs):
+    global RVC_SETTINGS
+    RVC_SETTINGS.update(kwargs)
+    save_rvc_settings()
+
+def save_rvc_settings():
+    global RVC_SETTINGS
+    os.makedirs('./config/', exist_ok=True)
+    with open(f'./config/rvc.json', 'w', encoding="utf-8") as f:
+        f.write(json.dumps(RVC_SETTINGS, indent='\t'))
+
+def save_rvc_settings_to_json():
+    # Hardcoded path to the JSON file
+    config_file = 'config\\rvc.json'
+
+    try:
+        # Try to load the existing JSON data from the file
+        with open(config_file, 'r') as json_file:
+            existing_settings = json.load(json_file)
+    except (FileNotFoundError, json.JSONDecodeError):
+        # If the file doesn't exist or is unreadable, create a new dictionary
+        existing_settings = {}
+
+    # Update the existing settings with the new values from RVC_SETTINGS
+    for key, value in RVC_SETTINGS.items():
+        # Check if the value is a Gradio Slider object
+        if isinstance(value, gr.Slider):
+            # Extract the value from the Slider
+            existing_settings[key] = value.value
+        elif isinstance(value, gr.Dropdown):
+            # Extract the selected value from the Dropdown
+            existing_settings[key] = value.value
+        else:
+            existing_settings[key] = value
+
+    # Save the updated settings back to the JSON file
+    with open(config_file, 'w') as json_file:
+        json.dump(existing_settings, json_file, indent=4)
+
 # can't use *args OR **kwargs if I want to retain the ability to use progress
 def generate_proxy(
 	text,
@ -83,7 +145,7 @@ def generate_proxy(
 	progress=gr.Progress(track_tqdm=True)
 ):
 	kwargs = locals()
-
+	# save_rvc_settings_to_json()
 	try:
 		sample, outputs, stats = generate(**kwargs)
 	except Exception as e:
@ -261,6 +323,15 @@ def update_args_proxy( *args ):
 		kwargs[k] = v

 	update_args(**kwargs)
+
+def update_rvc_settings_proxy(*args):
+    kwargs = {}
+    keys = list(RVC_SETTINGS.keys())
+    for i, key in enumerate(keys):
+        kwargs[key] = args[i]
+
+    update_rvc_settings(**kwargs)
+
 def optimize_training_settings_proxy( *args ):
 	kwargs = {}
 	keys = list(TRAINING_SETTINGS.keys())
@ -327,6 +398,8 @@ def update_voices():
 		gr.Dropdown.update(choices=get_voice_list(append_defaults=True)),
 		gr.Dropdown.update(choices=get_voice_list()),
 		gr.Dropdown.update(choices=get_voice_list(args.results_folder)),
+		gr.Dropdown.update(choices=get_rvc_models()),  # Update for RVC models
+		gr.Dropdown.update(choices=get_rvc_indexes())  # Update for RVC models
 	)

 def history_copy_settings( voice, file ):
@ -365,6 +438,8 @@ def setup_gradio():
 	dataset_list = get_dataset_list()
 	training_list = get_training_list()

+	load_rvc_settings()
+
 	global GENERATE_SETTINGS_ARGS
 	GENERATE_SETTINGS_ARGS = list(inspect.signature(generate_proxy).parameters.keys())[:-1]
 	for i in range(len(GENERATE_SETTINGS_ARGS)):
@ -424,6 +499,20 @@ def setup_gradio():
 						["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"],
 						value="DDIM", label="Diffusion Samplers", type="value"
 					)
+
+					EXEC_SETTINGS['use_rvc'] = gr.Checkbox(label="Run the outputted audio through RVC", value=args.use_rvc)
+					with gr.Column(visible=args.use_rvc) as rvc_column:
+						RVC_SETTINGS['rvc_model'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=RVC_SETTINGS['rvc_model'], interactive=True)
+						RVC_SETTINGS['file_index'] = gr.Dropdown(choices=get_rvc_indexes(), label="RVC Index File", value=RVC_SETTINGS["file_index"], interactive=True)
+						RVC_SETTINGS['index_rate'] = gr.Slider(minimum=0, maximum=1, label="Index Rate", value=RVC_SETTINGS["index_rate"], interactive=True)
+						RVC_SETTINGS['f0_up_key'] = gr.Slider(minimum=-24, maximum=24, label="Voice Pitch (f0 key)", value=RVC_SETTINGS["f0_up_key"], interactive=True)
+						# RVC_SETTINGS['f0_method'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=args.rvc_model)
+						RVC_SETTINGS['filter_radius'] = gr.Slider(minimum=0, maximum=7, label="Filter Radius", value=RVC_SETTINGS["filter_radius"], interactive=True)
+						RVC_SETTINGS['resample_sr'] = gr.Slider(minimum=0, maximum=48000, label="Resample sample rate", value=RVC_SETTINGS["resample_sr"], interactive=True)
+						RVC_SETTINGS['rms_mix_rate'] = gr.Slider(minimum=0, maximum=1, label="RMS Mix Rate (Volume Envelope)", value=RVC_SETTINGS["rms_mix_rate"], interactive=True)
+						RVC_SETTINGS['protect'] = gr.Slider(minimum=0, maximum=0.5, label="Protect Voiceless Consonants", value=RVC_SETTINGS["protect"], interactive=True)
+
+
 					GENERATE_SETTINGS["cvvp_weight"] = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
 					GENERATE_SETTINGS["top_p"] = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P")
 					GENERATE_SETTINGS["diffusion_temperature"] = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature")
@ -719,6 +808,18 @@ def setup_gradio():
 				exec_inputs = list(EXEC_SETTINGS.values())
 				for k in EXEC_SETTINGS:
 					EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
+
+				rvc_inputs = list(RVC_SETTINGS.values())
+				# for k in RVC_SETTINGS:
+				# 	RVC_SETTINGS[k].change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+
+				for k, component in RVC_SETTINGS.items():
+					if isinstance(component, gr.Dropdown):
+						component.change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+					elif isinstance(component, gr.Slider):
+						component.release(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+
+
 				
 				EXEC_SETTINGS['autoregressive_model'].change(
 					fn=update_autoregressive_model,
@ -773,6 +874,13 @@ def setup_gradio():
 			inputs=show_experimental_settings,
 			outputs=experimental_column
 		)
+
+		EXEC_SETTINGS['use_rvc'].change(
+			fn=lambda use_rvc_checked: gr.update(visible=use_rvc_checked),
+			inputs=EXEC_SETTINGS['use_rvc'],
+			outputs=rvc_column
+		)
+
 		if preset:
 			preset.change(fn=update_presets,
 				inputs=preset,
@ -807,11 +915,17 @@ def setup_gradio():
 			outputs=[
 				GENERATE_SETTINGS['voice'],
 				DATASET_SETTINGS['voice'],
-				history_voices
+				history_voices,
+				RVC_SETTINGS['rvc_model'],  # Add this line
+				RVC_SETTINGS['file_index']
+
 			]
 		)

 		generate_settings = list(GENERATE_SETTINGS.values())
+		rvc_settings = list(RVC_SETTINGS.values())
+		print(generate_settings)
+		print(rvc_settings)
 		submit.click(
 			lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)),
 			outputs=[source_sample, candidates_list, generation_results],
				`@ -0,0 +1 @@`
				`Subproject commit e17294b634674de8925c4bf4607adc15004bacce`