From b7879cc8dc9b426087d43d33985cd3f64068f35f Mon Sep 17 00:00:00 2001
From: Jarod Mica <jarodmica@gmail.com>
Date: Mon, 15 Jan 2024 02:53:07 -0800
Subject: [PATCH] Add RVC inference capabilities

---
 .gitignore      |   2 +
 changelog.md    |  18 +++++++-
 modules/rvc     |   1 +
 reload_flag.txt |   1 +
 src/utils.py    |  70 ++++++++++++++++++++++++++--
 src/webui.py    | 118 +++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 204 insertions(+), 6 deletions(-)
 create mode 160000 modules/rvc
 create mode 100644 reload_flag.txt

diff --git a/.gitignore b/.gitignore
index 17672a9..d810bb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 /models/*
 /training/*
 /config/*
+output/
+*.wav
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/changelog.md b/changelog.md
index faedff8..8c4e78e 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,20 @@
 # Changelogs & Notes
 
+## 1/14/2024
+- Look at 48khz hifigan
+- NOTE TO SELF, when installing rvc-tts-pipeline at this time, you modified the rvc_infer to look for the rvc package inside of modules instead of having it in the parent folder.
+    - Also modified imports in rvc: find and replace all rvc.infer. with modules.rvc.infer.
+    - Modified hardcoded paths in rvc configs
+    - utils.py for hubert and target_folder
+    - pipeline.py for rmpve path
+- Add option to be able to use RVC voice model
+    - EXEC_SETTINGS used instead as we just need to set a parameter to be able to use it
+- Add a function that handles rvc voice models for the drop down menu.  Voice models are located at models/rvc_models
+- Add rvc voice model refresh to update_voices and the refresh_voices button
+
+## Some date in December
+- Added whisper large-v3 to the list of whisper models available.  
+
 ## 12/17/2023
-- Changed the 
\ No newline at end of file
+- Resolved an import error caused by a newer version of rotary_embedding_torch.
+    - Modified portions of the code in dlas to use broadcast_tensors instead of broadcat.  In the latest version of rotary_embedding_torch (0.5.0 > and higher), broadcat was removed due to redudancy as it looks like broadcast_tensors is a part of torch
\ No newline at end of file
diff --git a/modules/rvc b/modules/rvc
new file mode 160000
index 0000000..e17294b
--- /dev/null
+++ b/modules/rvc
@@ -0,0 +1 @@
+Subproject commit e17294b634674de8925c4bf4607adc15004bacce
diff --git a/reload_flag.txt b/reload_flag.txt
new file mode 100644
index 0000000..e76285e
--- /dev/null
+++ b/reload_flag.txt
@@ -0,0 +1 @@
+reload
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
index e593ee5..cb85177 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -42,11 +42,11 @@ from tortoise.api_fast import TextToSpeech as Toroise_TTS_Hifi
 from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir, get_voices
 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
-
+from rvc_pipe.rvc_infer import rvc_convert
 
 MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
 
-WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
+WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]
 WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
 WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
 VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
@@ -1236,7 +1236,8 @@ def generate_tortoise(**kwargs):
 				parameters['seed'] = additionals[0]
 		except Exception as e:
 			raise RuntimeError(f'Possible latent mismatch: click the "(Re)Compute Voice Latents" button and then try again. Error: {e}')
-		
+		# print(type(gen))
+		# print(gen)
 		run_time = time.time()-start_time
 		print(f"Generating line took {run_time} seconds")
 
@@ -1358,6 +1359,32 @@ def generate_tortoise(**kwargs):
 		sample_voice = (tts.input_sample_rate, sample_voice.numpy())
 
 	info = get_info(voice=voice, latents=False)
+
+	#insert rvc stuff
+	if args.use_rvc:
+		rvc_settings = load_rvc_settings()
+		rvc_model_path = os.path.join("models", "rvc_models", rvc_settings['rvc_model'])
+		rvc_index_path = os.path.join("models", "rvc_models", rvc_settings['file_index'])
+		print (rvc_model_path)
+		rvc_out_path = rvc_convert(model_path=rvc_model_path, 
+							 		input_path=output_voices[0],
+									f0_up_key=rvc_settings['f0_up_key'],
+									file_index=rvc_index_path,
+									index_rate=rvc_settings['index_rate'],
+									filter_radius=rvc_settings['filter_radius'],
+									resample_sr=rvc_settings['resample_sr'],
+									rms_mix_rate=rvc_settings['rms_mix_rate'],
+									protect=rvc_settings['protect'])
+		
+		# Read the contents from rvc_out_path
+		with open(rvc_out_path, 'rb') as file:
+			content = file.read()
+
+		# Write the contents to output_voices[0], effectively replacing its contents
+		with open(output_voices[0], 'wb') as file:
+			file.write(content)
+
+
 	print(f"Generation took {info['time']} seconds, saved to '{output_voices[0]}'\n")
 
 	info['seed'] = usedSeed
@@ -3307,7 +3334,11 @@ def setup_args(cli=False):
 		'latents-lean-and-mean': True,
 		'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
 		'use-deepspeed': False,
+		#stuff that jarod has added
 		'use-hifigan': False,
+		'use-rvc' : False,
+		'rvc-model' : None,
+
 		'voice-fixer-use-cuda': True,
 
 		
@@ -3368,6 +3399,18 @@ def setup_args(cli=False):
 	parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
 	parser.add_argument("--use-deepspeed", action='store_true', default=default_arguments['use-deepspeed'], help="Use deepspeed for speed bump.")
 	parser.add_argument("--use-hifigan", action='store_true', default=default_arguments['use-hifigan'], help="Use Hifigan instead of Diffusion")
+	parser.add_argument("--use-rvc", action='store_true', default=default_arguments['use-rvc'], help="Run the outputted audio thorugh RVC")
+	parser.add_argument("--rvc-model", action='store_true', default=default_arguments['rvc-model'], help="Specifies RVC model to use")
+
+	# parser.add_argument("--f0_up_key", action='store_true', default=default_arguments['f0_up_key'], help="transpose of the audio file, changes pitch (positive makes voice higher pitch)")
+	# parser.add_argument("--f0method", action='store_true', default=default_arguments['f0method'], help="picks which f0 method to use: dio, harvest, crepe, rmvpe (requires rmvpe.pt)")
+	# parser.add_argument("--file_index", action='store_true', default=default_arguments['file_index'], help="path to file_index, defaults to None")
+	# parser.add_argument("--index_rate", action='store_true', default=default_arguments['index_rate'], help="strength of the index file if provided")
+	# parser.add_argument("--filter_radius", action='store_true', default=default_arguments['filter_radius'], help="if >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.")
+	# parser.add_argument("--resample_sr", action='store_true', default=default_arguments['resample_sr'], help="quality at which to resample audio to, defaults to no resample")
+	# parser.add_argument("--rms_mix_rate", action='store_true', default=default_arguments['rms_mix_rate'], help="adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume")
+	# parser.add_argument("--protect", action='store_true', default=default_arguments['protect'], help="protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy")
+	
 
 	parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
 	parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
@@ -3455,6 +3498,8 @@ def get_default_settings( hypenated=True ):
 		'voice-fixer': args.voice_fixer,
 		'use-deepspeed': args.use_deepspeed,
 		'use-hifigan': args.use_hifigan,
+		'use-rvc': args.use_rvc,
+		'rvc-model' : args.rvc_model,
 		'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
 		'concurrency-count': args.concurrency_count,
 		'output-sample-rate': args.output_sample_rate,
@@ -3510,6 +3555,8 @@ def update_args( **kwargs ):
 	args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda']
 	args.use_deepspeed = settings['use_deepspeed']
 	args.use_hifigan = settings['use_hifigan']
+	args.use_rvc = settings['use_rvc']
+	args.rvc_model = settings['rvc_model']
 	args.concurrency_count = settings['concurrency_count']
 	args.output_sample_rate = 44000
 	args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
@@ -4017,3 +4064,20 @@ def merge_models( primary_model_name, secondary_model_name, alpha, progress=gr.P
 	message = f"Saved to {output_path}"
 	print(message)
 	return message
+
+
+#Stuff added by Jarod
+def get_rvc_models():
+	folder_path = 'models/rvc_models'
+	return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.pth')]
+def get_rvc_indexes():
+	folder_path = 'models/rvc_models'
+	return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.index')]
+
+def load_rvc_settings():
+    rvc_settings_path = './config/rvc.json'
+    if os.path.exists(rvc_settings_path):
+        with open(rvc_settings_path, 'r') as file:
+            return json.load(file)
+    else:
+        return {}  # Return an empty dict if the file doesn't exist
\ No newline at end of file
diff --git a/src/webui.py b/src/webui.py
index 68b4ccf..01e4c18 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -24,12 +24,25 @@ from utils import *
 args = setup_args()
 
 GENERATE_SETTINGS = {}
+RVC_SETTINGS = {
+    'rvc_model': '',
+    'f0_up_key': 0,
+    'file_index': '',
+	'index_rate' : 0,
+    'filter_radius': 3,
+    'resample_sr': 48000,
+    'rms_mix_rate': 0.25,
+    'protect': 0.33,
+}
 TRANSCRIBE_SETTINGS = {}
 EXEC_SETTINGS = {}
 TRAINING_SETTINGS = {}
 MERGER_SETTINGS = {}
 GENERATE_SETTINGS_ARGS = []
 
+
+
+
 PRESETS = {
 	'Ultra Fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
 	'Fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
@@ -55,6 +68,55 @@ HISTORY_HEADERS = {
 	"Model Hash": "model_hash",
 }
 
+# Load settings from a file if it exists
+def load_rvc_settings():
+	global RVC_SETTINGS
+	try:
+		if os.path.exists('./config/rvc.json'):
+			with open('./config/rvc.json', 'r') as f:
+				RVC_SETTINGS.update(json.load(f))
+	except:
+		pass
+
+def update_rvc_settings(**kwargs):
+    global RVC_SETTINGS
+    RVC_SETTINGS.update(kwargs)
+    save_rvc_settings()
+
+def save_rvc_settings():
+    global RVC_SETTINGS
+    os.makedirs('./config/', exist_ok=True)
+    with open(f'./config/rvc.json', 'w', encoding="utf-8") as f:
+        f.write(json.dumps(RVC_SETTINGS, indent='\t'))
+
+def save_rvc_settings_to_json():
+    # Hardcoded path to the JSON file
+    config_file = 'config\\rvc.json'
+
+    try:
+        # Try to load the existing JSON data from the file
+        with open(config_file, 'r') as json_file:
+            existing_settings = json.load(json_file)
+    except (FileNotFoundError, json.JSONDecodeError):
+        # If the file doesn't exist or is unreadable, create a new dictionary
+        existing_settings = {}
+
+    # Update the existing settings with the new values from RVC_SETTINGS
+    for key, value in RVC_SETTINGS.items():
+        # Check if the value is a Gradio Slider object
+        if isinstance(value, gr.Slider):
+            # Extract the value from the Slider
+            existing_settings[key] = value.value
+        elif isinstance(value, gr.Dropdown):
+            # Extract the selected value from the Dropdown
+            existing_settings[key] = value.value
+        else:
+            existing_settings[key] = value
+
+    # Save the updated settings back to the JSON file
+    with open(config_file, 'w') as json_file:
+        json.dump(existing_settings, json_file, indent=4)
+
 # can't use *args OR **kwargs if I want to retain the ability to use progress
 def generate_proxy(
 	text,
@@ -83,7 +145,7 @@ def generate_proxy(
 	progress=gr.Progress(track_tqdm=True)
 ):
 	kwargs = locals()
-
+	# save_rvc_settings_to_json()
 	try:
 		sample, outputs, stats = generate(**kwargs)
 	except Exception as e:
@@ -261,6 +323,15 @@ def update_args_proxy( *args ):
 		kwargs[k] = v
 
 	update_args(**kwargs)
+
+def update_rvc_settings_proxy(*args):
+    kwargs = {}
+    keys = list(RVC_SETTINGS.keys())
+    for i, key in enumerate(keys):
+        kwargs[key] = args[i]
+
+    update_rvc_settings(**kwargs)
+
 def optimize_training_settings_proxy( *args ):
 	kwargs = {}
 	keys = list(TRAINING_SETTINGS.keys())
@@ -327,6 +398,8 @@ def update_voices():
 		gr.Dropdown.update(choices=get_voice_list(append_defaults=True)),
 		gr.Dropdown.update(choices=get_voice_list()),
 		gr.Dropdown.update(choices=get_voice_list(args.results_folder)),
+		gr.Dropdown.update(choices=get_rvc_models()),  # Update for RVC models
+		gr.Dropdown.update(choices=get_rvc_indexes())  # Update for RVC models
 	)
 
 def history_copy_settings( voice, file ):
@@ -365,6 +438,8 @@ def setup_gradio():
 	dataset_list = get_dataset_list()
 	training_list = get_training_list()
 
+	load_rvc_settings()
+
 	global GENERATE_SETTINGS_ARGS
 	GENERATE_SETTINGS_ARGS = list(inspect.signature(generate_proxy).parameters.keys())[:-1]
 	for i in range(len(GENERATE_SETTINGS_ARGS)):
@@ -424,6 +499,20 @@ def setup_gradio():
 						["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"],
 						value="DDIM", label="Diffusion Samplers", type="value"
 					)
+
+					EXEC_SETTINGS['use_rvc'] = gr.Checkbox(label="Run the outputted audio through RVC", value=args.use_rvc)
+					with gr.Column(visible=args.use_rvc) as rvc_column:
+						RVC_SETTINGS['rvc_model'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=RVC_SETTINGS['rvc_model'], interactive=True)
+						RVC_SETTINGS['file_index'] = gr.Dropdown(choices=get_rvc_indexes(), label="RVC Index File", value=RVC_SETTINGS["file_index"], interactive=True)
+						RVC_SETTINGS['index_rate'] = gr.Slider(minimum=0, maximum=1, label="Index Rate", value=RVC_SETTINGS["index_rate"], interactive=True)
+						RVC_SETTINGS['f0_up_key'] = gr.Slider(minimum=-24, maximum=24, label="Voice Pitch (f0 key)", value=RVC_SETTINGS["f0_up_key"], interactive=True)
+						# RVC_SETTINGS['f0_method'] = gr.Dropdown(choices=get_rvc_models(), label="RVC Voice Model", value=args.rvc_model)
+						RVC_SETTINGS['filter_radius'] = gr.Slider(minimum=0, maximum=7, label="Filter Radius", value=RVC_SETTINGS["filter_radius"], interactive=True)
+						RVC_SETTINGS['resample_sr'] = gr.Slider(minimum=0, maximum=48000, label="Resample sample rate", value=RVC_SETTINGS["resample_sr"], interactive=True)
+						RVC_SETTINGS['rms_mix_rate'] = gr.Slider(minimum=0, maximum=1, label="RMS Mix Rate (Volume Envelope)", value=RVC_SETTINGS["rms_mix_rate"], interactive=True)
+						RVC_SETTINGS['protect'] = gr.Slider(minimum=0, maximum=0.5, label="Protect Voiceless Consonants", value=RVC_SETTINGS["protect"], interactive=True)
+
+
 					GENERATE_SETTINGS["cvvp_weight"] = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
 					GENERATE_SETTINGS["top_p"] = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P")
 					GENERATE_SETTINGS["diffusion_temperature"] = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature")
@@ -719,6 +808,18 @@ def setup_gradio():
 				exec_inputs = list(EXEC_SETTINGS.values())
 				for k in EXEC_SETTINGS:
 					EXEC_SETTINGS[k].change( fn=update_args_proxy, inputs=exec_inputs )
+
+				rvc_inputs = list(RVC_SETTINGS.values())
+				# for k in RVC_SETTINGS:
+				# 	RVC_SETTINGS[k].change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+
+				for k, component in RVC_SETTINGS.items():
+					if isinstance(component, gr.Dropdown):
+						component.change(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+					elif isinstance(component, gr.Slider):
+						component.release(fn=update_rvc_settings_proxy, inputs=rvc_inputs)
+
+
 				
 				EXEC_SETTINGS['autoregressive_model'].change(
 					fn=update_autoregressive_model,
@@ -773,6 +874,13 @@ def setup_gradio():
 			inputs=show_experimental_settings,
 			outputs=experimental_column
 		)
+
+		EXEC_SETTINGS['use_rvc'].change(
+			fn=lambda use_rvc_checked: gr.update(visible=use_rvc_checked),
+			inputs=EXEC_SETTINGS['use_rvc'],
+			outputs=rvc_column
+		)
+
 		if preset:
 			preset.change(fn=update_presets,
 				inputs=preset,
@@ -807,11 +915,17 @@ def setup_gradio():
 			outputs=[
 				GENERATE_SETTINGS['voice'],
 				DATASET_SETTINGS['voice'],
-				history_voices
+				history_voices,
+				RVC_SETTINGS['rvc_model'],  # Add this line
+				RVC_SETTINGS['file_index']
+
 			]
 		)
 
 		generate_settings = list(GENERATE_SETTINGS.values())
+		rvc_settings = list(RVC_SETTINGS.values())
+		print(generate_settings)
+		print(rvc_settings)
 		submit.click(
 			lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)),
 			outputs=[source_sample, candidates_list, generation_results],