cleaning up things

2025-06-07 06:05:52 -04:00 · 2024-03-25 11:28:39 -07:00 · 2024-03-25 11:28:39 -07:00 · acbf1b2b2f
commit acbf1b2b2f
parent 8e01cd8fad
3 changed files with 108 additions and 33 deletions
--- a/changelog.md
+++ b/changelog.md
@ -9,6 +9,17 @@ AR Quantization
 - ggml - https://github.com/ggerganov/ggml/issues/59
 - TortoiseCPP https://github.com/balisujohn/tortoise.cpp

+## 3/24/2024
+Just cleaning up somethings and running tests on the code to make sure it functions as it should.  I should think of maybe a way to automate this... but that's a problem for another time.
+- Some values like number of processes (num_processes) to spawn based on your CPU cores added for conversion tasks  
+- Changed tab "Prepare Other Langauge" to "Prepare Dataset for Large Files" 
+- Moved all of the imports inside of main.py into the __name__ check to reduce overhead of multiprocessing
+- Ironing out continuation of transcription in case interrupted, so far, the cases I've tested I've fixed and added approrpiate code to accomodate these situations.  The only test case that doesn't work correctly would be if a file is interrupted in the middle of splitting segments based on the srt script since the segments never get written to train.txt...
+    - Maybe have a way of mapping what has already been segmented to the srt file that exists there? I'll have to think about this one. 
+    Other stuff
+        - Removes the "temp" file that is created for rename
+        - Modified the dataset script maker to ignore folders that contain mp3 segments already
+
 ## 3/23/2024
 - Comment out valle and bark instantiations to clean up console

--- a/src/main.py
+++ b/src/main.py
@ -1,29 +1,29 @@
-import os
-import sys
-
-if os.path.exists("runtime"):
-	# Get the directory where the script is located
-	script_dir = os.path.dirname(os.path.abspath(__file__))
-
-	# Add this directory to sys.path
-	if script_dir not in sys.path:
-		sys.path.insert(0, script_dir)
-
-if 'TORTOISE_MODELS_DIR' not in os.environ:
-	os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
-
-if 'TRANSFORMERS_CACHE' not in os.environ:
-	os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
-
-os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
-
-from utils import *
-from webui import *
-
-from api.websocket_server import start_websocket_server
-
+# Moved all of the imports into __name__ == "__main__" due to how multiprocessing spawns instances, makes multiprocessing faster as it reduces import overhead

 if __name__ == "__main__":
+	import os
+	import sys
+
+	if os.path.exists("runtime"):
+		# Get the directory where the script is located
+		script_dir = os.path.dirname(os.path.abspath(__file__))
+
+		# Add this directory to sys.path
+		if script_dir not in sys.path:
+			sys.path.insert(0, script_dir)
+
+	if 'TORTOISE_MODELS_DIR' not in os.environ:
+		os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
+
+	if 'TRANSFORMERS_CACHE' not in os.environ:
+		os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
+
+	os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
+
+	from utils import *
+	from webui import *
+
+	from api.websocket_server import start_websocket_server
 	args = setup_args()

 	if args.listen_path is not None and args.listen_path != "/":
--- a/src/webui.py
+++ b/src/webui.py
@ -314,7 +314,8 @@ def prepare_dataset_proxy(voice, language, validation_text_length, validation_au
    return "\n".join(messages)


-def transcribe_other_language_proxy(voice, language, chunk_size, continuation_directory, align, rename, progress=gr.Progress(track_tqdm=True)):
+def transcribe_other_language_proxy(voice, language, chunk_size, continuation_directory, align, rename, num_processes, keep_originals, progress=gr.Progress(track_tqdm=True)):
+    num_processes = int(num_processes)
    training_folder = get_training_folder(voice)
    processed_folder = os.path.join(training_folder,"processed")
    dataset_dir = os.path.join(processed_folder, "run")
@ -323,6 +324,8 @@ def transcribe_other_language_proxy(voice, language, chunk_size, continuation_di
    train_text_path = os.path.join(dataset_dir, 'dataset/train.txt')
    validation_text_path = os.path.join(dataset_dir, 'dataset/validation.txt')
    
+    large_file_num_processes = int(num_processes/2) # Used for instances where larger files are being processed, as to not run out of RAM
+    
    items_to_move = [audio_dataset_path, train_text_path, validation_text_path]
    
    for item in items_to_move:
@ -339,21 +342,74 @@ def transcribe_other_language_proxy(voice, language, chunk_size, continuation_di

    from modules.tortoise_dataset_tools.audio_conversion_tools.split_long_file import get_duration, process_folder
    chosen_directory = os.path.join("./voices", voice)
-    file_durations = [get_duration(os.path.join(chosen_directory, item)) for item in os.listdir(chosen_directory) if os.path.isfile(os.path.join(chosen_directory, item))]
+    items = os.listdir(chosen_directory)
+    
+    # In case of sudden restart, removes this intermediary file used for rename
+    for file in items:
+        if "file___" in file:
+            os.remove(os.path.join(chosen_directory, file))
+    
+    file_durations = [get_duration(os.path.join(chosen_directory, item)) for item in items if os.path.isfile(os.path.join(chosen_directory, item))]
    progress(0.0, desc="Splitting long files")
    if any(duration > 3600*2 for duration in file_durations):
-        process_folder(chosen_directory)
+        process_folder(chosen_directory, large_file_num_processes)
+    
+    if not keep_originals:
+        originals_pre_split_path = os.path.join(chosen_directory, "original_pre_split")
+        try:
+            shutil.rmtree(originals_pre_split_path)
+        except:
+            # There is no directory to delete
+            pass
+            
+    progress(0.0, desc="Converting to MP3 files") # add tqdm later
+    import modules.tortoise_dataset_tools.audio_conversion_tools.convert_to_mp3 as c2mp3
+    
+    # Hacky way to get the functions working without changing where they output to...
+    for item in os.listdir(chosen_directory):
+        if os.path.isfile(os.path.join(chosen_directory, item)):
+            original_dir = os.path.join(chosen_directory, "original_files")
+            if not os.path.exists(original_dir):
+                os.makedirs(original_dir)
+            item_path = os.path.join(chosen_directory, item)
+            try:
+                shutil.move(item_path, original_dir)
+            except:
+                os.remove(item_path)
+    
+    try:
+        c2mp3.process_folder(original_dir, large_file_num_processes)
+    except:
+        raise gr.Error('No files found in the voice folder specified, make sure it is not empty.  If you interrupted the process, the files may be in the "original_files" folder')
+    
+    # Hacky way to move the files back into the main voice folder
+    for item in os.listdir(os.path.join(original_dir, "converted")):
+        item_path = os.path.join(original_dir, "converted", item)
+        if os.path.isfile(item_path):
+            try:
+                shutil.move(item_path, chosen_directory)
+            except:
+                os.remove(item_path)
+            
+    if not keep_originals:
+        originals_files = os.path.join(chosen_directory, "original_files")
+        try:
+            shutil.rmtree(originals_files)
+        except:
+            # There is no directory to delete
+            pass

-    progress(0.1, desc="Processing audio files")
+    progress(0.4, desc="Processing audio files")
    process_audio_files(base_directory=dataset_dir,
                        language=language,
                        audio_dir=chosen_directory,
                        chunk_size=chunk_size,
                        no_align=align,
-                        rename_files=rename)
-    progress(0.5, desc="Audio processing completed")
+                        rename_files=rename,
+                        num_processes=num_processes)
+    progress(0.7, desc="Audio processing completed")

-    progress(0.5, desc="Merging segments")
+    progress(0.7, desc="Merging segments")
    merge_segments(merge_dir)
    progress(0.9, desc="Segment merging completed")

@ -791,7 +847,7 @@ def setup_gradio():
                    with gr.Column():
                        prepare_dataset_output = gr.TextArea(
                            label="Console Output", interactive=False, max_lines=8)
-            with gr.Tab("Prepare Large Files"):
+            with gr.Tab("Prepare Dataset for Large Files"):
                with gr.Row():
                    with gr.Column():
                        DATASET2_SETTINGS = {}
@ -810,6 +866,9 @@ def setup_gradio():
                                label="Language", value="en")
                            DATASET2_SETTINGS['chunk_size'] = gr.Textbox(
                                label="Chunk Size", value="20")
+                            DATASET2_SETTINGS['num_processes'] = gr.Textbox(
+                                label="Processes to Use", value=int(max(1, multiprocessing.cpu_count())))
+                            
                        with gr.Row():
                            DATASET2_SETTINGS['align'] = gr.Checkbox(
                                label="Disable WhisperX Alignment", value=False   
@ -817,6 +876,9 @@ def setup_gradio():
                            DATASET2_SETTINGS['rename'] = gr.Checkbox(
                                label="Rename Audio Files", value=True
                            )
+                            DATASET2_SETTINGS['keep_originals'] = gr.Checkbox(
+                                label="Keep Original Files", value=True
+                            )
                        transcribe2_button = gr.Button(
                            value="Transcribe and Process")
                        
@ -1323,7 +1385,9 @@ def setup_gradio():
                DATASET2_SETTINGS['chunk_size'],
                DATASET2_SETTINGS['continue_directory'],
                DATASET2_SETTINGS["align"],
-                DATASET2_SETTINGS["rename"]
+                DATASET2_SETTINGS["rename"],
+                DATASET2_SETTINGS['num_processes'],
+                DATASET2_SETTINGS['keep_originals']
            ],
            outputs=transcribe2_output
        )