From 70e522732c12f441718e2c5ea3e7cde33df366f9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 27 Feb 2023 23:50:16 -0300 Subject: [PATCH] Move RWKV loader into a separate file --- modules/RWKV.py | 26 ++++++++++++++++++++++++++ modules/models.py | 22 ++-------------------- modules/text_generation.py | 5 +---- 3 files changed, 29 insertions(+), 24 deletions(-) create mode 100644 modules/RWKV.py diff --git a/modules/RWKV.py b/modules/RWKV.py new file mode 100644 index 00000000..a4a406ee --- /dev/null +++ b/modules/RWKV.py @@ -0,0 +1,26 @@ +import os, time, types, torch +from pathlib import Path +import numpy as np +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +os.environ['RWKV_JIT_ON'] = '1' +os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) + +import repositories.ChatRWKV.v2.rwkv as rwkv +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS + +def load_RWKV_model(path): + os.system("ls") + model = RWKV(model=path.as_posix(), strategy='cuda fp16') + + out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json + print(out.detach().cpu().numpy()) # get logits + out, state = model.forward([187, 510], None) + out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) + out, state = model.forward([310, 247], state) + print(out.detach().cpu().numpy()) # same result as above + + pipeline = PIPELINE(model, Path("repositories/ChatRWKV/20B_tokenizer.json").as_posix()) + + return pipeline diff --git a/modules/models.py b/modules/models.py index 9ce94f6b..0ba584a5 100644 --- a/modules/models.py +++ b/modules/models.py @@ -79,27 +79,9 @@ def load_model(model_name): # RMKV model (not on HuggingFace) elif shared.is_RWKV: - import types - np.set_printoptions(precision=4, suppress=True, linewidth=200) + from modules.RWKV import load_RWKV_model - os.environ['RWKV_JIT_ON'] = '1' - os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) - - from rwkv.model import RWKV - from rwkv.utils import PIPELINE, PIPELINE_ARGS - - model = RWKV(model='models/RWKV-4-Pile-169M-20220807-8023.pth', strategy='cuda fp16') - - out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json - print(out.detach().cpu().numpy()) # get logits - out, state = model.forward([187, 510], None) - out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) - out, state = model.forward([310, 247], state) - print(out.detach().cpu().numpy()) # same result as above - - pipeline = PIPELINE(model, "20B_tokenizer.json") - - return pipeline, None + return load_RWKV_model(Path('models/RWKV-4-Pile-169M-20220807-8023.pth')), None # Custom else: diff --git a/modules/text_generation.py b/modules/text_generation.py index ebe6ed35..d879e14e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -82,17 +82,14 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi torch.cuda.empty_cache() if shared.is_RWKV: - def my_print(s): - print(s, end='', flush=True) args = PIPELINE_ARGS(temperature = temperature, top_p = top_p, alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) alpha_presence = 0.25, # Presence Penalty (as in GPT-3) token_ban = [0], # ban the generation of some tokens token_stop = []) # stop generation whenever you see any token here reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) - print(formatted_outputs(reply, None)) yield formatted_outputs(reply, None) - return formatted_outputs(reply, None) + return formatted_outputs(reply, None) original_question = question if not (shared.args.chat or shared.args.cai_chat):