extra slow chat

2024-03-25 11:18:03 -04:00 · 2024-03-25 11:18:03 -04:00 · fe6169cb0f
parent 4c2996d654
commit fe6169cb0f
2 changed files with 40 additions and 0 deletions
--- a/chat.py
+++ b/chat.py
@ -0,0 +1,29 @@
+import logging
+import torch
+import sys
+import os
+
+from loader import load_frozen
+from utils import Tokenizer, greedy_gen2
+from conf_fp16 import *
+
+logging.basicConfig(format='%(asctime)s %(message)s', level=logging.WARN)
+
+lora_weights = sys.argv[1] if len(sys.argv) > 1 else None
+
+tokenizer_path = os.path.join(frozen_model_path, 'tokenizer.model')
+tokenizer = Tokenizer(tokenizer_path)
+
+model = load_frozen(frozen_model_path, dropout=0.0, lora_rank=4, frozen_dtype=frozen_dtype, compute_dtype=compute_dtype).to(device)
+if lora_weights is not None:
+    logging.debug(model.load_state_dict(torch.load(lora_weights), strict=False))
+
+print(f'Model {frozen_model_path} loaded')
+
+while True:
+    prompt = input("> ")
+    while True:
+        for next in greedy_gen2(model, tokenizer, device, prompt, max_new_tokens=100):
+            sys.stdout.write(next)
+            sys.stdout.flush()
+
--- a/utils.py
+++ b/utils.py
@ -61,6 +61,17 @@ def greedy_gen(model, tokenizer, device, prompt, max_new_tokens=50):
    for i, output in enumerate(tokens):
        logging.info(f'{i} - {tokenizer.decode(output.tolist())}')

+def greedy_gen2(model, tokenizer, device, prompt, max_new_tokens=50):
+    tokens = torch.tensor(tokenizer.encode(prompt, True, False)).view(1, -1).to(device)
+    model.eval()
+    for _ in range(max_new_tokens):
+        logits = model(tokens)
+        logits = logits[:, -1, :]
+        _, next_token = torch.topk(logits, k=1, dim=-1)
+        logging.info(f'next token: {next_token} {tokenizer.decode(next_token.tolist())}')
+        yield tokenizer.decode(next_token.tolist())[0]
+        tokens = torch.cat((tokens, next_token), dim=1)
+
 def cleanup_cache(device='cpu'):
    if device.startswith('mps'):
        import torch.mps