slowllama: comments in conf.py

2023-10-31 18:10:44 -04:00 · 2023-10-31 18:10:44 -04:00 · cffad8c619
parent 048669061b
commit cffad8c619
4 changed files with 29 additions and 7 deletions
--- a/conf.py
+++ b/conf.py
@ -1,30 +1,54 @@
 import logging

-offload_to = 'disk'
+# which device to use for finetuning
+# 'cpu', 'mps' (for Apple devices) or 'cuda'
 device = 'mps'
+
+# random seed to use. Makes runs reproducible.
 seed = 54321

+# learning rate
 lr = 1e-4

+# logging gradient and weight distribution to log file
+# useful for debugging, but makes more 
 log_lora_grad = False
 log_lora_weight = False

+# how wide would LoRA layers be? (N x lora_rank) and (lora_rank x M).
+# Larger number - larger layer - more capacity.
 lora_rank = 4

 log_level = logging.DEBUG

 # training settings

+# total number of iterations to run. No microbatching so far
 iters = 20
+
+# how long should be the sequence to train on? 
+# we pick seq_len tokens and try to predict token [seq_len + 1]
 seq_len = 128
+
+# how large should be the batch size? 
 batch_size = 16

+# current script doesn't have validation set at all.
+# instead, we run prompt completion every  eval_period iterations
+# and check how the completion look like
 eval_before_training = False
 eval_period = 20
-gen_tokens = 32

-snapshots_path = 'out'
-finetune_file = './test_data/cubestat.txt'
+# how many tokens to generate for such test completion
+gen_tokens = 32
+# what prompt to use for test completion
 prompt = 'Cubestat reports the following metrics: '

+# where to save LoRA snapshots
+snapshots_path = 'out'
+
+# plaintext input file which will be tokenized and used for training 
+finetune_file = './test_data/cubestat.txt'
+
+# which model to use - path to raw model
 llama2_model_path = '/Volumes/LLAMAS/llama-2-7b'
--- a/finetune_dolly.py
+++ b/finetune_dolly.py
@ -15,7 +15,6 @@ device = 'mps' # mps for macbooks
 seq_len = 1024
 batch_size = 4
 lr = 1e-4
-offload_to = 'disk'

 # type used for computation. Might be different from storage type (which is bfloat16)
 compute_dtype = torch.float32 # float32 for macbooks
--- a/model_config.py
+++ b/model_config.py
@ -15,7 +15,6 @@ class ModelArgs:
    dropout: float = 0.0 # unless we bring back 
    ffn_dim_multiplier: Optional[float] = None
    compute_dtype: torch.dtype = torch.float32
-    offload_location: str = 'disk' # 'disk' or 'ram'
    rope_theta: float = 10000.0
    lora_rank: int = 8
    lora_alpha: int = 64
--- a/prepare_model.py
+++ b/prepare_model.py
@ -11,4 +11,4 @@ logging.basicConfig(format='%(asctime)s %(message)s',
 torch.random.manual_seed(seed)

 prepare_model(llama2_path=llama2_model_path, frozen_path=frozen_model_path, compute_dtype=compute_dtype,
-              offload_location=offload_to, lora_rank=lora_rank, frozen_dtype=frozen_dtype)
+              lora_rank=lora_rank, frozen_dtype=frozen_dtype)