diff --git a/conf.py b/conf.py index d6f9dfa..5b7b7f7 100644 --- a/conf.py +++ b/conf.py @@ -1,30 +1,54 @@ import logging -offload_to = 'disk' +# which device to use for finetuning +# 'cpu', 'mps' (for Apple devices) or 'cuda' device = 'mps' + +# random seed to use. Makes runs reproducible. seed = 54321 +# learning rate lr = 1e-4 +# logging gradient and weight distribution to log file +# useful for debugging, but makes more log_lora_grad = False log_lora_weight = False +# how wide would LoRA layers be? (N x lora_rank) and (lora_rank x M). +# Larger number - larger layer - more capacity. lora_rank = 4 log_level = logging.DEBUG # training settings +# total number of iterations to run. No microbatching so far iters = 20 + +# how long should be the sequence to train on? +# we pick seq_len tokens and try to predict token [seq_len + 1] seq_len = 128 + +# how large should be the batch size? batch_size = 16 +# current script doesn't have validation set at all. +# instead, we run prompt completion every eval_period iterations +# and check how the completion look like eval_before_training = False eval_period = 20 -gen_tokens = 32 -snapshots_path = 'out' -finetune_file = './test_data/cubestat.txt' +# how many tokens to generate for such test completion +gen_tokens = 32 +# what prompt to use for test completion prompt = 'Cubestat reports the following metrics: ' +# where to save LoRA snapshots +snapshots_path = 'out' + +# plaintext input file which will be tokenized and used for training +finetune_file = './test_data/cubestat.txt' + +# which model to use - path to raw model llama2_model_path = '/Volumes/LLAMAS/llama-2-7b' diff --git a/finetune_dolly.py b/finetune_dolly.py index 6c9d4a0..5a4e22c 100644 --- a/finetune_dolly.py +++ b/finetune_dolly.py @@ -15,7 +15,6 @@ device = 'mps' # mps for macbooks seq_len = 1024 batch_size = 4 lr = 1e-4 -offload_to = 'disk' # type used for computation. Might be different from storage type (which is bfloat16) compute_dtype = torch.float32 # float32 for macbooks diff --git a/model_config.py b/model_config.py index 8ac9e14..c98dad4 100644 --- a/model_config.py +++ b/model_config.py @@ -15,7 +15,6 @@ class ModelArgs: dropout: float = 0.0 # unless we bring back ffn_dim_multiplier: Optional[float] = None compute_dtype: torch.dtype = torch.float32 - offload_location: str = 'disk' # 'disk' or 'ram' rope_theta: float = 10000.0 lora_rank: int = 8 lora_alpha: int = 64 diff --git a/prepare_model.py b/prepare_model.py index b8933ef..b90efc7 100644 --- a/prepare_model.py +++ b/prepare_model.py @@ -11,4 +11,4 @@ logging.basicConfig(format='%(asctime)s %(message)s', torch.random.manual_seed(seed) prepare_model(llama2_path=llama2_model_path, frozen_path=frozen_model_path, compute_dtype=compute_dtype, - offload_location=offload_to, lora_rank=lora_rank, frozen_dtype=frozen_dtype) + lora_rank=lora_rank, frozen_dtype=frozen_dtype)