learning_rate |
Learning rate for training |
float |
[0.00001-0.001] |
0.00001 |
batch_size |
Batch size for training. In case of distributed training, this is batch size on each device |
int |
updating |
1 |
epochs |
Number of training epochs |
int |
updating |
1 |
gradient_accumulation_steps |
Number of updates steps to accumulate the gradients for, before performing a backward/update pass |
int |
updating |
4 |
checkpoint_steps |
Number of training steps before two checkpoint saves if save_strategy="steps". |
int |
updating |
1000 |
max_sequence_length |
Max input length, longer sequences will be cut-off to this value. |
int |
updating |
2048 |
finetuning_type |
Which parameter mode to use. |
enum[string] |
lora/full |
lora |
distributed_backend |
Backend to use for distributed training. Default is ddp |
enum[string] |
ddp/deepseed |
ddp |
deepspeed_zero_stage |
Stage to apply DeepSpeed ZeRO algorithm. Only apply when distributed_backend=deepspeed |
enum[int] |
1/2/3 |
1 |
lr_scheduler_type |
Learning rate scheduler to use |
enum[string] |
linear/cosine/constant |
linear |
lr_warmup_steps |
Number of steps used for a linear warmup from 0 to learning_rate |
int |
updating |
0 |
disable_gradient_checkpointing |
Whether or not to disable gradient checkpointing |
bool |
true/false |
false |
eval_strategy |
The evaluation strategy to adopt during training. |
enum[string] |
no/epoch/steps |
epoch |
eval_steps |
Number of update steps between two evaluations if eval_strategy="steps". Will default to the same value as logging_steps if not set. Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps. Only apply when eval_strategy=steps. |
int |
updating |
1000 |
mixed_precision |
Type of mixed precision to use |
enum[string] |
bf16/fp16/none |
bf16 |
optimizer |
Optimizer to use for training |
enum[string] |
adamw/sgd |
adamw |
lora_alpha |
Alpha parameter for LoRA |
int |
updating |
32 |
lora_dropout |
Dropout rate for LoRA |
float |
updating |
0.05 |
lora_rank |
Rank of the LoRA matrices |
int |
updating |
16 |
quantization_bit |
The number of bits to quantize the model using on-the-fly quantization. Currently only applicable when finetuning_type=LoRA (called QLoRA). |
enum[string] |
int4/int8/none |
none |
flash_attention_v2 |
Whether to use flash attention version 2 |
bool |
false |
false |
logging_steps |
Number of steps between logging events including stdout logs and MLflow data point. logging_steps=-1 means log on every step. |
int |
updating |
10 |
checkpoint_strategy |
The checkpoint save strategy to adopt during training. "best" only applicable when eval_strategy is not "no". |
enum[string] |
no/epoch/steps |
epoch |
max_grad_norm |
Maximum norm for gradient clipping |
float |
updating |
1 |
number_of_checkpoints |
If a value is passed, will limit the total amount of checkpoints |
int |
updating |
5 |
seed |
Random seed for reproducibility |
int |
updating |
1309 |
full_determinism |
Ensure reproducible results in distributed training. Important: this will negatively impact the performance, so only use it for debugging. If True, seting seed will not take effect. |
bool |
true/false |
false |
weight_decay |
Weight decay to apply to the optimizer |
float |
updating |
0 |
target_modules |
Target modules for quantization or fine-tuning. |
string |
updating |
all-linear |