
    h5                     P    S SK JrJr  S SKJrJr  S SKJr  \ " S S\5      5       rg)    )	dataclassfield)AnyOptional)TrainingArgumentsc                     ^  \ rS rSr% Sr\R                  S/-   r\" SSS0S9r\	\
S'   \" S	SS
0S9r\	\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\\\4      \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\\\4      \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\   \
S'   \" S SS!0S9r\\   \
S"'   \" S#SS$0S9r\\
S%'   \" S&SS'0S9r\\
S('   \" S#SS)0S9r\\
S*'   \" SSS+0S9r\\   \
S,'   \" SSS-0S9r \\   \
S.'   \" SSS/0S9r!\\   \
S0'   \" S#SS10S9r"\\
S2'   \" S3SS40S9r#\\
S5'   \" S#SS60S9r$\\
S7'   U 4S8 jr%S9r&U =r'$ ):	SFTConfig   a  
Configuration class for the [`SFTTrainer`].

This class includes only the parameters that are specific to SFT training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    > Parameters that control the model

    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
        argument of the [`SFTTrainer`] is provided as a string. If you're training a MoE architecture and want to
        include the load balancing/auxilliary loss as a part of the final loss, remember to set
        `output_router_logits=True` in this dictionary.
    chat_template_path (`str` or `None`, *optional*, defaults to `None`):
        If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory
        or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
        ensure that any special tokens referenced in the template are added to the tokenizer and that the model's
        embedding layer is resized accordingly.

    > Parameters that control the data preprocessing

    dataset_text_field (`str`, *optional*, defaults to `"text"`):
        Name of the column that contains text data in the dataset.
    dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Dictionary of optional keyword arguments for the dataset preparation. The only supported key is
        `skip_prepare_dataset`. When the model is a VLM, `skip_prepare_dataset` is automatically treated as `True`
        regardless of the provided value, since preprocessing is done on the fly.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of processes to use for processing the dataset.
    eos_token (`str` or `None`, *optional*, defaults to `None`):
        Token used to indicate the end of a turn or sequence. If `None`, it defaults to
        `processing_class.eos_token`.
    pad_token (`int` or `None`, *optional*, defaults to `None`):
        Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
        it falls back to `processing_class.eos_token`.
    max_length (`int` or `None`, *optional*, defaults to `1024`):
        Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from the right.
        If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length.
    packing (`bool`, *optional*, defaults to `False`):
        Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
        padding. Uses `max_length` to define sequence length.
    packing_strategy (`str`, *optional*, defaults to `"bfd"`):
        Strategy for packing sequences. Can be either `"bfd"` (best-fit decreasing, default), or `"wrapped"`.
    padding_free (`bool`, *optional*, defaults to `False`):
        Whether to perform forward passes without padding by flattening all sequences in the batch into a single
        continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
        supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch structure. When
        packing is enabled with strategy `"bfd"`, padding-free is enabled, regardless of the value of this
        parameter.
    pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
        If set, the sequences will be padded to a multiple of this value.
    eval_packing (`bool` or `None`, *optional*, defaults to `None`):
        Whether to pack the eval dataset. If `None`, uses the same value as `packing`.

    > Parameters that control the training

    completion_only_loss (`bool` or `None`, *optional*, defaults to `None`):
        Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed
        only on the completion, which is supported only for [prompt-completion](#prompt-completion) datasets. If
        `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset:
        loss is computed on the completion for [prompt-completion](#prompt-completion) datasets, and on the full
        sequence for [language modeling](#language-modeling) datasets.
    assistant_only_loss (`bool`, *optional*, defaults to `False`):
        Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed only
        on the assistant responses, which is supported only for [conversational](#conversational) datasets. If
        `False`, loss is computed on the entire sequence.
    loss_type (`str`, *optional*, defaults to `"nll"`):
        Type of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` (Dynamic
        Fine-Tuning, as described in [this paper](https://huggingface.co/papers/2508.05629)).
    activation_offloading (`bool`, *optional*, defaults to `False`):
        Whether to offload the activations to the CPU.
model_init_kwargsgh㈵>helpz$The initial learning rate for AdamW.)defaultmetadatalearning_rate
   zLog every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.logging_stepsTzZIf True, use gradient checkpointing to save memory at the expense of slower backward pass.gradient_checkpointingNzWhether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if `fp16` is not set.bf16aC  Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of the `SFTTrainer` is provided as a string. If you're training a MoE architecture and want to include the load balancing/auxilliary loss as a part of the final loss, remember to set `output_router_logits=True` in this dictionary.ac  If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must ensure that any special tokens referenced in the template are added to the tokenizer and that the model's embedding layer is resized accordingly.chat_template_pathtextz:Name of the column that contains text data in the dataset.dataset_text_fieldaT  Dictionary of optional keyword arguments for the dataset preparation. The only supported key is `skip_prepare_dataset`. If the model is a VLM, `skip_prepare_dataset` value is ignored. When the model is a VLM, `skip_prepare_dataset` is automatically treated as `True` regardless of the provided value, since preprocessing is done on the fly.dataset_kwargsz6Number of processes to use for processing the dataset.dataset_num_proczmToken used to indicate the end of a turn or sequence. If `None`, it defaults to `processing_class.eos_token`.	eos_tokenzToken used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`, it falls back to `processing_class.eos_token`.	pad_tokeni   zMaximum length of the tokenized sequence. Sequences longer than `max_length` are truncated fromthe right. If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length.
max_lengthFzWhether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce padding. Uses `max_length` to define sequence length.packingbfdzeStrategy for packing sequences. Can be either `'bfd'` (best-fit decreasing, default), or `'wrapped'`.packing_strategya  Whether to perform forward passes without padding by flattening all sequences in the batch into a single continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch structure. When packing is enabled with strategy `'bfd'`, padding-free is enabled, regardless of the value of this parameter.padding_freezAIf set, the sequences will be padded to a multiple of this value.pad_to_multiple_ofzNWhether to pack the eval dataset. If `None`, uses the same value as `packing`.eval_packinga  Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed only on the completion, which is supported only for prompt-completion datasets. If `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset: loss is computed on the completion for prompt-completion datasets, and on the full sequence for language modeling datasets.completion_only_losszWhether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed only on the assistant responses, which is supported only for conversational datasets. If `False`, loss is computed on the entire sequence.assistant_only_lossnllzType of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` (Dynamic Fine-Tuning, as described in https://huggingface.co/papers/2508.05629).	loss_typez.Whether to offload the activations to the CPU.activation_offloadingc                    > U R                   c  U R                  (       + OU R                   U l         [        TU ]  5         g )N)r   fp16super__post_init__)self	__class__s    P/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/sft_config.pyr*   SFTConfig.__post_init__  s*    '+yy'8Odii	    )r   )(__name__
__module____qualname____firstlineno____doc__r   _VALID_DICT_FIELDSr   r   float__annotations__r   r   boolr   r   r   dictstrr   r   r   r   r   intr   r   r   r   r   r   r    r!   r"   r#   r%   r&   r*   __static_attributes____classcell__)r,   s   @r-   r	   r	      s>   M^ +==AT@UU !@AM5  ! D
M5  $)p
$D  ! !
D(4.  38 "
3xS#X/  ). G
)  $VW  05 6
0NHT#s(^,  ',RS'hsm   %  D
 Ix}   % M
 Ix}  !& 
!J   O
GT  " 
c   '
	L$ 	 ).]^)  $)jk$L(4.  ,1.
,(4.  !&;
	! 	 c
Is  #(JK#4 
   r/   r	   N)	dataclassesr   r   typingr   r   transformersr   r	    r/   r-   <module>rB      s/    )   * p ! p  p r/   