
    h{                    R   S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJ	r	  S SK
JrJrJrJrJr  S SKrS SKJr  S SKrS SKJrJr  S SKJrJr  S S	KJrJrJrJrJrJrJ r J!r!J"r"J#r#J$r$  S S
K%J&r&  S SK'J(r(  S SK)J*r*  S SK+J,r,  SSK-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  SSK5J6r6J7r7J8r8  SSK9J:r:  SSK;J<r<J=r=J>r>J?r?J@r@JArA  \," 5       (       a  S SKBJCrCJDrD  \$" 5       (       a  S SKErE\R                  " \G5      rH\" S\I\5      rJS\JS\J4S jrK\ " S S\&5      5       rL\ " S S\&5      5       rMS rN " S S\"5      rOg)     N)defaultdict)Mapping)	dataclass)Path)AnyCallableOptionalTypeVarUnion)PartialStatelogging)DatasetIterableDataset)
AutoConfigAutoProcessorBaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainingArgumentsis_wandb_available)DataCollatorMixin)TrainerCallback)EvalPrediction)is_peft_available   )apply_chat_templateis_conversationalis_conversational_from_valuemaybe_convert_to_chatmlpack_datasetprepare_multimodal_messagestruncate_dataset)clone_chat_templateget_act_offloading_ctx_managerprepare_peft_model   )	SFTConfig)entropy_from_logits
flush_leftgenerate_model_cardget_comet_experiment_urlpadselective_log_softmax)
PeftConfig	PeftModelTListOrMappingexamplereturnc           
         [        U [        5      (       a9  U  Vs/ s H+  n[        U[        [        45      (       a  [        U5      OUPM-     sn$ [        U [        5      (       aQ  U R                  5        VVs0 s H3  u  p!Uc  M
  U[        U[        [        45      (       a  [        U5      OU_M5     snn$ [        S5      es  snf s  snnf )a  
Recursively removes entries with `None` values from a nested structure (list or dictionary).

Args:
    example (`list` or `Mapping`):
        Input nested structure (list or dictionary) from which to remove `None`.

Example:
```python
>>> [
...     {
...         "a": {"aa": None, "ab": 1},
...         "b": "my_string",
...     }
... ]
>>> remove_none_values(example)
[{'a': {'ab': 1}, 'b': 'my_string'}]
```
z%Input must be a list or a dictionary.)
isinstancelistdictremove_none_valuesr   items	TypeError)r5   valuekeys      Q/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/sft_trainer.pyr;   r;   N   s    ( '4  elmel\aZd|-L-L"5)RWWelmm	GW	%	% &mmo
-
 YCjt.M.M#E*SXX-
 	
 ?@@ n
s   2C 7	C-Cc                       \ rS rSr% Sr\\S'   Sr\\S'   Sr	\\S'   Sr
\\S'   S	r\\   \S
'   Sr\\S'   S\\\\   \\\\4   4      S\\\4   4S jr\S\\\      S\\R,                     4S j5       rSrg	)DataCollatorForLanguageModelingn   a  
Data collator used for language modeling data. Inputs are dynamically padded to the maximum length of a batch.

This collator expects each example in the input list to be a dictionary containing at least the `"input_ids"` key.
If the input contains a `"completion_mask"`, it is used to set the labels to `-100` for tokens that are not in the
completion. If `"assistant_masks"` are present, they are used to set the labels to `-100` for tokens that are not
in the assistant part of the sequence. The collator returns a dictionary containing the following keys:
- `"input_ids"`: Tensor of input IDs, padded to the maximum length of the batch.
- `"attention_mask"`: Tensor of attention mask, padded to the maximum length of the batch.
- `"position_ids"`: Tensor of position IDs, padded to the maximum length of the batch.
- `"labels"`: Tensor of labels, padded to the maximum length of the batch. If `completion_only_loss` is set to
`True`, tokens that are not in the completion are set to -100. If `assistant_masks` are present, tokens that are
not in the assistant part of the sequence are set to -100.

Args:
    pad_token_id (`int`):
        Token ID to use for padding.
    completion_only_loss (`bool`, *optional*, defaults to `True`):
        When the input contains a completion mask (`completion_mask`), the labels are set to -100 for the tokens
        that are no in the completion.
    padding_free (`bool`, *optional*, defaults to `False`):
        If set to `True`, the sequences will be flattened into a single sequence, and the position IDs will be
        generated accordingly.
    pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
        If set, the sequences will be padded to a multiple of this value.
    return_tensors (`str`, *optional*, defaults to `"pt"`):
        Type of Tensor to return. Only `"pt"` is currently supported.

Examples:
```python
>>> from trl.trainer.sft_trainer import DataCollatorForLanguageModeling

>>> collator = DataCollatorForLanguageModeling(pad_token_id=0)
>>> examples = [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}]
>>> collator(examples)
{'input_ids': tensor([[  1,  2,  3],
                      [  4,  5,  0]]),
 'attention_mask': tensor([[  1,  1,  1],
                           [  1,  1,  0]]),
 'position_ids': tensor([[0, 1, 2],
                         [0, 1, 0]]),
 'labels': tensor([[   1,    2,    3],
                   [   4,    5, -100]])}

>>> # With completion mask
>>> examples = [
...     {"input_ids": [1, 2, 3], "completion_mask": [0, 1, 1]},
...     {"input_ids": [4, 5], "completion_mask": [0, 1]},
... ]
>>> collator(examples)
{'input_ids': tensor([[  1,  2,  3],
                      [  4,  5,  0]]),
 'attention_mask': tensor([[  1,  1,  1],
                           [  1,  1,  0]]),
 'position_ids': tensor([[0, 1, 2],
                         [0, 1, 0]]),
 'labels': tensor([[-100,    2,    3],
                   [-100,    5, -100]])}

>>> # With padding_free
>>> collator = DataCollatorForLanguageModeling(pad_token_id=0, padding_free=True)
>>> collator(examples)
{'input_ids': tensor([[ 1, 2, 3, 4, 5]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1]]),
 'position_ids': tensor([[0, 1, 2, 0, 1]]),
 'labels': tensor([[1, 2, 3, 4, 5]])}
```
pad_token_idTcompletion_only_lossFpadding_freereturn_position_idsNpad_to_multiple_ofptreturn_tensorsexamplesr6   c                 V   U Vs/ s H  n[         R                  " US   5      PM     nnU R                  =(       a    SUS   ;   =(       a    U R                  nU(       d&  U Vs/ s H  n[         R                  " U5      PM     nnU R                  (       a]  SUS   ;   a%  U R                  U Vs/ s H  o"S   PM	     sn5      nO/W Vs/ s H"  n[         R                  " [        U5      5      PM$     nnSUS   ;   a*  U Vs/ s H  n[         R                  " US   5      PM     nnO)U Vs/ s H  n[         R                  " US   5      PM     nnU R                  (       a2  SUS   ;   a)  U Vs/ s H  n[         R                  " US   5      PM     n	nSUS   ;   a)  U Vs/ s H  n[         R                  " US   5      PM     n
n0 nU R                  (       a  [         R                  " WSS9/nU(       d  [         R                  " WSS9/nU R                  (       a  [         R                  " WSS9/n[         R                  " USS9/nU R                  (       a  SUS   ;   a  [         R                  " W	SS9/n	SUS   ;   a  [         R                  " W
SS9/n
[        WU R                  SU R                  S	9US'   U(       d  [        WSSU R                  S	9US
'   U R                  (       a  [        WSSU R                  S	9US'   [        USSU R                  S	9US'   U R                  (       a*  SUS   ;   a!  [        W	SSU R                  S	9n	SUS   U	S:H  '   SUS   ;   a!  [        W
SSU R                  S	9n
SUS   U
S:H  '   U$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )N	input_idsseq_lengthsr   labelscompletion_maskassistant_masksdimright)padding_valuepadding_siderH   attention_maskposition_ids)torchtensorrG   rF   	ones_like(get_position_ids_from_packed_seq_lengthsarangelenrE   catr0   rD   rH   )selfrK   r5   rM   has_packed_position_idsrW   rX   idsrO   rP   rQ   outputs               r@   
torch_call*DataCollatorForLanguageModeling.torch_call   s   GOPxGU\\'+"67x	P #'":":"q}PXYZP[?["q`d`q`q 'JST)Yeooi8)NT##+#LL;CD8]+8D  CLL)3SX 6)Lx{"EMNX'ell78#45XFNFHPQWell7;#78FQ$$):hqk)IW_`W_Gu||G4E,FGW_O`+W_`W_Gu||G4E,FGW_O` 9!45I*"'))N"B!C'' %		,A >?iiA./F((->(1+-M#(99_!#D"E HQK/#(99_!#D"E "++ #66	
{ ''*agZ^ZqZq(F#$ ##%(AGX\XoXo&F>" $WQUQhQh
x $$):hqk)I!qw[_[r[rO 6:F8_12+!qw[_[r[rO 6:F8_12I Q U
 E  MNQ``s/   #N# N.N)N #N*#N-#N!#N&batch_seq_lengthsc                    U  Vs/ s H  n[        U5      PM     nn[        R                  " U  VVs/ s H  o  H  o3PM     M     snn5      n [        R                  " [        U5      U R                  S9nSUS'   U SS S-
  * X@SS R                  S5      '   UR                  S5      n[        UR                  U5      5      $ s  snf s  snnf )a-  
Get position IDs for packed sequences.

Args:
    batch_seq_lengths (`list[list[int]]`):
        A list of lists containing the lengths of each individual document in the packed batch.

Return:
    `list[torch.Tensor]`:
        A list of tensors containing the position IDs for each packed sequence.
)dtyper   Nr*   )sumrZ   r[   onesri   cumsumr9   split)rg   rN   example_lengths
seq_lengthrX   s        r@   r]   HDataCollatorForLanguageModeling.get_position_ids_from_packed_seq_lengths  s     @QQ?P3{+?PQ!LL+<[+<K{Z{Z+<[
 zz#o"6>O>U>UVQ;LSb;QTU;U9Vs+22156#**1-L&&788 R \s
   CC
 )__name__
__module____qualname____firstlineno____doc__int__annotations__rE   boolrF   rG   rH   r	   rJ   strr9   r   r   r:   re   staticmethodrZ   Tensorr]   __static_attributes__rr       r@   rB   rB   n   s    CJ !%$%L$ $$(,,NCF4d3id38n.L(M#N FSWX[]`X`Sa FP 9DcO 9X\]b]i]iXj 9 9r   rB   c                   0   \ rS rSr% Sr\\S'   Sr\\	   \S'   Sr
\\S'   Sr\\	   \S'   S	r\\S
'   Sr\\S'   S\\\\	   \\\\4   4      S\\\4   4S jrS\\\\	   \\\\4   4      S\\\4   4S jrS\\\\	   \\\\4   4      S\\\4   4S jrSrg)%DataCollatorForVisionLanguageModelingi   u  
Data collator for vision-language modeling tasks.

Unlike text-only datasets—where the collator typically receives pre-tokenized inputs ready for batching,
vision-language data processing involves converting images into pixel values. This conversion is disk-intensive,
making upfront preprocessing of the entire dataset impractical. Therefore, this collator performs tokenization and
image processing on-the-fly to efficiently prepare batches.

Each input example should be a dictionary containing at least:
- An `"images"` key holding the image data.
- [language modeling](#language-modeling) type: either a `"messages"` key for conversational inputs or a `"text"`
  key for standard text inputs.
- [prompt-completion](#prompt-completion) type: keys `"prompt"` and `"completion"` for the prompt and completion.

The collator outputs a dictionary including:
- `"input_ids"`: Tensor of token IDs.
- `"attention_mask"`: Tensor indicating attention mask.
- `"pixel_values"`: Tensor representing image pixel values.
- `"labels"`: Tensor for training labels.

Additional keys may be present depending on the processor, such as `"image_grid_thw"`.

Args:
    processor (`ProcessorMixin`):
        The processor used to tokenize text and process images. It must be a subclass of `ProcessorMixin` and
        include a `tokenizer` with a defined `pad_token_id`.
    max_length (`int` or `None`, optional, defaults to `None`):
        Maximum sequence length for input tokens. If `None`, no truncation is applied.
    completion_only_loss (`bool`, *optional*, defaults to `False`):
        Whether to compute loss only on the completion part of the sequence. When `True`, the labels for the prompt
        part are set to -100. It requires the dataset type to be prompt-completion.
    pad_to_multiple_of (`int` or `None`, optional, defaults to `None`):
        If set, the sequences will be padded to a multiple of this value.
    dataset_text_field (`str`, optional, defaults to `"text"`):
        Name of the column that contains text data in the dataset. This parameter is only relevant for [standard
        datasets format](dataset_formats#standard).
    return_tensors (`str`, optional, defaults to `"pt"`):
        The tensor type to return. Currently, only `"pt"` (PyTorch tensors) is supported.

Example:
```python
>>> from trl.trainer.sft_trainer import DataCollatorForVisionLanguageModeling
>>> from transformers import AutoProcessor

>>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
>>> collator = DataCollatorForVisionLanguageModeling(processor)
>>> examples = [
...     {"images": [Image.open("image_0.png")], "messages": [{"role": "user", "content": "What is this?"}]},
...     {"images": [Image.open("image_1.png")], "messages": [{"role": "user", "content": "Describe this image."}]},
... ]
>>> collator(examples)
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
                       151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,   3838,    374,
                          419,     30, 151645,    198],
                      [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
                       151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,  74785,    419,
                         2168,     13, 151645,    198]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'pixel_values': tensor([[-0.9893,  0.1785,  1.5362,  ..., -0.0582,  0.8661, -0.2431],
                         [-0.2302,  0.9522, -1.1061,  ...,  0.0555,  1.3354, -0.6412],
                         [ 1.2150,  0.9084,  0.7041,  ...,  0.2404, -0.8403, -0.5133],
                         ...,
                         [ 0.6895,  0.2807,  0.2515,  ..., -0.2004, -1.2100,  0.0555],
                         [ 0.8209, -0.9748,  1.5654,  ...,  1.6055, -0.4706,  0.5817],
                         [-1.0915,  0.4559,  0.9230,  ...,  0.5106,  0.0982, -0.1720]]),
 'image_grid_thw': tensor([[1, 4, 4],
                           [1, 4, 4]]),
 'labels': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
                    151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,   3838,    374,
                       419,     30, 151645,    198],
                    [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
                     151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,  74785,    419,
                       2168,     13, 151645,    198]])}
```
	processorN
max_lengthFrE   rH   textdataset_text_fieldrI   rJ   rK   r6   c                 ,   SUS   ;   d  U R                   US   ;   a-  U R                  (       a  [        S5      eU R                  U5      $ SUS   ;   a  SUS   ;   a  U R	                  U5      $ [        S[        US   R                  5       5       S35      e)Nmessagesr   zTThe `completion_only_loss` argument is not supported for language modeling datasets.prompt
completionz#Unexpected input keys in examples: .)r   rE   
ValueError_collate_language_modeling_collate_prompt_completionKeyErrorr9   keys)ra   rK   s     r@   re   0DataCollatorForVisionLanguageModeling.torch_callv  s    !$(?(?8A;(N(( j  228<<!$!)D228<<@hqkFVFVFXAY@ZZ[\]]r   c                 D   U Vs/ s H  o"S   PM	     nnSUS   ;   aT  U H  n[        US   [        US   5      5        M      U Vs/ s H  o"S   PM	     nnU R                  R                  U5      nO=U R                  US   ;   a  U Vs/ s H  o"U R                     PM     nnO[        S5      eU R                  UUSSU R                  U R                  S LU R                  U R                  SS9	nUS	   R                  5       nS
XvS   S:H  '   XvS'   U$ s  snf s  snf s  snf )Nimagesr   r   zfThe input examples must contain either 'messages' for conversational data or 'text' for standard data.TrT   F)	r   r   paddingrV   rH   
truncationr   rJ   add_special_tokensrM   rY   rW   rO   )
r%   r_   r   r    r   r   rH   r   rJ   clone)ra   rK   r5   r   r   textsrd   rO   s           r@   r   @DataCollatorForVisionLanguageModeling._collate_language_modeling  sH   3;<8(#8<!$#+GJ,?WXEVAWX $;CD8
+8HDNN66x@E$$3EMNX'T445XENE 
  #66d2..$   

 $**,04&'1,- "xA =
 E Os   DD	Dc           	         U R                   b  [        S5      eU Vs/ s H  o"S   PM	     nn[        US   5      (       aO  U H$  n[        US   US   -   [	        US   5      5        M&     U Vs/ s H  n[        X R                  5      PM     nnU Vs/ s H  o"S   PM	     nnU Vs/ s H  o"S   PM	     nnU R                  UUSSU R                  SS	9nU R                  USS
U R                  SS9nUS   US   pUS   US   p[        R                  " X4SS9n[        R                  " X4SS9n[        R                  " [        R                  " U
5      U4SS9n[        XU5      u  pnU R                  b?  US S 2S U R                  24   nUS S 2S U R                  24   nUS S 2S U R                  24   nUR                  5       nSXS:H  '   U R                  (       a  SXS:H  '   UnXS'   XS'   XS'   U$ s  snf s  snf s  snf s  snf )NztPadding to a multiple of a value is not yet implemented for vision-language modeling and prompt-completion data yet.r   r   r   r   TleftF)r   r   r   rV   rJ   r   rT   )r   r   rV   rJ   r   rM   rW   r*   rR   rY   rO   )rH   NotImplementedErrorr!   r%   r_   r    r   rJ   rZ   r`   
zeros_liker-   r   r   rE   )ra   rK   r5   r   promptscompletionsprocessed_promptsprocessed_completions
prompt_idscompletion_idsprompt_maskrP   rM   rW   rO   rd   s                   r@   r   @DataCollatorForVisionLanguageModeling._collate_prompt_completion  sa   "".%.  4<<8(#8<Xa[))#+GH,=@U,UWZ[bck[lWmn $T\]T\+G^^DT\H]4<=H8$H=<DEH|,HE NN..$ + 
 !% ..$ !/ !
 &7{%CEZ[fEgN'89I'JLabrLs_IIz:B	K#AqI))U%5%5k%BO$TZ[\ 6@[j5k2? ??&!!%6t%6"67I+A/@/@,@AN-a1B4??1B.BCO "&*"#$$+/Fa'( #'{#1 !xi = ^=Es   G5.G:G?'Hrr   )rs   rt   ru   rv   rw   r   ry   r   r	   rx   rE   rz   rH   r   r{   rJ   r9   r   r   r:   re   r   r   r~   rr   r   r@   r   r      s   KZ  $J$!&$&(,,$$NC
^4d3id38n.L(M#N 
^SWX[]`X`Sa 
^!4d3idSVX[S[n>\8]3^ !cghkmphpcq !F:4d3idSVX[S[n>\8]3^ :cghkmphpcq :r   r   c                     [         R                  R                  USSS9nUSSS24   R                  5       nUS:g  nSX4) '   [	        U R
                  U5      nUR                  5       R                  5       * U-  nXd-  R                  5       U-  nU$ )z
DFT loss function, as presented in [On the Generalization of SFT: A Reinforcement Learning Perspective with Reward
Rectification](https://huggingface.co/papers/2508.05629)
)r   r*   rY   )r>   .r*   Nr   )	nn
functionalr0   
contiguousr1   logitsexpdetachrk   )outputsrO   num_items_in_batchshift_labels	loss_masklogprobsper_token_losslosss           r@   dft_lossr     s    
 ]]vvT:F#qr'?--/L$I L$W^^\BHlln++--8N&++-0BBDKr   c                    B  ^  \ rS rSrSrSS/r             S&S\\\R                  \
4   S\\\\4      S\\   S	\\\\4      S
\\\\\\4   4      S\\\\4      S\\   S\\\/\4      S\\\      S\\\R4                  R6                     \\R4                  R8                  R:                     4   S\\\\R4                  R6                     \\\4   4      S\\\R@                  \R@                  /\R@                  4      S\S   S\\\/\4      4U 4S jjjr!S\\\4   S\\\"\#\4   S\S\$S\\\/\4      S\S\\\4   4S jr%S r&S'U 4S jjr'U 4S jr(S(S\\\)4   S\\)   SS4U 4S  jjjr*U 4S! jr+   S)S"\\   S\\   S#\\\\   S4   4S$ jjr,S%r-U =r.$ )*
SFTTraineri  aZ  
Trainer for Supervised Fine-Tuning (SFT) method.

This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.

Example:

```python
from datasets import load_dataset
from trl import SFTTrainer

dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")

trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
          config) with the keyword arguments in `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object.
        If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
        as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
    args ([`SFTConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    data_collator ([`~transformers.DataCollator`] or `None`, *optional*):
        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
        Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
        and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
        [prompt-completion](#prompt-completion) type. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).

        The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
        with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
        If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
    compute_loss_func (`Callable` or `None`, *optional*, defaults to `None`):
        A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
        batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
        function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
        used by [`Trainer`].
    compute_metrics (`Callable[[EvalPrediction], dict]` or `None`, *optional*, defaults to `None`):
        The function that will be used to compute metrics at evaluation. Must take a
        [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
        [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
        `compute_result` argument. This will be triggered after the last eval batch to signal that the function
        needs to calculate and return the global summary statistics rather than accumulating the batch-level
        statistics.
    callbacks (list of [`~transformers.TrainerCallback`] or `None`, *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
        model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
    optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
        A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
        `args`. Incompatible with the `optimizers` argument.

        Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
        initializing the Trainer.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
        A function that preprocess the logits right before caching them at each evaluation step. Must take two
        tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
        by this function will be reflected in the predictions received by `compute_metrics`.

        Note that the labels (second parameter) will be `None` if the dataset does not have them.
    peft_config ([`~peft.PeftConfig`] or `None`, *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    formatting_func (`Callable` or `None`, *optional*, defaults to `None`):
        Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
        converts the dataset into a [language modeling](#language-modeling) type.
trlsftNmodelargsdata_collatortrain_dataseteval_datasetprocessing_classcompute_loss_funccompute_metrics	callbacks
optimizersoptimizer_cls_and_kwargspreprocess_logits_for_metricspeft_configr2   formatting_funcc                   > UcP  [        U[        5      (       a  UOUR                  R                  nUR	                  S5      S   n[        U S35      nOe[        U[        5      (       aP  [        U[
        5      (       d;  UR                  5       nUR                  US'   UR                  S5        [        S>0 UD6nUR                  =(       d    0 n[        U[        5      (       a  UnUR                  S5      n[        U[        R                  5      (       d	  US:X  d  Uc  O@[        U[        5      (       a  US;   a  [        [        U5      nUUS'   O[        S	U S
35      e[         R"                  " U5      n[        [$        UR&                  S   5      nUR"                  " U40 UD6nO8UR                  R                  nUR                  b  [(        R+                  S5        Uc  [,        R"                  " U5      n[        U[.        5      (       a  UR0                  nSU l        O*[        U[4        5      (       a
  UnSU l        O[7        S5      eUR8                  bM  UR8                  nUR;                  U5      nUc&  [        SU SUR<                  R>                   S35      eUUl         URB                  b  [D        RF                  RI                  URB                  5      (       aU  URB                  RK                  S5      (       a5  [M        URB                  SS9 nURO                  5       Ul(        S S S 5        / nO[S        XURB                  5      u  pnO/ nU R2                  (       a  URT                  (       a  [        S5      eU R2                  (       a  URV                  (       a  [        S5      eU R2                  (       a  URX                  (       a  [        S5      eUb  U(       a  URZ                  c
  SU0Ul-        O>SURZ                  ;  a  UURZ                  S'   OURZ                  S   R]                  U5        UR^                  b  SUR^                  ;  aF  [(        R+                  S5        UR^                  c	  S/Ul/        OUR^                  Ra                  S5        SU l1        Uc$  [e        5       (       af  [        U[f        5      (       aQ  [i        XU5      nURj                  URl                  ;   a+  URl                  URj                     n[        USS5      U l1        URV                  =(       d"    URT                  =(       a    URn                  S:H  U l+        UR                  Rp                  S;   nU RV                  (       a  Ub  [        S5      eURT                  (       a%  URn                  S :X  a  [(        R+                  S!5        U(       d  [(        R+                  S"5        URr                  S#:X  a&  URT                  (       d  [(        R+                  S$5        [u        [w        U5      5      nURx                  c  S%U;   =(       a    S&U;   U l<        OURx                  U l<        Uc  U R2                  (       d  URz                  =(       d    URz                  =(       d    UR8                  nUR;                  U5      nUc&  [        S'U SUR<                  R>                   S(35      e[}        UU Rx                  U RV                  UUR~                  S)9nOIUcF  U R2                  (       a5  [        UUR                  U Rx                  UR~                  UR                  S*9nURT                  (       a,  URn                  S:X  a  U(       d  [(        R+                  S+5        URX                  (       a  [        U5      (       d  [        S,5      eUR                  S L=(       a    UR                  R                  S-S5      =(       d    U R2                  n U (       d  U Rx                  (       a  U(       a  [        S.5      eU R                  XFX"RT                  US/5      nUb  UR                  c  URT                  OUR                  n![        U[        5      (       a:  UR                  5        V"V#s0 s H  u  n"n#U"U R                  U#XbU!UU"5      _M     nn"n#OU R                  XVUU!US05      nUR                  S1:X  a  O>UR                  S2:X  a  Ub  [        S35      e[        nO[        S4UR                   S535      e[        [        5      [        [        5      S6.U lM        SU lN        [        T$U G]A  UUUUUUUUU	U
UUS79  U R                  R                  (       a  [        U R                  S89U lU        O[        R                  " 5       U lU        [        U R                  S95      (       a%  U R                  R                  U R                  5        [        UR                  S:S5      U l[        [        UR                  S;S<5      U l\        U R                  (       a'  U R                  S<:X  a  [(        R+                  S=5        g g g ! , (       d  f       GNA= fs  sn#n"f )?N/rj   z-SFT	hub_tokenpush_to_hub_tokenri   auto)bfloat16float16float32zInvalid `dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing a valid `torch.dtype` (e.g., 'float32'), but got r   r   zYou passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.TFzWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`zThe specified `eos_token` ('zC') is not found in the vocabulary of the given `processing_class` (zX). Ensure that the `eos_token` exists in the vocabulary before using it as an EOS token.)z.jinjaz.j2zutf-8)encodingzaPacking is not supported for vision-language models. Please set `packing=False` in the SFTConfig.zzPadding-free training is yet not supported for vision-language models. Please set `padding_free=False` in the `SFTConfig`.zAssistant-only loss is not yet supported for vision-language models. Please set `assistant_only_loss=False` in the `SFTConfig`.embed_tokenslm_heada-  Cloning chat template added new tokens to the tokenizer, but 'lm_head' is not in PEFT's `modules_to_save`. As a result, the model may not learn to generate outputs with these new tokens, leading to degraded generation quality. To fix this, add `modules_to_save=['lm_head']` to your PEFT configuration.num_virtual_tokensbfd)flash_attention_2flash_attention_3z"kernels-community/vllm-flash-attn3zHPassing a custom data collator is not supported when using padding-free.wrappedzYou are passing `padding_free=True` with the 'wrapped' packing strategy, which is not recommended. Please refer to the documentation to understand why this is not recommended.a  Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.r*   zYou are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size to at least 2.r   r   zThe specified `pad_token` ('z[). Ensure that the `pad_token` exists in the vocabulary before using it as a padding token.)rD   rE   rF   rG   rH   )r   r   rE   rH   r   a$  You are using packing, but the attention implementation is not set to 'flash_attention_2' or 'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash Attention is the only known attention mechanisms that reliably support this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or `attn_implementation='kernels-community/vllm-flash-attn3'` in the model configuration.zYou set `assistant_only_loss=True`, but the dataset is not conversational. This option is only supported for conversational datasets.skip_prepare_datasetaE  A formatting function was provided while `completion_only_loss=True`, which is incompatible. Using a formatter converts the dataset to a language modeling type, conflicting with completion-only loss. To resolve this, apply your formatting function before passing the dataset, or disable `completion_only_loss` in `SFTConfig`.trainevalnlldftzYou passed a `compute_loss_func` together with `loss_type='dft'` to the `SFTTrainer`. When using `loss_type='dft'`, the loss function is internally set to the DFT loss, so passing a `compute_loss_func` is not allowed.zInvalid `loss_type` z. passed. Supported values are 'nll' and 'dft'.)r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )r   add_model_tagsoutput_router_logitsrouter_aux_loss_coef        a-  You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to `0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary loss.rr   )]r8   r{   config_name_or_pathrn   r+   r   to_dictr   popmodel_init_kwargsgetrZ   ri   getattrr   r   from_pretrainedtransformersarchitecturesloggerwarningr   r   	tokenizer_is_vlmr   r=   	eos_tokenconvert_tokens_to_ids	__class__rs   eos_token_idchat_template_pathospathisfileendswithopenreadchat_templater'   packingrF   assistant_only_losstrainable_token_indicesextendmodules_to_saveappendr   r   r3   r)   active_adapterr   packing_strategy_attn_implementationper_device_train_batch_sizenextiterrE   	pad_tokenrB   rH   r   r   r   r!   dataset_kwargs_prepare_dataseteval_packingr:   r<   	loss_typer   r   r9   _metrics_total_train_tokenssuper__init__r   activation_offloadingr(   r    maybe_activation_offload_context
contextlibnullcontexthasattrr   
_tag_namesaux_loss_enabledaux_loss_coef)%ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
model_name	dict_argsr   model_idri   r   architecturer   r   r   chat_template_fileadded_tokenspeft_model_configuse_flash_attentiondataset_sampler  rD   r   r   r?   datasetr   s%                                       r@   r  SFTTrainer.__init__N  s]	   $ <",UC"8"8ell>X>XJ#))#.r2J
|401D/00D)9T9TI%)^^Ik"MM-.)y)D !228beS!!H%))'2E%--&EME3''E5W,Wu--2!'* HHMwaQ   //9F"<1E1Ea1HIL 00O=NOE||11H%%1? #,<<XF &77(22IDL(*ABB(I DLuvv>>%I$::9EL# 29+ >++;+E+E+N+N*O PII 
 &2I""".ww~~d55664;R;R;[;[\m;n;n$11GDHZ5G5L5L5N$2 E!8KT-D-D95 L <<DLLs  <<D--;  <<D44B  "66>;I<:XK7#;+N+NNJVK77G77GNN|\ ..6);KfKf:fNNT #22:7@k3#33::9E #$"'8':':z%QZ?[?[&u4@E##u'8'88$)$5$5e6J6J$K!*12CEY[\*]'
 !--b$,,2a4CXCX\aCa#ll?? D
 

 ( !kll|| 5 5 Bp 'J //14T\\% d=12$$,(0N(B(e|WeGeD%(,(A(AD%  T)*=*=TATATI$::9EL# 29+ >++;+E+E+N+N*O PLL 
 <)%)%>%>!..$7#'#:#:M "t||A*??%)%>%>#'#:#:#'#:#:M <<D11U:CVNNi ##,=n,M,M9  t+f0C0C0G0GH^`e0fvjnjvjv 	 $((_ Q  !11||_V]M '*.*;*;*C$,,IZIZlD11 -9,>,>,@$,@LC T227<LT[]lnqrr,@ ! $L
 $(#8#8$gX^$L
 >>U"^^u$ , : 
 !)3DNN3CCqrss #.d"3[=NO#$  	''%-/+!%=*G 	 	
  99**4RY]YcYc4dD14>4J4J4LD1 4::/00JJ%%doo6 '6Le T$U\\3I3O  T%7%73%>NN &? C EDJ$s   i$#i6$
i3r  r   dataset_namer6   c           	      0  ^ [        U[        5      (       a  UR                  [        5      n[	        [        [        U5      5      R                  5       5      nSU;   n0 n	[        U[        5      (       a  UR                  U	S'   [        5       R                  5          Tb  U(       a  [        R                  S5        Tb@  U(       d9  [        U[        5      (       a	  SU S3U	S'   U4S jn
UR                  " U
4SS	0U	D6nU(       GdB  [        [        U5      5      n[        U5      (       aa  [        U[        5      (       a	  S
U S3U	S'   [        [        U5      5      R                  5       nUR                  " [        4SSU;   a  SOS 0U	D6n[        [        U5      5      n[!        U5      (       dK  [        U[        5      (       a	  SU S3U	S'   S nUR                  " U4SUR"                  0SU;   a  SOS S.U	D6n[        U[        5      (       a	  SU S3U	S'   S nUR                  " U4SUUR$                  UR&                  S.0U	D6nU(       a  UR(                  c  [+        S5      e[        U[        5      (       a	  SU S3U	S'   S/nSUR,                  ;   a  UR/                  S5        SUR,                  ;   a  UR/                  S5        UR1                  U5      n[3        XR(                  UR4                  U	5      nOAUR(                  b4  [        U[        5      (       a	  SU S3U	S'   [7        XR(                  U	5      nUR8                  (       a.  1 SknUR1                  UR;                  UR,                  5      5      nS S S 5        U$ ! , (       d  f       U$ = f)NrM   num_proczYou passed a dataset that is already processed (contains an `input_ids` field) together with a formatting function. Therefore `formatting_func` will be ignored. Either remove the `formatting_func` or pass a dataset that is not already processed.z Applying formatting function to z datasetdescc                    > ST" U 5      0$ )Nr   rr   )r5   r   s    r@   _func*SFTTrainer._prepare_dataset.<locals>._func  s    "OG$<==r   batchedFzConverting z dataset to ChatMLremove_columnsconversationszAdding EOS to c                     SU ;   a&  U S   R                  U5      (       d  U S   U-   U S'   U $ SU ;   a$  U S   R                  U5      (       d  U S   U-   U S'   U $ )Nr   r   )r   )r5   r   s     r@   add_eos,SFTTrainer._prepare_dataset.<locals>.add_eos  sp    !W,WV_5M5Mi5X5X.5fo	.IGFO  ' *W4W\=R=[=[\e=f=f4;L4II4UGL1&r   r   r   )	fn_kwargsr)  zTokenizing c                 f   SU ;   Ga  0 n[        U 5      (       a  UR                  " U S   4SU R                  S5      0U R                  S0 5      D6nUR                  " U S   U S   -   4SUU R                  S5      S.U R                  S0 5      D6nUS   nSU;   a  US   US'   OU" U S   S	9S   nU" U S   U S   -   S	9S   nUS [        U5       U:X  d  [        R                  S
5        S/[        U5      -  S/[        U5      [        U5      -
  -  -   nXtS'   XS'   U$ [        U 5      (       aq  UR                  " U S   4SUU R                  S5      S.U R                  S0 5      D6n	SU	;   a  SU	S   ;  a  [        S5      eS V
s0 s H  oU	;   d  M
  XU
   _M     nn
U$ SU" X   S	9S   0nU$ s  sn
f )Nr   toolschat_template_kwargsr   T)return_dictreturn_assistant_tokens_maskr0  rM   rQ   )r   zMismatch between tokenized prompt and the start of tokenized prompt+completion. This may be due to unexpected tokenizer behavior, whitespace issues, or special token handling. Verify that the tokenizer is processing text consistently.r   r*   rP   r   u?  You're using `assistant_only_loss=True`, but at least one example has no assistant tokens. This usually means the tokenizer's chat template doesn't generate assistant masks — it may be missing the `{% generation %}` keyword. Please check the template and ensure it's correctly configured to support assistant masking.)rM   rQ   )r!   r    r   r_   r   r   RuntimeError)r5   r   r   r   rd   r   prompt_completion_processedprompt_completion_idsrP   	processedks              r@   tokenize-SFTTrainer._prepare_dataset.<locals>.tokenize  sP   7*!#,W55)9)M)M ' 1*&-kk'&:* #*++.Db"I*J
 ;K:^:^ ' 1GL4I I;,0=P&-kk'&:	;
 #*++.Db"I;7 5PP[4\104OO<WXi<j'8 9)9wx?P)QR])^J4D'RZJ[^efr^sJs4t +51
  55Fs:G:U"NN!m ,-#J*?1#MbIcfijtfuIuBv*v.C{+4C01, "M' -W55(8(L(L '
 3),0=P&-kk'&:	)
 #*++.Db"I)I  1I=!9UfKgBg&2%/'" !" @`%r?_!hqcqoa1o?_F%r "M '23CId3efq3r%sF!M &ss   	F.	F.r.  )r   r   r   z6When packing is enabled, `max_length` can't be `None`.zPacking rP   rQ   zTruncating >   rM   rN   rQ   rP   )r8   r   with_transformr;   r9   r  r  r   dataset_num_procr   main_process_firstr   r   mapr"   r#   r!   r   r   r   r   r   column_namesr   select_columnsr$   r   r&   use_liger_kernelintersection)ra   r  r   r   r   r   r!  r?  is_processed
map_kwargsr&  first_exampler,  r9  columnscollator_expected_keyss        `          r@   r  SFTTrainer._prepare_dataset  su    gw'',,-?@G Dg/4467"l2 
gw''%)%:%:Jz"^..0*|Y *<gw//+KL>Ya)bJv&> "++eIUIjI $T'] 3/>>!'733/:<.HZ-[
6*#'W#6#;#;#=L%kk/:I\:Y_c %G !%T'] 3(77!'733/=l^8-T
6*' &kk#.0@0J0J"K5?<5OzUY %	G gw//+6|nH)MJv&:"x "++ -=.2.E.E/3/G/G ! ??*$%]^^gw//+3L>)JJv&&-$(<(<<NN#45$(<(<<NN#45!009 'wAVAVXbc,gw//+6|nH)MJv&*7OOZP$$)k&!001G1T1TU\UiUi1jkk 1n o 10n s   K"N
Nc                 h    U R                   c%  U R                  (       a
  / SQU l         g / SQU l         g g )N)r   r   r   r   )rM   rO   rN   rP   rQ   )_signature_columnsr   )ra   s    r@    _set_signature_columns_if_needed+SFTTrainer._set_signature_columns_if_needed8  s,    
 ""*||*X'*v'	 +r   c                 	  > U R                   R                  (       a  SOSnUS   nSUS'   [        TU ]  XSUS9u  pxU R                  (       a.  U R
                  (       a  U R
                  UR                  -  n	Xy-  nU R                  R                  (       Gd7  [        R                  " 5          [        UR                  5      n
SU;   a~  US   n[        R                  " UR                  S	5      U R                  UR                   S
9n[        R"                  " X4SS9n[        R$                  " X-  5      UR%                  5       -  nO(SU;   a  [        R&                  " U
5      nO[)        S5      eU R*                  R-                  U5      R'                  5       R/                  5       nSSS5        U R0                  U   S   R3                  W5        US:X  a  SU;   aI  U R*                  R-                  US   R%                  5       5      R%                  5       R/                  5       nO}SU;   al  [        R4                  " US   R                  S5      US   R                   S
9nU R*                  R-                  U5      R%                  5       R/                  5       nO[)        S5      eU =R6                  U-  sl        U R6                  /U R0                  U   S'   U R                  R                  (       Gd  [        R                  " 5          SU;   a   UR                  R9                  5       nUS   nO;UR                  SSS2SS24   R9                  5       nUSSS24   R9                  5       nUSS2U R                  S2SS24   nUR;                  SS9nUS:g  nUU:H  U-  nUR%                  5       nUR%                  5       nU R*                  R-                  U5      nU R*                  R-                  U5      nUR%                  5       nUS	:  a!  UR%                  5       U-  R/                  5       OSnU R0                  U   S   R3                  U5        U R                  (       aX  U R*                  R-                  W	5      R'                  5       R/                  5       n	U R0                  U   S   R3                  U	5        SSS5        U(       a  Xx4$ U$ ! , (       d  f       GN(= f! , (       d  f       N,= f)zA
Compute training loss and additionally compute token accuracies
r   r   rO   F	use_cacheT)return_outputsr   rW   r   )devicer*   rR   rX   z6Expected 'attention_mask' or 'position_ids' in inputs.Nentropy
num_tokensr   .rj   rY   r   mean_token_accuracyaux_loss)r   trainingr  compute_lossr  r  rT  r   rA  rZ   no_gradr,   r   rl   sizer   rP  r`   rk   meanr   acceleratorgather_for_metricsitemr
  r   r[   r  r   argmax)ra   r   inputsrO  r   moderO   r   r   rT  per_token_entropyrW   virtual_attention_maskrQ  num_tokens_in_batchlocal_num_tokensshift_logitsr   predictionsmaskcorrect_predictionstotal_tokenscorrect_tokens	total_sumaccuracyr   s                            r@   rV  SFTTrainer.compute_lossC  sL    **--w6 ! ${'.$CU / 

   T%7%7))G,<,<<HD yy)))$7$G!#v-%+,<%=N-2ZZ&++A.0G0GP^PePe.* &+YY0F/W]^%_N#ii(9(JKnN`N`NbbG#v-#jj):;G$%]^^**==gFKKMRRT ! MM$	*11':7?  6)&*&6&6&I&I&QaJbJfJfJh&i&m&m&o&t&t&v#6)#(<<~0F0K0KA0NW]^lWmWtWt#u &*&6&6&I&IJZ&[&_&_&a&f&f&h# !YZZ$$(;;$-1-E-E,FdL) yy)))!V+ $+>>#<#<#>L#).#9L#*>>#ssA+#>#I#I#KL#)#qr'?#=#=#?L  ,At/F/F/H!,KL +11b19 $t+ (3l'Bd&J##xxz!4!8!8!: "&!1!1!D!D^!T#//BB<P ),,.	HQTUN..09<BBD[^d#$9:AA(K((#//BB8LQQSXXZHMM$'
3::8DI !L #1:d:M !@ !s   ,C9SF/S
S
S&c                 r   > U R                      [        TU ]  " U0 UD6sS S S 5        $ ! , (       d  f       g = fN)r  r  training_step)ra   r   kwargsr   s      r@   ro  SFTTrainer.training_step  s*    227($9&9 322s   (
6logs
start_timec           	        > U R                   R                  (       a  SOSnU R                  U   R                  5        VVs0 s H  u  pEU[	        U5      [        U5      -  _M     nnnUS:X  a(  UR                  5        VVs0 s H  u  pESU 3U_M     nnnUR                  U5        [        TU ]!  X5        U R                  U   R                  5         g s  snnf s  snnf )Nr   r   eval_)
r   rU  r
  r<   rk   r_   updater  logclear)ra   rr  rs  r_  r?   valmetricsr   s          r@   rw  SFTTrainer.log  s    **--w6<@MM$<O<U<U<WX<W3C3s8++<WX 6>:A--/J/hcse}c)/GJGD%d!!# Y
 Ks   $CCc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nr   rj   )r  )	r   hub_model_idr   
output_dirnamern   create_model_cardr  _save_checkpoint)ra   r   trialr  r   s       r@   r  SFTTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .r   r  tagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        UUU R                   U[#        U5      [%        5       (       a+  [&        R(                  b  [&        R(                  R*                  OS[-        5       SS9nUR/                  [        R
                  R1                  U R2                  R4                  S	5      5        g)
a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr   unsloth_versionunslothJOB_IDhf_jobsSFT)
base_modelr  r}  r!  r  	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zeror  r   r   r   r   isdirr   setr8   r{   addenvironrv  r  r.   r}  r9   r   wandbrunurlr/   savejoinr   r~  )ra   r  r!  r  r  
model_cards         r@   r  SFTTrainer.create_model_card  sR   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$(!!**%d'9';';		@Ueiimm[_.0	

 	TYY%9%9;GHr   )
r   r
  rJ  r  r  r  rE   r  r   rF   )NNNNNNNN)NNNNNN)FNrn  )NNN)/rs   rt   ru   rv   rw   r  r   r{   r   Moduler   r	   r+   r   r   r   r   r:   r   r   r   r   r9   r   tuplerZ   optim	Optimizerlr_schedulerLambdaLRtyper   r}   r  r   r   rz   r  rK  rV  ro  floatrw  r  r  r~   __classcell__)r   s   @r@   r   r     s
   Xt J
 ?C04CGEIUY04FJ59jvaehl.2;?xS"))_45x uY(99:;x  -	x
  g&> ?@x uWd3<.@%@ABx #5)@.)P#QRx $H-x "(N+;T+A"BCx D12x (5;;#8#898EKKD\D\DeDe;ffgx #+5ekk6K6K1LdSVX[S[n1\+]"^x (0%,,9UW\WcWc9c0d'ex l+x "(D63;"78x xt	nw/0n   79KMces stn 	n
 n "(D63;"78n n 
w'	(n`	w];@:$S%Z( $huo $QU $ $/ %)&*,0	4ISM4I sm4I CcD()	4I 4Ir   r   )Pr  r   collectionsr   collections.abcr   dataclassesr   pathlibr   typingr   r   r	   r
   r   rZ   torch.nnr   r   
accelerater   r   datasetsr   r   r   r   r   r   r   r   r   r   r   r   r   transformers.data.data_collatorr   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   
data_utilsr    r!   r"   r#   r$   r%   r&   modelsr'   r(   r)   
sft_configr+   utilsr,   r-   r.   r/   r0   r1   peftr2   r3   r  
get_loggerrs   r   r9   r4   r;   rB   r   r   r   rr   r   r@   <module>r     s    	 # # !  : :    , -    > 9 5 0   ] \ !  *			H	%)49A A> A@ n9&7 n9 n9b ~,= ~ ~B@I @Ir   