
    h#_                        S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJ	r	  S SK
JrJrJrJrJr  S SKrS SKJr  S SKJr  S SKJr  S S	KJrJrJrJrJr  S S
KJr  SSKJ r J!r!  \" 5       (       a  S SK"r"S SK"J#r#J$r$J%r%J&r&  \(       a  S SK'J(r(  S SK)J*r*  S SKJ+r+  S SK,J-r-  \ \!4r.\	 " S S5      5       r/S\/0r0  S1S\S\S\\S      S\\1   S\2\\4   4
S jjr3 S2S\S\S\4S\\1   S\2\\\5\1   4   4
S jjr6S3S jr7S4S jr8S4S jr9S3S  jr:\ S5S\S!   S"S#S$\;4S% jj5       r<S6S& jr=S' r> " S( S)5      r?S\S*\\@   S\4S+ jrAS, rBS\S-\S.   S/\S\4S0 jrCg)7    N)Callable)contextmanager)deepcopy)	dataclass)TYPE_CHECKINGAnyLiteralOptionalUnion)is_peft_model)version)
AddedTokenAutoTokenizerPreTrainedModelPreTrainedTokenizerTrainingArguments)is_peft_available   )!AutoModelForCausalLMWithValueHead"AutoModelForSeq2SeqLMWithValueHead)
PeftConfig	PeftModelget_peft_modelprepare_model_for_kbit_training)Accelerator)DeepSpeedEngine)Module)DistributedDataParallelc                       \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   \
S 5       r\
S	 5       r\
S
 5       r\
S 5       rSrg)ChatMlSpecialTokens4   ziDataclass for special tokens used in ChatML, including system, user, assistant, bos, eos, and pad tokens.z<|im_start|>	bos_tokenz
<|im_end|>	eos_token	pad_tokenc                      U R                    S3$ )Nsystemr"   selfs    J/home/james-whalen/.local/lib/python3.13/site-packages/trl/models/utils.pyr&   ChatMlSpecialTokens.system<   s    ..!((    c                      U R                    S3$ )Nuserr'   r(   s    r*   r.   ChatMlSpecialTokens.user@   s    ..!&&r,   c                      U R                    S3$ )N	assistantr'   r(   s    r*   r1   ChatMlSpecialTokens.assistantD   s    ..!++r,   c                 V    SU R                    SU R                   SU R                   S3$ )Nz {% for message in messages %}{{'z2' + message['role'] + '
' + message['content'] + 'z7' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ 'z
' }}{% endif %})r"   r#   r1   r(   s    r*   chat_template!ChatMlSpecialTokens.chat_templateH   sA    NN##VW[WeWeVf g ^^$ %		
r,    N)__name__
__module____qualname____firstlineno____doc__r"   str__annotations__r#   r$   propertyr&   r.   r1   r4   __static_attributes__r6   r,   r*   r    r    4   sq    s#Is#!Is!!Is!) ) ' ' , , 
 
r,   r    chatmlmodel	tokenizerformatresize_to_multiple_ofreturnc                    [         R                  " S[        5        UR                  b  [	        S5      eU[
        ;  a#  [	        SU S[
        R                  5        35      e[
        U   " 5       nUR                  Ul        UR                  Ul        UR                  Ul	        UR                  SUR                  UR                  /05        UR                  Ul        U R                  [        UR                  5      Ub  UOSS9  [        U SS5      bQ  UR                  U R                   l        UR"                  U R                   l        UR$                  U R                   l        [        U S	S5      bQ  UR"                  U R&                  l        UR$                  U R&                  l        UR                  U R&                  l        X4$ )
a  
Setup chat format by adding special tokens to the tokenizer, setting the correct format, and extending the
embedding layer of the model based on the new special tokens.

<Tip warning="true">

This function is deprecated and will be removed in version 0.26.0. Please use [`clone_chat_template`] instead.

</Tip>

If the model already has a chat template, this will throw an error. If you want to overwrite it, please set
`tokenizer.chat_template` to `None`.

Args:
    model (`~transformers.PreTrainedModel`): The model to be modified.
    tokenizer (`~transformers.PreTrainedTokenizer`): The tokenizer to be modified.
    format (`Optional[Literal["chatml"]]`): The format to be set. Defaults to "chatml".
    resize_to_multiple_of (`int` or `None`): Number to resize the embedding layer to. Defaults to None.

Returns:
    model (`~transformers.PreTrainedModel`):
        The modified model.
    tokenizer (`~transformers.PreTrainedTokenizer`):
        The modified tokenizer.
zThe `setup_chat_format` function is deprecated and will be removed in version 0.26.0. Please use `clone_chat_template` instead.NzcChat template is already added to the tokenizer. If you want to overwrite it, please set it to NonezFormat z" not available. Please use one of additional_special_tokensnew_num_tokenspad_to_multiple_ofconfiggeneration_config)warningswarnDeprecationWarningr4   
ValueErrorFORMAT_MAPPINGkeysr#   r$   r"   add_special_tokensresize_token_embeddingslenvocabgetattrpad_token_idrK   bos_token_ideos_token_idrL   )rA   rB   rC   rD   chat_formats        r*   setup_chat_formatr\   W   s   > MM	) *q
 	

 ^#76(*L^M`M`MbLcdee (*K &//I%//I%//I  "=@U@UWbWlWl?m!no)77I 
!! 9??+4I4U0[_ "  uh%1$-$:$:!$-$:$:!$-$:$:!u)40</8/E/E,/8/E/E,/8/E/E,r,   source_tokenizer_pathc                    [         R                  " U5      nUR                  5       Ul        UR                  R                  5        Vs/ s H   oUR                  UR                  ;  d  M  UPM"     nnUR                  U5        UR                  Ul	        UR                  U R                  l
        UR                  U R                  l
        U R                  [        UR                  5      Ub  UOSS9  SnU R                  [        UR                  5      :  aa  [!        SU S35      nUR                  U5      n	US-  nU	S:X  a  UR#                  U5        U R                  [        UR                  5      :  a  Ma  [        UR                  5      U R                  :w  a/  [%        S[        UR                  5       SU R                   S	35      eU Vs/ s H  oUR                  PM     nnUR'                  U5      nXU4$ s  snf s  snf )
a  
Clones a chat template from a source tokenizer to the target tokenizer and updates the model accordingly.

This function:
- Copies the chat template from a source tokenizer to the target tokenizer.
- Adds any new tokens from the source tokenizer to the target tokenizer.
- Sets and synchronizes the EOS token across the tokenizer and model.
- Resizes the model's token embeddings to match the new vocabulary size, optionally rounding it up to a multiple of
  a specified value. In such cases, dummy tokens are added to the tokenizer to ensure the vocabulary size matches
  the embedding dimensions.

Args:
    model (`PreTrainedModel`):
        Model to update.
    tokenizer (`PreTrainedTokenizer`):
        Tokenizer to update.
    source_tokenizer_path (`str`):
        Path or identifier of the pretrained tokenizer to clone from.
    resize_to_multiple_of (`int` or `None`, *optional*, defaults to `64`):
        The embedding layer will be resized to the new vocabulary size. If this is not `None`, it will round up the
        new vocabulary size to the nearest multiple of this value.

Returns:
    model (`PreTrainedModel`):
        Updated model with resized token embeddings and EOS token configured.
    tokenizer (`~transformers.PreTrainedTokenizer`):
        Updated tokenizer with the chat template and special tokens applied.
    added_tokens (`list[int]`):
        List of tokens that were added to the tokenizer from the source tokenizer.

Example:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import clone_chat_template

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model, tokenizer, added_tokens = clone_chat_template(model, tokenizer, "Qwen/Qwen3-0.6B")
```
NrH   r   z
<extra_id_>r   zAVocabulary size mismatch after resizing: tokenizer vocab size is z, but model embedding size is zB. This indicates an internal error in the token alignment process.)r   from_pretrainedget_chat_templater4   added_tokens_decodervaluescontentrV   
add_tokensr#   rZ   rK   rL   rT   rU   
vocab_sizer   appendRuntimeErrorconvert_tokens_to_ids)
rA   rB   r]   rD   tokenizer_sourcetokenadded_tokensidxdummy_tokenis_addeds
             r*   clone_chat_templaterp      s   ^ %445JK /@@BI ,@@GGII%]]bkbqbqMqI   & +44I ) 6 6ELL+4+A+AE( 
!! 9??+4I4U0[_ "  C


S1
1 :cU!!45''4qq=, 

S1
1 9??u///OPST]TcTcPdOe f!!&!1!1 22tv
 	
 0<<|eMM|L<22<@L\))KF =s   G9)G9G>c                    [        U S5      (       d  gU R                  b2  [        U R                  S5      (       a  U R                  R                  nO%U R                  b  U R                  nO[        S5      e[	        UR
                  SS9 H  nUR                  R                  5         M     UR                   H  nUR                  5         M     UR                   H  nUR                  5         M     / Ul        / Ul
        g)z:Removes the optimizer hooks from a DeepSpeed ZeRO-3 model.	optimizerNparameter_offload8The model optimizer is None, which is not yet supported.Trecurse)hasattrrr   rs   rh   iter_paramsmoduleds_active_sub_modulesclearforward_hooksremovebackward_hooks)rA   optimizer_offloadparamhooks       r*   remove_hooksr     s    5+&&"wu@S'T'T!OO==		$!OOUVV.55tD##))+ E "// 0!00 1 ')#')$r,   c                 f    [         R                  " U R                  US9U R                  5       5      $ )Nru   )	itertoolschainnamed_parametersds_external_parameters)
sub_modulerv   s     r*   get_all_parametersr     s*    ??:66w6GIjIjIlmmr,   c                 N    [        X5       VVs/ s H  u  p#UPM	     snn$ s  snnf N)r   )ry   rv   _r   s       r*   rx   rx     s#    "4V"EF"EhaE"EFFFs   !c                    SSK n[        U S5      (       d  gU R                  b2  [        U R                  S5      (       a  U R                  R                  nO%U R                  b  U R                  nO[	        S5      e[
        R                  " UR                  5      [
        R                  " S5      :  a  UR                  UR                  5        gUR                  UR                  5        g)z7Adds the optimizer hooks from a DeepSpeed ZeRO-3 model.r   Nrr   rs   rt   z0.16.4)	deepspeedrw   rr   rs   rh   r   parse__version___register_deepspeed_modulery   _register_hooks_recursively)rA   r   r   s      r*   	add_hooksr   !  s    5+&&"wu@S'T'T!OO==		$!OOUVV}}Y**+w}}X/FF445F5M5MN556G6N6NOr,   )r   r   acceleratorr   gather_deepspeed3_paramsc              #   :  #    UR                  U 5      nUR                  nU(       a  UR                  5         UR                  R                  b  UR                  R                  R
                  S:X  a{  U(       d  UR                  U 5      v   OdSSKnUR                  R                  U R                  5       5         [        U 5        UR                  U 5      v   [        U 5        SSS5        OUv   U(       a  UR                  5         gg! , (       d  f       N'= f7f)a  
Context manager to unwrap distributed or accelerated models for generation tasks.

Args:
    model (`Union[DistributedDataParallel, DeepSpeedEngine]`):
        Model to be unwrapped.
    accelerator (`~accelerate.Accelerator`):
        Accelerator instance managing the model.
    gather_deepspeed3_params (`bool`, *optional*, defaults to `True`):
        Whether to gather weights for DeepSpeed ZeRO Stage 3 models. If `False`, skips parameter gathering, which
        can be more memory-efficient but may lead to slower generation times.

Yields:
    Unwrapped model.

Example:
```python
with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
    generated_outputs = unwrapped_model.generate(input_ids)
```
N   r   )unwrap_modelis_gradient_checkpointinggradient_checkpointing_disablestatedeepspeed_plugin
zero_stager   zeroGatheredParameters
parametersr   r   gradient_checkpointing_enable)rA   r   r   unwrapped_modelr   r   s         r*   unwrap_model_for_generationr   4  s     6 "..u5O / I I 668))5+:K:K:\:\:g:gkl:l'**5112253C3C3EFU#!..u55%  GF
  557 ! GFs   B8D:*D
$&D

DDc                    SSK nUR                  R                  n[        UR                  5      nUS   S   nU b{  [        U R                  SS5      (       a  [        U R                  R                  5      O[        U R                  SS5      nUb&  US:X  a   UR                  Xf-  SU-  S	U-  U-  S
.5        US:w  a  SUS   S'   UR                  XS9tpU R                  5         U $ )a  Prepares the model for DeepSpeed inference or evaluation by initializing it with the appropriate configuration.

Adapted from accelerate:
https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
r   Nzero_optimizationstagehidden_sizeshidden_sizer   
   g?)z$zero_optimization.reduce_bucket_sizez4zero_optimization.stage3_param_persistence_thresholdz-zero_optimization.stage3_prefetch_bucket_size)rA   rK   )r   r   r   r   deepspeed_configrW   rK   maxr   update
initializeeval)rA   r   r   r   config_kwargsr   r   r   s           r*   prepare_deepspeedr   c  s     "((99->>?M-.w7E u||^T:: ))*}d; 	
 "uz   <G<ULNQ\L\EH;EVYdEd z67)*73$$5$GIE	JJLLr,   c                 :   SSK Jn  SSKJn  [	        X5      (       d  [	        X5      (       d  UR
                  R                  R                  U 5        UR
                  R                  nUR                  =(       d    UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  S.nU" U 40 UD6n U R+                  5         U $ )Nr   )
FSDPModule)FullyShardedDataParallel)sharding_strategycpu_offloadauto_wrap_policymixed_precisionsync_module_statesbackward_prefetchforward_prefetchuse_orig_paramsparam_init_fnignored_moduleslimit_all_gathers	device_id)torch.distributed.fsdpr   2torch.distributed.fsdp.fully_sharded_data_parallelr   
isinstancer   fsdp_pluginset_auto_wrap_policyr   reshard_after_forwardr   r   mixed_precision_policyr   r   r   r   r   r   r   devicer   )rA   r   r   FSDPr   kwargss         r*   prepare_fsdpr     s    1c u##z%'D'D%%::5A!''33!,!>!>!c+BcBc&22 + < <*AA"-"@"@!,!>!> + < <*::(66*::!,!>!>$++
 U%f%	JJLLr,   c            
           \ rS rSrSrS\R                  S\R                  S\S\S\4
S jr	S\R                  S\R                  S	S
4S jr
S\R                  S\R                  S	S
4S jrSrg
)_ForwardRedirectioni  a9  Implements the `forward-redirection`.

Taken from Pytorch-lightning:
https://github.com/Lightning-AI/pytorch-lightning/blob/02311d03fb982560246eead7c08104481fac9579/src/lightning/pytorch/strategies/strategy.py#L602

A method call to a wrapped module gets rerouted through the wrapper's `forward` method instead.

wrapper_moduleoriginal_modulemethodargsr   c                    ^ ^^^^ TR                   mS[        S[        S[        4UUUU U4S jjnUTl         T" U0 UD6nT R                  TT5        U$ )aj  Reroutes a method call through the `wrapper_module`'s `forward` method.

Args:
    wrapper_module: The module that has `original_module` wrapped.
    original_module: The module that was wrapped inside `wrapper_module`.
    method: The method that should be called on the `original_module` after inputs get
        redirected through the `wrapper_module`'s `forward` method.
    *args: The positional arguments to the `method`. They will get passed to a patched
        `forward` method instead.
    **kwargs: The keyword arguments to the `method`. They will get passed to a patched
        `forward` method instead.

_args_kwargsrE   c                  J   > TTl         T" U 0 UD6nTR                  TT5        U$ r   )forwardon_after_inner_forward)r   r   outr   original_forwardr   r)   r   s      r*   wrapped_forward5_ForwardRedirection.__call__.<locals>.wrapped_forward  s1     '7O#%+7+C''HJr,   )r   r   on_after_outer_forward)	r)   r   r   r   r   r   r   wrapper_outputr   s	   ````    @r*   __call___ForwardRedirection.__call__  s_      +22	C 	C 	C 	 	 #2'88##NODr,   rE   Nc                     g r   r6   r)   r   r   s      r*   r   *_ForwardRedirection.on_after_inner_forward      r,   c                     g r   r6   r   s      r*   r   *_ForwardRedirection.on_after_outer_forward  r   r,   r6   )r7   r8   r9   r:   r;   nnr   r   r   r   r   r   r?   r6   r,   r*   r   r     s      ii :<)) MU ^a mp DRYY QSQZQZ _c RYY QSQZQZ _c r,   r   gradient_checkpointing_kwargsc                 L   [        U 5      (       a  U R                  R                  5         OU R                  5         U=(       d    0 nSU;  =(       d    US   nU(       aE  [        U S5      (       a  U R	                  5         U $ S nU R                  5       R                  U5        U $ )z-Enables gradient checkpointing for the model.use_reentrantenable_input_require_gradsc                 &    UR                  S5        g )NT)requires_grad_)ry   inputoutputs      r*   make_inputs_require_grad?enable_gradient_checkpointing.<locals>.make_inputs_require_grad  s    %%d+r,   )r   
base_modelr   rw   r   get_input_embeddingsregister_forward_hook)rA   r   r   r   s       r*   enable_gradient_checkpointingr     s    
 U668 	++-$A$GR!<<n@]^m@n  5677,,. L, &&(>>?WXLr,   c                   ^ U R                  5        H  u  mn[        U[        R                  R                  5      (       d  ST;   a!  UR                  [        R                  5      nMV  [        U4S jS 5       5      (       d  Mr  [        US5      (       d  M  UR                  R                  [        R                  :X  d  M  UR                  [        R                  5      nM     g )Nnormc              3   ,   >#    U  H	  oT;   v   M     g 7fr   r6   ).0xnames     r*   	<genexpr>.peft_module_casting_to_bf16.<locals>.<genexpr>  s     N$Mqd$Ms   )lm_headembed_tokenswtewpeweight)named_modulesr   torchr   	LayerNormtofloat32anyrw   r   dtypebfloat16)rA   ry   r   s     @r*   peft_module_casting_to_bf16r	    s    ++-ffehh0011Vt^YYu}}-FN$MNNNvx((==&&%--7#YYu~~6F .r,   peft_configr   r   c                    [        5       (       d  [        S5      e[        U [        5      (       a  Ub  U R	                  5       n [        U SS5      =(       d    [        U SS5      nSn[        U SS5      (       aX  U R                  5        HD  u  pVUR                  R                  S:X  d  M!  UR                  R                  R                  S;   n  O   U(       a7  U(       d0  [        U UR                  UR                  =(       d    0 S9n SUl        O&UR                  (       a  [        XR                  5      n Ubk  [         R"                  " [$        R&                  5      [         R"                  " S5      :  a$  [        U SS5      (       a  U(       a  [)        XSS	9n O[)        X5      n UR*                  (       a$  [        U SS5      (       a  U(       d  [-        U 5        U $ )
z#Prepares a model for PEFT training.z=PEFT is required to use a peft model. Run `pip install peft`.is_loaded_in_4bitFis_loaded_in_8bit
Params4bit>   cpumeta)use_gradient_checkpointingr   z0.12)autocast_adapter_dtype)r   ImportErrorr   r   merge_and_unloadrW   r   	__class__r7   datar   typer   gradient_checkpointingr   r   r   r   peftr   r   bf16r	  )rA   r
  r   is_qlorais_sharded_qlorar   r   s          r*   prepare_peft_modelr    s    YZZ %##(?&&( u159gWUL_af=gHu)511..0HA''<7#(::#4#4#9#9_#L  1 (/'+'B'B*.*L*L*RPR
 ',#		$	$-e5W5WX MM$**+w}}V/DD2E:: "5eTE"56E yyWU$7??HX#E*Lr,   )r@   N)@   )rA   r   rE   N)F)T)rA   r   r   r   )Dr   rM   collections.abcr   
contextlibr   copyr   dataclassesr   typingr   r   r	   r
   r   r  torch.nnr   accelerate.utilsr   	packagingr   transformersr   r   r   r   r   transformers.utilsr   modeling_value_headr   r   r  r   r   r   r   
accelerater   deepspeed.runtime.enginer   r   torch.nn.parallel.distributedr   SUPPORTED_ARCHITECTURESr    rQ   inttupler\   r<   listrp   r   r   rx   r   boolr   r   r   r   dictr   r	  r  r6   r,   r*   <module>r3     s     $ %  ! ? ?   *  k k 0 f [[ &8E &&  
 
 
> /0 +3+/	KK"K WX&'K $C=	K
 ?//0Kd ,.	Z*Z*"Z* Z* $C=	Z*
 ?/c:;Z*z*.nGP&  &*+8=>+8+8 #+8 +8\%P:0 0f;CD>:722)1,)?2GX22r,   