
    oi                        / S Qr SSKrSSKrSSKrSSKrSSKJr  SSKJrJ	r	  SSK
Jr  SSKJr  SSKJr  SSKrSS	KJrJrJrJrJrJr  SSKrS
SKJr  \" \5      r\" \5      r\" \5      r \" \5      r!\" \5      r"\" \5      r#SSSSSS.r$S r%\S   RM                  \%5        S r'\S   RM                  \'5        S r(\S   RM                  \(5        S r)\S   RM                  \)5        S r*\S   RM                  \*5        S r+\S   RM                  \+5        S r,\S   RM                  \,5        S r-\S   RM                  \-5        S r.\S   RM                  \.5        S r/\S   RM                  \/5        S r0\S   RM                  \05        \S   r1\S   r2\S   r3\S    r4\S!   r5\ S   RM                  \Rl                  " \15      5        \ S   RM                  \Rl                  " \35      5        \ S   RM                  \Rl                  " \45      5        \ S   RM                  \25        \ S   RM                  \Rl                  " \55      5        \ S   RM                  \Rl                  " \5      5        S" r7\S   RM                  \75        S# r8\!S   RM                  \85        S$ r9\"S   RM                  \95        S% r:\#S&   RM                  \:5        g)')RL_EXTRA_ARGSRL_FUNCTIONSRL_PRE_ITEMSRL_CONFIG_CHANGESRL_METRICS_CHANGES    N)defaultdict)RL_REPLACEMENTSleft_pack_padding)Version)version)logger   )is_hipget_device_typeDEVICE_TYPEDEVICE_TYPE_TORCHDEVICE_COUNTALLOW_PREQUANTIZED_MODELS   )#_get_inference_mode_context_managerTF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphsc                 $    SU ;   a
  SU ;   a  SnU$ g)Nmodeltrain_dataseta  IGNORED_TOKENIZER_NAMES = os.environ.get('UNSLOTH_IGNORED_TOKENIZER_NAMES', '').split('\n')
from unsloth_zoo.tokenizer_utils import fix_untrained_tokens
from unsloth_zoo.training_utils  import fix_zero_training_loss
if 'tokenizer' not in locals(): tokenizer = processing_class
fix_untrained_tokens(model, tokenizer, train_dataset, IGNORED_TOKENIZER_NAMES, eps = 1e-16)
fix_zero_training_loss(model, tokenizer, train_dataset)
  )	call_args
extra_argsfix_tokenizers      X/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/models/rl_replacements.py sft_trainer_fix_untrained_tokensr#   =   s'    )9 <H 	     sft_trainerc                 $    SU ;   a
  SU ;   a  SnU$ g)Nr   r   a  if hasattr(train_dataset, 'column_names'):
    column_names = set(train_dataset.column_names)
    check = ['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask',
             'chosen_labels', 'rejected_input_ids', 'rejected_attention_mask', 'rejected_labels',
             'prompt_input_ids', 'prompt_attention_mask']
    if all(x in column_names for x in check):
        train_dataset = train_dataset.remove_columns(['chosen', 'rejected', 'prompt'])
    del check, column_names
r   r   )r   r    fix_dpos      r"   dpo_trainer_fix_columnsr(   O   s%    )9 <, 	 r$   dpo_trainerc                 |   U S:w  a  U S:w  a  U$ [         R                  " SS 5      nUb  [        R                  " U5      R                  R                  5       nSR                  U5      n[        R                  " SU-   S-   U[        R                  [        R                  -  S9nU(       aS  [        R                  " U5      nUR                  S5      nSR                  S	 U 5       5      nUR                  S
S5      nU$ SnUR                  S5      nSR                  S U 5       5      nUR                  5       S-   n[        R                  " SU -   S-   U[        R                  [        R                  -  S9n[!        U5      S:w  a  US   nUR                  XfU-   5      nSn[        R"                  " SSU S3U5      nU$ )N_prepare_non_packed_dataloader_prepare_datasetsft_prepare_datasetz.*?z![\s]{0,}def _prepare_dataset\(.*?z.*?\)flags
c              3   ,   #    U  H
  nS U-   v   M     g7f)z    Nr   .0xs     r"   	<genexpr>.sft_trainer_prepare_dataset.<locals>.<genexpr>w   s      ?h1h   zdef sft_prepare_datasetzdef _prepare_datasetaI  if 'skip_prepare_dataset' in locals() and skip_prepare_dataset:
    return dataset
if 'tokenizer'          not in locals(): tokenizer = processing_class
if 'formatting_func'    not in locals(): raise RuntimeError('Unsloth: Please file a bug report - `formatting_func` does not exist!')
if 'dataset_text_field' not in locals() and 'args' in locals(): dataset_text_field = args.dataset_text_field
if 'dataset_text_field' not in locals(): raise RuntimeError('Unsloth: Please file a bug report - `dataset_text_field` does not exist!')
test_text = dataset[0][dataset_text_field] if (formatting_func is None and dataset_text_field is not None) else formatting_func(dataset[0])[0]
chat_template = getattr(tokenizer, 'chat_template', None)
chat_template = '' if chat_template is None else chat_template
has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) if getattr(tokenizer, 'bos_token', None) is not None else False
if 'add_special_tokens' not in locals() and has_bos_token_already:
    from functools import partial
    tokenizer_call = tokenizer.__call__
    tokenizer.__call__ = partial(tokenizer_call, add_special_tokens = False)
    processing_class = tokenizer
else:
    tokenizer_call = None
    add_special_tokens = False if has_bos_token_already else locals().get('add_special_tokens', False)
c              3   ,   #    U  H
  nS U-   v   M     g7f)z        Nr   r2   s     r"   r5   r6      s     ;
17Q;
r7   def z\(.*?\).*?\:\nr   zCif tokenizer_call is not None: tokenizer.__call__ = tokenizer_call
z \n([ ]{4,})(return .*?[\s]{0,})$z\1z\1\2)r	   getinspect	signature
parameterskeysjoinrematch	MULTILINEDOTALL	getsourcesplitreplacerstripfindalllensub)function_namefunctionfast_sft_prepare_datasetparamsmatched
check_textreplacerreturn_states           r"   sft_trainer_prepare_datasetrS   c   s   99//.223H$O+""#;<GGLLNF#((069HDLL299,

 (()ABH~~d+Hyy ?h ??H'')+AH O	s , !!$'J;
;;J""$t+J zz-"33ryy(H
 8}A;##H.CD 	O  vv+l^4 H
 Or$   c                 N    U S:w  a  U$  SS jn[         R                  " U5      nU$ )Ncompute_lossc                 :    [        5       R                  UUUUS9nU$ )N)return_outputsnum_items_in_batch)superrU   )selfr   inputsrW   rX   outputss         r"   rU   .sft_trainer_compute_loss.<locals>.compute_loss   s.     '&&+!3	 ' 
 r$   FNr;   rD   rK   rL   rU   s      r"   sft_trainer_compute_lossra      s2    & KO	   .HOr$   c                 ^    U S:w  a  U$ UR                  SS5      nUR                  SS5      nU$ )N_prepare_inputszwith torch.inference_mode():a<  with torch.inference_mode(), torch.amp.autocast(device_type = 'cuda', dtype = ((torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16) if not torch.is_autocast_enabled('cuda') else nullcontext())if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '0' else torch.float16):z)self.accelerator.unwrap_model(self.model)zDself.accelerator.unwrap_model(self.model, keep_fp32_wrapper = False))rF   rK   rL   s     r"   grpo_trainer__prepare_inputsre      sJ    )) &	VH 3NH Or$   grpo_trainerc                 F    U S:w  a  U$ [         R                  " SSU5      nU$ )N_generate_single_turnz:self\.llm\.collective_rpc\(\s*(['\"])reload_weights\1\s*\)r   )r@   rJ   rd   s     r"   "grpo_trainer__generate_single_turnri      s0    //
 vvE
H
 Or$   c                   ^ U S:w  a  U$ UR                  SS5      nSnSnUR                  X#5      n[        R                  " S[        R                  5      nSnUR	                  XQ5      u  p[        R                  " S[        R
                  [        R                  -  5      nS	nUR	                  XQ5      u  p[        R                  " S
U[        R
                  [        R                  -  S9n[        U5      S:w  ap  US   u  nm[        R                  " SSU5      n	U	R                  S5      n
[        U4S jU
 5       5      S:X  a&  [        T5      S:  a  ST S3nUR                  X5      nSnSnUR                  X5      nSnSnUR                  X5      nSnSnUR                  X5      nSU;  a  [        R                  " S[        R                  5      n[        UR                  U5      5      nUn[        U5       H  nUR                  S5      n[        R                  " SU5      nU(       a  UR                  S5      OSnU SU SU S U S!U SU SU S"3nUS UR!                  5        U-   UUR#                  5       S  -   nM     UnU$ )#N_generate_and_score_completionszHprompt_ids, skip_special_tokens=True, clean_up_tokenization_spaces=FalsezIprompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=Falsezobatch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_sizea  
        max_left_pad = None
        batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
        try:
            # TRL 0.23.1 and below path
            if not has_images:
                # Left pad prompt before calculation old and ref hidden states
                left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id)
                max_left_pad = torch.max(left_pad_tokens_per_prompt).item()
        except:
            # TRL 0.24.0 and below path
            if images is None:
                # Left pad prompt before calculation old and ref hidden states
                left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id)
                max_left_pad = torch.max(left_pad_tokens_per_prompt).item()
        self.model.for_training()z^\s*if self\.args\.gradient_accumulation_steps % generate_every != 0 or \(\s*self\.use_vllm and self\.vllm_importance_sampling_correction\s*\):z        
            if self.args.gradient_accumulation_steps % generate_every != 0 or (
                self.use_vllm
            ):z=(^\s*)all_logprobs = \[.*?for output in outputs\.outputs\s*\]z\1from trl.scripts.vllm_serve import sanitize_logprob\n\1all_logprobs = [\n\1    [sanitize_logprob(next(iter(logprob.values()))) for logprob in output.logprobs]\n\1    for outputs in all_outputs\n\1    for output in outputs.outputs\n\1]zK\n(([ ]{8,})if self\.max_prompt_length is not None:.*?\2if self\.use_vllm:)r.   r   z\#[^\n]{1,}r   r0   c              3   \   >#    U  H!  n[         R                  " T S 3U5      SLv   M#     g7f)z[^\s]N)r@   rA   )r3   r4   spacings     r"   r5   ?grpo_trainer__generate_and_score_completions.<locals>.<genexpr>D  s(     LVWIU+Q/t;Vs   ),r      a'  if self.max_prompt_length is not None:
            # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
            # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
            # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
            protected = [self.image_token_id, self.vision_start_token_id, self.vision_end_token_id]
            protected = [token for token in protected if token is not None]
            prompt_ids, prompt_mask = truncate_with_protected_tokens(
                prompt_ids, prompt_mask, self.max_prompt_length, protected
            )

            prompts_text = [re.sub(rf"^({re.escape(self.pad_token)})+", "", text) for text in prompts_text]

            # The chat template inserts a single image token into the prompt text. However, when this text is later
            # tokenized, the single image token string is expanded into multiple image token IDs, depending on the
            # image size. Since we're detokenizing here, we may see repeated image tokens in the decoded text. We
            # collapse them back into a single token string to match the original template.
            if self.image_token is not None:
                prompts_text = [
                    re.sub(rf"({re.escape(self.image_token)})+", self.image_token, text) for text in prompts_text
                ]
        # Generate completions using either vLLM or regular generation
        if self.use_vllm:z>if self.use_vllm and self.vllm_importance_sampling_correction:zHif False and self.use_vllm and self.vllm_importance_sampling_correction:zk        if "image_sizes" in prompt_inputs:
            output["image_sizes"] = prompt_inputs["image_sizes"]a          if "image_sizes" in prompt_inputs:
            output["image_sizes"] = prompt_inputs["image_sizes"]
        if max_left_pad is not None:
            output["max_left_pad"] = torch.tensor(prompt_ids.shape[0] * [max_left_pad]).unsqueeze(-1)        
        try:
            if self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False):
                output["sampling_per_token_logps"] = sampling_per_token_logps
        except NameError:
            output["sampling_per_token_logps"] = NonezL        if images is not None:
            output["num_images"] = num_imagesa          if images is not None:
            output["num_images"] = num_images
        if max_left_pad is not None:
            output["max_left_pad"] = torch.tensor(prompt_ids.shape[0] * [max_left_pad]).unsqueeze(-1)        
        try:
            if self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False):
                output["sampling_per_token_logps"] = sampling_per_token_logps
        except NameError:
            output["sampling_per_token_logps"] = Nonez	wake_up()z.*self\.llm\.generate\(.*\).*z(\s*)r   zif hasattr(self, 'llm'):
zZ    if getattr(self.llm.llm_engine.vllm_config.model_config, 'enable_sleep_mode', False):
z        self.llm.wake_up()
z

z=        self.llm.sleep(os.environ.get('VLLM_SLEEP_MODE', 1))
)rF   r@   compilerB   subnrC   rH   rI   rJ   rE   sumlistfinditerreversedgrouprA   startend)rK   rL   line_to_replacereplacement_linespattern_to_findreplacement_textnum_replacementsfoundreplace_partremoved_commentssplitsnew_replacementstring_to_findreplacement_stringpatternmatchespatchedrA   lineindent_matchindentwrappedrm   s                         @r"   ,grpo_trainer__generate_and_score_completionsr      s   99 RSH HO%" CHjj	 		O
 "1!5!56F!QHjj	 			BLL O	  "1!5!56F!QH JJ	!		BLL(	E 5zQ %ag66."lC!''-LVLLPQQG!"$WI . O,  ''FH
 VN 	S  CHDN9 CH1N9 CH(" **=r||Lw''12 g&E;;q>D88Hd3L.:\''*F (4(u(6&(4(u(XZ  o.87599;=;QQG '" Or$   c                 j   UR                  S5      nUS-  S:w  a  U$ US-  nSn[        R                  " U5      R                  5       n[        R                  " X2S-  5      nSU S3nSnUR                  XCR                  SS	5      5      n [        R                  " S
UR                  SS5      U5      nU$ )Nr9      r   a[  
        _chat_template_ = getattr(self.processing_class, "chat_template", None)
        if _chat_template_ is None: _chat_template_ = ""
        _supported_keys_ = set(("prompt", "chosen", "rejected", "completion", "messages", "label"))

        prompts_text = []
        for _example_ in __INPUTS__REPLACEMENT__:
            _tokenizer_kwargs_ = {}
            if type(_example_) is not dict:
                _example_ = {"prompt": _example_}
            _left_keys_ = _example_.keys() - _supported_keys_
            for k in _left_keys_:
                if k in _chat_template_:
                    v = _example_[k]
                    if type(v) is str:
                        _tokenizer_kwargs_[k] = v
            _x_ = maybe_apply_chat_template(_example_, self.processing_class, **_tokenizer_kwargs_)["prompt"]
            prompts_text.append(_x_)
     r0   zjprompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]__INPUTS__REPLACEMENT__r[   zprompts_text = \[[\s]{0,}maybe_apply_chat_template\(\{[\"\']prompt[\"\'][\s]{0,}\:[\s]{0,}prompt[\s]{0,}\}[\s]{0,}\,[\s]{0,}self\.processing_class\)\[[\"\']prompt[\"\']\] for prompt in prompts[\s]{0,}\]prompts)findtextwrapdedentstripr   rF   r@   rJ   )rK   rL   spacesreplacementwhats        r"   *grpo_trainer_fix_maybe_apply_chat_templater     s    ]]6"FzQ
aKFK& //+.446K//+|<K{m2&KwD!!";XFH	 vv	 	5yA	H Or$   c                 H    U S:w  a  U$ S n[         R                  " U5      nU$ )N_move_model_to_vllmc                     g Nr   )rZ   argskwargss      r"   r   =grpo_trainer__move_model_to_vllm.<locals>._move_model_to_vllm  s    r$   r_   )rK   rL   r   s      r"    grpo_trainer__move_model_to_vllmr     s-    --   !45HOr$   c                 N    U S:w  a  U$  SS jn[         R                  " U5      nU$ )N_get_per_token_logpsc                      g r   )hasattrosenvironr:   torchfloat16bfloat16_autocast_dtypeampautocastr   logits)rZ   r   	input_idsattention_masklogits_to_keepcompute_efficientr   s          r"   r   ?grpo_trainer__get_per_token_logps.<locals>._get_per_token_logps  s
     r$   )Fr_   )rK   rL   r   s      r"   !grpo_trainer__get_per_token_logpsr     s5    .. UZT   !56HOr$   c                 R    U S:w  a  U$    SS jn[         R                  " U5      nU$ )N"_get_per_token_logps_and_entropiesc                    U(       a  g[        U S5      (       d  [        R                  R                  SS5      S:X  a  [        R
                  O[        R                  U l        [        R                  R                  SS5      S:X  a  [        R
                  U l        U	R                  SS 5      U	R                  S	S 5      pU	R                  S
S 5      U	R                  SS 5      pU R                  R                  USS9nU R                  R                  5       R                  nU R                  [        R
                  [        R                  4;   a  SOSnUR                  S   nUR                  S   nUR                  S   nUR                  S   nU R                  R                  c-  [!        UUUUUU R                  R"                  5      u  nnUU-  nOSU R                  R                  nU R                  R"                  c  [%        SUS-  5      nOU R                  R"                  n/ nU
c  ['        X$U R(                  R*                  5      n[        R$                  " U5      R-                  5       n[/        X R(                  R*                  5      nX R(                  R*                  :g  nUR1                  UR2                  5      nOSn[        R4                  " UUSS9nS nSS KnUR                  S   nUR9                  UU-  5      n/ n/ n/ n/ n / n!Sn"[;        SUU5       H  n#U#U-   n$UR=                  UU#U$ 5        UR=                  UU#U$ 5        Ub  U
b  UU#U$ n%U R=                  U%5        U%R?                  SS9RA                  5       R-                  5       n&U"n'U"U&-   n(UR=                  U
U'U( 5        Ub  U!R=                  UU'U( 5        OU!R=                  S 5        U(n"M  UR=                  S 5        U R=                  S 5        U!R=                  S 5        M     Ub3  [C        U[        RD                  5      (       d  U V)s/ s H  n)U)/PM     n*n)O	U" UU5      n*U RF                  n+[I        URJ                  SS5      n,U,c  Sn,[I        URJ                  SS5      n-U-c  Sn-[I        URJ                  SS5      n.U.c  Sn.[M        UUUU U!U*5      n/S[        R                  S'   [O        U5         U/ GH	  u  n0n1n2n3n4n5[        RP                  RS                  SU R                  S9   U
cH  U" U0U1U2U3U4U5S9RT                  n6U0S S 2UU-   * S 24   n7U6S S 2UU-   S-   * S 2S S 24   n6U6S S 2S S2S S 24   n6O3U" U0U1U2U3U4U5US-   S9RT                  n6U6S S 2S S2S S 24   n6U0S S 2U* S 24   n7[W        U6UU7U0R                  S   U-  U-U.U,U+S 9n8S S S 5        [        RX                  R[                  5         UR=                  W85        GM     [        R\                  " USS9n9S n:S S S 5        S[        R                  S'   W9R_                  5       W:4$ s  sn)f ! , (       d  f       N= f! , (       d  f       NI= f)!N)NNr   ACCELERATE_MIXED_PRECISIONfp16UNSLOTH_FORCE_FLOAT3201pixel_valuesimage_grid_thwpixel_attention_maskimage_sizesF)keep_fp32_wrapper       r   r   r   i   chunksdimc                 >    U c  S /U-  $ [         R                  " XSS9$ )Nr   r   )r   chunk)tensorr   s     r"   chunk_optionalsgrpo_trainer__get_per_token_logps_and_entropies.<locals>._get_per_token_logps_and_entropies.<locals>.chunk_optional~  s$    > 6F?*{{6!DDr$   r   final_logit_softcappinglogit_scalelogits_scalingUNSLOTH_RETURN_HIDDEN_STATEScuda)device_typedtype)r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   logit_scale_multiplylogit_scale_dividelogit_softcappingtemperature)0r   r   r   r:   r   r   r   r   acceleratorunwrap_modelr   get_output_embeddingsweightshaper   unsloth_grpo_mini_batchautotune_batch_and_chunksunsloth_logit_chunk_multipliermaxcalculate_pad_tokens_in_promptprocessing_classpad_token_iditemr
   tor   r   mathceilrangeappendprodrr   
isinstanceTensorr   getattrconfigzipr   r   r   r   +chunked_hidden_states_selective_log_softmaxr   synchronizecatdetach);rZ   r   r   r   r   
batch_sizecompute_entropyr   r   r   r   r   r   r   unwrapped_modellm_headdtype_bytes
total_rowsseq_len
hidden_dim	vocab_dimB
multiplierall_logprobs_listleft_pad_tokens_per_promptmax_left_padattention_mask_chunksr   r   total_samplesinput_ids_chunkspixel_values_chunksimage_grid_thw_chunkspixel_attention_mask_chunkscurrent_pixel_idxrw   rx   
grid_slicebatch_pixel_countstart_pixel_idxend_pixel_idxsizeimage_sizes_chunksr   r   r   r   zipped_inputsinput_ids_chunkattention_mask_chunkpixel_values_chunkimage_grid_thw_chunkpixel_attention_mask_chunkimage_sizes_chunklogits_chunkcompletion_input_ids_chunklogprobs_chunklogprobs	entropiess;                                                              r"   r   [grpo_trainer__get_per_token_logps_and_entropies.<locals>._get_per_token_logps_and_entropies)  s    4!233 zz~~&BFKvU MM $
 ::>>"93?3F+0==D( 

>40

+T2 )
 

148

=$/ #.
 #..;;5 < O jj668??G **u}}enn.MMSU  #+Jooa(G q)Ja(Iyy008 9II<<!: !OII5599;;C!$Q4!8J!%!I!IJ "#-Kt/D/D/Q/Q.*  %yy)CDIIK-44AA	 "+.C.C.P.P!P!/!2!2>3G3G!H  %*KKRS$T!E
 %OOA.M=1#45J!$&!"$$&!*,' !q-<j( ''	%(<=%,,^E#-FG!-,2J!/c!:J)00<(2b(A(E(E(G(L(L(N%&7O$58I$IM'..$_]C ,73::0O 4::4@(5% (..t4)006/66t<? =B &z+u||/T/T9D%Etf"%E"%3K%C"**K '6OQR S ($%!#*5<<#J #+'($!(7G!K!)%&" %#%+"M :=BJJ564U; ##(&(.%++&,d6J6J ,  (/+:,;1E/A1E7Q.?, %f ) :I !^l%B#C#E E:6 ,8 !^l%BQ%F#G#I1 L,L ,83B3	+BL ,;,;1E/A1E7Q.?1?!1C, %f ) ,83B3	+BL9H !N?#3 3:6 *U(#6%4%:%:1%=
%J3G1C0A*5	*Md JJ**,%,,^<i #j !99%6a@ 	} <@ :=BJJ56??$i//w &FD  <;s,   8X46Y
BX94AY
9
YY


YNFFr_   )rK   rL   r   s      r"   /grpo_trainer__get_per_token_logps_and_entropiesr  $  s:    << !d0l   !CDHOr$   grpo_compute_lossgrpo_compute_loss_slowUnslothEfficientGRPOgrpo_accumulated_lossgrpo_update_SamplingParamsc                 N    U S:w  a  U$  SS jn[         R                  " U5      nU$ )NrU   c                 |  ^  U(       a  [        S5      eUS   US   peUS   US   pUR                  SS 5      UR                  SS 5      pUR                  SS 5      UR                  S	S 5      pUR                  S
S 5      nUR                  SS 5      nT R                  nT R                  R                  n[
        R                  " XW/SS9nUR                  u  nn[
        R                  " Xh/SS9nUR                  S5      nUnUn   S=U 4S jjnU" UUUUSS9nUR                  SS 5      nUS   nUR                  SS 5      nUS S 2U* S 24   n[        UR                  SS5      nUc  Sn[        UR                  SS5      nUc  Sn[        UR                  SS5      nUc  SnUR                  SS5      nUb  [        UUUUUT R                  U40 SU	_SU
_ST R                  R                  _ST R                  _ST R                   _ST R"                  _ST R                  R$                  _ST R                  R&                  _S T R                  R(                  _SU_S!U_S"U_S#U_S
U_S$U_S%U_SU_6u  n n!n"n#n$GO<[+        T R                  S5      (       a  [-        S>0 S&T _S'U_SU	_SU
_S(U_SU_SU_S)U_S*U_S+T R                  R.                  _ST R                  R                  _ST R                  _ST R                   _ST R"                  _ST R                  R$                  _ST R                  R&                  _S T R                  R(                  _SU_S!U_S"U_S#U_S,U_S
U_S$U_S%U_SU_6u  n n!n"n#n$OA[-        T UUUUUUT R                  R.                  T R                  R(                  UUUUS-9u  n n!n"S.T R0                  ;   a~  T R2                  R4                  (       a  S/OS.n%T R0                  U%   S0   R7                  U!R9                  5       5        T R0                  U%   S1   R7                  U"R9                  5       5        OXT R0                  S0   R7                  U!R9                  5       5        T R0                  S1   R7                  U"R9                  5       5        T R:                  (       Ga  W#Gb  [        T S2S5      (       Gay  U#R=                  5       S:  a  [
        R>                  " U#5      O([
        R@                  " S3T RB                  RD                  S49n&U#R=                  5       S:  a  [
        RF                  " U#5      O([
        R@                  " S3T RB                  RD                  S49n'T R0                  W%   S5   R7                  T R                  RI                  U&5      R?                  5       R9                  5       5        T R0                  U%   S6   R7                  T R                  RI                  U'5      RG                  5       R9                  5       5        W$R=                  5       S:  a  [
        RJ                  " U$5      O([
        R@                  " S3T RB                  RD                  S49n(U$R=                  5       S:  a  [
        R>                  " U$5      O([
        R@                  " S3T RB                  RD                  S49n)U$R=                  5       S:  a  [
        RF                  " U$5      O([
        R@                  " S3T RB                  RD                  S49n*T R0                  U%   S7   R7                  T R                  RI                  U(5      RM                  [O        S85      S99RK                  5       R9                  5       5        T R0                  U%   S:   R7                  T R                  RI                  U)5      RQ                  5       R9                  5       5        T R0                  U%   S;   R7                  T R                  RI                  U*5      RM                  [O        S<5      S99RG                  5       R9                  5       5        U $ )?Nz2The GRPOTrainer does not support returning outputs
prompt_idsprompt_maskcompletion_idscompletion_maskr   r   r   r   rX   sampling_per_token_logpsr   r   Fc           	         > [        TS5      (       a  TR                  XX#U5      $ TR                  U UUUUUU5      S   $ )Nr   r   )r   r   r   )r   r   r   r   r   r   r   rZ   s          r"   <lambda>Agrpo_trainer_compute_loss.<locals>.compute_loss.<locals>.<lambda>]  sd     t344 (,'@'@.BS( ( 88! 	(r$   T)r   ref_per_token_logps
advantagesold_per_token_logpsr   r   r   r   r  	loss_typeimportance_sampling_levelepsilon_lowepsilon_highmax_completion_lengthdeltar   r   r   r   #current_gradient_accumulation_stepsnum_processestrainerr   r   	old_logps	ref_logpsn_chunksr   )r9  r   r   r)  r/  r:  r;  r<  r   r   r   r   r   trainevalcompletion_lengthkl#vllm_importance_sampling_correctiong        )devicez&sampling/sampling_logp_difference/meanz%sampling/sampling_logp_difference/maxz&sampling/importance_sampling_ratio/mininf)nanz'sampling/importance_sampling_ratio/meanz&sampling/importance_sampling_ratio/maxz-infr  r   ))
ValueErrorr:   r7  r   r8  r   r   r   r  r   r   r   betar   r1  r2  r3  r4  r5  r6  r   r   r"  unsloth_num_chunks_metricscontrolshould_evaluater   r   use_vllmnumelmeanr   r   rB  r   gathermin
nan_to_numfloatnanmean)+rZ   r   r[   rW   rX   r&  r'  r(  r)  r   r   r   r   r*  r7  r8  r   bszqlenr   r   
_input_ids_logits_to_keepget_logps_funcper_token_logpsr;  r/  r:  r   r   r   r  lossr?  mean_klr6  flat_is_ratiomode
mean_delta	max_deltamin_importance_sampling_ratiomean_importance_sampling_ratiomax_importance_sampling_ratios+   `                                          r"   rU   /grpo_trainer_compute_loss.<locals>.compute_loss9  sU    QRR #)"6}8MK#$$% (
 JJ~t,JJ'. %
 JJ-t4JJ}d+ * $ZZ(<dC#)::.H$#O .2.V.V+((66IIz:!D	OO	TK#AK',,
 
( # % 	, )9nnRV
 JJ4d;	 L)
 JJ4d;	a.!112	 $ELL2KQO$ !&u||]AF'#$ $U\\3CQG%!"zz.!4&&##II $0 &4 !%		 3 3 150N0N #'"2"2 $(#4#4 -1II,K,K !IIOO  #'))"7"7!" $0#$ ):%& ,@'( *<)* *<+, ;^-. %2/0 0H1 CD#We]: tyy+..) "&$. (4 *8	
 *8 +: &0 %. %. $(99#?#? %)II$7$7 594R4R '+&6&6 (,'8'8 15		0O0O  !%		!" '+ii&;&;#$ (4%& ->'( 0D)* .@+, *8-. .@/0 ?b12 )634 4L5 G'%@ 4I" *%3&5!+ ) )#yy;;"&))"7"7(9+?);%340'  dmm#!\\996wDMM$ 34;;<M<R<R<TUMM$%,,W\\^<MM-.556G6L6L6NOMM$&&w||~6 MMM!CUKK ;;=1$ 

5!\\#

0A0AB  ;;=1$ 		% \\#

0A0AB 
 MM$ HIPP  ''
388:??A MM$ GHOO  ''	2668==? !&&(1, 		-(\\#

0A0AB * !&&(1, 

=)\\#

0A0AB + !&&(1, 		-(\\#

0A0AB *
 MM$ HIPP  ''(EF%,/	 MM$ IJQQ  ''(FGOOQVVX MM$ HIPP  ''(EF%-0	 r$   r^   r_   r`   s      r"   grpo_trainer_compute_lossrc  5  s4    & KOgR   .HOr$   c                 &    SU ;  a  gSU;  a  gSnU$ )Nz&divisible by the number of generationsr   num_generationsa  div = per_device_train_batch_size // num_generations
if div * num_generations != per_device_train_batch_size:
    print('Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations))
    per_device_train_batch_size = num_generations
r   )RLTrainer_sourceRLConfig_sourcecheck_batch_sizes      r"   grpo_trainer_fix_batch_sizeri  +  s+    /7GG/	>  r$   c                 ^    SU ;  a  gSU ;   nSU ;   nU(       d  SU ;   nOSnSU SU S	U S
3nU$ )Nreward_funcsr   zrewards/{reward_func_name}/meanzrewards/{reward_func_name}/stdzrewards/{reward_func_name}Fzif not isinstance(reward_funcs, list): _reward_funcs = [reward_funcs]
else: _reward_funcs = reward_funcs
for reward_func in _reward_funcs:
    try:
        reward_func_name = reward_func.__name__
        if zR:
            other_metrics.append(f'rewards/{reward_func_name}/mean')
        if zQ:
            other_metrics.append(f'rewards/{reward_func_name}/std')
        if zS:
            other_metrics.append(f'rewards/{reward_func_name}')
    except: pass
r   )rf  rg  use_meanuse_std
use_normallog_metricss         r"   grpo_trainer_metricsrp  @  sp    -- 14DDH.2BBG15EE

	
 Z  Y  \ "	  r$   c                     [         R                  R                  S5      c  g [        [	        S5      5      [        S5      :  a  g  SS KJs  Js  Jn   SS K	Js  Jn  [        R                  " U R                  5      n[         R"                  " U5      nUn[$        R&                  " SSU5      n[$        R&                  " SS	U5      nXC:X  a  [        R(                  " S
5        g 0 n[+        [-        USS5      U R.                  U5        US   nX`l        Xal        [        R                  " S5        g ! [         a9  n[        R                  " SU 35        [        R                  " S5         S nAg S nAff = f)Ntrlz0.26.0r   z'Unsloth: Failed to import trl openenv: uP   Unsloth: trl.experimental.openenv not available — skipping RL openenv patches.z+.*\.collective_rpc\("reload_weights"\).*\n?r   z\.wake_up\(tags=\[.*?\]\)z
.wake_up()z=Unsloth: Warning - regex did not match, patch may have failedz	<unsloth>execgenerate_rollout_completionsz9Unsloth: Patched trl openenv generate_rollout_completions)	importlibutil	find_specr   importlib_versiontrl.experimental.openenv.utilsexperimentalopenenvutilstrl.experimental.openenvImportErrorr   infor;   rD   rt  r   r   r@   rJ   warningrs  rp   __dict__)openenv_utilsr{  esrcoriginal_srclocal_nspatched_funcs          r"   openenv_vllm_reload_weightsr  `  s5    ~~&. '(78+<<>>22 

MFF
GC
//#
CL &&?S
IC &&-|S
ACVW Hk6	*M,B,BHM:;L 2>.+7(
KKKL=  =aSAB^	
 	s   D* *
E-4/E((E-r{  );__all__r   r@   r   r;   collectionsr   unsloth_zoo.rl_replacementsr	   r
   unsloth_zoo.utilsr   importlib.metadatar   rx  unsloth_zoo.logr   importlib.utilru  r   r   r   r   r   r   r   r   _utilsr   rs   r   r   r   r   r   RL_ADDITIONAL_FUNCTIONStorch_compile_optionsr#   r   r(   rS   ra   re   ri   r   r   r   r   r  r  r   r!  r"  r#  rD   rc  ri  rp  r  r   r$   r"   <module>r     s%   
 	   # J % ; "    7D!4 4 %  & %d+    m  # #$D E  m  # #$; <GT ]  " "#> ?
& ]  " "#; <( ^  # #$@ A ^  # #$F God ^  # #$P Q-` ^  # #$N O ^  # #$D E/d ^  # #$E F|~ ^  # #$S T#$78 ()AB &'=> '(?@ ,-IJ  ^  # #G$5$56G$H I ^  # #G$5$56J$K L ^  # #G$5$56K$L M ^  # #$: ; ^  # #G$5$56P$Q R ^  # #9:nb ^  # #$= >
" . ! ( ()D E: > " ) )*> ?1Mh 	 " ) )*E Fr$   