
    h                        S SK r S SKrS SKrS SKrS SKJrJr  S SKJr  S SK	J
r
  S SKJr  S SKJrJrJrJrJrJrJrJr  S SKJrJr  \
R2                  " \5      r\R8                  R;                  SS	5        \\" S
SS9S.r\ " S S\5      5       rS r SS\ RB                  4S jjr"\S:X  a(  \"" 5       r#\#RI                  SS9u  r%r&r'r(r)\ " \%\&\'\(5        gg)    N)	dataclassfield)Optional)logging)load_dataset)DatasetMixtureConfig
GRPOConfigGRPOTrainerModelConfigScriptArguments	TrlParserget_datasetget_peft_config)get_soft_overlong_punishmentthink_format_rewardTRACKIO_SPACE_IDztrl-trackioi      )max_completion_lensoft_punish_cache)r   r   c                   d    \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS0S9r
\\\      \	S	'   S
rg)GRPOScriptArguments;   a  
Script arguments for the GRPO training script.

Args:
    reward_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
        Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
        directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    reward_funcs (`list[str]` or `None`, *optional*, defaults to `None`):
        Reward functions to use. Supported values are:

            - `"think_format_reward"`
            - `"get_soft_overlong_punishment"` (used value are `max_completion_len=1280`, `soft_punish_cache=256`)
            - any dotted import path " (e.g., `'my_lib.rewards.custom_reward'`).
NhelpzReward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`.)defaultmetadatareward_model_name_or_pathzReward functions to use. Supported values are: `think_format_reward`, `get_soft_overlong_punishment` (used value are `max_completion_len=1280`, `soft_punish_cache=256`), or any dotted import path (e.g., `'my_lib.rewards.custom_reward'`).reward_funcs )__name__
__module____qualname____firstlineno____doc__r   r   r   str__annotations__r   list__static_attributes__r       J/home/james-whalen/.local/lib/python3.13/site-packages/trl/scripts/grpo.pyr   r   ;   s]     05 p
0x}  ). O
)L(49% r(   r   c           
         / nU R                   (       a  UR                  U R                   5        U R                  (       a  U R                   H  nU[        ;   a  UR                  [        U   5        M'  SU;   a{  UR	                  SS5      u  pe[
        R                  R                  S[        R                  " 5       5        [        R                  " U5      n[        Xu5      nUR                  U5        M  [        SU S[        [        R                  5       5       S35      e   UR                   (       a'  U R"                  (       a  [$        R'                  S5        OUR                   (       a  U R"                  (       d  [)        U5      n	OWUR                   (       d;  U R"                  (       a*  [+        U R"                  U R,                  U R.                  S9n	O[        S	5      e[1        UR2                  UUW	U R4                     UR6                  S
:w  a  XR8                     OS [;        U5      S9n
U
R=                  5         U
R?                  UR@                  5        URB                  (       a  U
RC                  U R"                  S9  g g )N.   r   z Could not load reward function 'z'. Expected one of z or a valid import path.zBoth `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the dataset and `dataset_name` will be ignored.)name	streamingz5Either `datasets` or `dataset_name` must be provided.no)modelr   argstrain_dataseteval_datasetpeft_config)dataset_name)"r   appendr   reward_funcs_registryrsplitsyspathinsertosgetcwd	importlibimport_modulegetattr
ValueErrorr&   keysdatasetsr5   loggerwarningr   r   dataset_configdataset_streamingr
   model_name_or_pathdataset_train_spliteval_strategydataset_test_splitr   train
save_model
output_dirpush_to_hub)script_argstraining_args
model_argsdataset_argsr   	func_namemodule_pathmodulereward_funcdatasettrainers              r)   mainrZ   ]   s   L,,KAAB$11I11##$9)$DE	!)2)9)9#q)A&299;/"00=%f8##K0 6ykAT16689::RT  2  !9!9:	
 
		{'?'?l+""{'?'?$$;+E+EQ\QnQn
 PQQ ++!k==>@M@[@[_c@cW;;<im#J/G MMO }//0  )A)AB !r(   
subparsersc                 t    [         [        [        [        4nU b  U R	                  SSUS9nU$ [        U5      nU$ )NgrpozRun the GRPO training script)r   dataclass_types)r   r	   r   r   
add_parserr   )r[   r^   parsers      r)   make_parserra      sH    *JEYZO&&v4Rds&t M ?+Mr(   __main__T)return_remaining_strings)N)*argparser>   r<   r9   dataclassesr   r   typingr   
accelerater   rC   r   trlr   r	   r
   r   r   r   r   r   trl.rewardsr   r   
get_loggerr   rD   environ
setdefaultr7   r   rZ   _SubParsersActionra   r`   parse_args_and_configrP   rQ   rR   rS   _r   r(   r)   <module>rp      s   0   	 
 (   !	 	 	 J 
		H	% 

  (- 8 /$@TXlo$p  /  B5CpH66  z]F ?E>Z>Z!% ?[ ?;K
L! 	mZ> r(   