
    hv                        S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	  SSK
JrJrJrJrJrJrJrJrJrJrJr  SSKJr  \R0                  R3                  SS5        \\\S	.r\S
:X  Ga  \" \\\45      r\R;                  5       u  rrr SS0\l!        \ RD                  S;   a  \ RD                  O\#" \\ RD                  5      r"\$" \ RJ                  \ RL                  \"\RN                  (       a  SOSS9r(\" \ 5      r)\)b  \" 5       \(S'   \)\(S'   \RT                  " \ RV                  4S\ RX                  0\(D6r-\RT                  " \ RV                  4S\ RX                  0\(D6r.\R^                  b+  \RT                  " \R^                  4S\ RX                  S.\(D6r0OSr0\Rb                  b  \\Rb                     r2\2" 5       r1OSr1\RT                  " \ RV                  S\ RX                  S9r3\3Rh                  c  \3Rj                  \3l4        \3Rl                  c  \\3l6        \" \Rn                  \Rp                  S9r9\" \-\.\0\1\\9\Rt                     \Rv                  S:w  a  \9\Rx                     OS\3S9r=\Rv                  S:w  a5  \	" \R|                  S\R~                  S9r@\" \=\@SS9rA\=R                  \A5        \=R                  5         \=R                  \R                  5        \R                  (       a  \=R                  \Rn                  S9  ggg)a  
Usage:

python examples/scripts/xpo.py     --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft      --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm     --dataset_name trl-lib/tldr     --learning_rate 5.0e-7     --output_dir pythia-1b-tldr-xpo     --per_device_train_batch_size 4     --gradient_accumulation_steps 32     --num_train_epochs 3     --max_new_tokens 64     --warmup_ratio 0.1     --missing_eos_penalty 1.0     --push_to_hub
    N)load_dataset)AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizerGenerationConfig)HfPairwiseJudgeLogCompletionsCallbackModelConfigOpenAIPairwiseJudgePairRMJudgeScriptArguments	TrlParser	XPOConfig
XPOTrainerget_kbit_device_mapget_quantization_config)SIMPLE_CHAT_TEMPLATETRACKIO_SPACE_IDztrl-trackio)pair_rmopenaihf__main__use_reentrantT)autoNF)revisionattn_implementationdtype	use_cache
device_mapquantization_configtrust_remote_code   )
num_labelsr!   left)padding_sider!   )nameno)model	ref_modelreward_funcsjudgeargstrain_dataseteval_datasetprocessing_class)max_new_tokens	do_sampletemperature   )num_prompts)dataset_name)G__doc__ostorchdatasetsr   transformersr   r   r   r   trlr   r	   r
   r   r   r   r   r   r   r   r   trl.trainer.utilsr   environ
setdefaultJUDGES__name__parserparse_args_and_configscript_argstraining_args
model_argsgradient_checkpointing_kwargsr   getattrdictmodel_revisionr   gradient_checkpointingmodel_kwargsr    from_pretrainedmodel_name_or_pathr!   r(   r)   reward_model_pathreward_modelr+   	judge_cls	tokenizer	pad_token	eos_tokenchat_templater5   dataset_configdatasetdataset_train_spliteval_strategydataset_test_splittrainerr0   r2   generation_configcompletions_callbackadd_callbacktrain
save_model
output_dirpush_to_hub     N/home/james-whalen/.local/lib/python3.13/site-packages/examples/scripts/xpo.py<module>re      s7  .$ 
  ! r r    3 

  (- 8 !,?	W zK@AF-3-I-I-K*K
3BD2IM/ * 0 0N BJPUWaWgWgHhE**&::(??%T	L 2*=&%8%:\".A*+ 00%%9C9U9UYeE %44%%9C9U9UYeI &&29II++
(::
 	
 &=../	--%%FjNjNjI "'11	&"6	;33+:T:TUG!k==>@M@[@[_c@cW[;;<im"	G ""d*,(774UbUnUn
  6g?P^_`12MMO }//0  )A)AB !S rc   