
    :i              
       >   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJ
r
JrJrJrJrJrJrJrJrJrJrJrJrJ r J!r!J"r"JrJ#r#  SSK r SSK7  SSK$J%r%J&r&  SS	K'J(r(  SSKrSSK)r*SS
K+J,r,  SSKJr  SSK-J.r.J/r0  SSK1J2r2  SSK3r3SSK4J5r5  S r6 SSSSSS.r7\Rp                  " SS\7S9S 5       r9S\R                  S\:S\:S\R                  4S jr;S\R                  S\R                  S\:S\:S\R                  4
S jr<S\R                  S\:S\R                  4S jr=\% " S  S!\5      5       r>  " S" S#\5      r? " S$ S%\?5      r@ \A" \S&5      (       a3  SSKr " S' S(\R                  5      rC \R                  " \C" S)5      5        gg)*z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)Acceleratorr   r   
DDPOConfigDDPOStableDiffusionPipelineDDPOTrainerr   PathPerPromptStatTrackerProjectConfigurationPyTorchModelHubMixinr	   defaultdictfuturesgenerate_model_cardget_comet_experiment_urlis_wandb_availableloggerloggingosset_seedtextwraptorchwarnings)*)	dataclassfield)Version)nullcontext)DataCollatorForSeq2SeqDataCollatorForLanguageModeling)ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrr,   r-   r.   )selfargskwargsoutputfs       ?/home/james-whalen/unsloth_compiled_cache/UnslothDDPOTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)r4   r6   s   ` r5   prepare_for_training_moder;   /   s%    __Q  Nr8   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rF   indexrF      )r   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrG   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             r5   chunked_selective_log_softmaxr_   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYr8   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
rI   z8logits_to_keep must be smaller than the sequence length.NrH   )rL   
ValueErrorsum)r`   ra   rb   prompt_sectionpadding_maskpad_token_countss         r5   calculate_pad_tokens_in_promptrj   W   sX     ++STTq"2N?"223N"2L#''A'.r8   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
devicer   rI   )rL   rp   r   arangerQ   )rk   rl   rm   rb   
batch_sizecompletion_lenrp   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               r5    create_completion_attention_maskry   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.Jr8   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
rI   T)rF   
descendingstable)r   argsortrP   )rz   r{   masksorted_indicespacked_tensors        r5   left_pack_paddingr      s8     D]]4Q4MNLLN;Mr8   c                      ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'                                           SU 4S jjrSrU =r$ )UnslothDDPOConfig   aj  
    
Configuration class for the [`DDPOTrainer`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
        Name of this experiment (by default is the file name without the extension name).
    run_name (`str`, *optional*, defaults to `""`):
        Name of this run.
    seed (`int`, *optional*, defaults to `0`):
        Random seed.
    log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`):
        Log with either 'wandb' or 'tensorboard', check
        https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
    tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the tracker (e.g. wandb_project).
    accelerator_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the accelerator.
    project_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the accelerator project config (e.g. `logging_dir`).
    tracker_project_name (`str`, *optional*, defaults to `"trl"`):
        Name of project to use for tracking.
    logdir (`str`, *optional*, defaults to `"logs"`):
        Top-level logging directory for checkpoint saving.
    num_epochs (`int`, *optional*, defaults to `100`):
        Number of epochs to train.
    save_freq (`int`, *optional*, defaults to `1`):
        Number of epochs between saving model checkpoints.
    num_checkpoint_limit (`int`, *optional*, defaults to `5`):
        Number of checkpoints to keep before overwriting old ones.
    mixed_precision (`str`, *optional*, defaults to `"fp16"`):
        Mixed precision training.
    allow_tf32 (`bool`, *optional*, defaults to `True`):
        Allow `tf32` on Ampere GPUs.
    resume_from (`str`, *optional*, defaults to `""`):
        Resume training from a checkpoint.
    sample_num_steps (`int`, *optional*, defaults to `50`):
        Number of sampler inference steps.
    sample_eta (`float`, *optional*, defaults to `1.0`):
        Eta parameter for the DDIM sampler.
    sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
        Classifier-free guidance weight.
    sample_batch_size (`int`, *optional*, defaults to `1`):
        Batch size (per GPU) to use for sampling.
    sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`):
        Number of batches to sample per epoch.
    train_batch_size (`int`, *optional*, defaults to `1`):
        Batch size (per GPU) to use for training.
    train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
        Use 8bit Adam optimizer from bitsandbytes.
    train_learning_rate (`float`, *optional*, defaults to `3e-4`):
        Learning rate.
    train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
        Adam beta1.
    train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
        Adam beta2.
    train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
        Adam weight decay.
    train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
        Adam epsilon.
    train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
        Number of gradient accumulation steps.
    train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
        Maximum gradient norm for gradient clipping.
    train_num_inner_epochs (`int`, *optional*, defaults to `1`):
        Number of inner epochs per outer epoch.
    train_cfg (`bool`, *optional*, defaults to `True`):
        Whether to use classifier-free guidance during training.
    train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
        Clip advantages to the range.
    train_clip_range (`float`, *optional*, defaults to `1e-4`):
        PPO clip range.
    train_timestep_fraction (`float`, *optional*, defaults to `1.0`):
        Fraction of timesteps to train on.
    per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`):
        Whether to track statistics for each prompt separately.
    per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`):
        Number of reward values to store in the buffer for each prompt.
    per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`):
        Minimum number of reward values to store in the buffer.
    async_reward_computation (`bool`, *optional*, defaults to `False`):
        Whether to compute rewards asynchronously.
    max_workers (`int`, *optional*, defaults to `2`):
        Maximum number of workers to use for async reward computation.
    negative_prompts (`str`, *optional*, defaults to `""`):
        Comma-separated list of prompts to use as negative examples.
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether to push the final model checkpoint to the Hub.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsrC   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksc)                   > [         S:  a  [        S[          S35        [         S:  a  [        S[          S35        [        T*U ]  " S,0 SU_SU_SU_S	U_S
U_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_S U_S!U_S"U_S#U_S$U_S%U _S&U!_S'U"_S(U#_S)U$_S*U%_S+U&_U)D6  U'U l        U(U l        g )-NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rI   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!exp_namerun_nameseedlog_withtracker_project_namelogdir
num_epochs	save_freqnum_checkpoint_limitmixed_precision
allow_tf32resume_fromsample_num_steps
sample_etasample_guidance_scalesample_batch_sizesample_num_batches_per_epochtrain_batch_sizetrain_use_8bit_adamtrain_learning_ratetrain_adam_beta1train_adam_beta2train_adam_weight_decaytrain_adam_epsilon!train_gradient_accumulation_stepstrain_max_grad_normtrain_num_inner_epochs	train_cfgtrain_adv_clip_maxtrain_clip_rangetrain_timestep_fractionper_prompt_stat_tracking$per_prompt_stat_tracking_buffer_size"per_prompt_stat_tracking_min_countasync_reward_computationmax_workersnegative_promptspush_to_hub )learning_rateprintsuper__init__r   r   )+r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   	__class__s+                                             r5   r   UnslothDDPOConfig.__init__   s   Z 4)I-  YB  (C  "D1e&F}o  Vw  %x  y &	0&	0&	0 &	0  	&	0
 $8&	0 &	0 $&	0 "&	0 $8&	0 .&	0 $&	0 &&	0  0&	0 $&	0 %:&	0  !2!&	0" ,H#&	0$  0%&	0& #6'&	0( #6)&	0*  0+&	0,  0-&	0. '>/&	00 "41&	02 1R3&	04 #65&	06 &<7&	08 "9&	0: "4;&	0<  0=&	0> '>?&	0@ (@A&	0B 4XC&	0D 2TE&	0F (@G&	0H &I&	0J  0K&	0L &M&	0N %9!"4r8   )r   r   )(train_eden_coder_small iO  Ntrllogsd   rI      fp16Tr   2         ?      @rI      rI   Fg-C6
?g?g+?g{Gz?:0yE>r   r   rI   Tr   g-C6?r   F   r   Fr   r   FNrC   )__name__
__module____qualname____firstlineno____doc__r#   r   r   r   __annotations__r   intr   __static_attributes____classcell__r   s   @r5   r   r      s    ]| +012+(3-  */VW*#  ,$   #'(## "&",-!!" !"%#(/1-/#(#SX5 X5r8   r   c                     ^  \ rS rSrSrSS/r S%S\S\\R                  \
\   \
\   /\R                  4   S\/ \
\\4   4   S	\S
\\\\\/\4      4
S jjrS&S jrS\S\4S jrS rS\R                  S\S\R                  4S jrS rS rS rS rS rS\
\\4   4S jrS%S\\   4S jjrS rU 4S jr   S'S \\   S!\\   S"\ \\!\   S4   4S# jjr"S$r#U =r$$ )(_UnslothDDPOTraineriS  a/  
The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is heavily
inspired by the work here: https://github.com/kvablack/ddpo-pytorch As of now only Stable Diffusion based pipelines
are supported

Args:
    config ([`DDPOConfig`]):
        Configuration object for DDPOTrainer. Check the documentation of [`PPOConfig`] for more details.
    reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
        Reward function to be used.
    prompt_function (`Callable[[], tuple[str, Any]]`): Function to generate prompts to guide model
    sd_pipeline ([`DDPOStableDiffusionPipeline`]): Stable Diffusion pipeline to be used for training.
    image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`): Hook to be called to log images.
r   ddpoNconfigreward_functionprompt_functionsd_pipelineimage_samples_hookc           
         [         R                  " S[        5        Uc  [        R                  " S5        X0l        X l        Xl        XPl        [        S0 U R                  R                  D6nU R                  R                  (       Ga  [        R                  R                  [        R                  R                  U R                  R                  5      5      U R                  l        S[        R                  R!                  U R                  R                  5      ;  a  [#        [%        S [        R&                  " U R                  R                  5      5      5      n[)        U5      S:X  a"  [+        SU R                  R                   35      e[-        U Vs/ s H   n[/        UR1                  S5      S   5      PM"     sn5      n	[        R                  R3                  U R                  R                  SU	S    35      U R                  l        U	S   S	-   Ul        [/        U R                  R6                  U R                  R8                  -  5      U l        [=        SU R                  R>                  U R                  R@                  UU R                  RB                  U R:                  -  S
.U R                  RD                  D6U l#        U RI                  5       u  pU
(       d  [+        U5      eUR>                  S L=(       a    UR>                  S:H  nU RF                  RJ                  (       ao  U RF                  RM                  U R                  RN                  U(       d  [Q        URS                  5       S9OURS                  5       U R                  RT                  S9  [        RV                  " SU 35        [Y        U R                  RZ                  SS9  X@l.        U R\                  R_                  S	U RF                  R`                  (       + SSSS9  U RF                  R@                  S:X  a  [b        Rd                  nO;U RF                  R@                  S:X  a  [b        Rf                  nO[b        Rh                  nU R\                  Rj                  Rm                  U RF                  Rn                  US9  U R\                  Rp                  Rm                  U RF                  Rn                  US9  U R\                  Rr                  Rm                  U RF                  Rn                  US9  U R\                  Ru                  5       nU RF                  Rw                  U Rx                  5        U RF                  R{                  U R|                  5        U R                  R~                  (       aL  [b        R                  R                  5       (       a)  S[b        R                  R                  R                  l?        U R                  [        U["        5      (       d  UR                  5       OU5      U lG        U R\                  Rq                  U R\                  R                  U R                  R                  c  S/OU R                  R                  SSSU R\                  R                  R                  S9R                  Rm                  U RF                  Rn                  5      5      S   U lL        UR                  (       a%  [        UR                  UR                  5      U lQ        U R\                  R                  =(       d    U RF                  R                  U lR        [        U R\                  S5      (       aq  U R\                  R                  (       aV  U RF                  R                  XR                  5      u  olG        [#        [%        S UR                  5       5      5      U lV        O2U RF                  R                  XR                  5      u  U lV        U lG        U R                  R                  (       a#  [        R                  " UR                  S9U l[        UR                  (       ax  [        RV                  " SUR                   35        U RF                  R                  UR                  5        [/        UR                  R1                  S5      S   5      S	-   U l]        g SU l]        g s  snf ) Nz@DDPOTrainer is deprecated and will be removed in version 0.23.0.z8No image_samples_hook provided; no images will be loggedcheckpoint_c                     SU ;   $ )Nr   r   )xs    r5   <lambda>._UnslothDDPOTrainer.__init__.<locals>.<lambda>  s	    -1"4r8   r   zNo checkpoints found in _rC   rI   )r   r   project_configgradient_accumulation_stepstensorboard)ddpo_trainer_config)r   init_kwargs
T)device_specificFTimestep)positiondisableleavedescdynamic_ncolsr   bf16)dtyper   pt
max_lengthreturn_tensorspadding
truncationr   use_lorac                     U R                   $ N)requires_grad)ps    r5   r   r     s    !//r8   )r   zResuming from r   )^r    warnDeprecationWarningr   warning	prompt_fn	reward_fnr   image_samples_callbackr   project_kwargsr   r   pathnormpath
expanduserbasenamelistfilterlistdirlenre   sortedr   splitjoin	iterationr   r   num_train_timestepsr   r   r   r   accelerator_kwargsaccelerator_config_checkis_main_processinit_trackersr   dictto_dicttracker_kwargsinfor   r   r   set_progress_bar_configis_local_main_processr   float16bfloat16rO   vaerN   rp   text_encoderunetget_trainable_layersregister_save_state_pre_hook_save_model_hookregister_load_state_pre_hook_load_model_hookr   cudais_availablebackendsmatmul_setup_optimizer
isinstance
parameters	optimizer	tokenizerr   model_max_lengthr`   neg_prompt_embedr   r   r   r   stat_trackerautocastr/   r   preparetrainable_layersr   r   ThreadPoolExecutorr   executor
load_statefirst_epoch)r0   r   r   r   r   r   accelerator_project_configcheckpointsr   checkpoint_numbersis_okaymessageis_using_tensorboardinference_dtyper1  r  s                   r5   r   _UnslothDDPOTrainer.__init__e  s    	N	
 %NNUV((&8#%9%WDKK<V<V%W";;"""&(gg&6&6rww7I7I$++JaJa7b&cDKK#BGG$4$4T[[5L5L$MM"4

4;;#:#:; {#q($'?@W@W?X%YZZ%+K,XKqSb1A-BK,X%Y"*,'',,KK++!"4R"8!9:+'
 8J"7MPQ7Q*4 $'t{{'C'CdkkFiFi'i#j & 	
[[)) KK775 )-(U(UX\XpXp(p	
 kk,,	
  --/W%%%d:_vR_?_++**00I]t0@Acicqcqcs KK66 +  	bM"!!48&00((>>> 	1 	
 ++v5#mmO--7#nnO#mmO 0 0 7 7O%%(()9)9)@)@(X  !1!1!8!8 P++@@B55d6K6KL55d6K6KL ;;!!ejj&=&=&?&?48ENN&&1..1;<Ld1S1S'')Yi
 !% 0 0 = =&&44<$++B^B^#$++55FF '  i4++223!
 ! ** 4;;99!D ((11NT5E5E5N5N4##Z00T5E5E5N5N#'#3#3#;#;<Lnn#] D.$(0I4??K\)]$^D!484D4D4L4LM]_m_m4n1D!4>;;//#666CUCUVDMKK.););(<=>''(:(:;"6#5#5#;#;C#@#DEID D] -Ys   'cc           	        ^  U(       d]  / nU HT  u  pEnT R                  XEU5      u  pxUR                  [        R                  " UT R                  R
                  S9U45        MV     O{T R                  R                  U 4S jU5      nU VVs/ s HL  u  px[        R                  " UR                  5       T R                  R
                  S9UR                  5       4PMN     nnn[        U6 $ s  snnf )Nro   c                 "   > TR                   " U 6 $ r   )r   )r   r0   s    r5   r   5_UnslothDDPOTrainer.compute_rewards.<locals>.<lambda>  s    $..!2Dr8   )
r   rT   r   	as_tensorr  rp   r3  mapresultrM   )	r0   prompt_image_pairsis_asyncrewardsimagespromptsprompt_metadatarewardreward_metadatas	   `        r5   compute_rewards#_UnslothDDPOTrainer.compute_rewards  s    G4F0*.../*Z't7G7G7N7NO' 5G mm''(DFXYG 07/6+F 9I9I9P9PQSbSiSiSkl/6  
 G}s   AC)epochglobal_stepc                 
   U R                  U R                  R                  U R                  R                  S9u  p4US   R	                  5        VVs0 s H,  oU[
        R                  " U Vs/ s H  ofU   PM	     sn5      _M.     nnnU R                  X@R                  R                  S9u  px[        U5       H  u  pU
R                  Xy   X   /5        M     U R                  b)  U R                  XBU R                  R                  S   5        [
        R                  " U5      nU R                  R                  U5      R                  5       R!                  5       nU R                  R#                  UUUR%                  5       UR'                  5       S.US9  U R                  R(                  (       az  U R                  R                  US   5      R                  5       R!                  5       nU R*                  R,                  R/                  USS	9nU R0                  R3                  X5      nO&XwR%                  5       -
  UR'                  5       S
-   -  n[
        R4                  " U5      R7                  U R                  R8                  S5      U R                  R:                     R=                  U R                  R>                  5      US'   US	 US   R@                  u  p[C        U R                  RD                  5       GH  n[
        RF                  " XR                  R>                  S9nURI                  5        VVs0 s H  u  nnUUU   _M     nnn[
        RJ                  " [C        U5       Vs/ s H+  n[
        RF                  " XR                  R>                  S9PM-     sn5      nS H=  nUU   [
        RL                  " XR                  R>                  S9SS2S4   U4   UU'   M?     UR	                  5       nURO                  5       nU Vs/ s H8  nUR6                  " SU R                  RP                  /UR@                  SS Q76 PM:     nn[S        U6 nU Vs/ s H  n[U        [S        UU5      5      PM     nnU R*                  RV                  RY                  5         U R[                  UXU5      nU R                  R\                  (       a  GM  [_        S5      e   US:w  aQ  XR                  R`                  -  S:X  a5  U R                  Rb                  (       a  U R                  Re                  5         U$ s  snf s  snnf s  snnf s  snf s  snf s  snf )a  
Perform a single step of training.

Args:
    epoch (int): The current epoch.
    global_step (int): The current global step.

Side Effects:
    - Model weights are updated
    - Logs the statistics to the accelerator trackers.
    - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step,
      and the accelerator tracker.

Returns:
    global_step (int): The updated global step.

)
iterationsrr   r   )rE  N)rJ  rN  reward_mean
reward_stdstep
prompt_idsT)skip_special_tokensr   rC   
advantages	timestepsro   )rY  latentsnext_latents	log_probsrI   zsOptimization step should have been performed by this point. Please check calculated gradient accumulation settings.)3_generate_samplesr   r   r   keysr   catrL  r   	enumerateextendr   r  trackersrP   cpunumpylogmeanstdr   r   r+  batch_decoder.  updaterA  rK   num_processesprocess_indexrN   rp   rL   ranger   randpermitemsstackrq   valuesr   rM   r  r  train_train_batched_samplessync_gradientsre   r   r  
save_state)r0   rN  rO  samplesprompt_image_dataksrF  rewards_metadatai
image_datarV  rH  rX  total_batch_sizenum_timestepsinner_epochpermvr   permskeyoriginal_keysoriginal_valuesreshaped_valuestransposed_values
row_valuessamples_batcheds                               r5   rU  _UnslothDDPOTrainer.step  s   $ &*%;%;{{??{{44 &< &
" CJ!*//BSTBSQeiiw 7w!1w 788BST$($8$8(L(L %9 %
! ''89MAwz+;+>?@ : &&2''(9HXHXHaHabcHde))G$""))'2668>>@!&||~%kkm	  	 	
 ;;//))001FGKKMSSUJ&&00==j^b=cG**11'CJ!LLN2w{{}t7KLJ OOJ'WT%%33R89I9I9W9WYR  ''( 	 L!*1+*>*D*D' !C!CDK>>"2;K;K;R;RSD.5mmo>odaq!D'zoG> KKX]^nXopXoST6F6F6M6MNXopE M&s|LL!1:J:J:Q:QRSTVZSZ[  M $LLNM%nn.Obqrbq]^qyyT[[-I-IXAGGTUTVKXbqOr !$_ 5VghVg
tCz$BCVgOh!!'')55k5WfgK##222  J ? EF A:%++"7"771<AQAQAaAa'')c !8T\ ?
 q s
 is0   T:+T59T:9U ,2U
?UU5T:c           	      N   U R                  5          U R                  R                  (       a  U R                  R	                  [
        R                  " U/S-  5      [
        R                  " U/S-  5      U5      R                  nUR                  S5      u  pXR                  R                  X-
  -  -   nO'U R                  R	                  UUU5      R                  nU R                  R                  UUUU R                  R                  US9n
U
R                  nSSS5        [
        R                  " UU R                  R                  * U R                  R                  5      n[
        R                  " WU-
  5      nU R!                  XPR                  R"                  U5      nS[
        R$                  " X-
  S-  5      -  n[
        R$                  " [
        R&                  " US-
  5      U R                  R"                  :  R)                  5       5      nXU4$ ! , (       d  f       GN= f)a  
Calculate the loss for a batch of an unpacked sample

Args:
    latents (torch.Tensor):
        The latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, width]
    timesteps (torch.Tensor):
        The timesteps sampled from the diffusion model, shape: [batch_size]
    next_latents (torch.Tensor):
        The next latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height,
        width]
    log_probs (torch.Tensor):
        The log probabilities of the latents, shape: [batch_size]
    advantages (torch.Tensor):
        The advantages of the latents, shape: [batch_size]
    embeds (torch.Tensor):
        The embeddings of the prompts, shape: [2*batch_size or batch_size, ...] Note: the "or" is because if
        train_cfg is True, the expectation is that negative prompts are concatenated to the embeds

Returns:
    loss (torch.Tensor), approx_kl (torch.Tensor), clipfrac (torch.Tensor) (all of these are of shape (1,))
r   )etaprev_sampleNg      ?r   )r/  r   r   r   r  r   r_  samplerJ   r   scheduler_stepr   r\  clampr   explossr   rf  absfloat)r0   rZ  rY  r[  r\  rX  embeds
noise_prednoise_pred_uncondnoise_pred_textscheduler_step_outputlog_probratior  	approx_klclipfracs                   r5   calculate_loss"_UnslothDDPOTrainer.calculate_lossv  s   . ]]_{{$$!--22IIwi!m,IIykAo. &	 
 6@5E5Ea5H2!.1R1R#72 
 "--22 &	  %)$4$4$C$CKK**( %D %! -66H7 : [[[[+++KK**

 		(Y./yy[[%A%A5I%**h&:q%@AA	::uyy58T8TT[[]^((W _s   DH
H$rX  
clip_ranger  c                     U* U-  nU* [         R                  " USU-
  SU-   5      -  n[         R                  " [         R                  " XE5      5      $ )Nr   )r   r  rf  maximum)r0   rX  r  r  unclipped_lossclipped_losss         r5   r  _UnslothDDPOTrainer.loss  sT     %u,"{U[[**&
 

 zz%--EFFr8   c                    U R                   R                  (       a  SS KnUR                  R                  nO[
        R                  R                  nU" UU R                   R                  U R                   R                  U R                   R                  4U R                   R                  U R                   R                  S9$ )Nr   )lrbetasweight_decayeps)r   r   bitsandbytesoptim	AdamW8bitr   AdamWr   r   r   r   r   )r0   trainable_layers_parametersr  optimizer_clss       r5   r'  $_UnslothDDPOTrainer._setup_optimizer  s    ;;**(..88M!KK--M'{{..;;//1M1MN<<..
 	
r8   c                 \    U R                   R                  XU5        UR                  5         g r   )r   save_checkpointpop)r0   modelsweights
output_dirs       r5   r   $_UnslothDDPOTrainer._save_model_hook  s!    ((*Er8   c                 Z    U R                   R                  X5        UR                  5         g r   )r   load_checkpointr  )r0   r  	input_dirs      r5   r"  $_UnslothDDPOTrainer._load_model_hook  s    ((;

r8   c                    / n/ nU R                   R                  R                  5         U R                  R	                  USS5      n[        U5       GH  n[        [        U5       Vs/ s H  o`R                  5       PM     sn6 u  pxU R                   R                  USSSU R                   R                  R                  S9R                  R                  U R                  R                  5      n	U R                   R                  U	5      S   n
U R                  5          U R                  U
UU R                   R"                  U R                   R$                  U R                   R&                  SS9nUR(                  nUR*                  nUR,                  nSSS5        [.        R0                  " WSS	9n[.        R0                  " WSS	9nU R                   R2                  R4                  R	                  US5      nUR7                  U	U
UUSS2SS
24   USS2SS24   UUS.5        UR7                  WXx/5        GM     X44$ s  snf ! , (       d  f       N= f)z
Generate samples from the model

Args:
    iterations (int): Number of iterations to generate samples for
    batch_size (int): Batch size to use for sampling

Returns:
    samples (list[dict[str, torch.Tensor]]), prompt_image_pairs (list[list[Any]])
rI   r   r   Tr   r   )prompt_embedsnegative_prompt_embedsnum_inference_stepsguidance_scaler  output_typeNrH   rC   )rV  r  rY  rZ  r[  r\  r  )r   r  evalr-  repeatrl  rM   r   r+  r,  r`   rN   r  rp   r  r/  r   r   r   r   rG  rZ  r\  r   ro  	schedulerrY  rT   )r0   rQ  rr   ru  rD  sample_neg_prompt_embedsr   rH  rI  rV  r  	sd_outputrG  rZ  r\  rY  s                   r5   r]  %_UnslothDDPOTrainer._generate_samples  s    ""$#'#8#8#?#?
Aq#Q z"A'*uZGX,YGX!^^-=GX,Y'Z$G))33#$++55FF 4  i4++223  !,,99*EaHM ,,"/+C(,(D(D#';;#D#D.. $ - 	 #))#++%//	 ! kk'q1GI15I((22<<CCJPQRINN",%2!*&q#2#v$+AqrEN!*.F
 %%vw&HIS #V **U -Z !s   'H9
A5H>>
I	c                    [        [        5      n[        U5       GH  u  pgU R                  R                  (       a  [
        R                  " US   US   /5      nOUS   n[        U R                  5       GH^  n	U R                  R                  U R                  R                  5         U R                  US   SS2U	4   US   SS2U	4   US   SS2U	4   US   SS2U	4   US   U5      u  pnUS	   R                  U5        US
   R                  U5        US   R                  U
5        U R                  R                  U
5        U R                  R                   (       as  U R                  R#                  [%        U R&                  [        5      (       d  U R&                  R)                  5       OU R&                  U R                  R*                  5        U R,                  R/                  5         U R,                  R1                  5         SSS5        U R                  R                   (       d  GM  UR3                  5        VVs0 s H0  u  pU[
        R4                  " [
        R6                  " U5      5      _M2     nnnU R                  R9                  USS9nUR;                  X!S.5        U R                  R=                  XSS9  US-  n[        [        5      nGMa     GM     U$ ! , (       d  f       N= fs  snnf )a  
Train on a batch of samples. Main training segment

Args:
    inner_epoch (int): The current inner epoch
    epoch (int): The current epoch
    global_step (int): The current global step
    batched_samples (list[dict[str, torch.Tensor]]): The batched samples to train on

Side Effects:
    - Model weights are updated
    - Logs the statistics to the accelerator trackers.

Returns:
    global_step (int): The updated global step
r  r  rZ  NrY  r[  r\  rX  r  r  r  rf  )	reduction)rN  r~  rT  rI   )r   r  r`  r   r   r   r_  rl  r  r  
accumulater   r  r  rT   backwardrs  clip_grad_norm_r(  r1  r)  r   r*  rU  	zero_gradrn  rf  ro  reduceri  re  )r0   r~  rN  rO  batched_samplesr  _ir  r  jr  r  r  rw  r  s                  r5   rr  *_UnslothDDPOTrainer._train_batched_samples  sx   " 4 #O4JB{{$$F+C$Df_F]#^_04334%%001A1A1F1FG040C0Cy)!Q$/{+AqD1~.q!t4{+AqD1|,1-DX %,,Y7$++H5L''-$$--d3''66((88#-d.C.CT#J#J !11<<>!%!6!6 KK;;	 NN'')NN,,.- H2 ##222FJjjlSldaAuzz%++a.99lDS++22462JDKK% LM$$(((@1$K&t,DC 5 5R C HG6 Ts   (EK?7K0

K-rc   c                    U R                   R                  U R                  R                  -  U R                   R                  -  nU R                   R
                  U R                  R                  -  U R                   R                  -  nU R                   R                  U R                   R
                  :  d3  SSU R                   R                   SU R                   R
                   S34$ U R                   R                  U R                   R
                  -  S:X  d3  SSU R                   R                   SU R                   R
                   S34$ X-  S:X  d  SSU SU S34$ g	)
NFzSample batch size (z9) must be greater than or equal to the train batch size ()r   z-) must be divisible by the train batch size (zNumber of samples per epoch (z3) must be divisible by the total train batch size ()Tr   )r   r   r  rj  r   r   r   )r0   samples_per_epochtotal_train_batch_sizes      r5   r  !_UnslothDDPOTrainer._config_check[  s   KK))D,<,<,J,JJT[[MuMuu 	 KK((,,-kk;;< 	 {{,,0L0LL%dkk&C&C%DD}  C  J  J  [  [  ~\  \]  ^  {{,,t{{/K/KKqP%dkk&C&C%DDqrvr}r}  sO  sO  rP  PQ  R  !9Q>/0A/BBu  wM  vN  NO  P  r8   epochsc                     SnUc  U R                   R                  n[        U R                  U5       H  nU R	                  X25      nM     g)z.
Train the model for a given number of epochs
r   N)r   r   rl  r5  rU  )r0   r  rO  rN  s       r5   rq  _UnslothDDPOTrainer.trainv  sB     >[[++F4++V4E))E7K 5r8   c                 Z    U R                   R                  U5        U R                  5         g r   )r   save_pretrainedcreate_model_card)r0   save_directorys     r5   _save_pretrained$_UnslothDDPOTrainer._save_pretrained  s"    ((8 r8   c                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )N/rC   )
model_name)	r1   hub_model_idr   r  namer
  r  r   _save_checkpoint)r0   r,   trialr  r   s       r5   r  $_UnslothDDPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .r8   r  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsa          @inproceedings{black2024training,
            title        = {{Training Diffusion Models with Reinforcement Learning}},
            author       = {Kevin Black and Michael Janner and Yilun Du and Ilya Kostrikov and Sergey Levine},
            year         = 2024,
            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=YCWjhGrJFD},
        }DDPOz5Training Diffusion Models with Reinforcement Learningz
2305.13301)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror/   r,   r   r   r  isdirr  setr(  straddenvironri  
_tag_namesr   dedentr   r  r   wandbrunurlr   saver  r1   r  )r0   r  r  r  r  citation
model_cards          r5   r  %_UnslothDDPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%O!

 	TYY%9%9;GHr8   )r  r/  r   r3  r5  r   r-  r  r*  r   r   r   r.  r1  r   )F)NNN)%r   r   r   r   r   r  r   r   r   r   tupler  r   r   r   r   rL  r   rU  r  r  r  r'  r   r"  r]  rr  boolr  rq  r  r  r	   r  r  r   r   r   s   @r5   r   r   S  s    J HLP!P! "5<<sU3Z"H%,,"VWP! ""eCHo"56	P!
 1P! %XsCos.B%CDP!d(i# iC iVB)HGLLG G ||	G
 <+|;zuT3Y/ 68HSM 8!
/ %)&*,0	BISMBI smBI CcD()	BI BIr8   r   c                   2   ^  \ rS rSrSr SU 4S jjrSrU =r$ )UnslothDDPOTraineri  a9  
    
The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is heavily
inspired by the work here: https://github.com/kvablack/ddpo-pytorch As of now only Stable Diffusion based pipelines
are supported

Args:
    config ([`DDPOConfig`]):
        Configuration object for DDPOTrainer. Check the documentation of [`PPOConfig`] for more details.
    reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
        Reward function to be used.
    prompt_function (`Callable[[], tuple[str, Any]]`): Function to generate prompts to guide model
    sd_pipeline ([`DDPOStableDiffusionPipeline`]): Stable Diffusion pipeline to be used for training.
    image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`): Hook to be called to log images.

    c           	        > Wc
  [        5       n/ nSSKJn	  U	" SU5        [        USS 5      [        R
                  :X  a(  UR                  S:  a  [        USS5      S:w  a  SUl        S[        5       ;   a)  [        [        S5      (       a  [        R                  5         [        T
U ]4  " SUUUUUS	.UD6  S[        5       ;   a+  [        [        S
5      (       a  [        R                  5         g g g )Nr   )PatchRLStatisticsddpo_trainerparallel_moderI   _n_gpur,   r-   )r   r   r   r   r   r.   r   )r   unsloth_zoo.logging_utilsr  getattrr(   NOT_DISTRIBUTEDn_gpur  localsr/   r,   r-   r   r   r.   )r0   r   r   r   r   r   r2   r1   other_metricsr  r   s             r5   r   UnslothDDPOTrainer.__init__  s     < 1 3?.-8 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	>--%!3	>
 7=	> fh75/#B#B! $Cr8   r   r   )r   r   r   r   r   r   r   r   r   s   @r5   r  r    s    , "" "r8   r  	addFilterc                        \ rS rSrS rS rSrg)HideLoggingMessagei  c                     Xl         g r   text)r0   r  s     r5   r   HideLoggingMessage.__init__  s    d)r8   c                 <    U R                   UR                  5       ;  $ r   )r  
getMessage)r0   r   s     r5   r  HideLoggingMessage.filter  s    alln)DEr8   r  N)r   r   r   r   r   r  r   r   r8   r5   r  r    s    2Er8   r  z`use_cache=True`)Er   r   r   torch.nnnnr   Ftypingr   r   r   r   r	   r
   r   r   trl.trainer.ddpo_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    dataclassesr"   r#   packaging.versionr$   rd  np
contextlibr%   transformersr&   r'   +TransformersDataCollatorForLanguageModelingtransformers.training_argsr(   r9   typesr)   r;   torch_compile_optionscompiler_   r   rj   ry   r   r   r   r  r/   Filterr  r  r   r8   r5   <module>r2     s  0    $ I I I d  d  d  d  d  d  d 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL  @5
 @5 @5D |	I. |	Iz.", ."`  6;FW^^ F 	
'(:;<  r8   