
    h/                     f    S SK r S SKrS SKJrJr  S SKJr  S SKJr  SSK	J
r
  \ " S S5      5       rg)	    N)	dataclassfield)Optional)is_bitsandbytes_available   )flatten_dictc                      \ rS rSr% Sr\" \R                  R                  \	R                  S   5      S\" S5      *  SS0S9r\\S	'   \" S
SS0S9r\\S'   \" SSS0S9r\\S'   \" SSSS/S.S9r\\   \S'   \" \SS0S9r\\S'   \" \SS0S9r\\S'   \" \SS0S9r\\S'   \" SSS0S9r\\S'   \" SSS0S9r\\S '   \" S!SS"0S9r\\S#'   \" S$SS%0S9r\\S&'   \" S'SS(0S9r\\S)'   \" S*SS+0S9r\\S,'   \" S-SS.0S9r\\S/'   \" S
SS00S9r \\S1'   \" S2SS30S9r!\\S4'   \" S5SS60S9r"\#\S7'   \" S8SS90S9r$\#\S:'   \" S$SS;0S9r%\\S<'   \" S=SS>0S9r&\\S?'   \" S$SS@0S9r'\\SA'   \" SBSSC0S9r(\\SD'   \" SESSF0S9r)\#\SG'   \" SHSSI0S9r*\#\SJ'   \" SKSSL0S9r+\#\SM'   \" SNSSO0S9r,\#\SP'   \" SQSSR0S9r-\#\SS'   \" S$SST0S9r.\\SU'   \" S5SSV0S9r/\#\SW'   \" S$SSX0S9r0\\SY'   \" S-SSZ0S9r1\\S['   \" S8SS\0S9r2\#\S]'   \" SNSS^0S9r3\#\S_'   \" S5SS`0S9r4\#\Sa'   \" SBSSb0S9r5\\Sc'   \" SdSSe0S9r6\\Sf'   \" SdSSg0S9r7\\Sh'   \" SBSSi0S9r8\\Sj'   \" S=SSk0S9r9\\Sl'   \" S
SSm0S9r:\\Sn'   \" SBSSo0S9r;\\Sp'   Sq r<Sr r=Ssr>g)t
DDPOConfig   a`  
Configuration class for the [`DDPOTrainer`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
        Name of this experiment (by default is the file name without the extension name).
    run_name (`str`, *optional*, defaults to `""`):
        Name of this run.
    seed (`int`, *optional*, defaults to `0`):
        Random seed.
    log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`):
        Log with either 'wandb' or 'tensorboard', check
        https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
    tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the tracker (e.g. wandb_project).
    accelerator_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the accelerator.
    project_kwargs (`Dict`, *optional*, defaults to `{}`):
        Keyword arguments for the accelerator project config (e.g. `logging_dir`).
    tracker_project_name (`str`, *optional*, defaults to `"trl"`):
        Name of project to use for tracking.
    logdir (`str`, *optional*, defaults to `"logs"`):
        Top-level logging directory for checkpoint saving.
    num_epochs (`int`, *optional*, defaults to `100`):
        Number of epochs to train.
    save_freq (`int`, *optional*, defaults to `1`):
        Number of epochs between saving model checkpoints.
    num_checkpoint_limit (`int`, *optional*, defaults to `5`):
        Number of checkpoints to keep before overwriting old ones.
    mixed_precision (`str`, *optional*, defaults to `"fp16"`):
        Mixed precision training.
    allow_tf32 (`bool`, *optional*, defaults to `True`):
        Allow `tf32` on Ampere GPUs.
    resume_from (`str`, *optional*, defaults to `""`):
        Resume training from a checkpoint.
    sample_num_steps (`int`, *optional*, defaults to `50`):
        Number of sampler inference steps.
    sample_eta (`float`, *optional*, defaults to `1.0`):
        Eta parameter for the DDIM sampler.
    sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
        Classifier-free guidance weight.
    sample_batch_size (`int`, *optional*, defaults to `1`):
        Batch size (per GPU) to use for sampling.
    sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`):
        Number of batches to sample per epoch.
    train_batch_size (`int`, *optional*, defaults to `1`):
        Batch size (per GPU) to use for training.
    train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
        Use 8bit Adam optimizer from bitsandbytes.
    train_learning_rate (`float`, *optional*, defaults to `3e-4`):
        Learning rate.
    train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
        Adam beta1.
    train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
        Adam beta2.
    train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
        Adam weight decay.
    train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
        Adam epsilon.
    train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
        Number of gradient accumulation steps.
    train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
        Maximum gradient norm for gradient clipping.
    train_num_inner_epochs (`int`, *optional*, defaults to `1`):
        Number of inner epochs per outer epoch.
    train_cfg (`bool`, *optional*, defaults to `True`):
        Whether to use classifier-free guidance during training.
    train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
        Clip advantages to the range.
    train_clip_range (`float`, *optional*, defaults to `1e-4`):
        PPO clip range.
    train_timestep_fraction (`float`, *optional*, defaults to `1.0`):
        Fraction of timesteps to train on.
    per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`):
        Whether to track statistics for each prompt separately.
    per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`):
        Number of reward values to store in the buffer for each prompt.
    per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`):
        Minimum number of reward values to store in the buffer.
    async_reward_computation (`bool`, *optional*, defaults to `False`):
        Whether to compute rewards asynchronously.
    max_workers (`int`, *optional*, defaults to `2`):
        Maximum number of workers to use for async reward computation.
    negative_prompts (`str`, *optional*, defaults to `""`):
        Comma-separated list of prompts to use as negative examples.
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether to push the final model checkpoint to the Hub.
r   Nz.pyhelpzQName of this experiment (by default is the file name without the extension name).)defaultmetadataexp_name zName of this run.run_namezRandom seed.seedz)Log with either 'wandb' or 'tensorboard'.wandbtensorboard)r   choiceslog_withz7Keyword arguments for the tracker (e.g. wandb_project).)default_factoryr   tracker_kwargsz&Keyword arguments for the accelerator.accelerator_kwargszJKeyword arguments for the accelerator project config (e.g. `logging_dir`).project_kwargstrlz$Name of project to use for tracking.tracker_project_namelogsz2Top-level logging directory for checkpoint saving.logdird   zNumber of epochs to train.
num_epochs   z2Number of epochs between saving model checkpoints.	save_freq   z:Number of checkpoints to keep before overwriting old ones.num_checkpoint_limitfp16zMixed precision training.mixed_precisionTzAllow `tf32` on Ampere GPUs.
allow_tf32z"Resume training from a checkpoint.resume_from2   z"Number of sampler inference steps.sample_num_stepsg      ?z#Eta parameter for the DDIM sampler.
sample_etag      @z Classifier-free guidance weight.sample_guidance_scalez)Batch size (per GPU) to use for sampling.sample_batch_sizer   z&Number of batches to sample per epoch.sample_num_batches_per_epochz)Batch size (per GPU) to use for training.train_batch_sizeFz*Use 8bit Adam optimizer from bitsandbytes.train_use_8bit_adamga2U0*3?zLearning rate.train_learning_rateg?zAdam beta1.train_adam_beta1g+?zAdam beta2.train_adam_beta2g-C6?zAdam weight decay.train_adam_weight_decayg:0yE>zAdam epsilon.train_adam_epsilonz&Number of gradient accumulation steps.!train_gradient_accumulation_stepsz,Maximum gradient norm for gradient clipping.train_max_grad_normz'Number of inner epochs per outer epoch.train_num_inner_epochsz8Whether to use classifier-free guidance during training.	train_cfgzClip advantages to the range.train_adv_clip_maxzPPO clip range.train_clip_rangez"Fraction of timesteps to train on.train_timestep_fractionz7Whether to track statistics for each prompt separately.per_prompt_stat_tracking   z?Number of reward values to store in the buffer for each prompt.$per_prompt_stat_tracking_buffer_sizez7Minimum number of reward values to store in the buffer."per_prompt_stat_tracking_min_countz*Whether to compute rewards asynchronously.async_reward_computationz>Maximum number of workers to use for async reward computation.max_workersz<Comma-separated list of prompts to use as negative examples.negative_promptsz6Whether to push the final model checkpoint to the Hub.push_to_hubc                 j    0 nU R                   R                  5        H	  u  p#X1U'   M     [        U5      $ )N)__dict__itemsr   )selfoutput_dictkeyvalues       Q/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/ddpo_config.pyto_dictDDPOConfig.to_dict   s3    ----/JC$ 0K((    c                 \    U R                   (       a  [        5       (       d  [        S5      eg g )NzfYou need to install bitsandbytes to use 8bit Adam. You can install it with `pip install bitsandbytes`.)r0   r   ImportError)rH   s    rL   __post_init__DDPOConfig.__post_init__&  s0    ##,E,G,GF  -H#rO    )?__name__
__module____qualname____firstlineno____doc__r   ospathbasenamesysargvlenr   str__annotations__r   r   intr   r   dictr   r   r   r   r   r    r"   r$   r&   r'   boolr(   r*   r+   floatr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r?   r@   rA   rB   rC   rD   rM   rR   __static_attributes__rT   rO   rL   r
   r
      sN   [z   !-mU<mnHc  -.Hc  .)D#  $?/
Hhsm  !STND   %BC   !fgND  !&@A!#  NOFC  67J  NOIs  !&VW!#  !56OS  89J  >?K  ">?c  ?@J  $)<=$5  #EFs  ).BC) #  "EFc  !&FG!  "'*+"  $-(e  $-(e  &+./&U  !&/*!  .3BC.%s  "'HI"  #(CD#C  TUIt  !&9:!  $+,e  &+>?&U  &+ST&d  16[\1(#  /4ST/&  &+FG&d  Z[K  "XYc  RSK 
)rO   r
   )rZ   r]   dataclassesr   r   typingr   transformersr   corer   r
   rT   rO   rL   <module>rk      s5    
 
 (  2  Q Q QrO   