
    h                     X    S SK r S SKJrJr  S SKJrJr  SSKJr  \ " S S\5      5       r	g)    N)	dataclassfield)LiteralOptional   )OnPolicyConfigc                      \ rS rSr% Sr\" \R                  R                  \	5      SS SS0S9r
\\S'   \" S	SS
0S9r\\S'   \" SSS0S9r\\   \S'   \" SSS0S9r\\   \S'   \" SSS0S9r\\S'   \" SSS0S9r\\S'   \" SSS0S9r\\S'   \" SSS0S9r\S   \S'   \" SSS0S9r\\S'   \" S SS!0S9r\\S"'   \" SSS#0S9r\\S$'   \" S%SS&0S9r\\S''   \" S(SS)0S9r\\S*'   \" S+SS,0S9r\\S-'   S.rg)/	PPOConfig   aL
  
Configuration class for the [`PPOTrainer`].

This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
values in this class may differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
        Name of this experiment.
    reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
        Path to the reward model.
    model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
    ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
    num_ppo_epochs (`int`, *optional*, defaults to `4`):
        Number of epochs to train.
    whiten_rewards (`bool`, *optional*, defaults to `False`):
        Whether to whiten the rewards.
    kl_coef (`float`, *optional*, defaults to `0.05`):
        KL coefficient.
    kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
        Which estimator for KL-Divergence to use from [Approximating KL
        Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
        estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
        better estimator". Cannot be set to "k2", as it is used for logging purposes.
    cliprange (`float`, *optional*, defaults to `0.2`):
        Clip range.
    vf_coef (`float`, *optional*, defaults to `0.1`):
        Value function coefficient.
    cliprange_value (`float`, *optional*, defaults to `0.2`):
        Clip range for the value function.
    gamma (`float`, *optional*, defaults to `1.0`):
        Discount factor.
    lam (`float`, *optional*, defaults to `0.95`):
        Lambda value for GAE.
    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
        improving generation speed. However, disabling this option allows training models that exceed the VRAM
        capacity of a single GPU, albeit at the cost of slower generation.
NhelpzName of this experiment.)defaultmetadataexp_namezEleutherAI/pythia-160mzPath to the reward model.reward_model_pathzNName of the train target PEFT adapter, when using LoRA with multiple adapters.model_adapter_namezKName of the reference PEFT adapter, when using LoRA with multiple adapters.ref_adapter_name   zNumber of epochs to train.num_ppo_epochsFzWhether to whiten the rewards.whiten_rewardsg?zKL coefficient.kl_coefk1aW  Which estimator for KL-Divergence to use from Approximating KL Divergence (http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better estimator'. Cannot be set to 'k2', as it is used for logging purposes.)r   k3kl_estimatorg?zClip range.	cliprangeg?zValue function coefficient.vf_coefz"Clip range for the value function.cliprange_valueg      ?zDiscount factor.gammagffffff?zLambda value for GAE.lamTa  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation.ds3_gather_for_generation ) __name__
__module____qualname____firstlineno____doc__r   ospathbasename__file__r   str__annotations__r   r   r   r   r   intr   boolr   floatr   r   r   r   r   r   r   r    __static_attributes__r!       P/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/ppo_config.pyr
   r
      s   -^   *3B/45Hc  #(56s  ).jk)  ',gh'hsm   67NC  !:;ND  +,GU  ). U
)L'*%  -(Iu  78GU  #>?OU  ,-E5  12C  ', a
't r1   r
   )
r'   dataclassesr   r   typingr   r   trainer.utilsr   r
   r!   r1   r2   <module>r6      s1    
 ( $ * p p pr1   