
    h7S                     X    S SK r S SKJrJr  S SKJrJr  S SKJr  \ " S S\5      5       r	g)    N)	dataclassfield)AnyOptional)TrainingArgumentsc                   z  ^  \ rS rSr% Sr\" SSS0S9r\\S'   \" SSS	0S9r	\\S
'   \" SSS0S9r
\\S'   \" SSS0S9r\\   \S'   \" SSS0S9r\\   \S'   \" SSS0S9r\\   \S'   \" SSS0S9r\\S'   \" SSS0S9r\\S'   \" SSS0S9r\\S'   \" SSS0S9r\\S '   \" SSS!0S9r\\   \S"'   \" SSS#0S9r\\   \S$'   \" SSS%0S9r\\S&'   \" SSS'0S9r\\   \S('   \" S)SS*0S9r\\S+'   \" SSS,0S9r\\   \S-'   \" SSS.0S9r\\   \S/'   \" S0 SS10S29r\\   \S3'   \" S4S5S4S6/S7.S9r \\S8'   \" SSS90S9r!\\S:'   \" S)SS;0S9r"\\S<'   \" S=SS>0S9r#\\S?'   \" SSS@0S9r$\\   \SA'   \" SBSSC0S9r%\\   \SD'   \" SESSF0S9r&\\SG'   \" SSSH0S9r'\\   \SI'   \" SJSSK0S9r(\\SL'   \" SMSSN0S9r)\\SO'   \" SPSSQ0S9r*\\SR'   \" SSSST0S9r+\\SU'   \" SSSV0S9r,\\SW'   \" SSSX0S9r-\\\\.4      \SY'   \" SSSZ0S9r/\\\      \S['   \" SSS\0S9r0\\   \S]'   \" SSS^0S9r1\\   \S_'   U 4S` jr2Sar3U =r4$ )bOnlineDPOConfig   u"  
Configuration class for the [`OnlineDPOTrainer`].

This class includes only the parameters that are specific to Online DPO training. For a full list of training
arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
class may differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    reward_model_path (`str` or `None`, *optional*, defaults to `None`):
        Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
    judge (`str` or `None`, *optional*, defaults to `None`):
        Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
    max_new_tokens (`int`, *optional*, defaults to `64`):
        Maximum number of tokens to generate per completion.
    max_length (`int`, *optional*, defaults to `256`):
        Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If the
        sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as
        possible.
    temperature (`float`, *optional*, defaults to `0.9`):
        Temperature for sampling. The higher the temperature, the more random the completions.
    missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
        Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to
        generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
        value. This parameter only works when using `reward_funcs` and not when using `judge`.
    beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
        the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
        selected for each new epoch and the last β is used for the rest of the epochs.
    loss_type (`str`, *optional*, defaults to `"sigmoid"`):
        Type of loss to use. Possible values are:

            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.

    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of processes to use for processing the dataset.
    disable_dropout (`bool`, *optional*, defaults to `True`):
        Whether to disable dropout in the model and reference model.

    > Parameters that control generation

    top_p (`float`, *optional*, defaults to `1.0`):
        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
        `1.0` to consider all tokens.
    top_k (`int` or `None`, *optional*, defaults to `None`):
        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
        disabled and all tokens are considered.
    min_p (`float` or `None`, *optional*, defaults to `None`):
        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
    repetition_penalty (`float`, *optional*, defaults to `1.0`):
        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
        tokens.
    use_transformers_paged (`bool`, *optional*, defaults to `False`):
        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
        paged implementation will be used for generation instead of the default padded implementation. This
        parameter is only effective when `use_vllm` is set to `False`.
    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
        as setting `supress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
        parameters (like `min_p`, `top_p`, etc.), they will override them.

    > Parameters that control generation acceleration powered by vLLM

    use_vllm (`bool`, *optional*, defaults to `False`):
        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
        instead of the default model.generate(). Requires `vllm` to be installed.
    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
        implementation.
    vllm_mode (`str`, *optional*, defaults to `"server"`):
        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
        `"colocate"`.

        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
          server is running (start with `trl vllm-serve`).
        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
          separate server but may cause resource contention with training.
    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.

    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)

    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
        `vllm_server_port` are ignored.
    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_port (`int`, *optional*, defaults to `8000`):
        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
        timeout, a `ConnectionError` is raised.

    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)

    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.55`):
        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.

    > Other parameters

    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
        improving generation speed. However, disabling this option allows training models that exceed the VRAM
        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
        with vLLM generation.
    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
        string.
gƠ>helpz$The initial learning rate for AdamW.)defaultmetadatalearning_rate
   zLog every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.logging_stepsTzZIf True, use gradient checkpointing to save memory at the expense of slower backward pass.gradient_checkpointingNzWhether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if `fp16` is not set.bf16zZPath to the reward model. Either `judge` or `reward_model_path` must be set, but not both.reward_model_pathzZName of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.judge@   z4Maximum number of tokens to generate per completion.max_new_tokensi   zMaximum total length of the sequence (prompt + completion) used to compute log probabilities. If the sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as possible.
max_lengthg?zVTemperature for sampling. The higher the temperature, the more random the completions.temperatureg      ?zFloat that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1.0 to consider all tokens.top_pzNumber of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is disabled and all tokens are considered.top_kzMinimum token probability, which will be scaled by the probability of the most likely token. It must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range.min_pzFloat that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model to repeat tokens.repetition_penaltya  Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the generation behavior, such as setting `supress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them.generation_kwargsFa  Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers` paged implementation will be used for generation instead of the default padded implementation. This parameter is only effective when `use_vllm` is set to `False`.use_transformers_pagedzWImplementation of the cache method for faster generation when use_vllm is set to False.cache_implementationzPenalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive value.missing_eos_penaltyc                      S/$ )Ng? r"       W/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/online_dpo_config.py<lambda>OnlineDPOConfig.<lambda>	  s    r#   u  Parameter controlling the deviation from the reference model. Higher β means less deviation from the reference model. For the IPO loss (`loss_type='ipo'`), β is the regularization parameter denoted by τ in the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is selected for each new epoch and the last β is used for the rest of the epochs.)default_factoryr   betasigmoidzType of loss to use.ipo)r   choices	loss_typez(Whether to disable dropout in the model.disable_dropoutzcWhether to use vLLM for generating completions. Requires vLLM to be installed (`pip install vllm`).use_vllmvllmzModel implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: Use the `transformers` backend for model implementation. `vllm`: Use the `vllm` library for model implementation.vllm_model_implzQRegex for vLLM guided decoding. If `None` (default), guided decoding is disabled.vllm_guided_decoding_regexg?a  Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.vllm_gpu_memory_utilizationservera  Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `'server'` or `'colocate'`. `'server'`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM server is running (start with `trl vllm-serve`). `'colocate'`: vLLM will run in the same process and share the training GPUs. This avoids the need for a separate server but may cause resource contention with training.	vllm_modezBase URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` and `vllm_server_port` are ignored.vllm_server_base_urlz0.0.0.0zSHost of the vLLM server to connect to. Ignored if vllm_server_base_url is provided.vllm_server_hosti@  zSPort of the vLLM server to connect to. Ignored if vllm_server_base_url is provided.vllm_server_portg      n@zTotal timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the timeout, a `ConnectionError` is raised.vllm_server_timeout   a  Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when launching the vLLM server via the `--vllm_tensor_parallel_size` flag.vllm_tensor_parallel_sizeaS  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation.ds3_gather_for_generationzoKeyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string.model_init_kwargszWeights for combining multiple reward functions. Must match the number of reward functions. If None, all reward functions are equally weighted.reward_weightsz6Number of processes to use for processing the dataset.dataset_num_proczuThis parameter is deprecated and will be removed in version 0.25.0. Please use `vllm_gpu_memory_utilization` instead.gpu_memory_utilizationc                   > U R                   c  U R                  (       + OU R                   U l         [        TU ]  5         U R                  b  [
        R                  " S5        U R                  b'  [
        R                  " S5        U R                  U l        [        U R                  S5      (       a/  [        U R                  5      S:X  a  U R                  S   U l
        g g g )NzThe parameter `dataset_num_proc` is deprecated and will be removed in version 0.25.0. Since OnlineDPO does not involve dataset preparation, you can safely remove it.zThe parameter `gpu_memory_utilization` is deprecated and will be removed in version 0.25.0. Please use `vllm_gpu_memory_utilization` instead.__len__r9   r   )r   fp16super__post_init__r>   warningswarnr?   r2   hasattrr(   len)self	__class__s    r$   rD   OnlineDPOConfig.__post_init__  s    '+yy'8Odii	  ,MMb &&2MMD 04/J/JD,499i((S^q-@		!DI .A(r#   )r(   r   r2   )5__name__
__module____qualname____firstlineno____doc__r   r   float__annotations__r   r   boolr   r   r   strr   r   intr   r   r   r   r   r   r   dictr   r   r    r(   listr,   r-   r.   r0   r1   r2   r4   r5   r6   r7   r8   r:   r;   r<   r   r=   r>   r?   rD   __static_attributes____classcell__)rJ   s   @r$   r	   r	      s    }@ !@AM5  ! D
M5  $)p
$D  ! !
D(4.  (-p
(x}  !p
E8C=   PQNC   &
J  rsK   1
E5  ! I
E8C=  # ]
E8E?  !&  
!  ). t
)x~  $) b
$D  +0st+(3-  ,1  
,%  % a
D$u+  *!5)
Is  "DEOT   $
Hd  ! $
OS  16mn1  49 V
4%   (
	Is 	 +0 2
+(3-  "opc  "opc  "' @
"  &+ T
&s  ', 6
't  38 
3xS#X/  -2 B
-NHT%[)  ',RS'hsm  /4 5
/HUO % %r#   r	   )
rE   dataclassesr   r   typingr   r   transformersr   r	   r"   r#   r$   <module>r]      s2     (   * %' % %r#   