
    h)                     P    S SK JrJr  S SKJrJr  S SKJr  \ " S S\5      5       rg)    )	dataclassfield)AnyOptional)TrainingArgumentsc                     ^  \ rS rSr% Sr\R                  S/-   r\" SSS0S9r\	\
S'   \" S	SS
0S9r\	\
S'   \" SSS0S9r\\
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\\   \
S'   \" SSS0S9r\	\
S'   \" SSS0S9r\	\
S'   \" S S!/ S"QS#.S9r\\
S$'   \" SSS%0S9r\\
S&'   \" S'SS(0S9r\	\
S)'   \" S*SS+0S9r\	\
S,'   \" SSS-0S9r\	\
S.'   \" S/SS00S9r\\
S1'   \" SSS20S9r\\   \
S3'   \" S4S5S4S6/S#.S9r\\
S7'   \" S8SS90S9r\\
S:'   \" SSS;0S9r \\   \
S<'   \" SSS=0S9r!\\"\\#4      \
S'   \" SSS>0S9r$\\   \
S?'   U 4S@ jr%SAr&U =r'$ )B	CPOConfig   u  
Configuration class for the [`CPOTrainer`].

This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    max_length (`int` or `None`, *optional*, defaults to `1024`):
        Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
        to use the default data collator.
    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
        Maximum length of the prompt. This argument is required if you want to use the default data collator.
    max_completion_length (`int` or `None`, *optional*, defaults to `None`):
        Maximum length of the completion. This argument is required if you want to use the default data collator
        and your model is an encoder-decoder.
    beta (`float`, *optional*, defaults to `0.1`):
        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
        the [paper](https://huggingface.co/papers/2310.12036).
    label_smoothing (`float`, *optional*, defaults to `0.0`):
        Label smoothing factor. This argument is required if you want to use the default data collator.
    loss_type (`str`, *optional*, defaults to `"sigmoid"`):
        Type of loss to use. Possible values are:

            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
            - `"hinge"`: hinge loss on the normalized likelihood from the
              [SLiC](https://huggingface.co/papers/2305.10425) paper.
            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
            - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
            - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
              automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.

    disable_dropout (`bool`, *optional*, defaults to `True`):
        Whether to disable dropout in the model.
    cpo_alpha (`float`, *optional*, defaults to `1.0`):
        Weight of the BC regularizer in CPO training.
    simpo_gamma (`float`, *optional*, defaults to `0.5`):
        Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
    alpha (`float`, *optional*, defaults to `0.0`):
        Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
        standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
        / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
        loss types.
    label_pad_token_id (`int`, *optional*, defaults to `-100`):
        Label pad token id. This argument is required if you want to use the default data collator.
    padding_value (`int` or `None`, *optional*, defaults to `None`):
        Padding value to use. If `None`, the padding value of the tokenizer is used.
    truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
        Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
        This argument is required if you want to use the default data collator.
    generate_during_eval (`bool`, *optional*, defaults to `False`):
        If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
    is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
        When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
        you need to specify if the model returned by the callable is an encoder-decoder model.
    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
        string.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of processes to use for processing the dataset.
model_init_kwargsgư>helpz$The initial learning rate for AdamW.)defaultmetadatalearning_rate
   zLog every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.logging_stepsTzZIf True, use gradient checkpointing to save memory at the expense of slower backward pass.gradient_checkpointingNzWhether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if `fp16` is not set.bf16i   zCMaximum length of the sequences (prompt + completion) in the batch.
max_lengthi   zMaximum length of the prompt. This argument is required if you want to use the default data collator and your model is an encoder-decoder.max_prompt_lengthzMaximum length of the completion. This argument is required if you want to use the default data collator and your model is an encoder-decoder.max_completion_lengthg?uv   Parameter controlling the deviation from the reference model. Higher β means less deviation from the reference model.beta        zLabel smoothing factor.label_smoothingsigmoidzType of loss to use.)r   hingeiposimpoalphapo)r   choices	loss_typez(Whether to disable dropout in the model.disable_dropoutg      ?z-Weight of the BC regularizer in CPO training.	cpo_alphag      ?zPTarget reward margin for the SimPO loss, used only when the `loss_type='simpo'`.simpo_gammaa  Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha)) / alpha` from the AlphaPO paper. This parameter works with all loss types.alphaizLabel pad token id.label_pad_token_idzLPadding value to use. If `None`, the padding value of the tokenizer is used.padding_valuekeep_endz3Truncation mode to use when the prompt is too long.
keep_starttruncation_modeFzRIf `True`, generates and logs completions from the model to W&B during evaluation.generate_during_evalz.Whether the model is an encoder-decoder model.is_encoder_decoderzoKeyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string.z6Number of processes to use for processing the dataset.dataset_num_procc                    > U R                   c  U R                  (       + OU R                   U l         U R                  S:X  a  SU l        SU l        [        TU ]  5         g )Nr   r   r   )r   fp16r    r"   super__post_init__)self	__class__s    P/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/cpo_config.pyr0   CPOConfig.__post_init__   sF    '+yy'8Odii	 >>Y&$DN DN    )r   r"   r    )(__name__
__module____qualname____firstlineno____doc__r   _VALID_DICT_FIELDSr   r   float__annotations__r   r   boolr   r   r   intr   r   r   r   r    strr!   r"   r#   r$   r%   r&   r)   r*   r+   r   dictr   r,   r0   __static_attributes____classcell__)r2   s   @r3   r	   r	      s   AF +==AT@UU !@AM5  ! D
M5  $)p
$D  ! !
D(4.  !&_`!J  (- =
(x}  ,1 =
,8C=   #
D%  #34OU  *F
Is  "DEOT  IJIu  lmK   o
E5  $/0  $)hi$M8C=  !I"L1
OS  "'no"$  */JK*  38 
3xS#X/  ',RS'hsm 
   r5   r	   N)	dataclassesr   r   typingr   r   transformersr   r	    r5   r3   <module>rH      s/    )   * D ! D  D r5   