
    oi,                        S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	J
r
  SSKJr  SSKJr  SSKJr  S	S
KJr  S	SKJr   " S S\5      r S             SS jjrg)zC
This module contains the implementation of the LoRA-FA optimizer.
    )annotationsN)Iterable)Callable)is_bf16_available)autocast)	Optimizer   )	PeftModel)infer_devicec                     ^  \ rS rSrSr     S           SU 4S jjjr\R                  " 5       SS	S jj5       rSr	U =r
$ )
LoraFAOptimizer#   a  
Implements the LoRA-FA optimizer designed specifically for training Low-Rank Adaptation (LoRA) parameters
efficiently. Note that LoraFAOptimizer is based on adamw-hf in transformers, with only LoRA part modified. Without
LoRA it will fall back to adamw-hf.

Args:
    params (Iterable[nn.parameter.Parameter]): Parameters to optimize.
    lr (float, optional): Learning rate (default: 1e-3).
    betas (Tuple[float, float], optional):
        Coefficients for computing running averages of gradient and squared gradient (default: (0.9, 0.999)).
    eps (float, optional): Term added to denominator to improve numerical stability (default: 1e-6).
    weight_decay (float, optional): Weight decay (L2 penalty) (default: 0.0).
    correct_bias (bool, optional): Whether to apply bias correction as in original Adam (default: True).

Args in sub-function step:
    closure (Callable, optional): A closure that reevaluates the model and returns the loss.

Reference:
    - LoRA-FA: https://huggingface.co/papers/2308.03303
c                  > US:  a  [        SU S35      eSUS   s=::  a  S:  d  O  [        SUS    S35      eSUS   s=::  a  S:  d  O  [        SUS    S35      eSU::  d  [        S	U S35      eUUUUUS
.n[        TU ]	  X5        g )N        zInvalid learning rate: z - should be >= 0.0r         ?zInvalid beta parameter: z - should be in [0.0, 1.0)   zInvalid epsilon value: )lrbetasepsweight_decaycorrect_bias)
ValueErrorsuper__init__)	selfparamsr   r   r   r   r   defaults	__class__s	           P/home/james-whalen/.local/lib/python3.13/site-packages/peft/optimizers/lorafa.pyr   LoraFAOptimizer.__init__9   s     86rd:MNOOeAh$$7azA[\]]eAh$$7azA[\]]cz6se;NOPP((
 	*    c           
     l	   SnUb  U" 5       nU R                    GH  nUS   n/ n/ n[        US   US   5       GH`  u  pxSU;  a  UR                  c  M  UR                  n	SU;   a[  UR                  U5        UR                  U5        [	        U5      S:X  a  USUR                  S5       S-   n
O[	        U5      S:X  a  M  OUn
U R                  W
   n[	        U5      S:X  a  [	        U5      S:X  a>  SUS	'   [        R                  " US   5      US
'   [        R                  " US   5      US'   O7SUS	'   [        R                  " U5      US'   [        R                  " U5      US'   [	        U5      S:X  Ga'  US   nUS   nUR                  nSnXR                  -  n[        R                  R                  UU[        R                  " UR                  S   5      R                  UR                  5      -  -   5      n[!        5       n[#        5       (       a0  [%        U[        R&                  S9   SUS-  -  UU-  -  nSSS5        OSUS-  -  UU-  -  nWR(                  UR                  R(                  :w  a%  UR                  UR                  R(                  5      nUS
   US   nnUS   u  nnUS	==   S-  ss'   UR+                  U5      R-                  USU-
  S9  UR+                  U5      R/                  UUSU-
  S9  UR1                  5       R-                  US   5      nUS   nUS   (       a2  SUUS	   -  -
  nSUUS	   -  -
  nU[2        R0                  " U5      -  U-  nUR5                  UUU* S9  US   S:  a  UR-                  XS   * US   -  S9  / n/ nGMc  US   US   nnUS   u  nnUS	==   S-  ss'   UR+                  U5      R-                  U	SU-
  S9  UR+                  U5      R/                  XSU-
  S9  UR1                  5       R-                  US   5      nUS   nUS   (       a2  SUUS	   -  -
  nSUUS	   -  -
  nU[2        R0                  " U5      -  U-  nUR5                  UUU* S9  US   S:  d  GMG  UR-                  XsS   * US   -  S9  GMc     GM     U$ ! , (       d  f       GNc= f)z
Performs a single optimization step.

Arguments:
    closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
Nscaling_factorr   nameslorar	   r   r   step	exp_avg_Bexp_avg_sq_Bexp_avg
exp_avg_sqg:0yE>)device_typedtyper   r   )alpha)valuer   r   r   r   r   )param_groupszipgradappendlenfindstatetorch
zeros_likeTlinalgpinveyeshapetodevicer   r   r   bfloat16r,   mul_add_addcmul_sqrtmathaddcdiv_)r   closurelossgroupr#   
param_list	name_listpnr1   namer5   ABgrad_B_orindeltaAA_TAA_T_invr+   grad_Br'   r(   beta1beta2denom_B	step_sizebias_correction1bias_correction2r)   r*   denoms                                  r   r&   LoraFAOptimizer.stepS   s    9D&&E"#34NJIE(OU7^<?qvv~vvQ;%%a($$Q':!+ !1166&>2V;ZA-  . D 

4(u:?:!+()f-2-=-=jm-Lk*050@0@A0On-()f+0+;+;A+>i(.3.>.>q.Al+ z?a'"1A"1A"#&&K !E ss7D$||00		!''RS*@U@X@XYZYaYa@b8b1bcH"..K(**%+U^^T&'.!*;&;h@V%WF UT #$na&7"7K(<R!S||qvv||3!'166<<!8.3K.@%BW|I#(>LE5&MQ&MNN5)..vcEk.K %%e,55ffCRWK5X*//166uU|DG %dI^,+.%-1G+G(+.%-1G+G($-		:J0K$KN^$^	JJy')JD^,s2q+n8M)MO!#J "I +0	*:E,<OZG#(>LE5&MQ&M LL',,T#+,GOOE*33DcEk3R&OO-225<@E %dI^,+.%-1G+G(+.%-1G+G($-		:J0K$KN^$^	JJwiZJ@ ^,s2q+n8M)MO_ =	 'j u UTs   *R$$
R3 )gMbP?g?g+?gư>r   T)r   z Iterable[nn.parameter.Parameter]r   floatr   ztuple[float, float]r   r_   r   r_   r   bool)N)rF   r   )__name__
__module____qualname____firstlineno____doc__r   r6   no_gradr&   __static_attributes____classcell__)r   s   @r   r   r   #   sy    0 %1!!+0+ + #	+
 + + + +4 ]]_@ @r!   r   c                F   U R                  5        H  u  pgSU;   d  M  UR                  S5        M      U(       a  U[        R                  " U5      -  OX!-  nU R	                  5       UU R                  5        VV	s/ s H  u  piUPM	     sn	nUSUS./n
[        U
5      $ s  sn	nf )a  
Helper function to instantiate a lorafa optimizer specifically configured for a given model using the LoRA method.

This function will:
- Disable gradient updates for the "lora_A" parameters (these are typically frozen during LoRA training).
- Compute the scaling factor based on provided `lora_alpha` and rank `r` for proper gradient projection.
- Create and configure parameter groups for the optimizer including specified learning rate, weight decay, and
  additional optimizer options.

For hyper-params, LoRA-FA uses the same hyper-params as AdamW, except for the LoRA hyper-params (r, lora_alpha,
use_rslora). One can always use the same hyper-params such as lr and weight_decay, as AdamW in LoRA tuning.

Args:
    model (PeftModel): The model containing LoRA-adapted parameters.
    r (int): Rank of the LoRA decomposition.
    lora_alpha (int): Scaling factor for LoRA parameterization.
    lr (float): Learning rate for optimizer updates.
    weight_decay (float): Weight decay for AdamW.
    use_rslora (bool):
        whether to use rslora. In rslora, the lora scaling factor becomes to lora_alpha / math.sqrt(r) instead of
        lora_alpha / r.

Returns:
    Optimizer: Configured lorafa optimizer instance ready for training.
lora_AFr^   )r   r   r$   r#   r   r   )named_parametersrequires_grad_rD   rC   
parametersr   )modelr
lora_alphar   r   
use_rslorarM   paramlora_scaling_r/   s              r   create_lorafa_optimizerru      s    8 --/t  ' 0 1;:		!,
L &&(*/*@*@*BC*Bwtd*BC*!(	
	L <(( Ds   :B
)r   F)rn   r
   ro   intrp   rv   r   r_   r   r_   rq   r`   returnr   )re   
__future__r   rD   collections.abcr   typingr   r6   torch.nnnnaccelerate.utils.importsr   r   torch.optimr   
peft_modelr
   utils.otherr   r   ru   r]   r!   r   <module>r      s    #  $    6  ! " &qi qj in*)*)*)*-*)38*)HM*)ae*)*)r!   