
    cCi	%                         S SK JrJr  SSKJrJrJrJr  SSKJ	r	  SSK
Jr  \" 5       (       a  S SKr\(       a  SSKJr  \R                  " \5      r " S	 S
\	5      rg)    )TYPE_CHECKINGOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                      ^  \ rS rSrSrSrSrS/rU 4S jrS r	SS	 jr
S
SSSS\SS4S jrS
SS\S\4S jr SS
SS\\\      4S jjrS S jrS\\   S\S\\   4S jrS rSS jr\S\4S j5       rS rSrU =r$ )!FineGrainedFP8HfQuantizer   zz
FP8 quantization implementation supporting both standard and MoE models.
Supports both e4m3fn formats based on platform.
TF
acceleratec                 4   > [         TU ]  " U40 UD6  Xl        g N)super__init__quantization_config)selfr   kwargs	__class__s      k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   "FineGrainedFP8HfQuantizer.__init__   s    ,77#6     c                 <   [        5       (       d  [        S5      e[        5       (       d  [        S5      eUR                  SS5      (       d  UR                  SS5      (       a  [	        S5      e[
        R                  R                  5       (       d  [        5       (       d  [        S5      e[
        R                  R                  5       (       aF  [
        R                  R                  5       nUu  pEUS:  d  US:X  a  US	:  a  [	        S
U SU S35      eUR                  S5      nUc  [        R                  S5        g Ub\  U R                  (       dJ  [        U[        5      (       a4  SUR!                  5       ;   d  SUR!                  5       ;   a  [	        S5      eg g g g )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudais_availabler   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr$   s          r   validate_environment.FineGrainedFP8HfQuantizer.validate_environment   s   !##] 
 '((mnn::i''6::k5+I+IF 
 

''))-C-E-Ebcc::""$$!&!A!A!C-LE	uzeai ##('5'4 
 ZZ-
6
 #&&z400j//11Vz?P?P?R5R k  6S 1 ' $r   returnc                 V    Uc%  [         R                  S5        [        R                  nU$ )NzKSetting dtype to torch.float32 as no dtype was specified in from_pretrained)r/   infor*   float32)r   dtypes     r   update_dtype&FineGrainedFP8HfQuantizer.update_dtypeN   s"    =KKefMMEr   modelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.devicec                    SSK Jn  SSKJn  [	        X5      u  p[        X5      (       aX  U R                  (       d  U	S:X  a0  U	S:X  a)  UR                  [        R                  :w  a  [        S5      eOU	S:X  a  [        S5      eUR                  U5      n[        R                  " [        R                  5      R                  n
[        R                  " [        R                  5      R                  nU R                  R                   u  pUR"                  S	S  u  pX-  S
:w  d  X-  S
:w  a  [        SU SU SU SU S3	5      eUR"                  nUR%                  SX-  XU-  U5      R'                  S
SSSS5      n[        R(                  " [        R*                  " U5      SS9nUU-  nUR"                  nUR-                  S5      R-                  S5      n[        R.                  " UU-  XS9R                  [        R                  5      nUR'                  S
SSSS5      nUR%                  U5      nUR%                  U5      R1                  5       R3                  5       nU" XU5        U" XR5                  SS5      S
   S-   U5        g )Nr   	FP8Linear)_load_parameter_into_modelbiasweightz6Expect quantized weights but got an unquantized weightweight_scale_invz;Expect unquantized weights but got a quantized weight_scaler   zMatrix dimensions (z, z$) must be divisible by block sizes ()r
         )rO   rM   )dim)minmaxr"   z.weight_scale_inv)integrations.finegrained_fp8rH   modeling_utilsrI   r   r2   r1   r?   r*   float8_e4m3fnr)   tofinforS   rT   r   weight_block_sizeshapereshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   rB   rC   rD   rE   r   rH   rI   moduletensor_namefp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_params                        r   create_quantized_param0FineGrainedFP8HfQuantizer.create_quantized_paramT   sT    	=? 35Ef((!![F%:(*{/@/@EDWDW/W$%]^^"44$%bcc!nn]3 ++e11266++e11266%)%=%=%O%O" &&rs+
!#t':a'?%dV2dV3WXdWeeghtguuvw  "-!2!2!))$lL4H,

'!Q1a
  	
 **UYY{3B'! ;;#--b1  ++kE&9wTWWX]XkXkl)11!Q1a@)112HI ./779DDF 	#5oF"5*;*;C*CA*FI\*\^cdr   c                 |    SSK Jn  [        X5      u  pV[        XT5      (       a  U R                  (       d  US:X  a  ggg)Nr   rG   rJ   FT)rU   rH   r   r2   r1   )r   rB   rD   r   rH   re   rf   s          r   param_needs_quantization2FineGrainedFP8HfQuantizer.param_needs_quantization   s6    <25Ef((!![F%:r   keep_in_fp32_modulesc                     SSK Jn  U R                  XR                  R                  U5      U l        U" UU R                  U R                  S9nU R                  UR
                  l        g )Nr   )replace_with_fp8_linearmodules_to_not_convertr   )rU   ry   get_modules_to_not_convertr   r{   config)r   rB   rw   r   ry   s        r   $_process_model_before_weight_loading>FineGrainedFP8HfQuantizer._process_model_before_weight_loading   sb     	K&*&E&E++BBDX'
# (#'#>#> $ 8 8
 ,0+C+C(r   c                     U$ r    )r   rB   r   s      r   #_process_model_after_weight_loading=FineGrainedFP8HfQuantizer._process_model_after_weight_loading   s    r   missing_keysprefixc                 \   SSK Jn  / nUR                  5        Hr  u  pg[        Xt5      (       d  M  U HU  nXh;   d  Xc SU 3;   d  M  UR	                  S5      (       a  M,  UR	                  S5      (       a  MD  UR                  U5        MW     Mt     U V	s/ s H  oU;  d  M
  U	PM     sn	$ s  sn	f )Nr   rG   r"   z.weightz.bias)integrationsrH   named_modulesr2   endswithappend)
r   rB   r   r   rH   not_missing_keysnamere   missingks
             r   update_missing_keys-FineGrainedFP8HfQuantizer.update_missing_keys   s    ,!//1LD&,,+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   	B) B)c                     SUR                   R                  ;   a8  0 SS_SS_SS_SS_SS_SS_S	S
_SS
_SS_SS_SS_SS_SS_SS
_SS
_SS_nX!l        U$ )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   r}   	text_plans      r   update_tp_plan(FineGrainedFP8HfQuantizer.update_tp_plan   s    f&&///2O<o 3O =o	
 3O =o 3O =o %h 0 :? . 8 0 :?  !I& )2%r   c                     g)NTr   )r   safe_serializations     r   is_serializable)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                     g)NFr   r   s    r   is_trainable&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                     g)Nr   r   r   s    r   get_accelerator_warm_up_factor8FineGrainedFP8HfQuantizer.get_accelerator_warm_up_factor   s    r   rz   )r?   torch.dtyper;   r   r   )rB   r   )r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r9   r@   strrr   boolru   r   listr~   r   r   r   r   propertyr   r   __static_attributes____classcell__)r   s   @r   r   r      s    
 (,$ %7-^;e ;e $;e 	;e
 &;ez	.? 	S 	_c 	 59D D 'tCy1D(FtCy F# FRVWZR[ F2 d   r   r   )typingr   r   utilsr   r   r   r	   baser   quantizers_utilsr   r*   rV   r   
get_loggerr   r/   r   r   r   r   <module>r      sF    * ` `  2 0			H	%S Sr   