
    cCi'5                         S SK JrJr  SSKJr  \(       a  SSKJr  SSKJrJ	r	J
r
Jr  SSKJr  \
" 5       (       a  S SKr\R                  " \5      r " S	 S
\5      rg)    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                      ^  \ rS rSrSrSrSrSS/rU 4S jrS r	SS
 jr
SSS\S	\4S jrSSSSS\SS4S jrSS jr S SSS\\\      4S jjrS\\   S\S	\\   4S jrS rS S jr\S	\4S j5       rSrU =r$ )!FbgemmFp8HfQuantizer!   z'
FP8 quantization using fbgemm kernels
TFz
fbgemm-gpu
acceleratec                 4   > [         TU ]  " U40 UD6  Xl        g N)super__init__quantization_config)selfr   kwargs	__class__s      f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   FbgemmFp8HfQuantizer.__init__+   s    ,77#6     c                 v   [        5       (       d  [        S5      e[        5       (       d  [        S5      e[        S5      (       d  [        S5      e[        R
                  R                  5       (       d  [        S5      e[        R
                  R                  5       nUu  pEUS:  a  [        S5      eUR                  S5      nUc  [        R                  S	5        g Ub\  U R                  (       dJ  [        U[        5      (       a4  S
UR!                  5       ;   d  SUR!                  5       ;   a  [        S5      eg g g g )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr
   r	   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   s          r   validate_environment)FbgemmFp8HfQuantizer.validate_environment/   s<   !##]  '((F 
 'x00r  zz&&((^__"ZZ==?)19j  ZZ-
| #&&z400j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                     Uc(  [         R                  n[        R                  SU5        U$ U[         R                  :X  a  [        S5      eU$ )NzOverriding dtype=%s with `dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.zYYou cannot use FP8 with dtype=torch.float16.We recommend you passing dtype=torch.bfloat16)r#   bfloat16r*   infofloat16r(   )r   dtypes     r   update_dtype!FbgemmFp8HfQuantizer.update_dtype\   sS    =NNEKK@  	 emm#k  r   modelr   
param_namec                     SSK JnJn  [        X5      u  pg[	        Xd5      (       a  U R
                  (       d  US:X  a  gg[	        Xe5      (       a  U R
                  (       d  US:X  a  ggg)Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsrB   rC   r   r-   r,   )r   r>   r?   r   rB   rC   moduletensor_names           r   param_needs_quantization-FbgemmFp8HfQuantizer.param_needs_quantizationl   sW    N25Ef..!![F%:f99!![F%:r   param_valueztorch.Tensortarget_deviceztorch.devicec                 "   SSK JnJn  [        X5      u  p[	        X5      (       aX  U R
                  (       d  U	S:X  a0  U	S:X  a)  UR                  [        R                  :w  a  [        S5      eOU	S:X  a  [        S5      e[	        X5      (       a.  U R
                  (       d  U	S:X  d  U	S:X  d  U	S	:X  a  [        S5      e[	        X5      (       Ga  U	S
:X  a  UR                  SS5      n
U
R                  nU
R                  SUS   5      n[        R                  R                  R                  U5      u  pUR                  U5      nUR                  SS5      nUR                  US   SUS   5      nOU	S:X  a  UR                  SS5      n
U
R                  nU
R                  SUS   5      n[        R                  R                  R                  U5      u  pUR                  U5      nUR                  SS5      nUR                  US   US   S5      n[        R                   R#                  WR%                  U5      5      UR&                  U	 S3'   O[        R                  R                  R                  U5      u  nn[        R                   R#                  UR)                  UR                  S   S5      R%                  U5      5      UR&                  U	 S3'   [        R                   R#                  WR%                  U5      5      UR&                  U	'   Ag )Nr   rA   rD   weightz6Expect quantized weights but got an unquantized weightweight_scalez;Expect unquantized weights but got a quantized weight_scalegate_up_proj_scaledown_proj_scalegate_up_projr   r   	down_proj_scale)rE   rB   rC   r   r-   r,   r;   r#   float8_e4m3fnr(   	transposeshapereshapeopsfbgemmquantize_fp8_per_rownn	Parameterto_parametersview)r   r>   rJ   r?   rK   r   rB   rC   rF   rG   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerN   s                    r   create_quantized_param+FbgemmFp8HfQuantizer.create_quantized_param}   s    	O25E f..!![F%:(*{/@/@EDWDW/W$%]^^.0$%bccf99&&+*?"66+IZ:Z$%bccf99n, $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 +22>B	%//15	0889JA~^_O`a+ $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 +22>B	%//15	0889JN[\L]_`a9>9K9KLOO\iLj9kF+f56&+ii&6&6&K&KK&X#I|9>9K9K!!,"4"4Q"7;>>}M:F+f56 +0((*<*<Y\\-=X*Y;'r   c                     U$ r    )r   r>   r   s      r   #_process_model_after_weight_loading8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc           	         SSK Jn  UR                  nU R                  XR                  R
                  U5      U l        UR                  nU" UU R
                  U R                  U R                  UUS9nU R                  UR                  l        g )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r,   configtp_plan)rE   ro   _tp_planget_modules_to_not_convertr   rp   rq   r,   )r   r>   rm   r   ro   rr   rq   s          r   $_process_model_before_weight_loading9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	B..&*&E&E++BBDX'
# .#'#>#> $ 8 8,,
 ,0+C+C(r   missing_keysprefixc                 d   SSK JnJn  / nUR                  5        Ht  u  px[	        XU45      (       d  M  U HU  n	Xy;   d  Xs SU	 3;   d  M  U	R                  S5      (       a  M,  U	R                  S5      (       a  MD  UR                  U	5        MW     Mv     U V
s/ s H  oU;  d  M
  U
PM     sn
$ s  sn
f )Nr   rA   .z.weightz.bias)rE   rB   rC   named_modulesr-   endswithappend)r   r>   rw   rx   rB   rC   not_missing_keysnamerF   missingks              r   update_missing_keys(FbgemmFp8HfQuantizer.update_missing_keys   s    N!//1LD&4N"OPP+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   	B-$B-c                    SUR                   R                  ;   ap  0 SS_SS_SS_SS_SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS
_SS_SSSSSS
SSSS.	EnUR                  5       b  X!R                  5       l        U$ X!l        U$ U$ )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rq   	text_plans      r   update_tp_plan#FbgemmFp8HfQuantizer.update_tp_plan   sQ   v''000$ 3O	$
 9/$ 3O$ 9/$ 3O$ 9/$ 3O$ %h$ 23F$ ;<O$ 2$& G'$( Mo)$* Eo+$, KO-$. G/$0 01$2 *2DSJYBQHWDS ?UDZ;JG$IJ %%'3>G&&(; M -6)Mr   c                     g)NTrj   )r   safe_serializations     r   is_serializable$FbgemmFp8HfQuantizer.is_serializable  s    r   c                     g)NFrj   )r   s    r   is_trainable!FbgemmFp8HfQuantizer.is_trainable   s    r   )rp   r   )r;   torch.dtyper6   r   )r>   r   r   )r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r4   r<   strboolrH   rg   rk   r   listru   r   r   r   propertyr   __static_attributes____classcell__)r   s   @r   r   r   !   s     (,$ %|47+Z .? S _c "D D $D 	D
 &DL 59D D 'tCy1D2FtCy F# FRVWZR[ F-^ d  r   r   )typingr   r   baser   modeling_utilsr   utilsr	   r
   r   r   quantizers_utilsr   r#   
get_loggerr   r*   r   rj   r   r   <module>r      sL    +  0 a a 2  
		H	%A; Ar   