
    hJ                     b   S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJr  S SKJ	r
  S SKJr  SPS jr\ " S S	5      5       r\" S
SSSSS9\" SSSS9\" SSSS9\" SSSS9\" SSSSSSSS9\" SSSSSS9\" SSSSSSS 9\" S!S!S"S9\" S#S#S$S9\" SSSSSSS%S&S'9\" S(SSSSSS%S&S)S*9	\" S(SSSSSS%S&S)S*9	\" S(S+S,SS
S-S%S&S)S*9	\" S(S.S/SS0SS%S&S)S)S19
S2.rS3r " S4 S5\R$                  5      rS S6KJr   " S7 S8\R$                  5      r " S9 S:\R$                  5      r " S; S<\R$                  5      r " S= S>\R$                  5      r " S? S@\R$                  5      r " SA SB\R$                  5      rSC\R                  4SD jrSE\R:                  S34SF\SG\SH\SI\R>                  SJ\ SK\4SL jjr!SM\SN\SK\4SO jr"g)Q    N)	dataclass)Optional)Tensor)
functional)find_multiplec                 8   U R                  5       S:  a  [        SU R                  5        35      e[        R                  " SU R                  S   U R
                  S9R                  [        R                  5      nU R                  SU R                  S   5      U4$ )N   z,Expected input to be of dim 1 or 2, but got r   device)	dim
ValueErrortorcharangeshaper   toint32view)inpsmax_new_tokens	input_poss      U/home/james-whalen/.local/lib/python3.13/site-packages/torchao/_models/llama/model.pyprepare_inputs_for_modelr      su    xxzA~G
|TUU Q

2t{{CFFu{{SIIIb$**R.)955    c                       \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\\S'   S	r
\\S
'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\S\4S j5       rSrg)	ModelArgs   i   
block_size }  
vocab_size    n_layern_head   r   Nintermediate_sizer
   n_local_heads@   head_dim'  	rope_baseh㈵>norm_epsFuse_scaled_ropetie_word_embeddingsc                     U R                   S:X  a  U R                  U l         U R                  c1  SU R                  -  n[	        SU-  S-  5      n[        US5      U l        U R                  U R                  -  U l        g )Nr
      r	         )r&   r#   r%   r   intr   r(   )self
hidden_dimn_hiddens      r   __post_init__ModelArgs.__post_init__,   sk    #!%D!!)TXXJ1z>A-.H%28S%AD"DKK/r   namec                 |   U[         ;   a  U " S0 [         U   D6$ [          Vs/ s H3  nU[        U5      R                  5       ;   d  U[        U5      ;   d  M1  UPM5     snn[        U5      S:  a9  UR	                  [        SS9  [        US   5      [        US   5      :w  d   U5       eU " S0 [         US      D6$ s  snf )N   T)keyreverser    )transformer_configsstrupperlensort)clsr9   configs      r   	from_nameModelArgs.from_name5   s    &&3,T233 .
-T**fD	.A -
 v;?KKCK.vay>S^3 3 4(344
s   0B9B9)r(   r%   r&   )__name__
__module____qualname____firstlineno__r   r3   __annotations__r    r"   r#   r   r%   r&   r(   r*   floatr,   r-   boolr.   r7   classmethodr@   rF   __static_attributes__r>   r   r   r   r      s    JJGSFCCO!s!M3HcIuHe!OT! %%0 5S 5 5r   r   i @  r   r!   r$   i@B )r   r    r"   r   r*   )r"   r#   r   (   i   <   4   i   0   r'          i V  )r"   r#   r   r    r&   r%   r*   P   i p  )r"   r#   r   r&   r%   i 8  )r"   r#   r&   r   r%   r       i      i   i  i  )r   r"   r#   r&   r   r%   r    r*   i   T)	r   r"   r#   r&   r   r%   r    r*   r-   ~      i         i   )
r   r"   r#   r&   r   r%   r    r*   r-   r.   )zCodeLlama-7b-Python-hf7B13B30B34B70Bz
Mistral-7B
stories15Mstories110Mz
Llama-3-8BzLlama-3.1-8BzLlama-3.1-70BzLlama-3.1-405BzLlama-3.2-3BFc                   H   ^  \ rS rSr\R
                  4U 4S jjrS rSrU =r	$ )KVCache   c                    > [         TU ]  5         XX$4nU R                  S[        R                  " XeS95        U R                  S[        R                  " XeS95        g )Nk_cachedtypev_cache)super__init__register_bufferr   zeros)r4   max_batch_sizemax_seq_lengthn_headsr(   rk   cache_shape	__class__s          r   rn   KVCache.__init__   sL     	%IYK(MNYK(MNr   c                    UR                   S   UR                   S   :X  d   e[        (       as  [        R                  R                  R                  U R                  S S U/U5      n[        R                  R                  R                  U R                  S S U/U5      nXE4$ U R                  nU R                  nX$S S 2S S 2U4'   X5S S 2S S 2U4'   XE4$ )Nr   r	   )r   use_index_put_for_kv_cacher   opsaten
index_put_ri   rl   )r4   r   k_valv_valk_outv_outs         r   updateKVCache.update   s    q!U[[^333%%IINN--tT95uE IINN--tT95uE | LLELLE%*!Q	/"%*!Q	/"|r   r>   )
rH   rI   rJ   rK   r   bfloat16rn   r   rP   __classcell__ru   s   @r   rf   rf      s    GL~~O r   rf   )%_quantize_activation_per_token_absmaxc                   X   ^  \ rS rSr\R
                  4U 4S jjrS r\S 5       r	Sr
U =r$ )AffineQuantizedKVCache   c                   > [         TU ]  5         XX$4nXUS4nU R                  S[        R                  " U[        R
                  S95        U R                  S[        R                  " U[        R
                  S95        U R                  S[        R                  " XuS95        U R                  S[        R                  " XuS95        g )Nr;   ri   rj   rl   k_cache_scalev_cache_scale)rm   rn   ro   r   rp   int8ones)	r4   rq   rr   rs   r(   scale_dtypert   scale_shaperu   s	           r   rn   AffineQuantizedKVCache.__init__   s     	%I%BYKuzz(RSYKuzz(RSUZZG	
 	UZZG	
r   c                    [        U5      u  pEX@R                  S S 2S S 2U4'   UR                  S5      U R                  S S 2S S 2U4'   U R                  U R                  -  nX&S S 2S S 2U4'   [        U5      u  pxXpR                  S S 2S S 2U4'   UR                  S5      U R
                  S S 2S S 2U4'   U R                  U R
                  -  n	X9S S 2S S 2U4'   Xi4$ )Nr
   )r   ri   	unsqueezer   rl   r   )
r4   r   r|   r}   q_k_valk_scaler~   q_v_valv_scaler   s
             r   r   AffineQuantizedKVCache.update   s    @G(/Q9_%.5.?.?.C1a?+t111!&aIo@G(/Q9_%.5.?.?.C1a?+t111!&aIo|r   c                 x    UR                   R                  nUu  p4pVUR                   R                  nU " X5XFU5      $ N)ri   r   rk   )rD   kv_cachert   rq   rs   rr   r(   r   s           r   
from_float!AffineQuantizedKVCache.from_float   s=    &&,,<G9&&,,>7kRRr   r>   )rH   rI   rJ   rK   r   r   rn   r   rO   r   rP   r   r   s   @r   r   r      s,     NN
(  S Sr   r   c                      ^  \ rS rSrS\SS4U 4S jjr    SS\4S jjrS rSS	\	S
\
\	   S\	4S jjr\S\4S j5       rSrU =r$ )Transformer   rE   returnNc                   >^ [         TU ]  5         TU l        [        R                  " TR
                  TR                  5      U l        [        R                  " U4S j[        TR                  5       5       5      U l        [        TR                  TR                  S9U l        [        R                  " TR                  TR
                  SS9U l        S U l        S U l        SU l        SU l        g )Nc              3   :   >#    U  H  n[        T5      v   M     g 7fr   )TransformerBlock).0_rE   s     r   	<genexpr>'Transformer.__init__.<locals>.<genexpr>   s      $
.CV$$.Cs   )epsFbiasr
   )rm   rn   rE   nn	Embeddingr    r   tok_embeddings
ModuleListranger"   layersRMSNormr,   normLinearoutput	freqs_cis
mask_cacherq   rr   r4   rE   ru   s    `r   rn   Transformer.__init__   s     ll6+<+<fjjImm $
.3FNN.C$
 
 FJJFOO<	ii

F,=,=EJ+/,0  r   trainingc           	         U R                   U:  a  U R                  U:  a  g U R                  R                  U R                  R                  -  n[        US5      nX l         Xl        S n[        U R                  S5      (       a   U R                  R                  R                  n[        U R                  S5      (       a!  U R                  R                  R                  nO;[        U R                  S5      (       a   U R                  R                  R                  nXPl        U R                  (       dR  [        R                  " [        R                  " U R                   U R                   [        R                   S95      U l        O`Ub  US:  d   S5       e[        R$                  " SSSU R                   [        R                   S9U l        SU R"                  S S 2S S 2S S 2S U24'   U(       d  U R&                   H  n	U(       a  [        R(                  " S5         [+        UUU R                  R,                  UU5      U	R.                  l        S S S 5        [2        R5                  U	R.                  R0                  5      U	R.                  l        M  [+        UUU R                  R,                  UU5      U	R.                  l        M     [7        U R                  R8                  U R                  R                  U R                  R                  -  U R                  R:                  UU R                  R<                  S	9U l        g ! , (       d  f       N= f)
NrV   weightscalesscales_and_zerosrj   r;   zLneed to set prompt_length>1 to use non quadratic causal mask in setup_cachesmeta)
use_scaled) rr   rq   rE   r   r#   r   hasattrr   r   rk   r   r   linear_causal_maskr   trilr   rN   causal_maskrp   r   r   rf   r&   	attentionr   r   r   precompute_freqs_cisr   r*   r-   r   )
r4   rq   rr   r   kv_cache_quantizationr   prompt_lengthr(   rk   bs
             r   setup_cachesTransformer.setup_caches  sr    >1##~5;;??dkk&8&88&~q9,,4;;))KK&&,,E4;;))KK&&,,ET[["455KK0066E"4&&$zz

4..0C0C5::V D !,1B ^B  %{{1a,,EJJ D 9:DQ1n}n45[[(f-/6** KK55$!0, . ,B+L+L,,,AKK( ,3&&11 ,AKK( !* .KK""KKOOt{{111KK!!{{22
' .-s   3L<<
M
	c                 <    SU l         SU l        SU l        SU l        g)ztReset caches.

The caches used by training stage and inference stage may be different, reset them before switching.
r
   N)rq   rr   r   r   )r4   s    r   reset_cachesTransformer.reset_cachesL  s"    
 ! +/,0r   idxr   c           	         U R                   c   S5       eUc  SnU R                   SUR                  S    nOU R                  (       d  U R                  SSU4   nO[	        U5      S:  a  U R                  (       au  [
        R                  " [
        R                  " [	        U5      U R                  [
        R                  UR                  S95      R                  S5      R                  S5      nOSU R                  SSSU4'   U R                  nU R                   U   nU R                  U5      n[        U R                  5       H  u  pgU" XRXC5      nM     U R                  U5      nU R!                  U5      nU$ )a
  Forward pass of the model.

Args:
    idx  (`torch.LongTensor` of shape `(batch_size, seq_length)`):
        Indices of input sequence tokens in the vocabulary.
    input_pos (`torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings.
        This argument is optional for training mode but required for
        inference mode(when model.setup_caches(training=False) is used).

Returns:
    Tensor: The output logits tensor.
Nz Caches must be initialized firstr;   rk   r   r   )r   r   r   r   rB   r   r   r   rr   rN   r   r   r   	enumerater   r   r   )	r4   r   r   maskr   xilayerlogitss	            r   forwardTransformer.forwardV  sH    ~~)M+MM)D~16I**''dI(=>I"t'>'> JJ

	N //"'**#,#3#3	 Yq\Yq\  89  Aq)!34''y1I$!$++.HAaI4A /IIaLQr   r9   c                 8    U " [         R                  U5      5      $ r   )r   rF   )rD   r9   s     r   rF   Transformer.from_name  s    9&&t,--r   )r   rE   r   r   r   r   rq   rr   r   r   r   )FNFNr   )rH   rI   rJ   rK   r   rn   rN   r   r   r   r   r   rO   r@   rF   rP   r   r   s   @r   r   r      sv    !y !T !( " F
 	F
P106 0hv.> 0& 0d .S . .r   r   c            
       ^   ^  \ rS rSrS\SS4U 4S jjrS\S\\   S\S	\\   S\4
S
 jrSr	U =r
$ )r   i  rE   r   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g r   )rm   rn   	Attentionr   FeedForwardfeed_forwardr   r   r,   ffn_normattention_normr   s     r   rn   TransformerBlock.__init__  sT    "6*'/

FOO<%fjj&//Br   r   r   r   r   c                     XR                  U R                  U5      X4U5      -   nXPR                  U R                  U5      5      -   nU$ r   r   r   r   r   )r4   r   r   r   r   houts          r   r   TransformerBlock.forward  sE     t2215y	RR##DMM!$455
r   r   )rH   rI   rJ   rK   r   rn   r   r   r   rP   r   r   s   @r   r   r     s]    Cy CT C		 F#	 		
 v	 
	 	r   r   c                   f   ^  \ rS rSrS\4U 4S jjrS r SS\S\S\\   S\\   S	\4
S
 jjr	Sr
U =r$ )r   i  rE   c                 &  > [         TU ]  5         UR                  UR                  -  S:X  d   eUR                  SUR                  -  -   UR
                  -  n[        R                  " UR                  USS9U l        [        R                  " UR                  UR                  SS9U l	        S U l
        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  U R                  5        g )Nr   r	   Fr   )rm   rn   r   r#   r&   r(   r   r   wqkvwor   "_register_load_state_dict_pre_hook	load_hook)r4   rE   total_head_dimru   s      r   rn   Attention.__init__  s    zzFMM)Q... --!f.B.B*BBfooUIIfjj.uE	))FJJ

?mm#11:://?r   c                     US-   U;   aZ  UR                  US-   5      nUR                  US-   5      nUR                  US-   5      n[        R                  " XEU/5      XS-   '   g g )Nz	wq.weightz	wk.weightz	wv.weightzwqkv.weight)popr   cat)r4   
state_dictprefixargswqwkwvs          r   r   Attention.load_hook  sg    K:- 45B 45B 45B16BB<1HJ-.	 .r   r   r   r   r   r   c                    UR                   u  pVnU R                  U R                  -  nU R                  U5      R	                  U R
                  X/SS9u  pnU	R                  XVU R                  U R                  5      n	U
R                  XVU R                  U R                  5      n
UR                  XVU R                  U R                  5      n[        X5      n	[        X5      n
[        S XU45      u  pnU R                  b  U R                  R                  XJU5      u  pU
R                  U R                  U R                  -  SS9n
UR                  U R                  U R                  -  SS9nUb  [        R                  " XXSS9nO[        R                  " XUSSS9nUR                  SS	5      R!                  5       R                  XVU R
                  5      nU R#                  U5      nU$ )
Nr
   r   c                 &    U R                  SS5      $ )Nr;   r	   )	transpose)r   s    r   <lambda>#Attention.forward.<locals>.<lambda>  s    Aq 1r   r;   g        )	attn_mask	dropout_pT)r  	is_causalr	   )r   r&   r(   r   splitr   r   r#   apply_rotary_embmapr   r   repeat_interleaveFscaled_dot_product_attentionr   
contiguousr   )r4   r   r   r   r   bszseqlenr   kv_sizeqkvys                r   r   Attention.forward  s    Q$$t}}4))A,$$dhh%Ar$JaFF3T]];FF3 2 2DMMBFF3 2 2DMMBQ*Q*1A!9=a==$==''	a8DAt/A/A AqIt/A/A AqI..qQRUVA..qQ#QUVAKK1((*//TXXFGGAJr   )r   r(   r   r#   r&   r   r   r   )rH   rI   rJ   rK   r   rn   r   r   r   r   rP   r   r   s   @r   r   r     s`    @y @ I '+"" " v	"
 F#" 
" "r   r   c                   F   ^  \ rS rSrS\SS4U 4S jjrS\S\4S jrSrU =r	$ )	r   i  rE   r   Nc                 <  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        g )NFr   )	rm   rn   r   r   r   r%   w1w3w2r   s     r   rn   FeedForward.__init__  sh    ))FJJ(@(@uM))FJJ(@(@uM))F44fjjuMr   r   c                     U R                  [        R                  " U R                  U5      5      U R	                  U5      -  5      $ r   )r  r  silur  r  r4   r   s     r   r   FeedForward.forward  s/    wwqvvdggaj)DGGAJ677r   )r  r  r  )
rH   rI   rJ   rK   r   rn   r   r   rP   r   r   s   @r   r   r     s1    Ny NT N8 8F 8 8r   r   c                   P   ^  \ rS rSrS
S\S\4U 4S jjjrS rS\S\4S jr	S	r
U =r$ )r   i  r   r   c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g r   )rm   rn   r   r   	Parameterr   r   r   )r4   r   r   ru   s      r   rn   RMSNorm.__init__  s+    ll5::c?3r   c           	      z    U[         R                  " [         R                  " X-  SSS9U R                  -   5      -  $ )Nr
   T)r   keepdim)r   rsqrtmeanr   r  s     r   _normRMSNorm._norm  s.    5;;uzz!%RFQRRRr   r   r   c                 z    U R                  UR                  5       5      R                  U5      nX R                  -  $ r   )r$  rM   type_asr   )r4   r   r   s      r   r   RMSNorm.forward  s.    AGGI&..q1##r   )r   r   )r+   )rH   rI   rJ   rK   r3   rM   rn   r$  r   r   rP   r   r   s   @r   r   r     s9    4C 4e 4 4
S$ $F $ $r   r   freqsc                 ~   SnSnSnSnXB-  nXC-  n/ nU  H}  nS[         R                  -  U-  n	X:  a  UR                  U5        M1  X:  a  UR                  X-  5        MK  XV:w  d   eXI-  U-
  X2-
  -  n
UR                  SU
-
  U-  U-  X-  -   5        M     [        R                  " XpR
                  U R                  S9$ )NrV   r;   r0   rU   r	   r   )mathpiappendr   tensorrk   r   )r)  scale_factorlow_freq_factorhigh_freq_factorold_context_lenlow_freq_wavelenhigh_freq_wavelen	new_freqsfreqwavelensmooths              r   apply_scalingr9    s    LOO&8':Idgg+$&T"'T01#888%//A 2F a&jD0<?&-OP  <<	U\\JJr   r)   seq_lenn_elembaserk   r   r   c                    SU[         R                  " SUS5      S US-   R                  5       U-  -  -  n[         R                  " XR                  S9nU(       a  [	        U5      n[         R
                  " Xe5      n[         R                  " [         R                  " U5      U5      n[         R                  " UR                  UR                  /SS9nUR                  US9$ )Ng      ?r   r	   r   r
   r   rj   )r   r   rM   r   r9  outerpolar	ones_likestackrealimagr   )	r:  r;  r<  rk   r   r)  tr   caches	            r   r   r     s     a+Ov{<BBDvMNE 	W\\2Ae$KK!EEOOE2E:IKK8bAE88%8  r   r   r   c                    U R                  5       R                  " / U R                  S S QSPSP76 nUR                  SUR	                  S5      SUR	                  S5      S5      n[
        R                  " US   US   -  US   US   -  -
  US   US   -  US   US   -  -   /S5      nUR                  S5      nUR                  U 5      $ )Nr
   r	   r;   r1   ).r   ).r;   )	rM   reshaper   r   sizer   rA  flattenr'  )r   r   xshapedx_out2s       r   r  r  #  s    ggi5"5r515Gq',,q/1gll1oqII[[FOi//'&/IfDU2UUFOi//'&/IfDU2UU	
 	F ^^AF>>!r   )r;   )#r+  dataclassesr   typingr   r   torch.nnr   r   r   r  torchao.utilsr   r   r   dictr?   rx   Modulerf   torchao.quantization.utilsr   r   r   r   r   r   r   r9  r   r3   rk   rN   r   r  r>   r   r   <module>rS     s    !     $ '6 *5 *5 *5\ #UBDG r"$
/240240 24qE  q4237	 
 
 
  QT p # bii : M*SRYY *SZW.")) W.tryy (:		 :z8")) 8$bii $K K8 !!! ! ;;	!
 ! !& 6 f r   