
    h5              3          S SK r S SKrS SKJr  S SKJr  S SKJrJr  S SKrS SK	rS SK
rS SKrS SKJrJrJr  S SKJr  S SKJr  S\R(                  R*                  l        \R.                  R0                  R3                  S	5         " S
 S5      rS rS r\R0                  R;                  5       (       a  SO"\R<                  R;                  5       (       a  SOSr\" \ 5      RB                  RB                  RE                  5       r#\ RH                  RK                  \&" \#5      5        S SK'J(r(J)r)  S SK*J+r+  S r,SS\-S\\.   4S jjr/SS\-S\\.   4S jjr0S\(S\Rb                  S\Rb                  S\Rb                  4S jq2S\(S\Rb                  S\Rb                  S\\Rb                  \Rb                  4   4S jq3S 4S\(S\Rb                  S\Rb                  S \.4S! jjr4S" r5\Rl                  " 5       S# SSSSSSSS$.S\(S%\Rb                  S&\.S'\.S(\7S)\7S*\\.   S+\7S,\\R0                  Rp                     S-\\R0                  Rp                     S.\\R0                  Rp                     S/\\R0                  Rp                     S\Rb                  4S0 jj5       r9S	\4S1 jr:S2 r;S3u  r<r=SS4SSS5S6S7S8S9\" S:5      SSSSSSSS	SSS\\R|                  SSS4S;\\.   S%\&S<\\&   S(\7S=\.S&\.S'\.S\.S\-S>\S?\\&   S@\\-   SA\\&   S)\7S*\\.   S+\7SB\7SC\7SD\7SE\\   SF\\   SG\\   SH\\   SI\7SS42SJ jjr?\@SK:X  Ga/  S SKArA\AR                  " SLSM9rC\CR                  SN\.SSOSP9  \CR                  SQ\&S4SRSP9  \CR                  SS\&STSU9  \CR                  SVSWSXSY9  \CR                  SZ\.S5S[SP9  \CR                  S\\.S8S]SP9  \CR                  S^\.S7S_SP9  \CR                  S`\.S8SaSP9  \CR                  Sb\-S9ScSP9  \CR                  Sd\\" Se5      SfSP9  \CR                  SgSh\&SiSU9  \CR                  Sj\-SSkSP9  \CR                  SlSm\&SnSU9  \CR                  SoSWSpSY9  \CR                  Sq\.SSrSP9  \CR                  SsSWStSY9  \CR                  SuSWSvSY9  \CR                  SwSWSxSY9  \CR                  SySWSzSY9  \CR                  S{\SS|SP9  \CR                  S}\SS~SP9  \CR                  S\&\SSP9  \CR                  SS \R|                  SSP9  \CR                  S\SSSP9  \CR                  S\SSSP9  \CR                  SSWSSY9  \CR                  5       rF\G" \F5        \?" \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  \FR                  5        gg)    N)datetime)Path)OptionalTuple)get_arch_namewrite_json_result_localwrite_json_result_ossci)MappingType)get_model_size_in_bytesFTc                   &    \ rS rSrS rS rS rSrg)	HostEvent   c                     S U l         g N
event_timeselfs    X/home/james-whalen/.local/lib/python3.13/site-packages/torchao/_models/llama/generate.py__init__HostEvent.__init__   s	        c                 8    [         R                  " 5       U l        g r   )timeperf_counterr   r   s    r   recordHostEvent.record!   s    ++-r   c                 |    U R                   c  [        S5      e[        UR                   U R                   -
  5      S-  $ )NzEvent not recorded!  )r   
ValueErrorabs)r   other_events     r   elapsed_timeHostEvent.elapsed_time$   s7    ??"233;))DOO;<tCCr   r   N)__name__
__module____qualname____firstlineno__r   r   r#   __static_attributes__ r   r   r   r      s    .Dr   r   c                     SU ;   a  [         R                  R                  SS9$ SU ;   a  [         R                  R                  SS9$ SU ;   d  SU ;   a
  [	        5       $ [        SU  S35        g )	NcudaT)enable_timingxpucpumpsdevice= is not yet suppported)torchr,   Eventr.   r   printdevices    r   device_timerr8   +   se    zzd33	&yyT22
6/u{x567r   c                     SU ;   a   [         R                  R                  U 5        g SU ;   a   [         R                  R                  U 5        g SU ;   d  SU ;   a  g [	        SU  S35        g )Nr,   r.   r/   r0   r1   r2   )r3   r,   synchronizer.   r5   r6   s    r   device_syncr;   6   sW    

v&	&		f%
6/ux567r   r,   r.   r/   )Transformerprepare_inputs_for_model)get_tokenizerc                     [         R                  " U 5      R                  S5      n[         R                  " X-  SSS9R	                  [         R
                  S9$ )N   T)dimkeepdim)dtype)r3   
empty_likeexponential_argmaxtoint)
probs_sortqs     r   multinomial_sample_one_no_syncrL   Q   sF     	$11!4A<<
B=@@uyy@QQr   temperaturetop_kc           	      f   U [        US5      -  n Ubv  [        R                  " U [        X R	                  S5      5      5      u  p4UR                  SS5      R                  S5      n[        R                  " X:  [        S5      * U 5      n [        R                  R                  R                  U SS9nU$ )Ngh㈵>rA   InfrB   )maxr3   topkminsizeselect	unsqueezewherefloatnn
functionalsoftmax)logitsrM   rN   v_pivotprobss          r   logits_to_probsrb   X   s    c+t,,Fzz&#e[[_"=>R **2.V^eEl]FCHH''B'7ELr   c                 D    [        U S S 2S4   X5      n[        U5      nXC4$ )NrA   )rb   rL   )r]   rM   rN   ra   idx_nexts        r   samplere   c   s(    F1b5M;>E-e4H?r   modelx	input_posreturnc                 0    U " X5      n[        U40 UD6S   $ )Nr   )re   rf   rg   rh   sampling_kwargsr]   s        r   prefillrm   i   s#     1 F&,O,Q//r   c                 T    UR                   S   S:X  d   eU " X5      n[        U40 UD6$ )NrA   r@   )shapere   rk   s        r   decode_one_tokenrp   q   s5     ??2!###1 F&,O,,r   c                     U $ r   r*   )r_   s    r   <lambda>rr          qr   	cur_tokennum_new_tokensc                    / / pv[        U5       H  n[        R                  R                  R	                  [        R                  R                  R
                  R                  5         [        XU40 UD6u  pU	R                  5       U
R                  5       pUS-  nUR                  U	R                  5       5        U" US   5        UR                  U
5        U	nS S S 5        M     Xg4$ ! , (       d  f       M  = f)Nr@   rA   )
ranger3   rZ   	attentionsdpa_kernel
SDPBackendMATHrp   cloneappend)rf   rt   rh   ru   callbackrl   
new_tokens	new_probsi
next_token	next_probs              r   decode_n_tokensr   z   s     	>"XX++EHH,>,>,I,I,N,NO$4)%/>%!J %/$4$4$6	8I	NIj..01Z^$Y'"I PO #    POs   'A1C''
C6	c                     U " X5      $ r   r*   )rf   rg   rh   s      r   model_forwardr      s    r   c                     U $ r   r*   rg   s    r   rr   rr      rs   r   )r~   kv_cache_quantization
cache_sizelinear_causal_maskprefill_start_eventprefill_end_eventdecode_start_eventdecode_end_eventpromptmax_new_tokens
batch_sizeinteractiver   r   r   r   r   r   r   c       	   	         UR                   nUR                  S5      nU(       d"  [        X-   U R                  R                  5      OSn[        SU SU 35        UU-
  n[        U5      u  nnUR                  US5      n[        R                  " UUUR                  US9nUUSS2SU24'   [        R                   " U5         Uc  UnUU:  d   S5       eU R                  UUUUUS	9  SSS5        U	b  U	R                  5         [        XR                  US5      U40 UD6R                  5       nUR!                  5       USS2U4'   U
b  U
R                  5         Ub  UR                  5         [        R"                  " U/U[        R$                  S
9n['        U UR                  US5      UUS-
  4SU0UD6u  nn[        R(                  " USS2SUS-   24   /UQ7SS9nUb  UR                  5         U$ ! , (       d  f       GN= f)zh
Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
rA   i^  zmax_seq_length=z, prompt_length=r@   rD   r7   NzBneed cache_size to be greater than max_new_tokens + size-of-prompt)max_batch_sizemax_seq_lengthr   r   prompt_lengthr7   rD   r~   rQ   )r7   rU   rT   config
block_sizer5   r=   repeatr3   emptyrD   setup_cachesr   rm   viewr|   squeezetensorrI   r   cat)rf   r   r   r   r   r~   r   r   r   r   r   r   r   rl   r7   Tr   r   rh   seqr   generated_tokensr_   s                          r   generater      s)   . ]]FBA ALA 7 78QT  
ON++;A3
?@!#J 18FI]]:q)F ++j.V
TCC2A2J 
f	'J^+ 	
P	
+ 	%%"71 	 	
 
 &""${{:r*I9Heg  ""$C1I$  " %!!#aSuyyAI)
B'Q	
  a ))SGa!eG_8'78b
AC#!JQ 
	s   &G00
G?c                     U R                  U5      nU(       a  U R                  5       /U-   n[        R                  " U[        R                  US9$ )Nr   )encodebos_idr3   r   rI   )	tokenizerstringbosr7   tokenss        r   encode_tokensr      sB    f%F
""$%.<<eii??r   c                 `   [         R                  " [        U 5      SSSS9nSU;   a  S[        U 5      ;   a  US   n[         R                  " S5         [        R
                  " U 5      nS S S 5        WR                  USS9  UR                  XS9nUR                  5       $ ! , (       d  f       N== f)	NTr/   )mmapweights_onlymap_locationrf   storiesmeta)assignr   )	r3   loadstrr7   r<   	from_nameload_state_dictrH   eval)checkpoint_pathr7   	precision
checkpointrf   s        r   _load_modelr      s    O4dJ *c/.B!B(
	f	%%o6 
	*T2HHFH4E::< 
	s   B
B-)z[INST]z[/INST]zHello, my name is   d   r@      g?z?checkpoints/meta-Transformer/Transformer-2-7b-chat-hf/model.pthprefill_sizedemo_summarize_promptnum_samplesr   quantizationmin_sqnrsparsitysavecompilecompile_prefillprofilememory_profilewrite_resultoutput_json_pathoutput_json_localc                 0  ^^^^^^^^^ U b@  U S:  a:  Uc  S[        U 5      S-
  -  nO%[        US5       nUR                  5       nSSS5        [        R                  R
                  R                  5         U	R                  5       (       d   U	5       eU	R                  S-  nUR                  5       (       d   [        U5      5       e[        SU 35        S[        U	5      ;   n[        S	5        [        R                  " 5       n[        U	UU5      n[        US
9  [        S[        R                  " 5       U-
  S S35        [        UU	5      m[        TUSUS9nUb:  [        TSSUS9n USU U R!                  S5      -
   n["        R$                  " UU 4SS9nUR!                  S5      n!["        R&                  " S5        S mU4S jn"S n#U
(       GaX  SSKJn$Jn%Jn&Jn'Jn(Jn)Jn*Jn+Jn,Jn-Jn.J n/J!n0  SSK"J#n1J$n2  SU
;   a  SSK%J&n3  U3" U5        U
RO                  S5      (       a  SSK(n4SSK)n5SSK*n6U6RW                  S5        SU5RY                  U4R[                  5       5      R\                   S3n7U
R_                  S5      n8[        U8S    5      n9U8S   S!:X  a  SO[        U8S   5      n:U8S"   S#:X  a  S$OS%n;U0" UU(" U9U:U;S&95        U6Ra                  U75        [        S'5        [c        U[        TUSUS9UUSUUS(9  U6Re                  U75        S)U
;   a  U0" UU-" 5       5        S*U
;   aT  U(       a+  S+U;   a%  SS,K3J4n<  U0" UU," U<" 5       S-9TS.9  U0" UU," 5       U"S.9  O"S/U
;   a  U0" UU," SS095        OU0" UU," 5       5        S1U
;   aG  Sn=S2U
;   a  Sn=[        U
R_                  S5      S    5      n:U:S3;   d
   S4U: 35       eU0" UU*" U:U=S S595        OS6U
;   a  SS7K3J5n>  [        U
Rm                  S65      5      n?U?S8:X  d  U?S9:X  d   eU?S8:X  a1  U0" UU)" [n        Rp                  [n        Rp                  U>" 5       S:95        O7U?S9:X  a1  U0" UU+" S[n        Rp                  [n        Rp                  U>" 5       S;95        S<U
;   a\  S=U
;   a8  SS>K3J9n@  U0" UU+" S?[n        Rp                  [n        Rp                  U@" 5       S;95        OS+U;   a  SS@K3J:nA  U0" UU*" UA" 5       S SA9U#S.9  SBU
;   a  U0" UU'" S"S5      5        GOSCU
;   a  U0" UU-" SDSE9SF S.9  GOU
RO                  SG5      (       a  SSHK;J<nB  SSIK=J>mJ?nCJ@nD  U
R_                  S5      S    nE[        U
R_                  S5      S   5      n:[        ["        UE["        R                  5      nEUR                  U5      nUD" US SJUEU:SK9  UB" UR                  U5      TSJWFUSL9R                  SM/S SN9  U4SO jnGS2U
;   n=U0" UUC" UEU:U=SP9UG5        GOSQU
;   a  S2U
;   a  Sn=OSn=U
R_                  S5      n8[        U8S    5      n?U?S :  a  U?S9::  d   SR5       e["        R                  ["        R                  ["        R                  ["        R                  ["        R                  ["        R                  ["        R                  ["        R                  SS.nHUHU?   nI[        U8S   5      n:U0" UU." UIU:U=ST95        GO-SUU
;   a  U["        R                  :X  d   SV5       eSSWK"JMnJJNnK  SSXKOJPnL  U
R_                  S5      n8[        ["        SYU8S     35      nM[        U8S   5      n:U:S:  a  WK" U:5      OWJ" S5      nN[        U8S"   R                  5       SZ:H  5      nOU0" UWL" WMUNUO(       a  [n        R                  O[n        Rp                  S[S\95        GO]S]U
;   a  U0" UU&" 5       5        GOGS^U
;   ak  U(       a  S+U;   a  U0" UU$" 5       TS.9  GO%[        U
R_                  S5      S_   5      nNUNS`:X  a  U2" 5       nNOWNSa:X  a  U1" 5       nNOU2" 5       nNU0" UU%" WNSb95        GOScU
;   Ga  SSdK;JTnP  SSeKUJVnF  SSfKWJXnQ  SJnRUP" TURUFSUR                  R                  SgS
9R                  SM/S 5      R                  5       S   R                  S   nSUF" US5      nS["        R                  " Sg5         UR                  S URSh9  SSS5        SiU
:X  a8  WQ" US[        R                  R                  R                  R                  WSWRSj9nGOuSkU
:X  a8  WQ" US[        R                  R                  R                  R                  WSWRSj9nGO7SlU
:X  a7  WQ" US[        R                  R                  R                  R                  WSWRSj9nOSmU
:X  a  [        R                  R                  R                  R                  [        R                  R                  R                  R                  -   [        R                  R                  R                  R                  -   nT[        R
                  R                  5       (       a1  WT[        R                  R                  R                  R                  -  nTWQ" USWTWSWRSj9nO	WQ" USWSWRSn9n[        So5        [c        U[        TUSUS9UUSUUS(9  [        Sp5        UR                  5         GOBSqU
;   GaT  SSdK;JTnP  SSeKUJVnF  SJnRUP" TURUFSUR                  R                  SgS
9R                  SM/S 5      R                  5       S   R                  S   nSUF" US5      nS["        R                  " Sg5         UR                  S URSh9  SSS5        SrU
:X  a$  U/" US[        R                  R                  WSUSs9nGOmStU
:X  a$  U/" US[        R                  R                  WSUSs9nGOCSuU
:X  a$  U/" US[        R                  R                  WSUSs9nGOSvU
:X  a#  U/" US[        R                  R                  WSUSs9nOSwU
:X  am  SSK(n4SSK)n5SSxKhJinU  UURa                  SU5RY                  U4R[                  5       5      R\                   S35        U/" US[        R                  R                  WSUSs9nO}SyU
:X  an   SSK(n4SSK)n5SSxKhJinU  UURa                  SU5RY                  U4R[                  5       5      R\                   S35        U/" US[        R                  R                  WSUSs9nO	U/" USWSUSz9n[c        U[        TUSUS9UUSUUS(9  UR                  5         OS{U
;   a3  SS|KlJmnV  UR                  U5        U0" UUV" ["        R                  SDS}95        OU(       a  SS~KnJonWJpnX  S+U;   a  WX" UR                  U5      WW" 5       TS.9  SU;   av  SSKnJqmJrnY  UR_                  S5      u  nZmm[        T5      [        T5      smmWX" UUUU4S jTS.9  [        U5        UX" UTR                  TS.9  [        U5        UX" UUY" TS9TS.9  [        USS9S-  n[U(       a  [        U	R                  5       5      n\[        U	R                  5      R_                  S5      S   n]["        R                  " UR                  5       W4R                  R                  U\U]SU
 S3-   5      5        U(       aF  [        S5        ["        R                  " [        SSS9q}U(       a  ["        R                  " [        SSS9q~U(       ap  USg:X  a,  ["        R                  GR                   GR                  SSSS9  O>US:X  a-  ["        GR                  GR                   GR                  SSSS9  O[        S5        / / / / S.n^U(       a  S_OSn_G[        U_U5       GHl  n`U`S:X  aL  USg:X  a   ["        R                  GR	                  5         O&US:X  a   ["        GR                  GR	                  5         [        US
9  W`S:  aI  U(       aB  G[        S5      nU(       a#  G[         SUGR                  5        SG[         3n[        TUSUS9nU(       a,  W`S:  a&  U c#  / mTGR                  S5      S   mSmUUUU4S jnaO,Ub&  W`S:  a   / mTGR                  S5      S   mUUU4S jnaOS na[        GR                  " 5       nG[        U5      G[        U5      ncnbG[        U5      G[        U5      nendSSKnfW`US -
  :w  d  U(       d  WfGR                  5       ngOK["        GR                  GR                  GR!                  5         ["        GR                  GR#                  5       ngWg   [c        UUUUUWaUUUUUWbWcWdWeS9nhSSS5        W`S:  a*  [        S[        GR                  " 5       U-
  S S35        GM  G[%        WgS5      (       a  WgGR'                  U S35        [        US
9  [        GR                  " 5       U-
  niU(       dq  Ucn  U ck  WhS   GR)                  5       njTGR+                  5       Uj;  a  WjO#WjSUjGR-                  TGR+                  5       5       nk[        TGR/                  Uk5      5        O[        S5        WhR!                  S_5      U!-
  nlUlWi-  nmW^S   GR1                  Um5        U^S   GR1                  Ui5        WdGR3                  We5      S-  nnUlUn-  noU^S   GR1                  Uo5        WbGR3                  Wc5      S-  npU^S   GR1                  Up5        [        SW`S -    SUiS SUmS S3SUpS SUoS S35        [        SW[Um-  S S35        U(       d  GM  W`S:X  d  GM  USg:X  a+  ["        R                  GR                   GR5                  5       nqO=US:X  a,  ["        GR                  GR                   GR5                  5       nqO[        S5        [        U S3S5       nSSKJnr  Ur" WqU5        SSS5        [        SU S3S5          O   [        S5        ["        GR:                  " ["        GR<                  " W^S   5      5      GR?                  5       ns["        GR:                  " ["        GR<                  " U^S   5      5      GR?                  5       nt["        GR:                  " ["        GR<                  " U^S   5      5      GR?                  5       nuW[Us-  nv["        R                  GRA                  5       S-  nw[        SUsS 35        [        SUuS S35        [        SUtS S35        USg:X  a#  ["        R                  GRA                  5       S-  nwO)US:X  a#  ["        GR                  GRA                  5       S-  nw[        SWsS 35        US :  a  [        SUWs-  S 35        [        SWvS S35        [        SWwS S35        [        SW[S S35        U(       Ga  SG[B        GRD                  " 5       GRG                  S5       SWsS SWuS SWtS SWvS SWwS SW[S S3nxUxSU
 SU SU	R                  R                   SU SU SU SU SU S3-  nxUxS-  nxUxU
(       a  SU
 S3OS-  nxUxU(       a  SU S3OS-  nxUxSU	 S3-  nxUxSU S3-  nxUxSU S3-  nxUxU(       a  SOS-  nxUxU(       a  SOS-  nxUxU (       a  SU  3OS-  nxUxU(       a  SU S3OS-  nxUxU(       a  SU S3OS-  nxUxU(       a  SOS-  nxUxSU S3-  nxUxSU S3-  nxUxSU S3-  nxUxSU S3-  nxUxSU S3-  nxUxU(       a  SU 3OS-  nxUxU(       a  SOS-  nxUxU(       a  SOS-  nx[        US5      nUGRI                  Ux5        UGRK                  5         U(       ap  / SQnyU	R                  R                  nzG[M        5       n{U
=(       d    SnIWzUIUUUW{SWvS/	n|UzUIUUUU{SWsS/	n}U(       a  G[N        OG[P        n~U~" UWyW|5        U~" UUyW}5        gg! , (       d  f       GNg= f! , (       d  f       GN*= f! , (       d  f       GN= f!    GNK= f! , (       d  f       GN= f! , (       d  f       GNX= f)zNGenerates text samples based on a pre-trained Transformer model and tokenizer.Nr   zprompt    rztokenizer.modelzUsing device=chatzLoading model ...r6   zTime to load model: z.02fz secondsT)r   r7   z
 <END_TEXT>FrQ   i  c                 b    [        U [        R                  R                  5      =(       a    SU;   $ )Nfeed_forward
isinstancer3   rZ   Linearmodfqns     r   ffn_onlymain.<locals>.ffn_onlyI  s!    #uxx/INc4IIr   c                 t   > [        U [        R                  R                  5      =(       a    T" X5      (       + $ r   r   )r   r   r   s     r   not_ffn_onlymain.<locals>.not_ffn_onlyL  s%    #uxx/J8J4JJr   c                 z    [        U [        R                  R                  5      =(       a    SU;   =(       d    SU;   $ )Nr   rx   r   r   s     r   ffn_or_attn_onlymain.<locals>.ffn_or_attn_onlyO  s0    #uxx/ 
c!7[C%7	
r   )3Float8DynamicActivationFloat8SemiSparseWeightConfig)Float8DynamicActivationFloat8WeightConfigFloat8WeightOnlyConfigFPXWeightOnlyConfigGemliteUIntXWeightOnlyConfig%Int4DynamicActivationInt4WeightConfigInt4WeightOnlyConfig%Int8DynamicActivationInt4WeightConfig%Int8DynamicActivationInt8WeightConfigInt8WeightOnlyConfigUIntXWeightOnlyConfig	autoquant	quantize_)PerRow	PerTensor	spinquant)apply_spinquantgemliterR   z/tmp/z_gemlite.json-r@   None   dqdynamicweight_only)	bit_width
group_sizemodezrunning gemlite warmup)r   rM   rN   int8woint8dqsemi)SemiSparseLayout)layout)	filter_fnint8dq_prefill_wo_decode)weight_only_decodeint4wohqq)    @         z=int4wo group_size needs to be one of [32,64,128,256] but got )r   use_hqqversionzint4dq-)CutlassInt4PackedLayout      )mapping_typeact_mapping_typer  )r   r  r  r  marlinqqq)MarlinQQQLayoutr  )MarlinSparseLayout)r  r  fp6zembed-int8wor
  )r   c                 J    [        U [        R                  R                  5      $ r   )r   r3   rZ   	Embedding)rg   argss     r   rr   main.<locals>.<lambda>  s    :a9K9K+Lr   awq)TransformerEvalWrapper)AWQObservedLinear	awq_uintxinsert_awq_observer_r  )quant_dtyper   )rf   r   r   input_prep_funcr7   wikitext)taskslimitc                    > [        U T5      $ r   )r   )mr   r  s     r   rr   r    s    
1>O0Pr   )r"  r   r  uintxznbits must be 1 to 8)r@   r   r   r  r         r  )r  #int8_dynamic_activation_intx_weightzJint8_dynamic_activation_intx_weight requires using precision=torch.float32)PerAxisPerGroup)%Int8DynamicActivationIntxWeightConfigrI   trueopaque_torchao_auto)weight_dtypeweight_granularityweight_mapping_typeintx_packing_formatfloat8wofloat8dqrA   r   row)granularityautoquant_v2)LMEvalInputRecorder)r=   )r:  r,   )r   r   zautoquant_v2-int4)manualqtensor_class_listexample_inputr   zautoquant_v2-float8zautoquant_v2-fpzautoquant_v2-all)r<  r>  r   zrunning generatezrunning finalize autoquantr   zautoquant-int4)r<  r=  r>  r   zautoquant-float8zautoquant-fpzautoquant-sparsezautoquant-gemlite-int4)GemLiteLinearTritonzautoquant-all)r<  r>  r   codebook)codebook_weight_only)rD   scale_block_size)semi_sparse_weight	sparsify_bsr)SupermaskLinearblock_sparse_weightc                 &   > TR                  U TTS9$ )N)sparsity_level	blocksize)from_linear)rg   rF  rJ  rI  s    r   rr   r  6  s    /55#1' 6 r   )rJ  )ignore_embeddingsg    eA.z.ptzCompiling Modelzreduce-overhead)r   	fullgraph)rN  r   i )trace_alloc_max_entriestrace_alloc_record_contextr.   z2Memory profiling only works on CUDA or XPU devices)tokens_per_secr   decode_tokens_per_secprefill_timezWhat is your prompt?  c                 h  > T(       a  g TR                  TR                  T/U R                  S5      R                  5       -   5      SS  5        U R	                  5       TR                  5       :X  a  Sm[        T5      S:X  d  T(       a+  [        SR                  T5      SSS9  TR                  5         g g )Nr   r@   Tr   endflush)
r}   decoder   tolistitemeos_idlenr5   joinclear)rg   bufferdone_generating	period_idr   s    r   r~   main.<locals>.callback  s    "i..	{QYYq\=P=P=R/RSTUTVWX668y//11&*Ov;!#"''&/r>LLN (7r   c                   > TR                  TR                  T/U R                  S5      R                  5       -   5      SS  5        [	        T5      S:X  a+  [        SR                  T5      SSS9  TR                  5         g g )Nr   r@   r  rV  TrW  )r}   rZ  r   r[  r^  r5   r_  r`  )rg   ra  rc  r   s    r   r~   rd    si    i..	{QYYq\=P=P=R/RSTUTVWXv;!#"''&/r>LLN $r   c                     U $ r   r*   r   s    r   rr   r    s    r   )r   r~   rM   rN   r   r   r   r   r   r   r   zCompilation time: z.2fexport_chrome_tracez.json
rQ  r   r   rR  rS  zSample z | overall time z.04fz s z tokens/secz| prefill time z
 s decode zBandwidth achieved: z GB/sz.picklewb)dumpz
memory profile z4.pickle saved, to convert that to a usable file, usez_python pytorch/torch/cuda/_memory_viz.py trace_plot <pickle file> -o <desired output name>.htmlz
==========zAverage overall tokens/sec: zAverage decode tokens/sec: z szAverage TTFT: zAverage tokens/sec: z%Average tokens/sec including batches zAverage Bandwidth: zPeak Memory Usage: z GBzModel Size: z%Y%m%d%H%M%Sz, tok/s=z6.2fz, tok/s_decode=z, ttft=z5.4fz, mem/s=z7.2fz GB/s, peak_mem=z5.2fz GB, model_size=z GB zquant: z
, sparse: z, mod: z, kv_quant: z, compile: z, compile_prefill: z	, dtype: z
, device: zrepro: python generate.py z--quantization rV  z--sparsity z--checkpoint_path z	--device z--precision z
--compile z--compile_prefill z--prefill_size z
--profile z--interactive z--num_samples z--max_new_tokens z--batch_size z--top_k z--temperature z--cache_size z--kv_cache_quantization z--linear_causal_mask a)	namerD   r   r   r7   archmetricactualtargetnoquantzmem/sztok/s)rI   openreadtorchaor   utils"recommended_inductor_config_setteris_fileparentr   r5   r   r   r;   r>   r   rU   r3   r   manual_seedtorchao.quantizationr   r   r   r   r   r   r   r   r   r   r   r   r    torchao.quantization.granularityr   r   torchao.prototype.spinquantr   
startswithospwdr   set_autotunegetpwuidgetuidpw_gecossplitload_configr   cache_configtorchao.dtypesr  r  removeprefixr
   	SYMMETRICr  r  torchao._models._evalr  torchao.prototype.awqr  r   r!  getattruint8rH   run_evaluint1uint2uint3uint4uint5uint6uint7float32r-  r.  torchao.quantization.quant_apir/  boollower
ASYMMETRICr;  torchao._models.llama.modelr=   +torchao.prototype.quantization.autoquant_v2r:  r   
vocab_sizerecord_inputsget_recorded_inputsvaluesr7   r   	prototype!DEFAULT_INT4_AUTOQUANT_CLASS_LISTOTHER_AUTOQUANT_CLASS_LIST"DEFAULT_FLOAT_AUTOQUANT_CLASS_LISTDEFAULT_AUTOQUANT_CLASS_LISTis_sm_89finalize_autoquant#DEFAULT_SPARSE_AUTOQUANT_CLASS_LISTgemlite.corer?  !GEMLITE_INT4_AUTOQUANT_CLASS_LISTALL_AUTOQUANT_CLASS_LIST'torchao.prototype.quantization.codebookrA  torchao.sparsityrC  rD  rF  rG  rY   	to_linearr   cwdrl  r   
state_dictpathr_  r   rp   rm   r,   memory_record_memory_historyr.   rw   reset_peak_memory_statsinputB_INSTstripE_INSTr   r   r8   
contextlibnullcontextprofiler_utils_init_for_cuda_graphsr   hasattrrg  r[  r]  indexrZ  r}   r#   	_snapshotpicklerj  meanr   r\  max_memory_reservedr   todaystrftimewritecloser   r   r	   )r   r   r   r   r   r   r   rN   rM   r   r   r   r   r   r   r   r   r   r   r   r   r7   r   r   r   r   ftokenizer_pathis_chatt0rf   encodedend_tagr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r~  r  r   config_file_quant_argsr   r   r   r  r  r  nbitsr  r  r  r   r!  r"  r=   is_observed_linear_NBITS_TO_DTYPErD   r-  r.  r/  r2  r9  is_asymmetricr;  r:  calibration_seq_lengthinputsall_qtensor_classesr?  rA  rC  rD  rG  r_   
model_size
output_dirfilenameaggregate_metricsstartr   r~   r   r   r   r   r  profyttok_listr   tokens_generated
tokens_secdecode_timedecode_tokens_secrS  snapshotrj  	tokpersecttftdecode_tokpersec	bandwidthmem
result_txtheadersrl  rm  memory_resultperformance_resultwrite_json_resultr  rF  rJ  ra  rb  r   rc  rI  r   s                                                                                                                                  @@@@@@@@@r   mainr    s	   @ L1$4 (#l"3a"78F+S1Q 2 AAC""$$5o5$$++.??N!!##8S%88#	M&
"#O,,G	
	B;Ev	 r!1$ 7x
@Ano>IIv4GG(	?fU:L7<<?:;))Wg.A6LLOM	dJK

 	
 	
 	
 	
 	G,&CE"""9--  '!#,,ryy{";"D"D!E]SK&,,S1KKN+I!,Q6!9s;q>?RJ +A$ 69MD,'JT ,*+iT&I!'   -|#e134|#Fh.;9AQAST&
 9;*
 ,|;9TR
 %!F!HI|#G$\//4Q78J "   PPZ|\  $
GUVW ,&>11)<=EA:!++z9%0%:%:)4)>)>68 !9#'%0%:%:)4)>)>68	 |#$:9#&%0%:%:)4)>)>.0	 8#=(0B0DaP.
 L e0A67|+$3L
 $$U++D  ',,S1!4K\//4Q78J!%ekkBKHHV$E q#;: #hhv&#" 8 h!l   "Q|+G +
G # $$&,,S1KA'EA:%1*D.DD,;;;;;;;;;;;;;;;;	O $E*E[^,Je25*gVW2lB- \- K
 ',,S1K"5CA/?*@AL[^,J2<q.(:.gajK Q!5!5!76!ABM5!-'2$ )4(>(>$..(=
 <'e356<'Fh.GI& ","4"4S"9""=>(*"++K E)"((K"++K=+V |+ALP%("#*,LL++! L %$&q*    .f5Ff%""#$5K #  &
 #l2$'.'8'8'E'E'R'R't't"(5 ',6$'.'8'8'E'E'R'R'm'm"(5 #l2$'.'8'8'E'E'R'R'u'u"(5 $|3%%22??\\''44AAccd''44AAdde $
 ==))++'7+<+<+I+I+V+V+q+qq'$':"(5 %"(5	 $%iT&I!' ./$$&L(AL%("#*,LL++! L %$&q*    .f5Ff%""#$5K #  &
  </!'.';';']']"(% $|3!'.';';'V'V"(%  </!'.';';'^'^"(% $|3!'.';';'_'_"(% *\9<#//CLL5>>?}M "'.';';']']"(% !L0
@'33RYY[ 9 B BC=Q "'.';';'T'T"(% "$fx iT&I!' $$&<'THHV+%++PRS
 
BXehhv&(:(<QHM ,4>>#+>(A~y(-n(=s9~%NI
 # %L))"
 %L *Y?8 )$G#MJ,,./
++,2237:

GGLLX!L>0E%EF	

   =="
 mmGtTJGVJJ44fQU 5  u_II33fQU 4  FG !#	 BqE5+&6

2245		1136"6k23F"81V\\^$4AfX>#Iv4OG16l&:F!((-a0I#O	# 	# #.16F!((-a0I# # #H    /
    - 	aw))+DNN!!779>>))+D'!'&;%#5$7"3#5!1A $ q5&t'8'8':R'?&DHMN4.//$$y%676""$4<AUt{{}H ##%X5 By/?/?/A BC 
 )""6*+$K66":5%)
*+22:>&!((+(556FG$N,{:1299:KL*778IJTQ.)00>a!eW,QtHC
47HTl40
;LT:RR]^	
 	$Z*%<T$B%HI>a1f ::,,6685 99++557JK(0$71'Xq! 8 #N#33ghq { '| 
, 

5<<(9:J(KLMRRTI::ell#4^#DEFKKMDzz&'>?@
df  Y&I
**
(
(
*S
0C	(3
89	'(8'>b
AB	N4+R
()jj,,.4	5ii++-3	 3
01A~5j96LS5QRS		$/u
56	Dz
-.	LD)
-.(..*33NCDHYW[L\\kl|  ~B  lC  CJ  KO  PT  JU  U]  ^g  hl  ]m  m}  ~A  BF  }G  GW  Xb  cg  Wh  hl  m
~ZzI_I_IdIdHeeq  sH  rI  IT  U\  T]  ]p  q@  pA  AJ  KT  JU  U_  `f  _g  gh  i  	i
22
<~Q7RO
8H:Q/C
*?*;1==
	&++
YKq11
gl25
o*2E
,~6BN

7)1-b@

>"2!4BN
+&2=
{m155
).)9;;
j\33
q))
{m155
jj\2bH
4I0rQ
1C-K
s#	
		

 %%**)	

 

 (9#>U 	 	*G]C*G5GHS y 21D
 &%r &%~R Tx 87s\   A^%A^7A_	A
A_ DA_#NA_5^%
A^4^7
A__	
A__A_ _#
A_2	_5
A`	__main__zYour CLI description.)descriptionz--prefill_sizezWhether to run in ttft mode)typedefaulthelpz--promptzInput prompt.z--demo_summarize_promptzRead prompt from text file)r  r  z--interactive
store_truez%Whether to launch in interactive mode)actionr  z--num_sampleszNumber of samples.z--max_new_tokenszMaximum number of new tokens.z--batch_sizezBatch size to benchmark withz--top_kzTop-k for sampling.z--temperaturezTemperature for sampling.z--checkpoint_pathz<../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pthzModel checkpoint path.z-qz--quantizationa  Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>, fbgemm-int4-<group_size>z
--min_sqnr)zNmin sqnr for quantizing v.s. not quantizing a layer, used in autoquant optionsz-sz
--sparsityz3Which sparsity techniques to apply: semi-structuredz--kv_cache_quantizationz Whether to quantize the KV cachez--cache_sizezeForce size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_sizez--linear_causal_maskztWhether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)z--savez$Whether to save the quantized model.z	--compilezWhether to compile the model.z--compile_prefillzPWhether to compile the prefill (improves prefill perf, but higher compile times)z	--profilezProfile path.z--memory_profilezfilename for memory profile.z--devicezDevice to usez--precisionc                 F    [        [        U R                  S5      S   5      $ )NrM  rA   )r  r3   r  r   s    r   rr   rr     s    wuaggcl2&67r   zdtype precision to usez--write_resultzPath where to write the resultz--output_json_pathz1Path where to write the json result for dashboardz--output_json_localznWhether to output json result for local machine or for CI machine, local option will fill in some dummy fields)g      ?N)bsysr   r   pathlibr   typingr   r   r3   torch._dynamo.configtorch._inductor.configrt  torchao._models.utilsr   r   r	   %torchao.quantization.quant_primitivesr
   torchao.utilsr   sparseSparseSemiStructuredTensor_FORCE_CUTLASSbackendsr,   enable_cudnn_sdpr   r8   r;   is_availabler.   default_device__file__rx  resolvewdr  r}   r   r  r<   r=   torchao._models.llama.tokenizerr>   rL   rY   rI   rb   re   Tensorrm   rp   r   r   no_gradr  r4   r   r   r   r  r  bfloat16r  r%   argparseArgumentParserparseradd_argument
parse_argsr  r5   r   r   r   r   r   r   r   rN   rM   r   r   r   r   r   r   r   r   r   r   r   r   r7   r   r   r   r   r*   r   r   <module>r     s       "     
 > 19> ' ' 6    $ $T *D D88 zz    yy 
	  
(^!!))+ B  M 9R Xc]  HSM 00 <<049LL0
\\0-- <<-49LL-
5<<%&- !!||! ||! 	!2  "' $$6:485937QQLLQ Q 	Q Q  Q Q Q "%**"2"23Q  

 0 01Q !!1!12Q uzz//0Q  \\!Q Qh *.n @ % #'%+/ I #' $""' $$!"%)nn#''+#9JI3-JIJI $C=JI 	JI
 JI JI JI JI JI JI 3-JI uoJI smJI   !JI" #JI$ %JI& 'JI( )JI* +JI, d^-JI. TN/JI4 4.5JI6 tn7JI8 9JI: 
;JIZ z$$1HIF
sD7T   &9   !2N   4  
 c1CWX
c8W   S!2P   	S?TU
eS7R   ST%	   J  	 
	   C	   !/  
 t	    D  
 ,R   L/N   _  
 $?S
t:X   n?   7%	   tT8X   @	   }   D	$K""

""		5i r   