
    ΅i                        % S SK Jr  S SKrS SKrS SKJrJrJr  SSKJ	r	  S SK
JrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJr  S SKJr  S SKrSS/r\" S5      r\" S5      r\R<                  " \5      r \RB                  RD                  r"S r#0 r$\%\\4   \&S'   S r'S<S\\\\4   /\\\4   4   4S jjr(\(" \"RR                  5      SS.S\*4S jj5       r+\(" \"RX                  5      S=S\*4S jj5       r-\(" \"R\                  5      S=S\*4S jj5       r/\(" \"R`                  5      S=S\*4S jj5       r1\(" \"Rd                  5           S>S\*4S jj5       r3 S<S\4\*   S\4\*   S\4\*   S \5S\*4
S! jjr6\(" \"Rn                  \"Rp                  \"Rr                  \"Rt                  \"Rv                  /5      SS.S\*4S" jj5       r<\(" \"Rz                  5      S\*4S# j5       r>S$ r?\(" \"R                  \"R                  \"R                  /5      SS.S\*4S% jj5       rCS& rDSS'.S\\E\E\*S(4   \E\*S(4   \E\*S(4   \E\*S(4   S-  4      4S) jjrFSS'.S\\E\E\*S(4   \E\*S(4   \E\*S(4   \E\*S(4   S-  4      4S* jjrG\(" \"R                  S+S,9SS.S\*4S- jj5       rI\(" \"R                  S+S,9S\*4S. j5       rKS/ rL\(" \"R                  \"R                  \"R                  /5      SS.S\*4S0 jj5       rP\(" \"R                  S+S,9S\*4S1 j5       rR\(" \"R                  S+S,9S\*4S2 j5       rT0 \"RR                  \+_\"RX                  \-_\"R\                  \/_\"R`                  \1_\"Rd                  \3_\"Rn                  \<_\"Rp                  \<_\"Rr                  \<_\"Rv                  \<_\"Rt                  \<_\"Rz                  \>_\"R                  \C_\"R                  \C_\"R                  \C_\"R                  \P_\"R                  \P_\"R                  \P_\"R                  \I\"R                  \K\"R                  \R\"R                  \T0Er$S3 rU/ S4QrVS5 rWS6 rXS\Y4S7 jrZS8 r[ " S9 S5      r\ " S: S;\5      r]g)?    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 \    [        U [        R                  5      (       a  U R                  $ U $ N)
isinstancetorchTensorshape)is    R/home/james-whalen/.local/lib/python3.13/site-packages/torch/utils/flop_counter.py	get_shaper       s!    !U\\""wwH    flop_registryc                 8   ^  [        T 5      S S.U 4S jj5       nU$ )N)out_valc                 B   > [        [        XU 45      u  pnT" USU0UD6$ )N	out_shape)r   r    )r$   argskwargsr&   fs       r   nfshape_wrapper.<locals>.nf#   s.    "*9tW6M"Ni$6)6v66r!   r   r)   r*   s   ` r   shape_wrapperr-   "   s#    
1X 7 7 Ir!   returnc                    ^ ^^  SSK Jm  S[        [        [        4   S[        [        [        4   4UUU 4S jjnU$ ! [         a    [        R	                  S5        [
        m NYf = f)Nr   JITFunctionz@triton not found; flop counting will not work for triton kernelsflop_formular.   c                    >^  T(       d  [        T 5      m SUU 4S jjn[        R                  R                  R	                  UT5        T $ )Nc                    > [        U [        R                  R                  T45      (       d  [	        SU  S[        U 5       35      eU [        ;   a  [        SU  35      eT[        U '   g )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper"   RuntimeError)targetr1   r2   s    r   register=register_flop_formula.<locals>.register_fun.<locals>.register4   sp    v

(C(C['QRR #H$6tF|nFG G &"%A&#JKK$0M&!r!   )r.   N)r-   r   utils_pytree	tree_map_)r2   r;   r1   get_rawtargetss   ` r   register_fun+register_flop_formula.<locals>.register_fun0   s<    (6L	1 	1 	%%h8r!   )	triton.runtime.jitr1   ImportErrorlogwarningr   r   r   r   )rA   r@   rB   r1   s   `` @r   r   r   )   s`    2
8BF#3 R8H  & /  VWs   = %A%$A%)r&   c                R    U u  pVUu  pxXg:w  a  [        SU SU 35      eXX-  S-  U-  $ )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper&   r'   r(   mkk2ns	            r   mm_floprS   E   sE    
 DAEBwRSTRUUZ[]Z^_``519q=r!   c                     [        X5      $ )zCount flops for addmm.rS   
self_shaperM   rN   r&   r(   s        r   
addmm_floprX   Q   s     7$$r!   c                     U u  pEnUu  pxn	XG:w  a  [        SU SU 35      eXh:w  a  [        SU SU 35      eXE-  U	-  S-  U-  n
U
$ )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rI   z0bmm: inner dimensions must match (k == k2), got rJ   rK   )rM   rN   r&   r(   brO   rP   b2rQ   rR   flops              r   bmm_flopr]   V   ss    
 GA!IBAwOPQsRWXZW[\]]wOPQsRWXZW[\]]519q=1DKr!   c                     [        X5      $ )z&Count flops for the baddbmm operation.)r]   rV   s        r   baddbmm_flopr_   e   s    
 G%%r!   c	                     [        X5      $ )zCount flops for _scaled_mm.rU   )
rM   rN   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr&   r(   s
             r   _scaled_mm_floprg   l   s     7$$r!   x_shapew_shaper&   
transposedc                 |    U S   nU(       a  U OUSS nUtpgn [        U5      [        U5      -  U-  U-  U-  S-  n	U	$ )a  Count flops for convolution.

Note only multiplication is
counted. Computation for bias are ignored.
Flops for a transposed convolution are calculated as
flops = (x_shape[2:] * prod(w_shape) * batch_size).
Args:
    x_shape (list(int)): The input shape before convolution.
    w_shape (list(int)): The filter shape.
    out_shape (list(int)): The output shape after convolution.
    transposed (bool): is the convolution transposed
Returns:
    int: the number of flops
r   rJ   Nr   )
rh   ri   r&   rj   
batch_size
conv_shapec_outc_infilter_sizer\   s
             r   conv_flop_countrq   }   s[    ( J''Y;J 'E+ 
d;//*<uDtKaODKr!   c                    [        XXvS9$ )zCount flops for convolution.rj   )rq   )
rh   ri   _bias_stride_padding	_dilationrj   r&   r'   r(   s
             r   	conv_floprx      s     7YNNr!   c                 0   S nSn U
S   (       a"  [        US   5      nU[        XX(       + 5      -  nU
S   (       aY  [        US   5      nU(       a#  U[        U" U 5      U" U5      U" U5      SS9-  nU$ U[        U" U5      U" U 5      U" U5      SS9-  nU$ )Nc                 4    U S   U S   /[        U SS  5      -   $ )Nr   r   rJ   )list)r   s    r   tconv_backward_flop.<locals>.t   s$    a%(#d59o55r!   r   r   Frs   )r    rq   )grad_out_shaperh   ri   rt   ru   rv   rw   rj   _output_padding_groupsoutput_maskr&   r|   
flop_countgrad_input_shapegrad_weight_shapes                   r   conv_backward_flopr      s    6JDL 1~$Yq\2on?OQ_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr!   c                     U u  p4pVUu  pxpUu  ppX7s=:X  a  U:X  a!  O  OXHs=:X  a  U:X  a  O  OXj:X  a
  X:X  a  Xj:X  d  [        S5      eSnU[        X4-  XV4X4-  Xi45      -  nU[        X4-  XY4X4-  X45      -  nU$ )zR
Count flops for self-attention.

NB: We can assume that value_shape == key_shape
z8sdpa_flop_count: query/key/value shapes are incompatibler   rL   r]   )query_shape	key_shapevalue_shaperZ   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopss                   r   sdpa_flop_countr     s     !NA#"Cc$Cc?s?!/c/3:]`]gWXXK8QUC-s/@AAK8QUC-s/@AAKr!   c                    [        XU5      $ )Count flops for self-attention.r   )r   r   r   r&   r'   r(   s         r   	sdpa_flopr   )  s     ;;??r!   c                     SSK Jn  SSKJn  [	        XU45      (       d8  U R
                  R                  S:w  a  U R                  5       R                  5       $ U/U R                  S5      S-
  -  $ )z
If the offsets tensor is fake, then we don't know the actual lengths.
In that case, we can just assume the worst case; each batch has max length.
r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   devicer8   difftolistsize)offsetsmax_lenr   r   s       r   _offsets_to_lengthsr   2  s\    
 9Dg,<=>>7>>CVCVZ`C`||~$$&&9Q!+,,r!   )grad_out.c              #     #    UGb+  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u  pn
UR                  u  pnUR                  u  pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a'  
Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr   rL   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r   %_unpack_flash_attention_nested_shapesr   >  sp    $  syy>Q !Z[[u{{q  !\]]HNNekk$A !ghhkkiikk !NOO !NOO??ioo- !dee+I=+I=&)-t&T"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU 'U 	
++syy%++AUx~~[_
__s   E&E(c              #     #    UGb.  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u    pn
UR                  u    pnUR                  u    pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a+  
Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r   )_unpack_efficient_attention_nested_shapesr   r  s    $  syy>Q !tuuu{{q  !vwwHNNekk$A   "B  C  C131313 !kll !kll!3!33  "Z [ ['C	'C		9TBLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU C 	
++syy%++AUx~~[_
__s   E)E+T)r@   c          
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   .0r   r   r   r   s        r   	<genexpr>0_flash_attention_forward_flop.<locals>.<genexpr>  &      6;2KK 	<<6;   r   sum)r   r   r   r   r   r   r   r&   r'   r(   sizess              r   _flash_attention_forward_flopr     s?    " 2E  6;  r!   c           
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   s        r   r   4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r'   r(   r   s              r   !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r!   c                    SnUu  pVpxUu  ppUu  pnnU u  nnnnXYs=:X  a  Us=:X  a  U:X  a  O  OXjs=:X  a  Us=:X  a  U:X  a  O  OX:X  d  [        S5      eUU:X  a  X:X  a  UU:X  d  [        S5      eSnU[        XV-  Xx4XV-  X45      -  nU[        XV-  UU4XV-  UU45      -  nU[        XV-  X4XV-  UU45      -  nU[        XV-  X{4XV-  X45      -  nU[        XV-  X4XV-  X{45      -  nU$ )Nr   zFsdpa_backward_flop_count: batch/heads/dimension mismatch among tensorszJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler   )r~   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4s                        r   sdpa_backward_flop_countr     s2   K NA#"Cc$Cc3'Cc3!s!c!)?S)?C)?szeff#:SZsczijjK 8QUC-s/@AAK 8QUC-sC/@AAK8QUC-sC/@AAK 8QUC-s/@AAK8QUC-s/@AAKr!   c                    [        XX#5      $ )z(Count flops for self-attention backward.r   )r~   r   r   r   r&   r'   r(   s          r   sdpa_backward_flopr     s    
 $NXXr!   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   r   r   r   r~   s        r   r   1_flash_attention_backward_flop.<locals>.<genexpr>(  &      CI?KK 	!iUUCIr   r   )r   r   r   r   out	logsumexpr   r   r   r   r'   r(   shapess                r   _flash_attention_backward_flopr     sB    " 3	F  CI  r!   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   s        r   r   5_efficient_attention_backward_flop.<locals>.<genexpr>I  r   r   r   )r   r   r   r   r   r   r   r   r   r   r'   r(   r   s                r   "_efficient_attention_backward_flopr   .  sB    " 7!!!!	F  CI  r!   c                 6    [        U [        5      (       d  U 4$ U $ r   )r   tuple)xs    r   normalize_tupler   g  s    atHr!   ) KMBTc                     [        S[        [        [        5      S-
  [        [	        U 5      5      S-
  S-  5      5      n[        U   $ )Nr   r   rJ   r   )maxminr   suffixesstr)numberindexs     r   get_suffix_strr	  p  s=     3s8}q(3s6{+;a+?A*EFGEE?r!   c                 X    [         R                  U5      nU SU-  -  S nU[         U   -   $ )Ni  z.3f)r  r  )r  suffixr  r   s       r   convert_num_with_suffixr  w  s2    NN6"E%c*E8E?""r!   c                     US:X  a  gX-  S $ )Nr   0%z.2% )numdenoms     r   convert_to_percent_strr  ~  s    zk#r!   c                 0   ^  [        T 5      U 4S j5       nU$ )Nc                 >   > [        U 5      u  pT" U6 n[        X25      $ r   )r   r   )r'   	flat_argsspecr   r)   s       r   r*   )_pytreeify_preserve_structure.<locals>.nf  s#    &t,	mc((r!   r   r,   s   ` r   _pytreeify_preserve_structurer    s     
1X) )
 Ir!   c                     ^  \ rS rSrSr    SS\R                  R                  \\R                  R                     -  S-  S\	S\
S\\\4   S-  SS4
U 4S	 jjjrS\	4S
 jrS\\\\\	4   4   4S jrSS jrS rS rS rSrU =r$ )r   i  a  
``FlopCounterMode`` is a context manager that counts the number of flops within its context.

It does this using a ``TorchDispatchMode``.

It also supports hierarchical output by passing a module (or list of
modules) to FlopCounterMode on construction. If you do not need hierarchical
output, you do not need to use it with a module.

Example usage

.. code-block:: python

    mod = ...
    with FlopCounterMode(mod) as flop_counter:
        mod.sum().backward()

Nmodsdepthdisplaycustom_mappingr.   c                 n  > [         TU ]  5         [        S 5      U l        X l        X0l        S U l        Uc  0 nUb  [        R                  " SSS9  0 [        EUR                  5        VVs0 s H%  u  pVU[        USS5      (       a  UO
[        U5      _M'     snnEU l	        [        5       U l        g s  snnf )Nc                       [        [        5      $ r   )r   intr  r!   r   <lambda>*FlopCounterMode.__init__.<locals>.<lambda>  s
    +VYJZr!   z<mods argument is not needed anymore, you can stop passing itrJ   )
stacklevel_get_rawF)super__init__r   flop_countsr  r  modewarningswarnr"   itemsgetattrr-   r   mod_tracker)selfr  r  r  r  rP   v	__class__s          r   r&  FlopCounterMode.__init__  s     	6ABZ6[
-1	!NMMXefg

WeWkWkWmnWmtqqwq*e44!-:JJWmn
 )? os   +,B1c                 N    [        U R                  S   R                  5       5      $ )NGlobal)r   r'  valuesr.  s    r   get_total_flopsFlopCounterMode.get_total_flops  s!    4##H-44677r!   c                     U R                   R                  5        VVs0 s H  u  pU[        U5      _M     snn$ s  snnf )zReturn the flop counts as a dictionary of dictionaries.

The outer
dictionary is keyed by module name, and the inner dictionary is keyed by
operation name.

Returns:
    Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
)r'  r+  dict)r.  rP   r/  s      r   get_flop_countsFlopCounterMode.get_flop_counts  s7     (,'7'7'='='?@'?tq47
'?@@@s   :c                 (  ^ ^
^^ Uc  T R                   nUc  SnSS KnSUl        / SQn/ nT R                  5       m
[	        T
5      mSmU
UUU 4S jn[        T R                  R                  5       5       HB  nUS:X  a  M  UR                  S5      S	-   nXq:  a  M&  U" XgS	-
  5      nUR                  U5        MD     ST R                  ;   a'  T(       d   U H  n	S
U	S   -   U	S'   M     U" SS5      U-   n[        U5      S:X  a  / SQ/nUR                  XCSS9$ )Ni?B r   T)ModuleFLOPz% TotalFc           	        > [        T
R                  U    R                  5       5      nT	UT:  -  m	SU-  n/ nUR                  X0-   [	        UT5      [        UT5      /5        T
R                  U    R                  5        H<  u  pVUR                  US-   [        U5      -   [	        UT5      [        UT5      /5        M>     U$ )N z - )r   r'  r4  appendr  r  r+  r  )mod_namer  r   paddingr4  rP   r/  global_flopsglobal_suffixis_global_subsumedr.  s          r   process_mod.FlopCounterMode.get_table.<locals>.process_mod  s     d..x8??ABK+"==EkGFMM"']C&{LA 
 ((288:eOc!f,+A}=*1l;  ; Mr!   r3  .r   r@  )r3  0r  )leftrightrL  )headerscolalign)r  tabulatePRESERVE_WHITESPACEr6  r	  sortedr'  keyscountextendr   )r.  r  rO  headerr4  rG  mod	mod_depth
cur_valuesr   rD  rE  rF  s   `         @@@r   	get_tableFlopCounterMode.get_table  s%   =JJE=E 	'+$.++-&|4"	 	, $**//12Ch		#*I $Sa-8JMM*% 3 t'''0Bq>a   !1-6Fv;!+,F  B\ ]]r!   c                     U R                   R                  5         U R                  R                  5         [	        U 5      U l        U R
                  R                  5         U $ r   )r'  clearr-  	__enter___FlopCounterModer(  r5  s    r   r]  FlopCounterMode.__enter__  sG     ""$$T*			r!   c                    U R                   c  [        S5      eU R                   R                  " U6 nS U l         U R                  R                  5         U R                  (       a$  [        U R                  U R                  5      5        U$ )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r(  rL   __exit__r-  r  printrY  r  )r.  r'   rZ   s      r   ra  FlopCounterMode.__exit__  sf    99 !_``II%	!!#<<$..,-r!   c                     XR                   ;   a[  U R                   U   nU" U0 UDSU0D6n[        U R                  R                  5       H  nU R                  U   U==   U-  ss'   M     U$ )Nr$   )r"   setr-  parentsr'  )r.  func_packetr   r'   r(   flop_count_funcr   pars           r   _count_flopsFlopCounterMode._count_flops  sm    ,,,"00=O($F&F#FJ4++334  %k2j@2 5
r!   )r  r  r'  r"   r-  r(  )NrJ   TNr   )__name__
__module____qualname____firstlineno____doc__r   nnr=  r{   r   boolr9  r	   r&  r6  r  r:  rY  r]  ra  rj  __static_attributes____classcell__)r0  s   @r   r   r     s    * DH 48+((//D$99D@+ + 	+
 !cNT1+
 >B+ +*8 8
Ac4S>&9!: 
A<^~ r!   c                   @    \ rS rSrSrS\SS4S jrS rS rSS	 jr	S
r
g)r^  i  Tcounterr.   Nc                     Xl         g r   rv  )r.  rv  s     r   r&  _FlopCounterMode.__init__   s    r!   c                    SSK nUR                  U R                  R                  5      nU    U" U6 nSSS5        UR                  U R                  R                  5      nX@R                  l        WU4$ ! , (       d  f       NG= f)a]  Execute a branch function and capture its FLOP counts without
affecting self.counter.flop_counts

Args:
    branch_fn: The branch function to execute
    operands: Arguments to pass to the branch function

Returns:
    Tuple of (result, flop_counts) where result is the branch output
    and flop_counts is a copy of the FLOP counts after execution
r   N)copyrv  r'  )r.  	branch_fnoperandsr{  checkpointed_flop_countsresultr'  s          r   $_execute_with_isolated_flop_counting5_FlopCounterMode._execute_with_isolated_flop_counting#  sg     	#'99T\\-E-E#F )F ii 8 89#; {""	 Ts   A33
Bc                    U[         R                  R                  R                  [         R                  R                  R                  1;   nU(       au  SSKJn  SSKJn  U" US   5      n[        X5      (       d1  [        US5      (       a  UR                  nOO[        X5      (       d  M1  U R                  R                  US X45      $ U[         R                  R                  R                  L GaL  Uu  ppU R                  X5      u  pU[         L a  [         $ U R                  X5      u  nnU[         L a  [         $ [#        UR%                  5       5      [#        UR%                  5       5      -  n0 nU H  nUU   nUU   n0 n[#        UR%                  5       5      [#        UR%                  5       5      -  nU H6  nUR'                  US5      nUR'                  US5      n[)        UU5      UU'   M8     UUU'   M     UR+                  5        H.  u  nnU R                  R,                  U   R/                  U5        M0     U$ [         $ )Nr   )
get_kernelr0   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wrapr  rD   r1   r   hasattrr  rv  rj  condr  NotImplementedre  rR  getr  r+  r'  update)r.  functypesr'   r(   	is_tritonr  r1   kernel_namepredtrue_branchfalse_branchr}  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dicts                               r   _handle_higher_order_ops)_FlopCounterMode._handle_higher_order_ops7  s    UYY33RR"YY33TTV V	M6$VL%9:K ::;--"-..K	 !::
 <<,,[$MMUYY++000
 9=5D|)-)R)R*&H >)%%+/+T+T,(I( N*%% /4467#>O>T>T>V:WWL!#)	#3I#> $5i$@!%'" #$4$9$9$; <sCTCYCYC[?\ \ -H/33Ha@H 1 5 5h BI36x3K&x0 !.
 1C"9- * *<)A)A)C%	:((3:::F *D
 O!!r!   c                 d   U(       a  UO0 nU[         R                  R                  R                  R                  [         R                  R                  R
                  R                  [         R                  R                  R
                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                   R                  [         R                  R                  R"                  R                  [         R                  R$                  R&                  R                  1;   a  [(        $ [+        U[         R,                  R.                  5      (       a  U R1                  XX45      $ XR2                  R4                  ;  ac  U[         R                  R$                  R6                  R                  La2  U    UR8                  " U0 UD6nU[(        La  UsS S S 5        $  S S S 5        U" U0 UD6nU R2                  R;                  UR<                  XcU5      $ ! , (       d  f       N== fr   )r   r  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr  r   r5   HigherOrderOperatorr  rv  r"   r   	decomposerj  _overloadpacket)r.  r  r  r'   r(   rr   s          r   __torch_dispatch__#_FlopCounterMode.__torch_dispatch__t  s9   !r EIINN44<<IINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3  "!dEJJ::;;00dKK ||111d%))..BWBWB_B_6_NND3F3N* *  D#F#||(()=)=s&QQ s   N!!
N/rx  )r  N)rl  rm  rn  ro  supports_higher_order_operatorsr   r&  r  r  r  rs  r  r!   r   r^  r^    s,    &*# D #(;"z"Rr!   r^  )Fr   )NNNFN)^r  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr	   r
   collections.abcr   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r)  __all__r   r   	getLoggerrl  rF   r  r  r    r"   r9  __annotations__r-   r   mmr   rS   addmmrX   bmmr]   baddbmmr_   
_scaled_mmrg   r{   rr  rq   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideablerx   convolution_backwardr   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r  r	  r  r  r  r  r   r^  r  r!   r   <module>r     s      F F )  $ $ ' # :   5
6T]t_! yy~~
 !#tCH~ "XxB?O>PRZ[]_a[aRb>b5c 8 tww/3 	# 	  	 tzz"%# % #% txx C  ! t||$&C & %& t' % 	% (%( 	$#Y$#Y$ Cy$ 	$
 	$L (())..1155	7 8
 cg Oux O8
O t001e e 2eN& DD@@@@B C EI @WZ @C@	-" 1` eE#s(OU38_eCHouSRUXY]G]]^_1`r 4` eE#s(OU38_eCHouSRUXY]G]]^_4`n t44dC  	 D> t88$G 	 H>: MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 I@GGWJJ
 	HHh 	LL,	
 	OO_ 	i 	y 	I 	!!9 	y 	1 	00) 	,,i 	,,i 	99;M  	557I!" 	557I#$ 	!!#@%%'H""$B&&(J+0 $# #  
N N`yR( yRr!   