
    oib                    \   S SK JrJrJrJrJrJrJrJr  S SK	r	S SK
r
S SKJr  S SKJs  Jr  S SKrSSKJrJrJrJrJr  S SKJr  SSKJr  \" \" S5      5      r\\" S	5      :  rSS
KJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  \
RP                  RR                  r*\" SSS9S1S j5       r+ \" SSS9S 5       r, S r- \R\                  " \-5         " S S\R^                  5      r0  " S S\R^                  5      r1 SSK2J3r3  \3S:X  a)  \
Rh                  Rj                  Rm                  S 5      S   r7O(\
RP                  Rj                  Rm                  S 5      S   r7\7S-  S-  S-  S::  a  SOSr8\" SSSS\8\8SSS\S9
r9\" SSSS\8SSSS\S9
r:\" SS\9S9S 5       r; \" SSS9S 5       r< \" SS\:S9S 5       r=  " S  S!\R^                  5      r> S" r? \R\                  " \?5        S# r@ \R\                  " \@5        S$ rA \R\                  " \A5         S S%KBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrL  \K" \FR                  5      rN     S2S& jrO  S S'KPJQrQ   S S(KPJRrR  \RrSS S*KTJUrU   " S+ S,\S5      rV " S- S.\S5      rWS/ rX \R\                  " \X5        g!    NR= f!   S S)KPJSrS   ND= f! \Y a  rZ\"" S0\Z5         SrZCZgSrZCZff = f)3    )AnyListOptionalTupleUnionDictSetCallableN   )TEMPORARY_PATCHEStorch_compile_torch_compileget_torch_compile_optionsUNSLOTH_ENABLE_LOGGING)version   )Versiontransformersz4.56.0.dev0)patch_functionpatch_function_past_key_valuesdedentKWARGS_TYPEraise_errorloggerCacheprocess_return)dtype_from_configTdynamic	fullgraphc                 v   U SS S S24   R                  [        R                  5      nUb  UR                  US9nU SSS S24   R                  [        R                  5      nUb  UR                  U* US9nU[        R                  " X-  5      -  nXeS-   -  nUR                  Uc  U R
                  5      $ U5      $ )N.r   )maxr   minr"   )totorchfloat32clampsigmoiddtype)aalphalimitr*   a_gelua_linearout_geluouts           _/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/temporary_patches/gpt_oss.pyswiglu_torch_forwardr3   0   s    sCaCx[^^EMM*F%(add|u}}-H>>uf%>8en55H
l
#C66U]!''6666    c                    U SS S S24   R                  [        R                  5      U SSS S24   R                  [        R                  5      pTUbW  XB:*  nUR                  5       U:*  n[        R                  " XdU5      n[        R                  " XuUR                  5       U-  5      n	O[        R                  " U[        S9=pgXEp[        R                  " X-  5      n
XU-  U
-  SU
-
  -  -   U	S-   -  nX-  n[        R                  " XkS5      n[        R                  " X|S5      n[        R                  " U 5      nXsUSS S S24'   USSS S24'   X=R                  UR                  5      -  $ )N.r   r   r*           )r%   r&   r'   abswheresign	ones_likeboolr)   
empty_liker*   )pre_actr,   r-   g1glmask_gmask_l   ḡ   l̄   σdgdlgrads                 r2   swiglu_torch_backwardrJ   >   sG   3!8.QTT	0B0E0Eemm0TqE!kk&U+kk&QVVX%56//!488S==%Bb AF++a
8B
(B
++f"
%B
++f"
%BG$D&(#DccNDaddO!!!r4   c                    ^^	^
^^^^^^^^^^  SS K m SS KnS nX!R                  R
                  l        S UR                  R
                  R                  l        [        UR                  R
                  R                  S5      (       a)  U4S jUR                  R
                  R                  l
         S UR                  R
                  R                  l         SS
K JmJn  TR                  TR                  TR                  sm	m
mUR                  m SS KnS m[#        UR$                  R&                  STSS9   " U4S jS[(        R*                  R,                  5      m  " U	U
UUU4S jS[.        R0                  5      n[#        UR$                  R&                  SU5         TR2                  R2                  m[(        R4                  R7                  T5      mU4S jn[#        UR$                  R&                  SU5         TR                  R8                  TR                  R:                  TR                  R<                  smmm SSKJ m  UUUUU4S jn [#        UR$                  R&                  SUSS9   SSKJ!m      S!U4S jjn[#        UR$                  R&                  S U5        g ! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        S	U 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f! [         a  n [        SU 5      s S n A $ S n A ff = f)"Nr   zPlease install triton_kernelsc                      gNT rN   r4   r2   is_kernels_available+patch_gpt_oss.<locals>.is_kernels_available_   s    4r4   c                      grM   rN   argskwargss     r2   <lambda>patch_gpt_oss.<locals>.<lambda>a       hlr4   z<transformers.quantizers.quantizer_mxfp4.is_kernels_available_lazy_import_kernelsc                     > T$ NrN   )rS   rT   triton_kernelss     r2   rU   rV   f   s    p~r4   c                      grM   rN   rR   s     r2   rU   rV   i   rW   r4   z8transformers.quantizers.quantizer_mxfp4.Mxfp4HfQuantizer)
matmul_ogsswiglur[   ztransformers.integrations.mxfp4c                     SSK JnJn  UR                  UR                  UR
                  pnUR                  n	UR                  R                  n
U	R                  SS9u  pU" U" XS9U40 UD6n U" U" U5      U
5      nX4$ )Nr   )tensortensor_detailsr   )mx_axisr6   )	r[   r`   ra   FP4convert_layoutwrap_torch_tensorlayoutStridedLayout"make_default_matmul_mxfp4_w_layout)ww_scalerS   rT   r`   ra   rc   rd   re   rf   rg   value_layoutvalue_layout_optss                r2   swizzle_mxfp4$patch_gpt_oss.<locals>.swizzle_mxfp4}   s    9JJ!!$$ /
  &&&--;;*0*S*S\]*S*^',Q:L^L]^ !!27!;]Kzr4   rm   relaxedmatch_levelc                   D   > \ rS rSr\U 4S j5       r \U 4S j5       rSrg)2patch_gpt_oss.<locals>.Mxfp4GptOssExperts_Training   c                   > T	" UR                  [        R                  5      UR                  UR                  UUS UR
                  S S S9	n[        UUR                  UR                  5      nT	" UUR                  UR                  US UUR                  UR                  S S9	nU R                  UUR                  UR                  UR                  UR                  UR                  5        X l        X@l        XPl        X0l        U$ )N)gather_indxscatter_indxprecision_configgammasfused_activation)r%   r&   bfloat16gate_up_projgate_up_proj_biasgate_up_proj_precision_configr3   r,   r-   	down_projdown_proj_biasdown_proj_precision_config	gate_scalsave_for_backwardsrc_indxdst_indx
self_class
gather_idxscatter_idxrouting_data)
ctxhidden_statesr   r   r   r   pre_activationswiglu_outputr1   r]   s
            r2   forward:patch_gpt_oss.<locals>.Mxfp4GptOssExperts_Training.forward   s    (  0'',,&!!+!I!I!%
N 1    M
 $$)) (!+!F!F#--!%
C !!&&####$$$$  *N)N*O+Jr4   c                    > [        S5      e)NzBackwards pass using MXFP4 is still under construction!
Instead, use `unsloth/gpt-oss-20b-BF16` for bfloat16 training which will work for LoRA.
Or, use `load_in_4bit = True` which allows finetuning.)NotImplementedErrorsaved_tensorsr   r-   r,   index_selectmul_	unsqueezer   dataswapaxes	transpose
contiguousr   r   rJ   r|   r   r&   
zeros_like
index_add_)r   
grad_tokenr>   gamma
gather_src
gather_dstscatter_srcscatter_dstr   r-   r,   grad_expWd_Tr?   Wu_Tdx_expdx_tokenr]   s                    r2   backward;patch_gpt_oss.<locals>.Mxfp4GptOssExperts_Training.backward   s    %I r4   rN   N)__name__
__module____qualname____firstlineno__staticmethodr   r   __static_attributes__)r]   s   r2   Mxfp4GptOssExperts_Trainingrs      s2    	/	 
/	` 			7 
	7: 	r4   r   c                   r   >^  \ rS rSrU 4S jrS\R                  S\R                  4UUUUU4S jjrSrU =r	$ ))patch_gpt_oss.<locals>.Mxfp4GptOssExperts   c           
        > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        [        R                  " [        R                  " U R                  SU R                  -  U R
                  S-  S[        R                  S9SS9U l        [        R                  " [        R                  " U R                  SU R                  -  U R
                  S-  [        R                  S9SS9U l        [        R                  " [        R                  " U R                  SU R                  -  [        R                  S9SS9U l        [        R                  " [        R                  " U R                  U R
                  U R                  S-  S4[        R                  S9SS9U l        [        R                  " [        R                  " U R                  U R
                  U R                  S-  [        R                  S9SS9U l        [        R                  " [        R                  " U R                  U R
                  [        R                  S9SS9U l        SU l        ['        USS	5      U l        S U l        S U l        g )
Nr          r6   Frequires_gradZd;?swiglu_limit      @)super__init__num_local_expertsnum_expertsintermediate_sizehidden_sizenn	Parameterr&   zerosuint8gate_up_proj_blocksgate_up_proj_scalesr'   r}   down_proj_blocksdown_proj_scalesr   r,   getattrr-   r~   r   selfconfig	__class__s     r2   r   2patch_gpt_oss.<locals>.Mxfp4GptOssExperts.__init__   s   G%77D%+%=%=D"%11D')||D,,a$2H2H.H$JZJZ^`J`bdlqlwlwx#(D$ (*||D,,a$2H2H.H$JZJZ^`J`hmhshst#(D$ &(\\D,,a$2H2H.HPUP]P]^ns&D" %'LLT--t/?/?AWAW[]A]_abjojujuv#%D! %'LLD,,d.>.>@V@VZ\@\didodop#%D! #%,,D,,d.>.>emmTdi#D DJ =DJ15D..2D+r4   r   returnc                 P  > [        UR                  5         [        U S5      (       d-  T" T" STS5      U R                  U R                  4S5      U l        UR                  (       d  T
" UR                  [        R                  5      U R                  U R                  UUU R                  S U R
                  S9nT
" UU R                  U R                  UUU R                  U(       a  UR                   OS S9nOT	R#                  UU UUU5      nS S S 5        U$ ! , (       d  f       W$ = f)Nactr^   )r,   r-   r   )rv   rx   ry   rz   )rw   rx   ry   )torch_cuda_devicedevicehasattrr,   r-   r   r   r%   r&   r{   r|   r}   r~   r   r   r   r   apply)r   r   r   r   r   intermediate_cache1intermediate_cache3FnSpecsFusedActivationr   r]   	swiglu_fns          r2   r   1patch_gpt_oss.<locals>.Mxfp4GptOssExperts.forward  s   "=#7#78tU++.wxL^/_bfblblnrnxnxay{|}DH$22*4%((8))..$$.)-)K)K#)-	+' +5+++$%0)-)H)H9E|554+' +F*K*K%$"#+'1 9> '&? 98> '&s   C5D
D%)r   r,   r   r   r   r   r}   r   r~   r   r   r   r-   r   
r   r   r   r   r   r&   Tensorr   r   __classcell__)r   r   r   r   r]   r   s   @r2   Mxfp4GptOssExpertsr      s2    !	3F 	'  	'afamam  	'  	'B 	r4   r   ztriton_kernels.routing.routingc                   > UR                   S   nUR                  SU R                  R                  5      n[        R
                  R                  XR                  R                  U R                  R                  5      n[        UR                  5         T" X0R                  R                  5      u  pEnS S S 5        U R                  UWWW5      nUR                  USU R                  R                  5      nXs4$ ! , (       d  f       NL= f)Nr   )shapereshaperouter
hidden_dimr   
functionallinearweightbiasr   r   top_kexperts)	r   r   
batch_sizerouter_logitsr   r   r   
routed_outroutings	           r2   mlp_forward"patch_gpt_oss.<locals>.mlp_forward<  s    "((+
%--b$++2H2HI,,]KK<N<NPTP[P[P`P`a}3344;M;;K\K\4]1Lk 5 \\-z;W
''
B8N8NO
(( 54s    C::
Dr   ztriton_kernels.matmul_ogs)shard_and_distribute_modulezEtransformers.integrations.tensor_parallel.shard_and_distribute_modulec                   > UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS 5      n	UR                  SS 5      n
UR                  SS 5      nS GH=  nX;   d  M  Ub  T" XbXqXX5        O;[        XR                  SS	5      S	   [        R                  R                  US
S95        U S3nU S3n[        X5      n[        X5      nUR                  R                  S:w  d  M  UR                  R                  S:w  d  M  UR                  S5      nUS:X  a!  UR                  UU R                  S-  S5      nO UR                  USU R                  S-  5      n[        USU5      S:X  a  SnUR                  U5      nUR                  U5      n[        R                  R                  U5         T" UR                  SS5      UR                  SS5      5      u  nnS S S 5        US:X  a6  [        R                  " UU R                   U R                  S-  /5      Wl        O2[        R                  " UU R                  U R                   /5      Wl        [        XU5        [        U U S3T" WT" T" 5       S9S95        [%        X5        [%        X5        AGM@     g ! , (       d  f       N= f)Nmodelempty_paramcasting_dtypeto_contiguousrankdevice_mesh)r|   r   .r   Fr   _blocks_scalesmetar   r|   r   r   typecpucuda_precision_config)rhs_data)weight_scaleflex_ctx)getsetattrrsplitr&   r   r   r   r   r   sizeviewr   r%   r   r   Sizer   r   delattr)module
param_nameparam_valuetarget_devicerS   rT   r   r   r   r   r   r   projblocks_attrscales_attrblocksscaleslocal_expertstriton_weight_tensorr  FlexCtx
InFlexDataPrecisionConfigr   rm   s                       r2   load_and_swizzle_mxfp4-patch_gpt_oss.<locals>.load_and_swizzle_mxfp4W  s   

7D)jj5

?D9

?D9zz&$'jj51D!*/K]cg F$5$5c1$=a$@%((BTBTU`puBTBvw!%g.!%g. 5 5==%%/FMM4F4F&4P$*KKNM~-!']F<T<TWX<XZ\!]!']B@X@X\]@]!^}fmDM(.#YY}5F#YY}5F**=9=J",,R4f6F6Fr26N>:,l : ~-5:ZZ*F,>,>@X@X[\@\]6,2 6;ZZ*F,D,DfFXFXY6,2
 F*>?& 12'\G]g]iLjk F0F0k 24 :9s   	-J33
K	r  )_replace_with_mxfp4_linearz:transformers.integrations.mxfp4._replace_with_mxfp4_linearc                   > UR                   (       a  U $ Uc  S/OUnUR                  b  UR                  UR                  5        [        [	        U5      5      nT" U UUUUS9u  pU(       d  [
        R                  " S5        U $ )Nlm_headr   zYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)
dequantizemodules_to_not_convertextendlistsetr   warning_once)r   r"  current_key_namequantization_configr   has_been_replacedr  s         r2   replace_with_mxfp4_linear0patch_gpt_oss.<locals>.replace_with_mxfp4_linear  s     ))%<0F0N)Tj55A"))*=*T*TU!%c*@&A!B#="$
  ! r4   r*  )NNNN)"r[   	Exceptionr   'transformers.quantizers.quantizer_mxfp4
quantizersquantizer_mxfp4rO   Mxfp4HfQuantizeris_trainabler   rX   r]   r^   r   r   r   transformers.integrations.mxfp4r   integrationsmxfp4r&   autogradFunctionr   Moduler   compilerdisabler  r  r  )transformers.integrations.tensor_parallelr   r  )er   rO   r^   r   r   r  r*  r  r   r   r  r   r  r  r]   r   r   r   rm   r[   s           @@@@@@@@@@@@@r2   patch_gpt_ossr<  W   sy   ?^6/G[//DPl//@@M |&&66GGI_``X~//@@UZPl//@@M	05&&!! 	-*
 $$	A.4 <,,22O]bklRenn&=&= Rf 	E ERYY EL <,,224HJ\]@ ((00..((1
) <,,22M;O;%%55%%--%%00 	-*gY= =| 	<,,224LNdt}~\N  $ 6 <,,224OQjk}
  ?:A>>?  ^Y[\]]^  ZUWXYYZ  0+Q//0
  A<a@@A~  @;Q??@,  ;6::;
  gbdeffgL  \WYZ[[\s  J, AK 4&K2 9L L8 %5M AM> N! 8O ,
K6KKK
K/K*$K/*K/2
L<LLL
L5L0*L50L58
MMMM
M;%M60M;6M;>
NNNN!
O+N<6O<O
O$OO$O$c                   j   ^  \ rS rSrU 4S jr  SS\R                  S\R                  4S jjrSrU =r	$ )GptOssExpertsi  c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        SU l        [        USS5      U l	        [        U5      U l        [        R                  " [        U R                  5       Vs/ s H:  n[        R                  " U R                  SU R                  -  U R                  S9PM<     sn5      U l        [        R                  " [        U R                  5       Vs/ s H7  n[        R                  " U R                  U R                  U R                  S9PM9     sn5      U l        g s  snf s  snf )Nr   r   r   r   r6   )r   r   r   r   r   r   
expert_dimr,   r   r-   r   r*   r   
ModuleListrangeLineargate_up_projs
down_projs)r   r   _r   s      r2   r   GptOssExperts.__init__  s   !33!-- 22
V^S9
&v.
]]4++,,
, IId&&DOO(;4::N,,
  --4++,)
, IIdoot'7'7tzzJ,)
 	,
)
s   AE>Er   r   c                 @   UR                   S   nUR                  SU R                  5      nUR                   S   nU R                  (       Ga7  [        R
                  " U[        R                  UR                  S9n[        U5       H  n[        R                  " 5          [        R                  " X':H  5      u  pS S S 5        UW   n
U R                  U   " U
5      n[        XR                  U R                  5      nU R                  U   " U5      nXXS 4   R!                  [        R                  5      -  nUR#                  SX5        M     UR%                  USU R                  5      nUR!                  UR&                  5      $ UR)                  S5      R+                  USS5      n[-        U R                  5       VVs/ s H  u  nnU" UU   5      PM     nnn[        R.                  " USS9n[        XR                  U R                  UR&                  S9n[-        U R                  5       VVs/ s H  u  nnU" UU   5      PM     nnn[        R.                  " USS9nUR1                  SS5      R)                  S5      nUR!                  [        R                  5      UR!                  [        R                  5      -  R3                  SS9nUR%                  USU R                  5      R!                  UR&                  5      $ ! , (       d  f       GNP= fs  snnf s  snnf )Nr   r   r   r*   r   dimr6   )r   r   r   trainingr&   r   r'   r   rB  no_gradr9   rD  r3   r,   r-   rE  r%   r   r
  r*   r   expand	enumeratestackr   sum)r   r   router_indicesrouting_weightsr   r   next_states
expert_idx	token_idxrF  current_stategate_upgated_outputr1   weighted_outputX_repr;  up_lgate_up_listfuseddown_lout_listoutsrwmixeds                            r2   r   GptOssExperts.forward  s    #((+
%--b$2B2BC%++A.===**=VcVjVjkK $K0
]]_#(;;~/K#LLI % !.i 8,,Z8G3GZZT ooj1,?"%	t8S(T(W(WX]XeXe(f"f&&q)E 1 &**:r4;K;KLK>>-"5"566!++A.55k2rJE:CDDVDV:WX:Wwq$DqN:WLXkk,A6G(**djjRWR]R]^E ;DDOO:TU:TYQuQx(:THU;;xQ/D **1a0::2>BWWU]]+beeEMM.BBGGAGNE::j"d.>.>?BB=CVCVWW? %_$ Y Vs   L>L/L
L	)r,   rE  r*   r@  rD  r   r-   r   NNr   r   s   @r2   r>  r>    s8    , 	1X||1X
 
1X 1Xr4   r>  c                   B   ^  \ rS rSrU 4S jr\" SSS9S 5       rSrU =r$ )GptOssTopKRouteri  c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R
                  [        U5      S9U l        g )Nr6   )r   r   num_experts_per_tokr   r   r   r   r   r   rC  r   r   r   s     r2   r   GptOssTopKRouter.__init__  s[    //
!33 ,,ii1A1AIZ[aIbcr4   Tr   c                 F   UR                  SU R                  5      nU R                  UR                  U R                  R                  R
                  5      5      n[        R                  " X R                  SS9u  p4UR
                  [        R                  :X  a  [        R                  OUR
                  n[        R                  R                  R                  US[        R                  S9R                  U5      n[        R                  " X%S9R                  SXC5      nXd4$ Nr   rJ  r   rK  r*   r6   )r   r   r   r%   r   r*   r&   topkr   float16r'   r   r   softmaxr   scatter_r   r   r   router_top_valuerR  r*   router_scoress          r2   r   GptOssTopKRouter.forward  s    %--b$//BM$4$4T[[5G5G5M5M$NO+0::mZZUW+X(!.!4!4!E=K^K^ 88..667GQV[VcVc6dgghmn((FOOPQSat,,r4   )r   r   r   r   )	r   r   r   r   r   r   r   r   r   rf  s   @r2   rh  rh    s$    d Tt4- 5-r4   rh  )DEVICE_TYPExpur   i   (   F
epilogue_fusionmax_autotuneshape_padding
cudagraphscoordinate_descent_tuningcombo_kernelsmemory_planningmulti_kerneluse_block_ptrloggingr   r    optionsc           
      $   U R                  U5      u  p#UnU R                  nUR                  S   nUR                  SUR                  5      nUR                  S   nUR                  S5      R                  USS5      n[        UR                  5       V	V
s/ s H  u  pU
" X   5      PM     nn	n
[        R                  " USS9nUR                  [        R                  :w  a  [        R                  OUR                  n[        XR                  UR                   US9nUR#                  U5      n[%        UR&                  R(                  [*        5      (       a0  UR&                  R(                  S:w  a  UR&                  R(                  OSn[        R,                  " USS	9   [        UR.                  5       V	Vs/ s H  u  n	nU" X   R#                  U5      5      PM!     nn	nS
S
S
5        [        R                  " WSS9nUR#                  U5      R1                  SS5      R                  S5      nUU-  R3                  SS9nUR5                  USUR                  5      R#                  UR                  5      $ s  sn
n	f s  snn	f ! , (       d  f       N= f)z=Torch compile for forward inference path only with CUDAGraphsr   r   r   rJ  r6   mpsr   Fdevice_typeenabledN)r   r   r   r   r   r   rN  rO  rD  r&   rP  r*   r{   r'   r3   r,   r-   r%   
isinstancer   r   strautocastrE  r   rQ  r
  )r   r   ru  rR  rS  moer   r   r[  r;  r\  r]  rX  r*   r^  r  r_  r`  ra  rb  rc  s                        r2   moe_forward_inferencer  9  s    %)KK$>!M#O
,,C$$Q'J!))"coo>M!''*K##A&--k2rBE 3<C<M<M2NO2NwqDN2NLOkk,a0G*00ENNBEMMH[H[E ))SYYNE HHUOE'1%,,2C2CS'I'IellN_N_chNh%,,##nsK	K	?@I#..@YZ@Y91fF58;;u-.@YZ 
@;;xQ'D			E	"	,	,Q	2	<	<R	@BBYOOO"E::j"coo699-:M:MNN P [ 
@	?s$   I5J7&I;J;J
Jc                 h   UR                  SU R                  5      n[        R                  " UR	                  U R
                  R                  5      U R
                  U R                  5      n[        R                  " X R                  SS9u  p4UR                  [        R                  :X  a  [        R                  OUR                  n[        R                  R                  R                  US[        R                  S9R	                  U5      n[        R                   " X%S9R#                  SXC5      nXd4$ rm  )r   r   Fr   r%   r   r*   r   r&   ro  r   rp  r'   r   r   rq  r   rr  rs  s          r2   moe_router_forwardr  X  s    !))"doo>MHH]--dkk.?.?@$++tyyYM',zz-QS'T$*00EMMAEMM}GZGZExx**223CRWR_R_2`ccdij$$]BKKA~pM((r4   c                 
   [        U R                  U5      u  p#UnU R                  nUR                  S   nUR	                  SUR
                  5      nUR                  S   nUR                  US5      nUR                  USUR
                  5      n[        R                  " XR                  5      UR                  SS S S 24   -   nUSS S S24   USSS S24   pU	R                  S UR                  S9n	U
R                  UR                  * UR                  S9n
U	[        R                  " U	R                  [        R                   5      UR"                  -  5      R                  U	R$                  5      -  n[        R                  " U
S-   U-  UR&                  5      nXR(                  SS S S 24   -   nUR                  XvSUR
                  5      nXR+                  SS5      R                  XvS5      S   -  nUR-                  SS9nU$ )	Nr   r   r   .r   r#   ).NrJ  )r  r   r   r   r   r   repeatr
  r&   bmmr|   r}   r(   r-   r)   r%   r'   r,   r*   r   r   r   rQ  )r   r   ru  rR  rS  r  r   r   rX  gateupglurT  s                r2   moe_forward_inference_bf16r  d  s   $6t{{M$R!M#O
,,C$$Q'J!))"coo>M!''*K!((a8M!&&{BHMii'7'783;P;PQTVZ\]Q];^^GsCaCx '#qt!t)"4"::$CII:.D	syyjcii	0B
twwu}}5		ABEEdjjQ
QC))b1f^cmm<K 2 23a< @@K"";BPK 9 9!Q ? D D[^` abk llK//a/(Kr4   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	GptOssMLPi}  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g rZ   )r   r   rh  r   r>  r   r   s     r2   r   GptOssMLP.__init__~  s&    &v.$V,r4   c                     UR                   u  p#nUS:X  a  U R                  (       d  [        X5      S 4$ U R                  U5      u  pVU R	                  XUS9nXu4$ )Nr   )rR  rS  )r   rL  r  r   r   )r   r   bszqlenhdru  rR  r   s           r2   r   GptOssMLP.forward  s[    %++219T]](=tCC(,M(B%\\-`m\n
((r4   )r   r   )r   r   r   r   r   r   r   r   rf  s   @r2   r  r  }  s    -
) )r4   r  c                     S[         R                  R                  SS5      ;  a  g S[         R                  R                  SS5      ;  a  g  SS Kn [         R                  R                  SS5      S	:X  a5    SS
[        R                  S[        R                  4S jjn U[        l	         [        U R                  R                  R                  l        [        U R                  R                  R                  l        [        U R                  R                  R                  l        g ! [         a  n[        SU5      s S nA$ S nAff = f)Ngpt_ossUNSLOTH_MODEL_NAME _load_in_4bit_r   z,transformers.models.gpt_oss.modeling_gpt_ossUNSLOTH_FORCE_FLOAT3201r   r   c           
      V	   UR                   S   nUR                  SU R                  5      nUR                   S   nU R                  (       Ga  [        R
                  " U[        R                  UR                  S9n[        U5       GH  n[        R                  " 5          [        R                  " X':H  5      u  pS S S 5        UW   n
U R                  U   " U
5      nU R                  U   n[        XR                  U R                  [        R                  S9nUR!                  [        R                  5      n[#        UR                  R$                  [&        5      (       a0  UR                  R$                  S:w  a  UR                  R$                  OSn[        R(                  " USS	9   U" U5      nS S S 5        WR!                  [        R                  5      X8US 4   R!                  [        R                  5      -  nUR+                  SUU5        GM     UR-                  USU R                  5      nUR!                  [        R                  5      $ UR/                  S5      R1                  USS5      n[3        U R                  5       VVs/ s H  u  nnU" UU   5      PM     nnn[        R4                  " USS
9nUR6                  [        R8                  :w  a  [        R                  OUR6                  n[        XR                  U R                  US9n[#        UR                  R$                  [&        5      (       a0  UR                  R$                  S:w  a  UR                  R$                  OSn[        R(                  " USS	9   [3        U R                  5       VVs/ s H   u  nnU" UU   R!                  U5      5      PM"     nnnS S S 5        [        R4                  " WSS
9nUR;                  SS5      R/                  S5      nUR!                  U5      UR!                  U5      -  R=                  SS
9nUR-                  USU R                  5      R!                  UR6                  5      $ ! , (       d  f       GN= f! , (       d  f       GN= fs  snnf s  snnf ! , (       d  f       N= f)Nr   r   r   rI  r6   r  r   Fr  rJ  )r   r   r   rL  r&   r   r'   r   rB  rM  r9   rD  rE  r3   r,   r-   r%   r  r   r  r  r   r
  r   rN  rO  rP  r*   r{   r   rQ  )r   r   rR  rS  r   r   rT  rU  rV  rF  rW  rX  r   rY  r  r1   rZ  r[  r;  r\  r]  r*   r^  r_  r`  ra  rb  rc  s                               r2   r   )patch_gpt_oss_linearized.<locals>.forward  s    ',,Q/J)11"d6F6FGM)//2K}}}#..}EMMZgZnZno #("4J',{{>3O'P	 ) %2)$<M"00<]KG $
 ;I#7TZZafanan#oL $0??5==#AL>HI\I\IaIacf>g>glxll  mE  mE  IN  mN,"5"5":":  TYKKO'5 P&)ffU]]&;oYceiNi>j>m>mnsn{n{>|&|O**1iI) #5* *..z2t?O?OP"~~emm44%//299+r2N>GHZHZ>[\>[71dU1X>[\++l:)6)<)<)NTaTgTg,Wjj$**V[\ 4>ell>O>OQT3U3UZ_ZfZfZkZkotZtell//z^^UK *34??)C )CIAv uQx{{512)C    L
 {{83$..q!4>>rB"%%,6;;;Bzz*b$2B2BCFF}GZGZ[[[ )  PO  ]  LKs<   Q*'	Q<
RR'RR*
Q9	<
R	R
R(re  )osenvironr  ,transformers.models.gpt_oss.modeling_gpt_ossr,  r   r&   r   r>  r   modelsr  modeling_gpt_ossrh  r  )r   r;  r   s      r2   patch_gpt_oss_linearizedr    s   

';R@@&rzz~~.BBGGN;
 
zz~~-s3s: ""	@	 <<@	
 \\@	B 	 'ANL00>DTL00A=FL00:
[  NI1MMNs   D& &
E0E;EEc                  H
  ^^	^
^^^^^ [         R                  R                  SS5      S:X  a  g S[         R                  R                  SS5      ;  a  g  SSKJmJn JnJn  Tc   e  S
S K
nUR                  R                  R                  R                    S
SK
Jm
  S[         R"                  R$                  l        S[         R(                  S[*        S[         R(                  4S jm[         R,                  R.                  R0                  m	[,        R.                  R2                  m[         R4                  m S$S[,        R6                  S[         R(                  S[         R(                  S[         R(                  S[8        [         R(                     S[:        S[:        4UU	UU4S jjjn  S$S[,        R6                  S[         R(                  S[         R(                  S[         R(                  S[8        [         R(                     S[:        S[:        4UU	UU4S jjjm [=        T
5      m
 Um  S%S[         R(                  S[>        [         R(                  [         R(                  4   S[8        [         R(                     S[8        [@           S[8        [         RB                     S[D        S[>        [         R(                  [         R(                  4   4U
UU4S jjjm / n  S%S[         R(                  S[>        [         R(                  [         R(                  4   S[8        [         R(                     S[8        [@           S[8        [         RB                     S[D        S[>        [         R(                  [8        [         R(                     [8        [>        [         R(                        4   4U4S  jjjnURG                  U5          S%S[         R(                  S[>        [         R(                  [         R(                  4   S[8        [         R(                     S![8        [@           S[8        [         RB                     S[D        S[>        [         R(                  [8        [         R(                     [8        [>        [         R(                        4   4U4S" jjjnURG                  U5        [I        UR                  R                  R                  R                  S#U5        S[         R                  S'   g ! [         a  n[        S	U5      s S nA$ S nAff = f! [         a  n[        SU5      s S nA$ S nAff = f)&NUNSLOTH_ENABLE_FLEX_ATTENTIONr  r  r  r  r  r   )flex_attention_with_sinkis_flex_attention_decoding!flex_attention_with_sink_decodingflex_attention_add_sinksr  r   apply_rotary_pos_embz<transformers.models.gpt_oss.modeling_gpt_oss.GptOssAttention   r   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r   rN  r   )r   r  batchnum_key_value_headsslenhead_dims         r2   	repeat_kv(patch_GptOssAttention.<locals>.repeat_kv  s`    
 6C5H5H2DA:  %aD!Q&67>>u[`hpq$$U%,GXXr4   r  querykeyvalueattention_maskscalingdropoutc           
      l  > T" X R                   5      nT" X0R                   5      n	UR                  u  ppUR                  u  ppUR                  XXS-   45      nT" XR                  SS5      US S 2S S 2S S 2S U24   S9nUU-  nUb#  US S 2S S 2S S 2S UR                  S   24   nUU-  nU R                  R                  SSS5      US S 2S S 2S S 2S4'   T" US[        R                  S9US S & UnUSS S24   nT" UX`R                  S	S
9nT" UXS9nUR                  SS5      R                  5       nUS 4$ )Nr   r      r1   r  r   rn  .TprL  inplace)
num_key_value_groupsr   	new_emptyr   sinksr   r&   r'   rL  r   )r  r  r  r  r  r  r  rT   
key_statesvalue_statesr  n_headsr  rF  kvlencombined_logitsattn_weightscausal_maskprobsscoresattn_output	F_dropout	F_softmaxmatmulr  s                        r2   inplace_eager_attention_forward>patch_GptOssAttention.<locals>.inplace_eager_attention_forward  sf    s$?$?@
 (C(CD!&d!+!1!1e$..d!G/LMe%9%9!Q%?WXYZ[\]c^c]cWcGde%(Aq2HJ4D4DR4H2H)HIKK'L (.||';';Ar1'E1a$
 'BemmTsCRCx 7__VZ[\<E!++Aq1<<>D  r4   c                 P  > T" X R                   5      nT" X0R                   5      n	T" XR                  SS5      5      n
X-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-  n
U R                  R	                  SSSS5      R                  UR                  S   SUR                  S   S5      n[        R                  " X/SS9nT" US[        R                  S9US S & UnUS	S S24   nT" XU R                  S
S9n
T" XUS9nUR                  SS5      R                  5       nUS 4$ )Nr   r  r  r   r   r   rJ  rn  .Tr  r  )r  r   r   r  r   rN  r&   catr'   rL  r   )r  r  r  r  r  r  r  rT   r  r  r  r  r  r  r  r  r  r  r  r  r  s                    r2   eager_attention_forward6patch_GptOssAttention.<locals>.eager_attention_forward-  s:    s$?$?@
 (C(CDe%9%9!Q%?@%(Aq2HJ4D4DR4H2H)HIK'L$$QAq188QU[[Y[_^`a))\$9rB
 'BemmTsCRCx V__VZ[\uE!++Aq1<<>D  r4   position_embeddingspast_key_valuecache_positionrT   c                   > UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pT" XX5      u  pUb#  SU0nUR                  XU R                  U5      u  pU R                  (       a  T" U U	U
U5      nS nOPT" U U	U
UU4U R                  (       d  SOU R                  U R                  U R                  U R                  S.UD6u  nnUR                  " / UQSP76 R                  5       nU R!                  U5      nUU4$ )Nr   r   r   r  r7   )r  r  sliding_windows_aux)r   r  q_projr
  r   k_projv_projupdate	layer_idxrL  attention_dropoutr  r  r  r   r   o_proj)r   r   r  r  r  r  rT   input_shapehidden_shapequery_statesr  r  cossincache_kwargsr  r  r  r  r  s                    r2   forward_function/patch_GptOssAttention.<locals>.forward_functionU  s    $))#2.88b8$--8{{=166|DNNqRST{{=166|DNNqRST
{{=166|DNNqRST&#7RU#[ %,n=L'5'<'<ZW[WeWegs't$J4 ==2	K  L )@) $(==d6L6L#22jj) )%K "));;;;FFHkk+.L((r4   c                    > T" XX#XE40 UD6$ rZ   rN   )r   r   r  r  r  r  rT   r  s          r2   r   &patch_GptOssAttention.<locals>.forward  s)      5HZh  D  }C  D  	Dr4   past_key_valuesc                    > T" XX#XE40 UD6$ rZ   rN   )r   r   r  r  r  r  rT   r  s          r2   r   r    s)      5HZi  E  ~D  E  	Er4   r   )r7   re  )%r  r  r  flex_attentionr  r  r  r  r,  r   r  r  r  r  GptOssAttentionr  r&   _dynamor   cache_size_limitr   intr   r   rq  r  r  r7  r   floatr   tupler   
LongTensorr   appendr   )r  r  r  r;  r   r  	functionsr   r  r  r  r  r  r  r  r  s           @@@@@@@@r2   patch_GptOssAttentionr
    s   	zz~~5s;sBF

';R@@&	:	
 	
 (333^;##44DDU -0EMM)	Y 	Yc 	Yell 	Y ##++I%%I\\F $!		$!||$! \\$! ||	$!
 !.$! $! $! $!J 	 !		!||! \\! ||	!
 !.! ! ! !> 	()=> #B +/59E)||E) #5<<#=>E) !.	E)
 !E) !!1!12E) E) 
u||U\\)	*E) E)L 	I +/59	D||	D #5<<#=>	D !.		D
 !	D !!1!12	D 	D 
u||Xell3XeELL>Q5RR	S	D 	D W ,059	E||	E #5<<#=>	E !.		E
 "%	E !!1!12	E 	E 
u||Xell3XeELL>Q5RR	S	E 	E W"<#6#6#>#>#O#O#_#_ajluv25BJJ./Q  :5q99:  ^Y[\]]^s<   S '4T 
S>(S93S>9S>
T!TT!T!c                    ^	^
^^^^^^^^^ [         R                  R                  SS5      S:X  a  g S[         R                  R                  SS5      ;  a  g  SS Kn U R                  R
                  R                  R                    SSKJm
  SS	KJ	m   SSKJm	  S[        R                  R                  l        SS Kn SS Kn S n ['        U R(                  S['        U R(                  SS 5      5      m['        U R(                  S['        U R(                  SS 5      5      mTc  [        S5      $ Tc  [        S5      $ [+        U R(                  S5      (       Gdw  [-        U R(                  R.                  SSS9U R(                  l        [-        U R(                  R2                  SSS9U R(                  l        U" T5      U R(                  l        U" T5      U R(                  l        U R(                  R.                  U R                  R
                  R                  l        U R(                  R2                  U R                  R
                  R                  l        U" U R(                  R6                  5      U R(                  l        U" U R8                  R:                  R6                  5      U R8                  R:                  l        SU R(                  l         SSKJ nJ!nJ"m  [G        T5      m SSK$J%m    S8S[        RL                  S[N        [        RL                  [        RL                  4   S[P        [        RL                     S[P        [R           S [P        [        RT                     S![V        4U4S" jjjm U4S# jm S$ m [-        SSS%S&9      S9S[        RL                  S[P        [        RL                     S'[P        [        RT                     S[P        [R           S([P        [X           S [P        [        RT                     S[P        [N        [        RL                  [        RL                  4      4UU4S) jjj5       n [[        SSSSSSSSS[\        S*9
n[-        S SUS+9S,[        RL                  S-[        RL                  S.[        RL                  4UU4S/ jj5       n U4S0 jm        S:S1[P        [        RT                     S[P        [        RL                     S'[P        [        RT                     S[P        [R           S2[P        [        R^                     S([P        [X           S [P        [        RT                     S![V        S3T
4U	U
UUUUU4S4 jjjn[a        U R                  R
                  R                  R                  S5US6S79  g ! [         a  n[        S
U5      s S nA$ S nAff = f! [         a  n[        S
U5        S m	 S nAGN$S nAff = f!   S m GN= f);Nr  r  r  r  r  r  r   )MoeModelOutputWithPastr  z8transformers.models.gpt_oss.modeling_gpt_oss.GptOssModel)DynamicCachec                      g rZ   rN   rR   s     r2   rU   #patch_GptOssModel.<locals>.<lambda>  s    tr4   r  c                    ^  U 4S jnU$ )Nc                     > US   R                   (       aU  SU;   a  US   $ U  HC  n[        U5      [        R                  L d  M!  UR                  [        R
                  :X  d  MA  Us  $    g T" U 0 UD6$ )Ninput_embedsr  )r   r   r&   r   r*   int32)rS   rT   argfs      r2   return_attention_mask>patch_GptOssModel.<locals>.wrap.<locals>.return_attention_mask  sj    n%33#v-!"233CCyELL0SYY%++5M"
    $)&))r4   rN   )r  r  s   ` r2   wrappatch_GptOssModel.<locals>.wrap  s    
	 %$r4   _old_create_causal_maskcreate_causal_mask&_old_create_sliding_window_causal_mask!create_sliding_window_causal_maskz-transformers.masking_utils.create_causal_maskz<transformers.masking_utils.create_sliding_window_causal_mask__patched_causal_mask__FT)r    r   r   )r  r  r  )r   r   r  r  r  r  rT   c                   > UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pT" XX5      u  pUb#  SU0nUR                  XU R                  U5      u  pXX4$ )Nr   r   r   r  )	r   r  r  r
  r   r  r  r  r  )r   r   r  r  r  r  rT   r  r  r  r  r  r  r  r  r  s                  r2   pre_attention_decoding1patch_GptOssModel.<locals>.pre_attention_decoding  s     $))#2.88b8$--8{{=166|DNNqRST{{=166|DNNqRST
{{=166|DNNqRST&#7RU#[ &,n=L'6'='=jX\XfXfht'u$JBBr4   c                    > T" XU5      nUR                   " / UQSP76 R                  5       nU R                  U5      nU$ )Nr   )r   r   r  )	self_attnr  	logsumexpr  r  s       r2   post_attention_decoding2patch_GptOssModel.<locals>.post_attention_decoding  sF    .yyQ!));;;;FFH&&{3r4   c                    UR                   nUR                  [        R                  5      nUR	                  5       R                  SSS9nX0R                  -  nU[        R                  " U5      -  nXR                  R                  UR                  5      R                  [        R                  5      -  nUR                  U5      $ )Nr   T)keepdim)
r*   r%   r&   r'   squaremeanvariance_epsilonrsqrt_r   r   )r   r   input_dtypevariances       r2   rms_layernorm_forward0patch_GptOssModel.<locals>.rms_layernorm_forward'  s    #))%((7 '')..r4.@)))h//(<(<=@@OO,,r4   zreduce-overhead)r   r    modeposition_ids	use_cachec                 f   > T" U R                   U5      nT" U R                  UUUUUUUS9u  ppXX4$ )N)r   r   r  r2  r  r3  r  r  )input_layernormr#  )r   r   r  r2  r  r3  r  r  r  r  r  r  r   r/  s               r2   pre_forward&patch_GptOssModel.<locals>.pre_forward2  sO     .d.B.BMR>T')%+) 3	?
;, BBr4   rz  r  residualr  r$  c                    > T" U R                   X#U5      nXQ-  nUR                  5       nT" U R                  U5      nXQ4$ rZ   )r#  clonepost_attention_layernorm)r   r8  r  r$  r  r   r%  r/  s         r2   post_forward'patch_GptOssModel.<locals>.post_forwardW  sK     0Xcd! !&&(-d.K.K][&&r4   c                   > UR                  5       n	T" U R                  U5      nU R                  " SUUUUUUUS.UD6u  pXR                  UR                  5      -  nUR                  5       n	T" U R
                  U5      nX4$ )N)r   r  r2  r  r3  r  r  rN   )r:  r5  r#  r%   r   r;  )r   r   r  r2  r  r3  r  r  rT   r8  rF  r/  s              r2   inference_forward,patch_GptOssModel.<locals>.inference_forwardh  s     !&&(-d.B.BMR>> 	
')%+) 3	
 	
 	]%9%9:: !&&(-d.K.K][&&r4   	input_idsinputs_embedsr   c                 J  > US L US L-  (       a  [        S5      eU(       a  Uc  T" U R                  S9nUcX  U R                  R                  R                  n	U R                  UR                  U	SS95      R                  UR                  5      nU R                  (       d  UR                  S5        UcD  Ub  UR                  5       OSn
[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nUnU R                  X5      n [        R                  R                  US5        [        R                  R!                  US5        [        R                  R                  US	5        U R                  (       d8  [#        U[$        5      (       d#  U R                  UUUUS
.nT" S0 UD6T" S0 UD6S.nUR                  u  pnU R                  (       Gd-  US:X  Ga&  [#        U[$        5      (       Ga  [        R&                  R)                  5         U R*                   H  nT" UUUUR,                     UUUUU40 UD6u  nn[/        UR0                  R2                  S5      (       a  [5        UR0                  U5      nOiUR0                  R2                  R6                  R8                  S:X  a%  Tc  [;        S5      eT" UR0                  U5      u  nnO[=        UR0                  U5      nUU-  nM      T" U R>                  U5      nO[U R*                   H9  n[#        U[$        5      (       a  UUR,                     OUnU" U4UUUUUUS.UD6nM;      U R?                  U5      nUR                  UR@                  5      n[C        TUUS.5      $ !    GN!= f)Nz:You must specify exactly one of input_ids or inputs_embedsr   T)non_blockingFr   r   )r   r   )r   r  r  r  r  )full_attentionsliding_attentionrD  r   z#Unsloth: MXFP4 forward is not found)r  r2  r  r3  r  r  )last_hidden_stater  rN   )"
ValueErrorr   embed_tokensr   r   r%   rL  requires_grad_get_seq_lengthr&   aranger   r   
rotary_embr  mark_staticmark_dynamicr  dictr8  cudagraph_mark_step_beginlayersattention_typer   mlpr   r  r   r   RuntimeErrorr  normr*   r   )r   rA  r  r2  r  rB  r3  r  rT   embed_devicepast_seen_tokensr   r  mask_kwargsr  r  r  decoder_layerr8  rF  maskr  r  r  r  r?  r   r/  s                        r2   r   "patch_GptOssModel.<locals>.forward  sz    -t";<YZZ0*$++>O ,,33::L --ill<X\l.]^aabkbrbrsM}}((/!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L%"oomJ	MM%%}a8MM&&}a8MM%%}a8
 }}Z%E%E++ -"0"0#2K #5"C{"C%F%U%UN &++2}}}z.$/O/O NN446!%*;!!"=#?#?@ #"'
+ 
+'x =,,44oFF$9-:K:K]$[M"&&..88AAEYY"**+PQQ'2=3D3Dm'T$M1$>}?P?PR_$`M)) "-* 1$))]KM!%GQR`bfGgGg~m&B&BCm{ -!	!#'!-$3'#1(;	! 	! "-  IIm4M%(()<)<=4"/ /7
  		s   !A N N"r   ro   rp   re  )NNNFNN)NNNNNNN)1r  r  r  r  r  r  r  GptOssModelr  r  r,  r   r  r&   r  r   r  transformers.masking_utilstransformers.generation.utilsr   masking_utilsr   r   r  r  r  r  create_masks_for_generate
generationutilsr  r   r  r  r  r   r2  r   r   r  r   r   r  r   r<   r   r   FloatTensorr   )r   r;  r  r  r  r6  fused_torch_compile_optionsr<  r   r  r  r  r  r  r  r?  r   r%  r   r/  s            @@@@@@@@@@@r2   patch_GptOssModelrf    s   	zz~~5s;sBF

';R@@&Z;##44@@WU4M
 -0EMM) &(% 	 ""!**,@$G
 )0""0**,OQUV)%
 !JKK(0YZZ<--/HII=KLLfLfLyLy  HM  Y]  >^"":LZ[g[u[u  \X  \X  fk  w{  M|""I8<=O8P""5GKLmGn""DJVJdJdJwJw##44GYeYsYs  ZV  ZV##44V?CLD^D^DxDx?y""<BF|G^G^GdGdG~G~B%%?=A"": 
 ))=>? ,059C||C #5<<#=>C !.	C
 "%C !!1!12C C C( 	
 	- 	 d>OP 2637+/$)59KOC||C !.C u//0	C
 "%C D>C !!1!12C &eELL%,,,F&GHC C QC. 	";$)(# d@[\',,' \\' <<	' ]' 	'< 	 151537+/59$(59hE,,-h !.h u//0	h
 "%h   1 12h D>h !!1!12h h 
 h hR <&&..??KKYX_oxyc	  ZUWXYYZ  4NPQR34rsB   :U: V !W :
VVVV
W'V<<WW)
AuthorConversationDeveloperContentHarmonyEncodingNameMessageRoleSystemContentToolDescriptionload_harmony_encodingReasoningEffortc           	      \    [           US;   d   eU=S:X  a    [        R                  nO-=S:X  a    [        R                  nOS:X  a  [        R
                  n/ nSS KnUR                  R                  5       R                  S5      n	[        R                  " [        R                  [         R                  " 5       R                  U5      R                  W5      R!                  U	5      R#                  S5      R%                  / S	Q5      5      n
UR'                  U
5        [(        R                  " 5       nUb  UR+                  U5      nUbX  / nU H?  nUS
   nUS   nUS   nUS   n[,        R                  " XU5      nUR'                  U5        MA     UR/                  U5      n Uc  Ub6  [        R                  " [        R0                  U5      nUR'                  U5        U  GH  nUS   S:X  a9  UR'                  [        R                  " [        R2                  US   5      5        MF  US   S:X  a  SU;   a:  [        R                  " [        R4                  US   5      nUR7                  S5      nOSU;   aj  [        R                  " [        R4                  US   S   S   5      nUR7                  S5      R9                  SUS   S   S    35      R;                  S5      nO9[        R                  " [        R4                  US   5      nUR7                  S5      nUR'                  U5        GML  US   S:X  d  GMX  [        R<                  " [>        R                  " [        R@                  SUS    35      US   5      R9                  S5      R7                  S5      nUR'                  U5        GM      [B        RD                  " U5      nU(       a%  [F        RI                  U[        R4                  5      nO[F        RK                  U5      n[F        RM                  U5      nUU4$ !   [        S5      e= f)Nz>Please install openai_harmony via `pip install openai_harmony`)lowmediumhighrr  rs  rt  r   z%Y-%m-%dz2024-06)analysis
commentaryfinalfunctionnamedescription
parametersroleusercontent	assistantthinkingru  
tool_calls	argumentsrv  z
functions.jsonrw  tool)'rm  ImportErrorrp  LOWMEDIUMHIGHdatetimetodaystrftimerk  from_role_and_contentrl  SYSTEMnewwith_model_identitywith_reasoning_effortwith_conversation_start_datewith_knowledge_cutoffwith_required_channelsr  ri  with_instructionsrn  with_function_tools	DEVELOPERUSER	ASSISTANTwith_channelwith_recipientwith_content_typefrom_author_and_contentrg  TOOLrh  from_messagesencoding"render_conversation_for_completionrender_conversationdecode)messagesreasoning_effortadd_generation_promptr  developer_instructionsmodel_identityharmony_reasoningconvosr  r  systemdev	new_toolsrx  ry  rz  r{  r  messagexharmony_input_idsharmony_decoded_texts                         r2   !encode_conversations_with_harmonyr    s   \ 8888
?+>+>(?+A+A(?+?+?(F ##%..z:E**4;;  0""#45))%0""9-##$GHF MM& 


 C)1F1FG]1^3	"H
+HF#D"=1K!,/J"&&t*EDT" # %%i0)Z-C++DNNC@c6?f$MM--dii9KL V_+W$11$..')BTUNN:.(11$..',BWXYBZ[fBghNN<0$nz',2G2J62R1S%TU''/  11$..')BTUNN7+MM!V_&//JJtyyJwv6G*HII& !.-ll<.H  MM!/ 0 	 ''/F$GGPTP^P^_$88@#??+<=!222Y\Z[[s   P P+)layer_type_validation)PreTrainedConfig)PretrainedConfig)rope_config_validationc                      ^  \ rS rSrSrSrS/S/4SS/S/4S/S/4S.rS	S	S	S
SSSSSSSS.rSSSSSSSSSSSSSSSSSSSSS.S S!S"SS#S$4S%\S&\S'\S(\S)\S*\S+\S,\S-\S.\	S/\
S0\	S1\	S2\	S3\	4U 4S4 jjjrS5rU =r$ )6Old_GptOssConfigio  
This will yield a configuration to that of the BERT
[google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.

r  rA  rB  r   r  rI  rR  rV  colwiserowwiselocal_rowwisegather	ep_routergrouped_gemmzlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.self_attn.sinkszlayers.*.mlp.expertszlayers.*.mlp.routerz!layers.*.mlp.experts.gate_up_projz&layers.*.mlp.experts.gate_up_proj_biaszlayers.*.mlp.experts.down_projz#layers.*.mlp.experts.down_proj_bias$       @  @          OAFsilu{Gz?   h㈵>yarn      @@      ?	rope_typefactor	beta_fast	beta_slowtruncater7      ?TNnum_hidden_layersr   
vocab_sizer   r   r  num_attention_headsr  r  
rope_theta
hidden_actinitializer_rangerms_norm_epsr  router_aux_loss_coefc                   > X0l         X@l        XPl        Xl        Xpl        X l        Xl        UU l        Uc  UnXl        Xl	        Xl
        Xl        Xl        UU l        UU l        Ub  UOU R                  U R                  -  U l        UU l        U R                   cC  [#        U R                  5       Vs/ s H  n[%        US-   S-  5      (       a  SOSPM     snU l        ['        U R                   5        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [)        U 5        SU l        Xl        UU l        UU l        UU l        [4        TU ]l  " S	SU0UD6  g s  snf )
Nr   r   rF  rE  r   r  Ttie_word_embeddingsrN   )r  r   r   r  r  r   r  rj  r  r  r  r  r  rope_scalingr  r  layer_typesrB  r<   r  r  attention_biasmax_position_embeddingsr  output_router_logitsr3  r   r   r   r  r   r  r   r   r  r  r  r  r  r  r  r  r  r  r  r  rj  r  r  r3  r  rT   ir   s                            r2   r   Old_GptOssConfig.__init__  sz   4 )O*%6"%6"':$%6""0':D$"*&9#':$(O%6" ,(O ,D%6D"(0(<H$BRBRVZVnVnBnDM*D'\abfbxbx\y$\yWX4Q!+<+<'BRR\y$  "$"2"23   ,4;L;L1L151B1B61J!!+."4("&D+B((<D%(<D%&DNG $7!$s   2$E,r  r  r  r  r   r  r   r  r  r  rj  r  r  r   r  r  r  r  r  r  r3  r  r   r   r   r   __doc__
model_typebase_model_pp_planbase_model_tp_planr  r  r  r   r   r   rf  s   @r2   r  r  o  sn   	 
)]_,=>')9:_<MN%&(9:
 *3)2)2)2(7$,#.1?6D.<3A
  &(%($#%)')'("% ( %$'+$*"&'-D_bpuv'* !*-!&/C	"C	  #C	 	C	
 C	  #C	 C	 "%C	 "%C	  C	 C	 C	  %C	   !C	$  %%C	( #()C	 C	r4   r  c                      ^  \ rS rSrSrSrS/S/4SS/S/4S/S/4S.rS	S	S	S
SSSSSSSS.rSSSSSSSSSSSSSSSSSSSSS.S S!S"SS#S$4S%\S&\S'\S(\S)\S*\S+\S,\S-\S.\	S/\
S0\	S1\	S2\	S3\	4U 4S4 jjjrS5rU =r$ )6GptOssConfigi  r  r  rA  rB  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Fr  r  r  r  r  r  r  r  r7   r  r  TNr  r   r  r   r   r  r  r  r  r  r  r  r  r  r  c                 &  > X0l         X@l        XPl        Xl        Xpl        X l        Xl        UU l        Uc  UnXl        Xl	        Xl
        Xl        Xl        UU l        UU l        Ub  UOU R                  U R                  -  U l        UU l        U R                   cC  [#        U R                  5       Vs/ s H  n[%        US-   S-  5      (       a  SOSPM     snU l        ['        U R                   5        SU l        Xl        UU l        UU l        UU l        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [3        U 5        SU l        Xl        UU l        UU l        UU l        [4        TU ]l  " S	SU0UD6  g s  snf )
Nr   r   rF  rE  Tr   r  r  rN   )r  r   r   r  r  r   r  rj  r  r  r  r  r  r  r  r  r  rB  r<   r  r  r  r  r  r3  r  r   r   r  s                            r2   r   GptOssConfig.__init__  s   4 )O*%6"%6"':$%6""0':D$"*&9#':$(O%6" ,(O ,D%6D"(0(<H$BRBRVZVnVnBnDM*D'\abfbxbx\y$\yWX4Q!+<+<'BRR\y$  "$"2"23"&D+B((<D%(<D%&DN   ,4;L;L1L151B1B61J!!+."4("&D+B((<D%(<D%&DNG $7+$s   2$Fr  r  rf  s   @r2   r  r    sn   	 
)]_,=>')9:_<MN%&(9:
 *3)2)2)2(7$,#.1?6D.<3A
  &(%($#%)')'("% ( %$'+$*"&'-D_bpuv'* !*-!&/H	"H	  #H	 	H	
 H	  #H	 H	 "%H	 "%H	  H	 H	 H	  %H	   !H	$  %%H	( #()H	 H	r4   r  c                  ~    SS K n U R                  R                  R                  R                     [        [        R                  " U R                  R                  R                  R                  5      5      n[        [        R                  " [        5      5      nUR                  SS5      nX2:X  aF  [        R                  " S5        [        U R                  R                  R                  S[        5        g g ! [
         a  n[        SU5      s S nA$ S nAff = f! [
         a  n[        SU5      s S nA$ S nAff = f)Nr   z1transformers.models.gpt_oss.configuration_gpt_ossr  r  zIUnsloth: Updating GPT OSS Config to fix missing `max_position_embeddings`)1transformers.models.gpt_oss.configuration_gpt_ossr  r  configuration_gpt_ossr  r,  r   r   inspect	getsourcer  replacer   infor   )r   r;  current_class	new_classs       r2   patch_gpt_oss_configr  4  s    	WD''==JJ	W"7#4#4\5H5H5P5P5f5f5s5s#tuMw001ABCI!))*<nMI)gh|22::PPR`bno *  	WRTUVV	W  	WRTUVV	Ws;   .C9 CD 9
DDDD
D<&D71D<7D<z>transformers.models.gpt_oss.configuration_gpt_oss.GptOssConfigrZ   )rs  TNNz:You are ChatGPT, a large language model trained by OpenAI.)[typingr   r   r   r   r   r   r	   r
   r  r&   torch.nnr   torch.nn.functionalr   r  r  commonr   r   r   r   r   importlib.metadatar   importlib_versionrc  r   transformers_versionhas_static_cacher   r   r   r   r   r   r   r   hf_utilsr   r   r   r   r3   rJ   r<  r  r7  r>  rh  r  rw  rx  memorymem_get_infodevice_memoryuse_combo_kernelsre  $no_combo_fused_torch_compile_optionsr  r  r  r  r  r
  rf  openai_harmonyrg  rh  ri  rj  rk  rl  rm  rn  ro  rp  HARMONY_GPT_OSSr  r   transformers.configuration_utilsr  r  r   transformers.modeling_rope_utilsr  r  r  r  r,  r;  rN   r4   r2   <module>r     se  " J I I 	       < 0@A '7=+AA 	 	 	 )JJ%%  40
7 1
7 40" 1"* alD     'EXBII EXL -ryy -"  &%II$$11!4R8MJJ%%2215b9M*4/4T9R?ET 7 1%$  (A 1$( $ $D<WXO YO8 40) 1)  $D<`a b* )		 ) Rf    1 2S6h    . /yzt	    * +	   %%8%H%HIH
   !QV3n bUFFE+ H^+ ^@c' cJW  	12		FFEv  UPRSTTUs<   *I> <J J 2J >JJJ J+	J&&J+