
    oip                        S SK 7  SSKrS SKJr  SSKJrJr  SSKJr  SSK	J
r
  SS	KJrJrJrJrJr  S S
K JrJrJr   SSKJrJrJrJrJr  SSKJr  SSKJ r    SSKJr          S'S\"RF                  S\$\%   S\$\"RF                     S\$\"RL                     S\$\'\"RF                        S\(S\(S\$\"RL                     S\$\'\"RF                  \"RF                  4      S\'\"RF                  \$\"RF                     \$\'\"RF                        4   4S jjr)\"RT                  r+  S(S\"RF                  S\$\'\"RF                        4S jjr,          S)S\"RF                  S\$\"RF                     S\$\"RF                     S\$\"RL                     S \$\"RL                     S\$\'\"RF                        S\$\(   S\$\(   S\$\"RL                     S\$\'\"RF                  \"RF                  4      S\'\"RZ                  \$\'\"RZ                  \"RZ                  4      4   4S! jjr.\,\/4S" jr0      S*S# jr1S$ r2 " S% S&\35      r4g!   SSKJr  \" \5      r\\" S5      :  d  \" S\ S35      e GN= f! \! a	    \" S5      ef = f)+   )*    N)__version__)Version
_get_dtype)dtype_from_config   )get_packed_info_from_kwargs)AttentionConfigAttentionContextrun_attentionselect_attention_backendSDPA)LlamaRotaryEmbedding!LlamaLinearScalingRotaryEmbedding"_LlamaModel_fast_forward_inference)FalconH1AttentionFalconH1DecoderLayerFalconH1ModelFalconH1ForCausalLM&FalconHybridMambaAttentionDynamicCachez4.53.0z&Unsloth: Your transformers version of z does not support FalconH1.
The minimum required version is 4.53.0.
Try `pip install --upgrade "transformers>=4.53.0"`
to obtain the latest transformers build, then restart this session.)*_prepare_4d_causal_attention_mask_for_sdpa)is_torchdynamo_compiling)r   zbUnsloth: Could not import FalconH1Attention from transformers.models.falcon_h1.modeling_falcon_h1.hidden_statescausal_maskattention_maskposition_idspast_key_valueoutput_attentions	use_cachepadding_maskposition_embeddingsreturnc
                    [        U S5      (       a  U ?U ?U ?U ?U ?U ?U ?UR                  5       u  pnU R                  R                  nU R                  nU R                  R                  nU R                  nUU-  U:X  d   eU R                  X5      u  nnnUR                  XUU5      nUR                  XUU5      nUR                  XUU5      R!                  SS5      n[#        XR$                  5      nUU R                  R&                  -  nUR!                  SS5      nUR!                  SS5      nUR(                  S   nUb  UUS   R(                  S   -  nU	(       a  UU	S   R(                  S   ::  a  U	u  nnOEU R*                  nUR-                  UUS9  UR/                  UUR$                  R0                  5      u  nnUb  UOUR3                  S5      n[5        UUUUU5      u  nnUb2  [6        R9                  US   U/SS9n[6        R9                  US   U/SS9nU(       a  UU4OS nS	nUS L =(       a    US L=(       a    US L =(       a    US	:H  nUb  [:        O
[=        U5      n[?        UUUS
UU4S.SS S
S.Uc  0 OSU0S9n[A        UUUUUURB                  UUUS9	n [E        UU UUUS9n!U!RG                  XUU-  5      n"U RI                  U U"5      n"S n#U"U#U4$ )Npaged_attentionr   r	   r   )seq_lenr   )dim)r)   T)causalwindow_sizeg        )	dropout_psoftmax_scaler*   	attn_mask)backend
n_kv_headsn_groupsflash_dense_kwargsflash_varlen_kwargssdpa_kwargs)	bszq_len
kv_seq_lenn_headshead_dimrequires_gradseq_infor   r   )configcontextQKV)%hasattrpaged_attention_Kpaged_attention_Vr%   temp_QAtemp_KVRH_Q	attentionsizer<   num_attention_headsnum_key_value_groupsnum_key_value_headsr9   	apply_qkvview	transposer
   devicekey_multipliershape
rotary_embextend_rope_embedding
get_cachedindexgetfast_rope_embeddingtorchcatr   r   r   r   r:   r   reshapeapply_o)$selfr   r   r   r   r   r   r    r!   r"   argskwargsr5   r6   _r8   r1   r0   r9   r>   r?   r@   r;   r7   cossinrR   rope_position_idswindow
use_varlenr/   attention_configr=   Aattn_outputattn_weightss$                                       R/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/models/falcon_h1.pyFalconH1Attention_fast_forwardrj   L   sL    t&''"" LLIN!&&(MCkk--G((H00J}}H G+++nnT1GAq!	s7H-A	s:x0A	s:x0::1a@A*63G3GHH 	
DKK&&&A	AqA	AqAJ!nQ'--b11
 z-@-C-I-I!-LL&S__
((j(A((QXX^^DS %0fjj6P  q!S#/@ADAq!II~a(!,AI6II~a(!,AI6(aVdN F$ 	D 	d"	 h	  *0H0T  '&
3

 !

 +2bn8U %33'!
G 	/7qVWXA))C((:;K,,t[1KLn44    c           
      z   UnUR                  5       u  pxn	Uu  pUR                  nU R                  R                  nU R                  nU R                  R
                  nU R                  nU R                  R                  nUU-  nU
R                  S   nUS-   nUR                  nU(       Ga  [        R                  [        U-   S-   SXU4UUS9U l        U R                  SS2S4   U l        U R                  SS2S4   U l        U
R!                  SSSS5      U R                  SU& UR!                  SSSS5      U R                  SU& [        R                  SUSU4UUS9U l        [        R                  SUSUU-  4UUS9U l        [        R                  X}SU4UUS9U l        UU:w  a  [        R                  SUU4UUS9U l        O U R"                  S   SS2SS2SU24   U l        [        R                  X}S[        U-   4UUS9U l        S[-        U R                  5      -  U l        US-  U l        OUU R                  R                  S   :  a  U R                  R3                  U R                  R                  S   [        -   SUUU45        U R                  SS2S4   U l        U R                  SS2S4   U l        U R*                  R3                  X}SU R*                  R                  S	   [        -   45        [5        U R6                  X`R"                  S   S
9n[5        U R8                  X`R$                  S   S
9nUU R                  R:                  -  n[5        U R<                  X`R$                  S   S
9nUR?                  USUU5      nUR?                  USUU5      nUR?                  USUU5      RA                  SS5      nURA                  SS5      nURA                  SS5      nU RB                  RE                  UUS-   5        U RB                  RG                  UUR                  RH                  5      u  nnUU   RK                  S5      nUU   RK                  S5      nU R0                  nU R&                  nUSS2SS2SS2US24   USS2SS2SS2SU24'   USS2SS2SS2SU24   USS2SS2SS2US24'   USS2SS2SS2SU24   RM                  5         UU-  nURO                  UU5        USS2SU2SS2SS24   nUSS2SS2SS2US24   USS2SS2SS2SU24'   USS2SS2SS2SU24   USS2SS2SS2US24'   USS2SS2SS2SU24   RM                  5         UU-  nURO                  UU5        UR!                  SSSS5      U R                  U'   UR!                  SSSS5      U R                  U'   U R                  SU R!                  SSSS5      nU R                  SU R!                  SSSS5      n[Q        U R                  SS5      nUb.  UU:  a(  SU-
  nUSS2SS2US2SS24   n USS2SS2US2SS24   n!OUUn!n U R                  u    nn"nUS:X  d  [R        (       dt  US:w  an  U SS2SS2SSS2SS24   RU                  XUU"U5      n U!SS2SS2SSS2SS24   RU                  XUU"U5      n!U RW                  X}U"U5      n U!RW                  X}U"U5      n!US:X  ak  UU R.                  -  n[Y        UU RA                  SS5      U R*                  SS2SS2SS2SU"24   S
9n#[[        U#S	[        R\                  S9U#SS& [Y        U#U!US
9n#O'[R        (       a  [_        UU U!USSS9n#O[_        UU U!USS9n#U#RA                  SS5      n#U#RW                  USU5      n#[5        U R`                  U#U R(                  S
9n#U#UU44$ )a  
https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L406
Fast inference using KV cache.
QK^T can be computed in 4 chunks

[Q, q] @ [K, k].T where q, k are the new tokens.
[QK^T, Qk^T]
[qK^T, qk^T]

Since the attention mask wipes Qk^T, we just get
[QK^T,    0]
[qK^T, qk^T]

Since softmax is row-wise, we get
softmax([QK^T,    0])
softmax([qK^T, qk^T])

We then multiply by   [V]
                      [v]
softmax([QK^T,    0]) [softmax(QK^T)V] *
softmax([qK^T, qk^T]) [softmax([qK^T, qk^T]) @ [V, v]]

But notice * [softmax(QK^T)V] is just the last attention.
We just need to compute the last final row.

This means we can pass in a row of Q, but we need to
remember K and V, which are called the KV cache.
r&   r   r	   dtyperO   Nr      g      ?r)   )outsliding_window)r(   rn   FT)r.   	is_causal
enable_gqa)r.   rr   )1rH   rn   r<   rI   rJ   rK   r9   hidden_sizerQ   rO   rX   emptyKV_CACHE_INCREMENTr%   rB   rC   permuterD   rE   rF   temp_OrG   	math_sqrtscalarhalf_head_dimresize_fast_linear_forwardq_projk_projrP   v_projrM   rN   rR   rS   rT   rU   	unsqueezeneg_addcmul_getattrSDPA_HAS_GQAexpandrZ   torch_matmultorch_nn_functional_softmaxfloat32scaled_dot_product_attentiono_proj)$r\   r   r   r   
do_prefillr   Xnr5   r_   hdK1V1rn   r8   r1   r0   r9   rt   attention_sizer'   r7   rO   QnKnVnr`   ra   hrF   RH_Krq   slicing_tokensKnnVnn
cached_lenrf   s$                                       ri   (FalconH1Attention_fast_forward_inferencer      s   H 
B##%JCBFBHHEkk--G((H00J}}H ++))Kx'NhhrlG1J !!F${{')A-q#8L  +  

 "&!5!5ad!;!%!5!5ad!;+-::aAq+Ax(+-::aAq+Ax({{Q' # 
 {{Q
X-. # 
 KKq( ;UU[K\	 [(++q#{&;UU[+\DK,,q/!Q*<=DK107:;UU[ % 
 Idmm44%]	t++11!4	4$$$$**1-0BB	
 "&!5!5ad!;!%!5!5ad!;1dnn22269KKL	
 
T[["LLO	DB	T[["LLO	DB	dkk((	(B	T[["LLO	DB	Q
B 
Q
H
B 
aX	.	8	8A	>B	a	B	a	B 	OO))"gk:))*biiooFHC
l

%
%a
(C
l

%
%a
(CA99D1aAB;DAq"1"1aBQB;DAq!"Aq"1"#IBKKc	;J;1D 1aAB;DAq"1"1aBQB;DAq!"Aq"1"#IBKKc
 ')jjAq!&<D7#&(jjAq!&<D7#				,	4	4Q1a	@B				,	4	4Q1a	@B T[[*:DAN!j>&A^+A~)*A~)*rS ))Aq*a
ax||A!Qa"#**Xz8
 !Qa"#**Xz8
 kk#
H=kk#
H= ax
dkka#4>>!Q;J;:N+O
 +R
! Cr*<,C.eZ^A -C.eA 	
AqA			#q.)ADKK$++>Ar2h;rk   mamba_attention_maskcache_positionc                    U	(       a  [        U S5      (       a  Un[        U R                  U5      nU R                  " SUUUUUUU	U
US.	UD6u  nnnXR                  -  nU R                  UUUUS9nUU R                  -  nUU-   nX-  nUn[        U R                  U5      n[        U R                  U5      nX-  nOUn[        U R                  U5      nU R                  UUUUS9nUU R                  -  nU R                  " SUUUUUUU	U
US.	UD6u  nnnXR                  -  nUU-   nX-   nUn[        U R                  U5      nU R                  U5      nX-   nU4nU(       a  UU4-  nU	(       a  UU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
_flag_for_generation)	r   r   r   r   r   r   r    r!   r"   r   cache_paramsr   r    )rA   fast_rms_layernorm_inferenceinput_layernorm	self_attnattn_out_multipliermambassm_out_multiplierpre_ff_layernormfast_swiglu_inferencefeed_forwardfast_rms_layernorm)r\   r   r   r   r   r   r   r   r   r    r!   r"   r]   r^   residualattention_hidden_statesself_attn_weightspresent_key_valuemamba_hidden_statesoutputss                       ri   !FalconH1DecoderLayer_fast_forwardr     s   : WT#9:: 4  -
 IM I
)%+'+ 1!'"5I
 I
E!24E #:<T<T"T"jj))+1	 ) 
 2D4K4KK+.EE! !4!!=
 .d.?.?O! *4+?+?O"jj))+1	 ) 
 2D4K4KKHL I
)%+'+ 1!'"5I
 I
E!24E #:<T<T"T+.EE !0 !*4+@+@-P))-8 0G%''%''Nrk   c                 "   ^ ^    SU U4S jjnU$ )Nc                 (  > US S 2S U R                   24   nUR                  u  pxU R                  R                  n	U R                  R                  n
U R                  R
                  u  pU R                  R                  U5      nXR                  R                  -  nUR                  [        [        U R                  5      5      5      nUR                  u  pxn	US:X  d   e[        R                  XxU	4[        R                  SS9n[        R                  SXxU	4[        R                  SS9nUS   US   nn[        R                  XxS4[        R                  SS9n[        R                  SUSU
4UR                  SS9nUS   US   nnUS   S   R                  S   nUS:w  a$  [!        UXx4UU[#        U R                  SS 5      S9nOS n/ n[%        U R                  R&                  5       H  u  nnUR)                  U5        [+        UR,                  UUUUS	9nT" UR.                  UUR0                  -  UU   UU[3        UR.                  S
5      (       + S9u  nnUUR4                  -  nUR7                  UUUUS9nUUR8                  -  nUU-   nX-  nUR)                  U5        [+        UR:                  UUUUS	9nT" UR<                  UUUUUS9nX-  nUR?                  U5        M     [+        U R                  R@                  UUUUS	9n[C        UU/ / S9$ )Nr   zcuda:0rm   r	   r   r&   rq   )rq   )XXXX2variancer%   )r   r   r   r   r   r   )	temp_gatetemp_upgate_multiplierdown_multiplier)last_hidden_statepast_key_valuesr   
attentions)"max_seq_lengthrQ   r<   rt   intermediate_sizemlp_multipliersmodelembed_tokensembedding_multipliertor   r   rX   ru   r   rn   r   r   	enumeratelayerscopy_r   r   r   attention_in_multiplierrA   r   r   r   r   r   appendfinal_layernormBaseModelOutputWithPast)r\   	input_idsr   r   r   r   r   r5   r6   r   mlp_sizer   r   Xr   _XXr   r   r   temp_mlpr   r   r'   next_decoder_cacheidxdecoder_layerr   r   r    attention_fast_forward_inferencemlp_fast_forward_inferences                                ri   +FalconH1Model_fast_forward_inference_customU_FalconH1_fast_forward_inference.<locals>.FalconH1Model_fast_forward_inference_custom  sJ    a!64#6#6!667	__
[[$$;;00+/;;+F+F(JJ##I.000DD-dkk:;<Bzz;;B/QY;Zkk1c"-u}}xkXa&#a&C;;APX;Y;;384aggPX;Y%a[(1+7	!!$Q'--b1!8G!(6F!MN "N"+DJJ,=,=">CNN1,--#A 1!++$%(M(M$M%4S%9#/%3%,]-D-DFW%X!X 7#%6 (-*K*KK $ #0"5"5 !0!/!5	 #6 # #68X8X"X#&==AMANN1,..#A +**%!"1"1A MA%%&78g #?h )JJ&&
 ' !0	
 	
rk   )NNNr   )r   r   r   s   `` ri    _FalconH1_fast_forward_inferencer     s#     #k
 k
Z 76rk   c           	      F   US L n	U	(       dk  Uc%  [        5       (       d  US   UR                  S   :  a  US S 2UR                  S   * S 24   nO)UR                  S   UR                  S   :w  a	  US S 2U4   nUbZ  UcW  UR                  5       R                  S5      S-
  nUR	                  US:H  S5        U	(       d  US S 2UR                  S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUU R                  R                  US.5        U
$ )Nr)   r   r   inputs_embedsr   )r   r   r    r   logits_to_keepr   )	r   rQ   longcumsummasked_fill_
contiguousupdater<   num_logits_to_keep)r\   r   r   r   r   r   r   r    r^   empty_past_kvmodel_inputss              ri   #_fast_prepare_inputs_for_generationr   s  sJ    $t+M %(**nR.@IOOTUDV.V "!n&:&:1&=%=%?"?@IOOA."6"6q"99!!^"34I !l&:%**,33B7!;!!.A"5q9'IOOA,>+>+@(@AL  ]'7 --/
 (.","kk<<,	
	 rk   c                 >    [        U S5      (       a  [        U l        g g )Nprepare_inputs_for_generation)rA   r   r   )modules    ri   !fix_prepare_inputs_for_generationr     s    v677/R, 8rk   c                   N    \ rS rSr\S 5       r\           SS j5       rSrg)FastFalconH1Modeli  c                     [        S[        [        [        S9u  pU b(  [	        U[        5       5        [        U 5      [        l        [        [        l	        [        [        l	        [        [        l	        [        [        [         5      5      ["        l	        [$        [&        l	        [)        ["        5        SS Kn[        UR,                  R.                  R0                  l        g )NFalconH1)
model_namerope_modulescaled_rope_moduleattention_moduler   )patch_linear_scalingr   r   r   execglobalseval__init__rj   forwardr   r   LlamaModel_fast_forwardr   CausalLM_fast_forwardr   r   r   PeftModel_fast_forwardPeftModelForCausalLMr   0transformers.models.falcon_h1.modeling_falcon_h1models	falcon_h1modeling_falcon_h1FalconH1RotaryEmbedding)	init_namefunctiontransformerss      ri   	pre_patchFastFalconH1Model.pre_patch  s    2#.!B0	
	  79%)-i&$B!'H$ 7&;,-UV'
# (>$)*=> 	@ ! 	%%88P 	rk   Nc                 N    [         R                  " SU UUUUUUU[        U	U
S.UD6$ )N)r   r   rn   load_in_4bittoken
device_maprope_scalingfix_tokenizermodel_patchertokenizer_nametrust_remote_coder   )FastLlamaModelfrom_pretrainedr   )r   r   rn   r  r	  r
  r  r  r  r  r  r^   s               ri   r  !FastFalconH1Model.from_pretrained  sG     -- 
#+'#')-+ 1
 
 	
rk   r   )zQwen/FalconH1-7Bi   NTN
sequentialNTNNF)__name__
__module____qualname____firstlineno__staticmethodr  r  __static_attributes__r   rk   ri   r   r     sI     > '!!
 
rk   r   )NNNNFFNN)FN)
NNNNNNFFNN)NNNNNT)5llamaos_utilsr   unsloth_zoo.utilsr   r   unsloth_zoo.hf_utilsr   utils.packingr
   utils.attention_dispatchr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  transformers_versionImportError%transformers.modeling_attn_mask_utilsr   transformers.utilsr   ModuleNotFoundErrorrX   TensorOptionalBlockDiagonalCausalMask
LongTensorTupleboolrj   matmulr   r   FloatTensorr   r   r   r   r   r  r   r   rk   ri   <module>r.     sq    	  1 2 7  
 (

 6:-1/348#/3GKo5<<o5 12o5 U\\*	o5
 5++,o5 U5<<01o5 o5 o5 5++,o5 "%ell(B"CDo5 5<<%,,/%:M1NNOo5d || G<<G U5<<01G\ -137/31548(- %/3GKm<<m U\\*	m
 #5<<0m 5++,m U--.m U5<<01m  ~m ~m 5++,m "%ell(B"CDm 5huU->->@Q@Q-Q'RSSTmb (P!6s7t CLS=
 =
k
@"#787$  45I4J KR S
 	
*   l 	s   H1 I 1(II.