
    oiE                     p   S SK 7  SSKrS SKJr  SSKJr  SSKJr  SSKJ	r	J
r
  SS	KJrJrJrJr  S S
K JrJr  SSKJrJrJrJr   SSKJrJr  SSKJrJr          S%S\R6                  S\\   S\\R6                     S\\R<                     S\\\R6                        S\ S\ S\\R<                     S\\\R6                  \R6                  4      S\\R6                  \\R6                     \\\R6                        4   4S jjr!             S&S\R<                  S\\   S\\R6                     S\\R<                     S\\"\RF                        S\\RF                     S\\R<                     S\\    S\\    S\\    S\\    S\\$   S \\$   S\%\\&4   4S! jjr'S" r( " S# S$\)5      r*g!   \r\r GNh= f)'   )*    N)__version__)
_get_dtype)dtype_from_config   )get_packed_info_from_kwargsmask_packed_sequence_boundaries)AttentionConfigAttentionContextrun_attentionselect_attention_backend)LlamaRotaryEmbedding!LlamaLinearScalingRotaryEmbedding)MistralAttentionMistralDecoderLayerMistralModelMistralForCausalLM)MistralSdpaAttentionMistralFlashAttention2)Versionr   hidden_statescausal_maskattention_maskposition_idspast_key_valueoutput_attentions	use_cachepadding_maskposition_embeddingsreturnc
                 ~   [        U S5      (       a  U ?U ?U ?U ?U ?U ?U ?UR                  5       u  pnU R                  R                  nU R                  nU R                  R                  nU R                  nUU-  U:X  d   eU R                  X5      u  nnnUR                  XUU5      R!                  SS5      nUR                  XUU5      R!                  SS5      nUR                  XUU5      R!                  SS5      n[#        UUR$                  5      nUR&                  S   nUb  UUS   R&                  S   -  nU R(                  R+                  UUS9  U R(                  R-                  UUR$                  R.                  5      u  nnUb  UOUR1                  S5      n[3        UUUUU5      u  nnUb2  [4        R7                  US   U/SS9n[4        R7                  US   U/SS9nU(       a  UU4OS n[9        U R                  S	S 5      nUb  US
:X  a  UOUnUU::  a  SOUU4nUS L=(       a    US L =(       a    US:H  n[;        U5      n[=        UUUSUS.SS[9        U SS 5      S.S9n [?        UUUUUUR@                  UUUS9	n![C        U U!UUUS9n"U"RE                  XUU-  5      n#U RG                  U U#5      n#S n$U#U$U4$ )Npaged_attentionr   r   r   )seq_lenr   )dimsliding_windownull)r)   T)causalwindow_size        softmax_scale)	dropout_pr*   r-   )backend
n_kv_headsn_groupsflash_dense_kwargsflash_varlen_kwargs)	bszq_len
kv_seq_lenn_headshead_dimrequires_gradseq_infor   r   )configcontextQKV)$hasattrpaged_attention_Kpaged_attention_Vr#   temp_QAtemp_KVRH_Q	attentionsizer;   num_attention_headsnum_key_value_groupsnum_key_value_headsr8   	apply_qkvview	transposer	   deviceshape
rotary_embextend_rope_embedding
get_cachedindexgetfast_rope_embeddingtorchcatgetattrr   r   r   r9   r   reshapeapply_o)%selfr   r   r   r   r   r   r   r   r    argskwargsr4   r5   _r7   r1   r0   r8   r=   r>   r?   r:   r6   cossinrope_position_idssw_cfgswr+   
use_varlenr/   attention_configr<   Aattn_outputattn_weightss%                                        P/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/models/mistral.pyMistralAttention_fast_forwardrj   5   s    t&''"" LLIN!&&(MCkk--G((H00J}}H G+++nnT1GAq!	s7H-771=A	s:x0::1a@A	s:x0::1a@A*6188<HJ!nQ'--b11
 	OO))!z)B))*ahhnnEHC %0fjj6P  q!S#/@ADAq!II~a(!,AI6II~a(!,AI6(aVdN T[["2D9F&F*:B)R/(r2hK 	S4!7SK8<S  'z2G&(,[I$T?DA

 %33'!
G 	/7qVWXA))C((:;K,,t[1KLn44    	input_idspast_key_valuesinputs_embedslabelsoutput_hidden_statesreturn_dictnum_logits_to_keeplogits_to_keepc                    UGcC  UGc?  UR                   u  nn[        U R                  SS 5      n[        (       a  Ub  US:X  d  US::  a   [        R
                  R                  5       nGOUU::  a   [        R
                  R                  5       nGO[        R
                  R                  R                  U/U-  5      R                  US9nGO}Ub  US:X  d  US::  d  UU::  aC  [        R                  [        R                  UU4[        R                  * UR                  S9SS9nO[        R                  UUR                  S9R!                  SS5      n[        R                  UUR                  S9R!                  SS5      nUU:*  nUU-
  U:  n[        R#                  UU-  S	[        R                  * 5      nUc   US S S S 2S S 24   R%                  USUU5      nOEUR'                  5       S
:X  a!  US S 2S S S S 24   nUR%                  USUU5      nUUS S S S 2S S 24   -   nUR)                  [+        [-        U R                  5      5      S9nU	b  U	OU R                  R.                  n	U
b  U
OU R                  R0                  n
Ub  UOU R                  R2                  nUS L U R4                  l        Ub  [9        U UUUUS9nOU R4                  " S UUUUUUUU	U
US.
UD6nUS   nUR                   u  nnnU R:                  R<                  nUR                  nUR)                  U5      nUb  UR)                  U5      n[>        R@                  RC                  SS5      S:X  aK  [E        X5      nUS:w  a  US S 2U* S 2S S 24   n[G        S UURH                  URJ                  URL                  S9$ US:X  ae  US:X  a_  [        RO                  UURQ                  5       R)                  URR                  5      5      nURU                  S5      RU                  S5      nGOQUS:w  a9  U R;                  US S 2U* S 2S S 24   R)                  URR                  5      5      nGO[>        R@                  RC                  SS5      S:H  nUU-  S::  a	  U(       d  SnU(       d  Ub  URC                  SS 5      nUc  URC                  SS 5      n[        U R                  SS5      n [W        S UUS US U[        U SS 5      S SU S9n!U(       d  W4USS  -   n"U!b  U!4U"-   $ U"$ [G        U![X        URH                  URJ                  URL                  S9n"U"$  U R;                  UR)                  URR                  5      5      nUR)                  [+        [-        U R                  5      5      5      nS n!Uby  Un#[        R[                  U5      n$USSS 24   U$SS S24'   SU$S'   []        U$URC                  S5      5        URC                  SS 5      nUc  URC                  SS 5      n[_        U#U$US9n!U(       d  U4USS  -   n"U!b  U!4U"-   $ U"$ [G        U!UURH                  URJ                  URL                  S9$ )!Nr'   r(   r   )r+   )rN   r   )diagonalr)   r,   r   )dtype)r   r   )
rl   r   r   r   rm   rn   r   r   rp   rq   UNSLOTH_RETURN_HIDDEN_STATES01)losslogitsrm   r   
attentionsUNSLOTH_RETURN_LOGITSi   Fnum_items_in_batchn_itemsfinal_logit_softcappingaccelerator_scalerT)trainerr   lm_head_weightlm_head_biasro   maskr   scaling	target_gbtorch_compilelogit_softcapping.i).r)   packed_seq_lengths)r{   ro   r    )0rO   rX   r;   HAS_XFORMERSxformers	attn_biasLowerTriangularMaskBlockDiagonalCausalMaskfrom_seqlensmake_local_attentionrV   triufullinfrN   arangerL   whereexpandr&   tor   r   r   rp   use_return_dictmodel_has_no_labels!LlamaModel_fast_forward_inferencelm_headweightosenvironrT   maxCausalLMOutputWithPastrm   r   r|   mvravelrv   	unsqueezeunsloth_fused_ce_lossEMPTY_LOGITS
empty_liker
   fast_cross_entropy_loss)%r[   rl   r   r   r   rm   rn   ro   r   r   rp   rq   rr   rs   r\   r]   r4   r5   r'   causal_mask_values	q_indices	k_indicescausal_bool_maskwindow_bool_maskoutputsr   hdr   lm_head_devicer{   RETURN_LOGITSr   r   rz   outputshift_logitsshift_labelss%                                        ri   MistralForCausalLM_fast_forwardr      s   $ 6__
U .>E< &!V+!Q&&00DDF.(&00DDF&00HHUUGcM&&^&D  &!V+!Q&N* &+ZZJJu~		zIDTDTJU  &0 &" "LL9I9ILJOOPRTUV	!LL9I9ILJOOPQSUV	#,	#9 $-	$9^#K %*[[$'77uyyj&"
 %!3D$14D!E!L!LE5" "%%'1,%3AtT14D%EN%3%:%:35%%PN!/2DT4QRTUEU2V!V+.."#4T[[#AB / N ( 	[[**   + 	[[--  #.DKK4O4O 
 !'$DJJ"3'+
 ** 
!%+'-)! 1#7%
 
 AJM"((NCll!!G^^N "$$^4M>* 
zz~~4c:cA !3D")!.@-@-A1*DEM%"%55#11 ++
 	
 axEQJ'=#6#6#8#;#;GMM#JK!!!$..q1	q	 !00114588G
 

'>DK;$}!M!3jj!5t<G **Y5 '5NPQ R ) -!(#!!$(<dC  $$5D  WQR[0+/+;w'GG+%")"9"9 ' 5 5$//F Mm..w}}=>YYz"3DKK"@ABFD ''/!'QRS#2#X $W'JJ+,	
 **148?jjD1G&!!
 WQR[(#'#3w??!!11--'' rk   c                 r    U R                  SS5      n U R                  SS5      n U R                  SS5      n U $ )NzL(self.head_dim * self.config.num_attention_heads) != self.config.hidden_sizeFalsezJself.head_dim = self.config.hidden_size // self.config.num_attention_headszself.head_dim = config.head_dimzUself.o_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)zmself.o_proj = nn.Linear(self.config.num_attention_heads * self.head_dim, self.config.hidden_size, bias=False))replace)functions    ri   patch_mistral_nemo_attentionr     sO    VH T)H _wH Ork   c                   N    \ rS rSr\S 5       r\           SS j5       rSrg)FastMistralModeli  c                     [        S[        [        [        S9u  pUb6  U b3  [	        U5      n[        U[        5       5        [        U 5      [        l        [        [        l
        [        [        l
        [        [        l
        [        [        l
        [        [         l
        ["        [$        l
        [&        [(        l
        [+        [$        5        SS Kn[        UR.                  R0                  R2                  l        g )Nmistral)
model_namerope_modulescaled_rope_moduleattention_moduler   )patch_linear_scalingr   r   r   r   execglobalseval__init__rj   forwardr   r   LlamaDecoderLayer_fast_forwardr   LlamaModel_fast_forwardr   r   r   PeftModel_fast_forwardPeftModelForCausalLM!fix_prepare_inputs_for_generation,transformers.models.mistral.modeling_mistralmodelsr   modeling_mistralMistralRotaryEmbedding)	init_namer   transformerss      ri   	pre_patchFastMistralModel.pre_patch  s    2".!B/	
	 I$93H=H79%(,Y%#@ 'D$)F&&D#6%D"'=$)*<= 	< ! 	##44K 	rk   Nc                 N    [         R                  " SU UUUUUUU[        U	U
S.UD6$ )N)r   max_seq_lengthrv   load_in_4bittoken
device_maprope_scalingfix_tokenizermodel_patchertokenizer_nametrust_remote_coder   )FastLlamaModelfrom_pretrainedr   )r   r   rv   r   r   r   r   r   r   r   r   r]   s               ri   r    FastMistralModel.from_pretrained  sG     -- 
#+'#'),+ 1
 
 	
rk   r   )zunsloth/mistral-7b-bnb-4bitNNTN
sequentialNTNNF)__name__
__module____qualname____firstlineno__staticmethodr   r   __static_attributes__r   rk   ri   r   r     sJ       D 2!!
 
rk   r   )NNNNFFNN)NNNNNNNNNNNr   r   )+llamar   _utilsr   unsloth_zoo.utilsr   unsloth_zoo.hf_utilsr   utils.packingr	   r
   utils.attention_dispatchr   r   r   r   r   r   r   r   r   r   r   r   r   r   rV   TensorOptionalr   
LongTensorTupleboolrj   ListFloatTensorintUnionr   r   r   r   r   r   rk   ri   <module>r      s    	  ( 2  . 2 6:-1/348#/3GK]5<<]5 12]5 U\\*	]5
 5++,]5 U5<<01]5 ]5 ]5 5++,]5 "%ell(B"CD]5 5<<%,,/%:M1NNO]5D #'59-1/39=15)- $(,+/"&()$%ll 12l U\\*	l
 5++,l d5#4#456l E--.l U%%&l ~l  ~l #4.l $l !l SMl" 5(()#l` @
~ @
M.+-s   F, ,F5