
    cCi                    t   S r SSKrSSKJrJrJr  SSKrSSKJs  J	r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'  SSK(J)r)  \" 5       (       a  SSKJ*r*  \'" 5       (       a  SSK+J,r,J-r-  SSK.J/r/  OSu  r/r-r,\&" 5       (       a	  SSK0J1r1J2r2  OSu  r2r1\3" \/\-\1\2\,45      r4\"Rj                  " \65      r7   S@S\\Rp                  \9\Rp                     S4   S\\:   S\\Rp                     S\\Rp                  \:4   4S jjr; " S S\Rx                  5      r=S \Rp                  S!\:S\Rp                  4S" jr> " S# S$5      r? " S% S&\Rx                  5      r@ " S' S(\@5      rA " S) S*\@5      rB\@\A\BS+.rC " S, S-\Rx                  5      rD " S. S/\Rx                  5      rE " S0 S1\Rx                  5      rF " S2 S3\5      rG " S4 S5\5      rH\  " S6 S7\5      5       rI\G\HS8.rJ\  " S9 S:\I5      5       rK " S; S<\I\5      rL " S= S>\\I5      rM/ S?QrNg)AzPyTorch Jamba model.    N)AnyOptionalUnion)nn   )ACT2FN)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )JambaConfig)_flash_attention_forward)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNrouter_logitsnum_expertsattention_maskreturnc                    U b  [        U [        5      (       d  g[        U [        5      (       aC  U S   R                  n[        R                  " U  Vs/ s H  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGO"UR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XXR                  S   45      R                  SUR                  S   5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  nUR                  R"                  b  UR                  R"                  OSnUR                  S   [%        U5      -  n[        R                   " USS2UUUR                  S   -   24   UR'                  S5      -  5      nUU-  $ s  snf )a|  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    router_logits:
        Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   dimr   )
isinstancetupledevicetorchcattor   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshapesumindexint	unsqueeze)r#   r$   top_kr%   compute_devicelayer_routerconcatenated_router_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskdevice_indexrankoverall_losss                        b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/jamba/modeling_jamba.pyload_balancing_loss_funcrQ   F   s   : J}e$D$D-''&q)00%*YYANO__^,OUV&
" hh))112LRT1UO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
6<<Q?JD`a 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OEZEZ[\E]^_WR..q12R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 4C3I3I3O3O3[?))//abL  #c,&77D99!TD?+@+@+C$CCCDG]GgGghiGjjL +%%e Ps   K	c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )JambaRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
JambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr.   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      rP   rW   JambaRMSNorm.__init__   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   r*   T)keepdim)	dtyper0   r.   float32powr5   rsqrtr[   rZ   )r\   hidden_statesinput_dtypevariances       rP   forwardJambaRMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::ra   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r,   rZ   r7   r[   )r\   s    rP   
extra_reprJambaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIra   )r[   rZ   )gư>)	__name__
__module____qualname____firstlineno__rW   rl   ro   __static_attributes____classcell__r_   s   @rP   rS   rS      s    $;J Jra   rS   ri   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r7   r8   r9   )ri   rx   batchnum_key_value_headsslenhead_dims         rP   	repeat_kvr~      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTra   c                   
   \ rS rSrSrSr\R                  S4S jr SS\R                  S\R                  S\
S	\\\\4      S
\\R                  \R                  4   4
S jjrS\R"                  4S jrSS\\
   S
\
4S jjrSrg) HybridMambaAttentionDynamicCache   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
FNc                    X0l         UR                  U l        SU l        UR                  UR                  -  nUR
                  nUR                  n/ U l        / U l        / U l	        [        UR                  5       H  nU R                  U   S:X  aV  U =R                  [        R                  " X%XtUS9/-  sl        U =R                  [        R                  " X%XdUS9/-  sl        Ml  U =R                  [        R                  " / /U-  US9/-  sl        U =R                  [        R                  " / /U-  US9/-  sl        U R                  R                  U5        M     [        UR                  5       V	s/ s H  n	[        R                  " / /U-  US9PM     sn	U l        [        UR                  5       V	s/ s H  n	[        R                  " / /U-  US9PM     sn	U l        g s  sn	f s  sn	f )NFmambar-   re   r-   )re   layers_block_typehas_previous_statemamba_expandr]   mamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangerJ   r.   zerostensorappend	key_cachevalue_cache)
r\   configrH   re   r-   intermediate_sizessm_state_sizeconv_kernel_sizeirC   s
             rP   rW   )HybridMambaAttentionDynamicCache.__init__   s   
!'!9!9"'"//&2D2DD--!.."$v//0A%%a(G3  KK
?Ofkl%   KK
~dij$    U\\2$2CF%S$TT ELL"
1B6$R#SS''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   6#G$8#G)
key_statesvalue_states	layer_idxcache_kwargsr&   c                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )Nr*   r   rc   r(   )r   r7   r   r.   r/   )r\   r   r   r   r   s        rP   update'HybridMambaAttentionDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEra   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r-   index_selectr0   r   r   r   )r\   r   r   r-   s       rP   reorder_cache.HybridMambaAttentionDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4ra   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r   r   r   r7   )r\   r   s     rP   get_seq_length/HybridMambaAttentionDynamicCache.get_seq_length  sP     3<CZCZ2ZD++A.`i	t~~)+~~i(..r22ra   )r   re   r   r   r   r   r   r   N)r   )rq   rr   rs   rt   __doc__is_compileabler.   float16rW   Tensorr<   r   dictstrr   r,   r   
LongTensorr   r   ru    ra   rP   r   r      s     N16t u> 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3 3ra   r   c                   T  ^  \ rS rSrSrSS\S\\   4U 4S jjjr\	" SSSS	9      SS
\
R                  S\\
R                     S\\
R                     S\\   S\S\S\\
R                     S\\
R                  \\
R                     \\\
R                        4   4S jj5       rSrU =r$ )JambaAttentioni  z
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".
r   r   c                 "  > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l
        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l        SU l        UR                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   SU R                   S35      e["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  -  U R                  SS9U l        g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).Fbias)rV   rW   r   r   loggerwarning_oncer_   rq   r]   num_attention_heads	num_headsr}   r{   num_key_value_groups	is_causalattention_dropout
ValueErrorr   Linearq_projk_projv_projo_proj)r\   r   r   r_   s      rP   rW   JambaAttention.__init__  s   " !8!8 9 :, , "--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9MMDNN*t/?/??QRVRbRbQc$T^^$4B8  ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]ra   past_key_valuepast_key_values4.58new_nameversionri   r%   position_idsoutput_attentions	use_cachecache_positionr&   c                    UR                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " XR                  SS5      5      [        R                   " U R                  5      -  nUb"  US S 2S S 2S S 2S UR"                  S   24   nX-   n[$        R&                  R)                  US[        R*                  S9R-                  UR.                  5      n[$        R&                  R1                  XR2                  U R4                  S9n[        R                  " X5      nUR                  5       XR
                  XR                  4:w  a5  [7        SXR
                  XR                  4 S	UR                  5        35      eUR                  SS5      R9                  5       nUR;                  XU R<                  5      nU R?                  U5      nU(       d  S nUX4$ )
Nr   rc   r   r   r*   r)   re   )ptrainingz `attn_output` should be of size z	, but is ) sizer   r   r   viewr   r}   	transposer{   r   r   r~   r   r.   matmulmathsqrtr7   r   r1   r2   rf   r0   re   dropoutr   r   r   
contiguousr9   r]   r   )r\   ri   r%   r   r   r   r   r   bszq_lenrC   query_statesr   r   attn_weightscausal_maskattn_outputs                    rP   rl   JambaAttention.forward5  s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&'6'='=jX\XfXf'g$J z+D+DE
 /H/HI||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIK'5L }},,\r,WZZ[g[m[mn}},,\=S=S^b^k^k,lll<>#~~umm!LL2CP]P]3^2_ `$$&') 
 "++Aq1<<>!))#d6F6FGkk+. LL99ra   )r   r   r}   r]   r   r   r   r   r   r{   r   r   r   r   NNNFFN)rq   rr   rs   rt   r   r   r   r<   rW   r   r.   r   r   r   boolr,   rl   ru   rv   rw   s   @rP   r   r     s    
^{ ^x} ^ ^: %0A6R 2637FJ"'594:||4: !.4: u//0	4:
 ""BC4:  4: 4: !!1!124: 
u||Xell3XeELL>Q5RR	S4: S4:ra   r   c                      ^  \ rS rSrSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\
   S\S	\S
\\R                     4S jjrSrU =r$ )JambaFlashAttention2in  a6  
Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r   )rV   rW   r   _flash_attn_uses_top_left_mask)r\   argskwargsr_   s      rP   rW   JambaFlashAttention2.__init__u  s#    $)&)
 /P.Q+ra   ri   r%   r   r   r   r   r   c                 p   UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      nU R                  (       d  SOU R                  nUR                  nUR                   R"                  S:w  a  UR                   R"                  OSnU[$        R&                  :X  a  [$        R(                  " 5       (       aA  [+        [$        S5      (       a  [$        R,                  " U5      O[$        R.                  " 5       nOR[+        U R0                  S5      (       a  U R0                  R2                  nO U R                  R4                  R                  n[6        R9                  SU S	35        UR;                  U5      nUR;                  U5      nUR;                  U5      nUR                  SS5      nUR                  SS5      n[=        UUUUU
U[?        U R0                  S
S 5      U R@                  U RB                  S9	nURE                  XU RF                  5      RI                  5       nU RK                  U5      nU(       d  S nUWU4$ )Nr   rc           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r   r   r   use_top_left_mask)&r   r   r   r   r   r   r}   r{   r   r   r   r~   r   r   r   re   r-   typer.   rf   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper   r   rZ   r   r   r0   r   getattrr   r   r9   r]   r   r   )r\   ri   r%   r   r   r   r   r   r   r   r   rC   r   r   r   dropout_raterj   device_typetarget_dtyper   r   s                        rP   rl   JambaFlashAttention2.forward}  s    &**,A{{=1[[/
{{=1
 $((T^^T]]S__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&'6'='=jX\XfXf'g$J z+D+DE
 /H/HI"&--sT5K5K
 #((2>2E2E2J2Je2Sl))..Y^%--'((** u&:;; ,,[9557  &?@@#{{BB#{{1177 >$ (??<8L#|4J'??<8L  ))!Q/
#--a3. "4;;0@$Gnn"AA

 "))#d6F6FGRRTkk+. LL/99ra   )r   r   )rq   rr   rs   rt   r   rW   r.   r   r   r   r   r   rl   ru   rv   rw   s   @rP   r   r   n  s    R 2637FJ"'59R:||R: !.R: u//0	R:
 ""BCR:  R: R: !!1!12R: R:ra   r   c                   8  ^  \ rS rSrSr\" SSSS9      SS\R                  S\\R                     S	\\R                     S\\
   S
\S\S\\R                     S\\R                  \\R                     \\\R                        4   4U 4S jjj5       rSrU =r$ )JambaSdpaAttentioni  z
Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
r   r   r   r   ri   r%   r   r   r   r   r&   c           	      (  > U(       a'  [         R                  S5        [        TU ]  UUUUUUS9$ UR	                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                   5      n[        XR                   5      nUnUb  US S 2S S 2S S 2S UR"                  S   24   nUR$                  R&                  S:X  a3  Ub0  UR)                  5       nUR)                  5       nUR)                  5       nU R*                  =(       a    US L =(       a    U	S:  n[,        R.                  R0                  R3                  UUUUU R4                  (       a  U R6                  OSUS9nUR                  SS5      R)                  5       nUR                  XU R8                  5      nU R;                  U5      nUS U4$ )	Na  JambaModel is using JambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)ri   r%   r   r   r   r   r   rc   r   cudar   )	attn_mask	dropout_pr   )r   r   rV   rl   r   r   r   r   r   r   r}   r   r{   r   r   r~   r   r7   r-   r   r   r   r.   r   r1   scaled_dot_product_attentionr   r   r]   r   )r\   ri   r%   r   r   r   r   r   r   r   rC   r   r   r   r   r   r   r_   s                    rP   rl   JambaSdpaAttention.forward  sk    [ 7?+-) /"3# #   &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&'6'='=jX\XfXf'g$Jz+D+DE
 /H/HI$%%aA/E1A1A"1E/E&EFK ##v-.2L'224L#..0J'224L
 NNH{d':Huqy	hh))FF!04d,,3 G 
 "++Aq1<<>!&&s43C3CDkk+.D/11ra   r   r   )rq   rr   rs   rt   r   r   r.   r   r   r   r   r   r,   rl   ru   rv   rw   s   @rP   r   r     s     %0A6R 2637FJ"'59G2||G2 !.G2 u//0	G2
 ""BCG2  G2 G2 !!1!12G2 
u||Xell3XeELL>Q5RR	SG2 SG2ra   r   )eagerflash_attention_2sdpac                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\
   S\	\R                     4S jjrSS\	\
   S\	\R                     4S	 jjr  SS\	\
   S\	\R                     4S
 jjrSrU =r$ )JambaMambaMixeri.  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        UR                  U l        ["        R$                  " U R                  U R                  U R                  U R                  U R                  U R                  S-
  S9U l        UR(                  U l        [,        UR(                     U l        UR0                  U l        ["        R4                  " U R                  U R                  S-  U R                   S9U l        ["        R4                  " U R                  U R                  U R                  S-  -   SS9U l        ["        R4                  " U R                  U R                  SS9U l        [<        R>                  " SU R                  S-   5      S S S 24   nURA                  U R                  S5      RC                  5       n["        RD                  " [<        RF                  " U5      5      U l$        ["        RD                  " [<        RJ                  " U R                  5      5      U l&        ["        R4                  " U R                  U R                  U R                   S9U l'        [Q        U R                  URR                  S9U l*        [Q        U R                  URR                  S9U l+        [Q        U R                  URR                  S9U l,        [Z        (       d  [\        R_                  S	5        g g )
Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingrc   r   FTr*   r^   aq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)0rV   rW   r   r   r]   r   r   r   r   r   r   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projx_projdt_projr.   aranger8   r   rX   logA_logrY   Dout_projrS   rms_norm_epsdt_layernormb_layernormc_layernormis_fast_path_availabler   r   )r\   r   r   Ar_   s       rP   rW   JambaMambaMixer.__init__6  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, & 8 8 yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU%%^ &ra   ri   cache_paramsr%   c                 l
   UR                   u  pEnUS L=(       a|    UR                  =(       ai    US:H  =(       a]    UR                  U R                     R                   S   UR                  U R                     R                   S   s=:H  =(       a    U:H  Os  nU R                  U5      R                  SS5      nUR                  SSS9u  pUb  XR                  S5      -  nU R                  R                  R                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ae  [        UR                  S5      UR                  U R                     U
U R                  R                  U R                   5      nUR                  S5      nOUbc  ["        R$                  R'                  XR(                  UR                   S   -
  S45      nUR                  U R                     R+                  U5        [-        XU R                  R                  U R                   S9nUb  XR                  S5      -  nU R/                  UR                  SS5      5      n[0        R2                  " XR4                  U R6                  U R6                  /SS9u  pnU R9                  U5      nU R;                  U5      nU R=                  U5      nU R>                  R                  R@                  n[0        RB                  " 5          [0        RD                  " U R>                  R                  R@                  5      U R>                  R                  l         S S S 5        U R?                  U5      R                  SS5      n[0        RB                  " 5          UU R>                  R                  l         S S S 5        [0        RF                  " U RH                  RK                  5       5      * nUb  URK                  5       OS nU(       aZ  [M        UR                  U R                     US   US   UUS S 2S4   US S 2S4   U RN                  U	S   USS	9
R                  S5      nO{[Q        UUUUR                  SS5      UR                  SS5      U RN                  RK                  5       U	USSS
9
u  nnUb+  Ub(  UR                  U R                     R+                  U5        U RS                  UR                  SS5      5      nU$ ! , (       d  f       GN= f! , (       d  f       GNd= f)Nr   r   rc   r(   r*   )r  ).r   T)dt_softplus)delta_softplusreturn_last_state)*r7   r   r   r   r   r   r   chunkr=   r  rZ   r   r   r!   squeezer   r  r   r1   padr   copy_r    r!  r.   splitr  r   r)  r*  r+  r"  datano_grad
zeros_likeexpr%  r6   r   r&  r   r'  )r\   ri   r/  r%   rH   seq_lenrC   use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr-  scan_outputs	ssm_statecontextualized_statess                         rP   cuda_kernels_forward$JambaMambaMixer.cuda_kernels_forwardj  sg    "/!4!4
Q$ //1 ((8>>qA&&t~~6<<Q? 	 	  <<6@@AF /44QA4>%),D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]$++JZJZgkgvgvwM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ **//]]_%*%5%5dll6G6G6L6L%MDLL" !\\)4>>q!D]]_%3DLL"  YYtzz'')**3A3M--/SW!1''7f%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A''7==iH !%l.D.DQ.J K$$S _ _s   AT+T$
T!$
T3c           	         UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  SSS9u  pUb  XR                  S5      -  n	[        U[        5      nU(       GaB  UR                  U R                     R                   S   U:X  Ga  U R                  (       a(  UR                  U R                     R                  5       nOUR                  U R                     nUR                  U	R                  5      nUR                  (       Ga  US:X  Ga  UR                  U R                     R                   S   U:X  a  UR                  U R                     n[         R"                  " USSS9nU	S S 2S S 2S4   US S 2S S 2S4'   XR                  U R                  '   [         R$                  " XR&                  R(                  S S 2SS S 24   -  SS9n	U R*                  (       a  XR&                  R,                  -  n	U R/                  U	5      R                  U5      R                  S5      n	O[0        R2                  R5                  U	U R6                  U	R                   S   -
  S45      nXR                  U R                  '   U R/                  U R'                  U	5      SS U24   5      n	O][         R8                  " X@R:                  U R<                  4U	R                  US9nU R/                  U R'                  U	5      SS U24   5      n	Ub  XR                  S5      -  n	U R?                  U	R                  SS5      5      n[         R@                  " XRB                  U R<                  U R<                  /SS9u  nnnU RE                  U5      nU RG                  U5      nU RI                  U5      nU RK                  U5      n[0        R2                  RM                  U5      R                  SS5      n[         RN                  " U RP                  RS                  5       5      * n[         RN                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RS                  5       -  nUU	S S 2S S 2S S 2S 4   RS                  5       -  n/ n[U        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[         RV                  " UR                  U5      US S 2US S 24   R                  S5      5      nURY                  US S 2S S 2S4   5        M     [         RZ                  " USS9nUXR\                  S S S 2S 4   -  -   nUU R/                  U
5      -  nU(       a  XR                  U R                  '   U R_                  UR                  SS5      5      nU$ )	Nr   rc   r(   r   r*   )shiftsdims.r   )0r7   re   r   r   r4  r=   r+   r   r   r   r   cloner0   r-   r   r   r.   rollr:   r  rZ   r  r   r  r   r1   r6  r   r   r   r   r!  r8  r  r)  r*  r+  r"  softplusr<  r%  r6   r   r   r   stackr&  r'  )r\   input_statesr/  r%   rH   r=  rC   re   r?  ri   r@  r   rI  
conv_staterB  rC  rD  rE  rG  r-  
discrete_A
discrete_BdeltaB_urH  r   scan_outputrJ  s                              rP   slow_forwardJambaMambaMixer.slow_forward  s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM|-MN	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I...7a< ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 <F((8 $])CC'M)R S33T5H5HI$++5I !HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDwA"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45   kk,B7!]VVD!TM5J%JK"TXXd^36?##DNN3 !%k.C.CAq.I J$$ra   c                     U R                   (       aV  [        (       a.  SU R                  R                  R                  R
                  ;  a  [        S5      eU R                  XU5      $ U R                  XU5      $ )Nr  zsFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device)	r  r,  r!  rZ   r-   r   r   rK  rZ  )r\   ri   r/  r%   s       rP   rl   JambaMambaMixer.forward*  sk       ))V4;;;M;M;T;T;Y;Y-Y  J  ,,].YY  nMMra   )r%  r&  r  r  r*  r+  r   r  r   r)  r"  r]   r   r   r   r'  r   r  r  r  r  r!  r"   )rq   rr   rs   rt   r   r   rW   r.   r   r   r   r   rK  rZ  rl   ru   rv   rw   s   @rP   r  r  .  s    2{ 2n DH59	h%||h% ?@h% !!1!12	h%VR%x@`7a R%  {C  DI  DT  DT  {U R%p DH59	N ?@N !!1!12	N Nra   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JambaMLPi:  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFr   )rV   rW   r   r]   r   r   r   	gate_projup_proj	down_projr   r  act_fnr\   r   r_   s     rP   rW   JambaMLP.__init__;  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../ra   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )rd  re  rb  rc  )r\   xrd  s      rP   rl   JambaMLP.forwardE  s6    NN4;;t~~a/@#ADLLQRO#ST	ra   )re  r   rd  rb  r]   r   rc  )rq   rr   rs   rt   rW   rl   ru   rv   rw   s   @rP   r_  r_  :  s    0 ra   r_  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\R                  \R                  4   4S jr
SrU =r$ )	JambaSparseMoeBlockiK  a  
This implementation is
strictly equivalent to standard MoE with full capacity (no
dropped tokens). It's faster since it formulates MoE operations
in terms of block-sparse operations to accommodate imbalanced
assignments of tokens to experts, whereas standard MoE either
(1) drop tokens at the cost of reduced performance or (2) set
capacity factor to number of experts and thus waste computation
and memory on padding.
r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf ra  )rV   rW   r]   
hidden_dimr   ffn_dimr$   num_experts_per_tokr>   r   r   router
ModuleListr   r_  experts)r\   r   rC   r_   s      rP   rW   JambaSparseMoeBlock.__init__W  s     ,,//!--//
ii1A1AN}}dFVFV@W%X@W1hv&6@W%XY%Xs   *Cri   r&   c                    UR                   u  p#nUR                  SU5      nU R                  U5      n[        R                  " US[
        R                  S9n[
        R                  " X`R                  SS9u  pgUR                  UR                  5      n[
        R                  " X#-  U4UR                  UR                  S9n[
        R                  R                  R                  XpR                   S9R#                  SSS5      n	[%        U R                   5       H  n
U R&                  U
   n[
        R(                  " X   5      u  pUR                   S   S:X  a  MA  US	U4   R+                  SU5      nU" U5      XmUS	4   -  nUR-                  SXR                  UR                  5      5        M     UR+                  X#U5      nX4$ )
 r*   r   r   r(   )re   r-   )num_classesrc   r   N)r7   r   rq  Fr2   r.   r6   r3   r>   r0   re   r   r-   r   r1   r4   r$   permuter   rs  wherer9   
index_add_)r\   ri   rH   rI   rn  r#   rB   rD   final_hidden_statesrE   
expert_idxexpert_layeridxtop_xcurrent_statecurrent_hidden_statess                   rP   rl   JambaSparseMoeBlock.forwarda  s   2?2E2E/
Z%**2z:M2))MqL,1JJ

XZ,[)),,]-@-@A#kk):6m>Q>QZgZnZn
 hh))112BP`P`1aiijkmnpqr   0 01J<<
3L[%<=JC{{1~"
 *$+6>>r:NM$0$?/Y\^bRbBc$c!  **1e5M5MmNaNa5bc 2  299*Wab"11ra   )rs  ro  rn  r$   rq  r>   )rq   rr   rs   rt   r   r   rW   r.   r   r,   rl   ru   rv   rw   s   @rP   rl  rl  K  sD    	Z{ Z&2U\\ &2eELL%,,<V6W &2 &2ra   rl  c                   Z  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )JambaAttentionDecoderLayeri  r   r   c                 F  > [         TU ]  5         UR                  U   n[        UR                     " X5      U l        US:  a  [        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r  )rV   rW   layers_num_expertsJAMBA_ATTENTION_CLASSES_attn_implementation	self_attnrl  r_  feed_forwardrS   r]   r(  input_layernormpre_ff_layernormr\   r   r   r$   ffn_layer_classr_   s        rP   rW   #JambaAttentionDecoderLayer.__init__  s    //	:01L1LMf`1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yra   r   r   r   r   ri   r%   r   r   output_router_logitsr   r   r&   c	           
      <   Un	U R                  U5      nU R                  UUUUUUUS9u  pnX-   nUn	U R                  U5      nU R                  U5      n[	        U[
        5      (       a  Uu  pOUSpX-   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )c  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
)ri   r%   r   r   r   r   r   N)r  r  r  r  r+   r,   )r\   ri   r%   r   r   r   r  r   r   residualself_attn_weightspresent_key_value
ff_outputsr#   outputss                  rP   rl   "JambaAttentionDecoderLayer.forward  s    > !,,];>Bnn')%+/) ?M ?
;*; !0 !--m<&&}5
j%((+5(M=+5t= 0 "++G++G''Gra   )r  r  r  r  NNNFFFNrq   rr   rs   rt   r   r<   rW   r   r.   r   r   r   r   r   r,   FloatTensorrl   ru   rv   rw   s   @rP   r  r    s   Z{ Zs Z %0A6R 2637FJ,1/4$)59D||D !.D u//0	D
 ""BCD $D>D 'tnD D>D !!1!12D 
u  (51B1BEDUDU1U+V"WW	XD SDra   r  c                   Z  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )JambaMambaDecoderLayeri  r   r   c                 &  > [         TU ]  5         UR                  U   n[        XS9U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r   r   r   r  )rV   rW   r  r  r   rl  r_  r  rS   r]   r(  r  r  r  s        rP   rW   JambaMambaDecoderLayer.__init__  s{    //	:$FH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yra   r   r   r   r   ri   r%   r   r   r  r   r   r&   c	                 2   Un	U R                  U5      nU R                  UUUS9nSn
X-   nUn	U R                  U5      nU R                  U5      n[	        U[
        5      (       a  Uu  pOUSpX-   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )r  )ri   r/  r%   N)r  r   r  r  r+   r,   )r\   ri   r%   r   r   r   r  r   r   r  r  r  r#   r  s                 rP   rl   JambaMambaDecoderLayer.forward  s    > !,,];

'() # 

 ! !0 !--m<&&}5
j%((+5(M=+5t= 0 "++G))G''Gra   )r  r  r   r  r  r  rw   s   @rP   r  r    s   Z{ Zs Z %0A6R 2637FJ,1/4$)59A||A !.A u//0	A
 ""BCA $D>A 'tnA D>A !!1!12A 
u  (51B1BEDUDU1U+V"WW	XA SAra   r  c                   F    \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrS rS	rg
)JambaPreTrainedModeli-  r   modelTr  r  r   c                 J   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       a  [         R"                  " SUR$                  S-   5      S S S 24   nUR'                  UR(                  S5      R+                  5       nUR,                  R                  R/                  [         R0                  " U5      5        UR2                  R                  R                  S5        g g )Nr   )r5   stdg      ?r   r*   )r   initializer_ranger+   r   r   r  rZ   r9  normal_r   zero_	Embeddingpadding_idxrS   fill_r  r.   r#  r   r8   r   r   r%  r7  r$  r&  )r\   moduler  r-  s       rP   _init_weights"JambaPreTrainedModel._init_weights9  s}   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)00Q 5 5 9:47CA1126AACALL##EIIaL1HHMM$	 1ra   r   N)rq   rr   rs   rt   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  ru   r   ra   rP   r  r  -  s;    &*#57OP"3NL%ra   r  )	attentionr   c                   L  ^  \ rS rSrSrS\4U 4S jjr\\          SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       rS rS rSrU =r$ )
JambaModeliP  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JambaDecoderLayer`]

Args:
    config: JambaConfig
r   c                 <  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H.  n[        UR                  U      nUR                  U" XS95        M0     [
        R                  " U5      U l        UR                   U l        [#        UR                  UR$                  S9U l        SU l        U R+                  5         g )N)r   r  F)rV   rW   pad_token_idr  
vocab_sizer   r  r]   embed_tokensr   rJ   ALL_DECODER_LAYER_TYPESr   r   rr  layersr  rS   r(  final_layernormgradient_checkpointing	post_init)r\   r   decoder_layersr   layer_classr_   s        rP   rW   JambaModel.__init__Y  s     !.. ++LL):):F<N<NPTP`P`av//0A1&2J2J12MNK!!+f"BC 1 mmN3$*$?$?!+F,>,>FDWDWX&+#ra   	input_idsr%   r   r   inputs_embedsr   r   output_hidden_statesr  r   r   r&   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUnU(       a  Uc  [        R                  S5        U
c,  [        R                  " UR                  S   UR                  S9n
Uc  U
R                  S5      nU R!                  X%U
5      nU R#                  X*5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nU R$                   Hj  n['        U[(        5      (       a  UOUnU(       a  X4-  nU" UUUUUU	UU
S	9nUS   nU(       a  US   b	  UUS   4-  nU	(       d  MY  US
   c  Ma  UUS
   4-  nMl     U R+                  U5      nU(       a  X4-  nU(       a  UR,                  (       d  SUl        U(       d  S OUn[/        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzJamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r   r   r   )r%   r   r   r   r  r   r   r*   T)last_hidden_stater   ri   
attentionsr#   )r   r   r  r  r   r   r  r   r   r   r  r.   r#  r7   r-   r=   _update_causal_mask_update_mamba_maskr  r+   r  r  r   r   )r\   r  r%   r   r   r  r   r   r  r  r   r   ri   r   
mamba_maskall_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputs
next_caches                         rP   rl   JambaModel.forwardl  sh     2C1N-TXT_T_TqTq$8$D $++JjJj 	 %9$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..~n],,^L
"6BD0d"6BD![[M'1-AW'X'X^iJ#!%55!))) /"3%9#-	M *!,M  #/"}Q'7&99N## $0%-*;)==%9 )< ,,];  !11?#E#E15O.!*T
%+&+%+
 	
ra   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nr  r   r   r*   )
fill_valuere   r-   )diagonalr   r   rc   .r	  )r  xpunpu)r   r  re   r-   r.   finfominr7   fulltriur#  r9   r8   rP  r)   eqmasked_fillr   r
   _unmask_unattended)r\   r%   input_tensorr   re   r-   	min_dtyperI   target_lengthr   mask_lengthpadding_masks               rP   r  JambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Kra   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )r.   all)r\   r%   r   r  s       rP   r  JambaModel._update_mamba_mask  s:     $
!q ^%?EIIn`aNaDbDbJra   )r  r  r  r  r  r  r  )
NNNNNNNNNN)rq   rr   rs   rt   r   r   rW   r   r   r   r.   r   r   r   r  r   r   r   r   rl   r  r  ru   rv   rw   s   @rP   r  r  P  s6   { &  151537FJ59$(,0/3/359e
E,,-e
 !.e
 u//0	e

 ""BCe
   1 12e
 D>e
 $D>e
 'tne
 'tne
 !!1!12e
 +,e
 
 e
  e
N!F	 	ra   r  c                     ^  \ rS rSrS/rS\4U 4S jjr\\            SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                  4   S\\   S\4S jj5       5       r       SS jrSrU =r$ )JambaForCausalLMi  zlm_head.weightr   c                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  5         g ra  )rV   rW   r  r  r  r   r   r]   lm_headrouter_aux_loss_coefr$   rp  r  rf  s     rP   rW   JambaForCausalLM.__init__  s}     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#= ra   r  r%   r   r   r  labelsr   r   r  r  r   logits_to_keepr   r&   c                    Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
U	b  U	OU R                   R                  n	U R	                  UUUUUUUU	U
US9
nUR
                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nSnU
(       aZ  [        UR                  U R                  U R                  U5      nUb+  UU R                   UR#                  UR$                  5      -  -  n['        UUUUR(                  UR*                  UR,                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, JambaForCausalLM

>>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
r  r%   r   r   r  r   r   r  r  r   )lossaux_losslogitsr   ri   r  r#   )r   r   r  r  r  r  r+   r<   slicer  loss_functionr  rQ   r#   r$   rp  r  r0   r-   r   r   ri   r  )r\   r  r%   r   r   r  r  r   r   r  r  r   r  r   r  ri   slice_indicesr  r  r  s                       rP   rl   JambaForCausalLM.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 %9$D $++JjJj 	
 +/**)%+'/!5!5) +5 +
  118B>SV8W8W~ot4]kmA}a,?@A%%ffooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
ra   c	           
         US L n
U
(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U
(       d  US S 2UR                   S   * S 24   nUb  U
(       a  SU0nOSUR                  5       0nUR                  UUUUUU R                  R                  US.5        U	R                  5        H  u  pX;  d  M  XU'   M     U$ )Nr*   r   r   r   r  r  )r   r   r   r%   r  r  r   )r7   r   r   re   r-   longcumsummasked_fill_r   r   num_logits_to_keepitems)r\   r  r   r%   r  r  r   r   r   r   empty_past_kvmodel_inputskeyvalues                 rP   prepare_inputs_for_generation.JambaForCausalLM.prepare_inputs_for_generationp  s    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0(<"&++"@"@"0
	
 !,,.JC&$)S! ) ra   )r  r  r$   rp  r  r  )NNNNNNNNNNNr   )NNNFNNT)rq   rr   rs   rt   _tied_weights_keysr   rW   r   r   r   r.   r   r   r   r  r   r   r<   r   r   r   rl   r  ru   rv   rw   s   @rP   r  r    s   *+	{ 	  151537FJ59-1$(,0/3/35934Y
E,,-Y
 !.Y
 u//0	Y

 ""BCY
   1 12Y
 ))*Y
 D>Y
 $D>Y
 'tnY
 'tnY
 !!1!12Y
 c5<</0Y
 +,Y
 
#Y
  Y
| "@ @ra   r  c                       \ rS rSrSrg)JambaForSequenceClassificationi  r   N)rq   rr   rs   rt   ru   r   ra   rP   r  r    s    ^ara   r  )r  r  r  r  )Nrc   N)Or   r   typingr   r   r   r.   torch.nn.functionalr   r1   rx  activationsr   
generationr	   modeling_attn_mask_utilsr
   modeling_flash_attention_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.import_utilsr   r   configuration_jambar   r   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr    r!   r  r,  
get_loggerrq   r   r   r,   r<   rQ   ModulerS   r~   r   r   r   r   r  r  r_  rl  r  r  r  r  r  r  r  __all__r   ra   rP   <module>r!     s  (   ' '     ! ) > h R - & R R 0 T , J XR@P=-~DD-7**.0@BVXfg 
 
		H	% "&
-1	U&uU\\':D@AU&#U& U\\*	U&
 5<<U&rJ299 J*	UU\\ 	U# 	U%,, 	UN3 N3dX:RYY X:xa:> a:JP2 P2h - HNbii HNXryy "<2")) <2~P!; PfM7 M` %? % %< )CMcd  p% p phk+_ k\ b%EG[ a gra   