
    cCiÜ                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  SSKJrJrJrJr  SSKJ r   \RB                  " \"5      r#\" 5       (       a  SSK$J%r%  OSr%\" 5       (       a  SSK&J'r'J(r(  SSK)J*r*  OSu  r*r(r'Sq+S r, " S S5      r- " S S\	R\                  5      r/ " S S\	R\                  5      r0 " S S\5      r1\ " S S\5      5       r2\\" S S!9 " S" S#\5      5       5       r3\\" S$S!9 " S% S&\5      5       5       r4\ " S' S(\25      5       r5\" S)S!9 " S* S+\2\5      5       r6/ S,Qr7g)-zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_kernels_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNNc                      [         b  [         $ [        5       (       a,  SSKJn   U " S5      nUR                  UR
                  4q [         $ [        5       (       a  SSKJnJn  X24q [         $ Sq [         $ )Nr   )
get_kernelzkernels-community/causal-conv1d)causal_conv1d_fncausal_conv1d_update)NN)_causal_conv1d_cacher   kernelsr   r    r   r   causal_conv1d)r   _causal_conv1d_kernelr   r    s       b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mamba/modeling_mamba.py_lazy_load_causal_conv1dr&   <   ss    '##& *+L M 5 J JLaLrLrs   
$	%	%H 4G    ,    c                      \ rS rSrSrSr\R                  S4S\S\	S\R                  S\\R                  \S4   4S	 jjrS
\	S\R                  S\R                   S\R                  4S jrS
\	S\R                  4S jrS rSrg)
MambaCacheO   a  
Cache for mamba model which does not have attention mechanism and key value states.

Arguments:
    config (`PretrainedConfig):
        The configuration file defining the shape-related attributes required to initialize the static cache.
    max_batch_size (`int`):
        The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
        a smaller batch size is used.
    dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
        The default `dtype` to use when initializing the layer.
    device (`torch.device` or `str`, *optional*):
        The device on which the cache should be initialized. Should be the same as the layer.

Example:

    ```python
    >>> import torch
    >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

    >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
    >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

    >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

    >>> # Prepare a cache class and pass it to model's forward
    >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
    >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
    >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
    >>> outputs.cache_params
    ```
TNconfigmax_batch_sizedtypedevicec           	         X l         X0l        UR                  U l        UR                  U l        UR
                  U l        / U l        / U l        Ub  [        R                  " U5      OS n[        UR                  5       H  n[        R                  " U R                   U R                  U R                  UU R                  S9n[        R                  " U R                   U R                  U R                  UU R                  S9n[        R                  R                  U5        [        R                  R                  U5        U R                  R!                  U5        U R                  R!                  U5        M     g )Nr.   r-   )r,   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr.   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr+   r,   r-   r.   _
conv_state	ssm_states           r%   __init__MambaCache.__init__t   s&    -!'!9!9$// & 2 2/1.0)/);f%v//0A',{{##&&%%kk(J ',kk##&&##kk'I MM--j9MM--i8##J/OO""9-' 1r'   	layer_idxnew_conv_statecache_positionreturnc                    U R                   U   R                  UR                  :w  a5  U R                   U   R                  UR                  5      U R                   U'   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR                  UR                  UR                  S9US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r   )shiftsdimsr0   )r7   r.   toclampr6   rollr-   zero_)r@   rF   rG   rH   rB   s        r%   update_conv_stateMambaCache.update_conv_state   s    
 I&--1F1FF*.*:*:9*E*H*HI^I^*_DY'%%i0
'--a1F1F1JK__BR_8
+9+<+<JDUDU]g]m]m+<+n
1a'(#))+#z1#	**r'   new_ssm_statec                     U R                   U   R                  5         U R                   U==   UR                  U R                   U   R                  5      -  ss'   U R                   U   $ N)r8   rQ   rN   r.   )r@   rF   rT   s      r%   update_ssm_stateMambaCache.update_ssm_state   sT    	"((*	"m&6&6ty7Q7X7X&YY"y))r'   c                     [        [        U R                  5      5       H=  nU R                  U   R                  5         U R                  U   R                  5         M?     g rV   )r:   lenr7   rQ   r8   )r@   rF   s     r%   resetMambaCache.reset   sH    s4#3#345IY'--/OOI&,,. 6r'   )r1   r6   r7   r2   r,   r4   r8   )__name__
__module____qualname____firstlineno____doc__is_compileabler9   float16r   intr-   r   r.   strrD   Tensor
LongTensorrR   rW   r[   __static_attributes__ r'   r%   r)   r)   O   s    B N #]]15#. #. #. {{	#.
 ellC-.#.J++.3ll+LQL\L\+	+"*# *ell *
/r'   r)   c            
       ^  ^  \ rS rSrSrS\S\4U 4S jjrS r   SS\	R                  S\\   S	\\	R                     S
\\	R                     4S jjrSS\\   S	\\	R                     S
\\	R                     4S jjr   SS\\   S	\\	R                     S
\\	R                     4S jjrSrU =r$ )
MambaMixer   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r+   rF   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  5      U l
        X l        UR                  U l        [        R                  " U R                  U R                  UR                  UR                  U R                  UR                  S-
  S9U l        UR                   U l        [$        UR                      U l        UR(                  U l        [        R*                  " U R                  U R                  S-  UR,                  S9U l        [        R*                  " U R                  U R                  U R
                  S-  -   SS9U l        [        R*                  " U R                  U R                  SS9U l        [4        R6                  " SU R
                  S-   [4        R8                  S9S S S 24   nUR;                  U R                  S5      R=                  5       n[        R>                  " [4        R@                  " U5      5      U l!        [        R>                  " [4        RD                  " U R                  5      5      U l#        [        R*                  " U R                  U R                  UR,                  S9U l$        UR,                  U l        U RK                  5         g )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   rp   FTr-   rK   )&superrD   r+   hidden_sizer3   r4   r5   r6   r2   rd   time_step_rankrF   use_conv_biasr   Conv1dconv1d
hidden_act
activationr
   actuse_mambapyLinearuse_biasin_projx_projdt_projr9   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r@   r+   rF   A	__class__s       r%   rD   MambaMixer.__init__   s-   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%%'r'   c                    [        5       u  p[        [        [        X![        45      nU(       dW  U R
                  (       a0  [        5       (       a  [        R                  S5        g [        S5      e[        R                  S5        g g )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)
r&   allr   r   r   r   r   loggerwarning_onceImportError)r@   r    r   is_fast_path_availables       r%   r   #MambaMixer.warn_slow_implementation   s    1I1K.!$#%68H`no"
 &'))''S & Z  ##W &r'   hidden_statescache_paramsrH   attention_maskc                 	   U R                  U5      R                  SS5      nU R                  (       Ga.  UGc*  [        UU R                  R
                  U R                  (       a  U R                  R                  OS U R                  R
                  U R                  R
                  U R                  R
                  U R                  (       a$  U R                  R                  R                  5       OS [        R                  " U R                  R                  5       5      * S S U R                   R                  5       U R                  R                  R                  5       SS9nU$ [#        5       u  pxUR%                  SSS9u  pUb  XR'                  S5      -  nU R                  R
                  R)                  U R                  R
                  R+                  S5      U R                  R
                  R+                  S5      5      n
Ubk  US   S:  ab  U" UR-                  S5      UR.                  U R0                     U
U R                  R                  U R2                  5      nUR'                  S5      nOUbW  [4        R6                  R9                  XR:                  UR<                  S   -
  S45      nUR?                  U R0                  X5        U" XU R                  R                  U R2                  S9nUb  XR'                  S5      -  nU R                  UR                  SS5      5      n[        R@                  " XRB                  U RD                  U RD                  /SS9u  pnU R                  R
                  UR                  SS5      -  n[        R                  " U R                  R                  5       5      * n[G        U R                  S	5      (       a$  U R                  R                  R                  5       OS nUbc  US   S:  aZ  [I        URJ                  U R0                     US
   US
   UUS S 2S4   US S 2S4   U R                   U	S
   USS9
R'                  S5      nOo[M        UUUUR                  SS5      UR                  SS5      U R                   R                  5       U	USSS9
u  nnUb  Ub  URO                  U R0                  U5        U R                  UR                  SS5      5      nU$ )Nr   rt   T)
delta_biasdelta_softplusdimr   rK   )r~   rp   ).r   )dt_softplus)r   return_last_state)(r   	transposetrainingr   r|   weightrz   rp   r   r   r   r   floatr9   expr   r   r&   chunk	unsqueezeviewsizesqueezer7   rF   r~   r   
functionalpadr6   shaperR   splitry   r4   hasattrr   r8   r   rW   )r@   r   r   rH   r   projected_statescontextualized_statesr    r   gateconv_weightsr7   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsrC   s                        r%   cuda_kernels_forwardMambaMixer.cuda_kernels_forward  s8     <<6@@AF===\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!r %$Q 6N5O2 "2"8"8"8"BM) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;_ 0!1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOI! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:W:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$r'   c           	      ^   UR                   u  pVnUR                  nU R                  U5      R                  SS5      n	U	R	                  SSS9u  pUb  XR                  S5      -  n
UGb  UR                  U R                     R                  5       nUR                  U
R                  5      nUR                   S   U R                  :X  a  [        R                  R                  U
U R                  U
R                   S   -
  S45      nUR                  U R                  X5        U R!                  U R#                  U
5      SS U24   5      n
GO6UR                  U R                  X5      nUR                  U R"                  R$                  R                  5      n[&        R(                  " XR"                  R$                  S S 2SS S 24   -  SS9n
U R*                  (       a  XR"                  R,                  -  n
U R!                  U
5      R                  U5      R                  S5      n
O][&        R.                  " XPR0                  U R2                  4U
R                  US9nU R!                  U R#                  U
5      SS U24   5      n
Ub  XR                  S5      -  n
U R5                  U
R                  SS5      5      n[&        R6                  " XR8                  U R2                  U R2                  /SS9u  nnnU R;                  U5      n[        R                  R=                  U5      R                  SS5      n[&        R>                  " U R@                  RC                  5       5      * n[&        R>                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RC                  5       -  nUU
S S 2S S 2S S 2S 4   RC                  5       -  nU RD                  (       a  U RF                  (       a  Uc  [I        UR                  SS5      UR                  SS5      5      nUUR                  S5      -  RK                  S5      R                  SS5      nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nO/ n[O        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[&        RP                  " UR                  U5      US S 2US S 24   R                  S5      5      nURS                  US S 2S S 2S4   5        M     [&        RT                  " USS9nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nUb(  UR                  U R                     RW                  U5        U RY                  UR                  SS5      5      nU$ )	Nr   rt   r   r   rK   .r0   r	   )-r   r-   r   r   r   r   r8   rF   clonerN   r.   r6   r   r   r   rR   r   r|   r   r9   sumrz   rp   r<   r2   r4   r   r   ry   r   softplusr   r   r   r   r   r   r   r   r:   matmulr?   stackcopy_r   )r@   input_statesr   rH   r   
batch_sizeseq_lenrA   r-   r   r   r   rC   rB   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   s                               r%   slow_forwardMambaMixer.slow_forwardh  s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~zZ $])CC'M)R S);;DNNMj
']]4;;+=+=+D+DE
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O33T5H5HI$++5I !HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89 $  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$r'   c                 H   [        5       u  pV[        [        [        Xe[        45      nU(       ac  SU R
                  R                  R                  R                  ;   a5  [        R                  R                  5       (       d  U R                  XX45      $ U R                  XX45      $ )Ncuda)r&   r   r   r   r   r   r   r.   typer9   r=   is_compilingr   r   )r@   r   r   rH   r   r    r   r   s           r%   forwardMambaMixer.forward  s     2J1K.!$#%68H`no"
 "f0B0B0I0I0N0N&NW\WdWdWqWqWsWs,,].ii  n]]r'   )r   r   r   r~   r+   r|   r6   r   rx   r   r2   rF   r   r4   ry   r   rz   r   r   r   )r]   r^   r_   r`   ra   r   rd   rD   r   r9   rf   r   r)   rg   r   r   r   rh   __classcell__r   s   @r%   rk   rk      s!   )({ )(s )(V6 .25959d%||d% z*d% !!1!12	d%
 !!1!12d%NO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959^ z*^ !!1!12	^
 !!1!12^ ^r'   rk   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )MambaRMSNormi  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z<
MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
N)rw   rD   r   r   r9   r   r   variance_epsilon)r@   rx   epsr   s      r%   rD   MambaRMSNorm.__init__  s/     	ll5::k#:; #r'   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nrt   rK   T)keepdim)	r-   rN   r9   r   powmeanrsqrtr   r   )r@   r   input_dtypevariances       r%   r   MambaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r'   c                 R    U R                   R                  S    SU R                   3$ )Nr   z, eps=)r   r   r   r@   s    r%   
extra_reprMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEEr'   )r   r   )gư>)	r]   r^   r_   r`   rD   r   r   rh   r   r   s   @r%   r   r     s    $;F Fr'   r   c                      ^  \ rS rSrU 4S jr   SS\\   S\\R                     S\\R                     4S jjr	Sr
U =r$ )	
MambaBlocki  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XS9U l
        g )Nr   rF   )rw   rD   r+   rF   residual_in_fp32r   rx   layer_norm_epsilonnormrk   mixer)r@   r+   rF   r   s      r%   rD   MambaBlock.__init__  sL    " & 7 7 !3!39R9RS	<
r'   r   rH   r   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XX4S9nXQ-   nU$ )Nrv   r   rH   r   )r   rN   r   r-   r   r9   r   r   )r@   r   r   rH   r   residuals         r%   r   MambaBlock.forward  sx     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^ # 
 !0r'   )r+   rF   r   r   r   r   )r]   r^   r_   r`   rD   r   r)   r9   rg   r   rh   r   r   s   @r%   r   r     sV    = .25959 z* !!1!12	
 !!1!12 r'   r   c                   :    \ rS rSr% \\S'   SrSS/rSrSr	S r
Srg	)
MambaPreTrainedModeli  r+   backboner   rk   Tc                 .   U R                   R                  n[        U[        5      (       Ga.  [        R
                  " SUR                  S-   [        R                  S9SSS24   nUR                  UR                  S5      R                  5       nUR                  R                  [        R                  " U5      5        UR                  R                  R!                  S5        U R                   R"                  S-  U R                   R$                  -  nU R                   R&                  S:X  a5  [(        R*                  R-                  UR.                  R0                  U5        OPU R                   R&                  S:X  a6  [(        R*                  R3                  UR.                  R0                  U* U5        [        R4                  " [        R6                  " U R                   R                  5      [8        R                  " U R                   R:                  5      [8        R                  " U R                   R<                  5      -
  -  [8        R                  " U R                   R<                  5      -   5      R?                  U R                   R@                  S	9nU[        R                  " [        RB                  " U* 5      * 5      -   nUR.                  RD                  R                  U5        S
UR.                  RD                  l#        [(        R*                  RI                  URJ                  R0                  [8        RL                  " S5      S9  URJ                  RD                  bY  [O        URJ                  RD                  SS5      (       d3  [(        R*                  RQ                  URJ                  RD                  5        [(        R*                  RI                  URR                  R0                  [8        RL                  " S5      S9  U R                   RT                  (       aC  URR                  R0                  nU[8        RL                  " U R                   RV                  5      -  n[        U[(        RX                  5      (       a  [O        UR0                  SS5      (       d(  [(        R*                  R[                  UR0                  US9  URD                  bG  [O        URD                  SS5      (       d*  [(        R*                  RQ                  URD                  5        ggg[        U[\        5      (       a&  UR0                  R                  R!                  S5        g[        U[(        R^                  5      (       a)  [(        R*                  R[                  UR0                  US9  gg)zInitialize the weights.r   rv   NrK   g      ?g      constantrandom)minT   )a
_no_reinitF)std)0r+   initializer_range
isinstancerk   r9   r   r4   r   r   r2   r   r   r   r   r   datafill_ry   time_step_scaletime_step_init_schemer   init	constant_r   r   uniform_r   randmathtime_step_maxtime_step_minrO   time_step_floorexpm1rp   r  kaiming_uniform_r|   sqrtgetattrzeros_r   rescale_prenorm_residualr;   r   normal_r   	Embedding)r@   moduler  r   dt_init_stddtinv_dtps           r%   _init_weights"MambaPreTrainedModel._init_weights  s   kk++fj)) Q 5 5 9OPTVWPWXA1126AACALLuyy|,HHMM$++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f--1FNN*GG$$V]]%9%9TYYq\$J}}!!-v}}11<GGGGNN6==#5#56GG$$V__%;%;tyy|$L{{33 OO**TYYt{{<<==fbii((6==,>>37{{&v{{L%@@GGNN6;;/ A ' --MM$$S)--GGOOFMMsO3 .r'   ri   N)r]   r^   r_   r`   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr!  rh   ri   r'   r%   r   r     s)    "%|4&*#L84r'   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Srg)MambaOutputi=  a%  
cache_params (`MambaCache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlast_hidden_stater   r   ri   )r]   r^   r_   r`   ra   r+  r   r9   FloatTensorr#  r   r)   r   tuplerh   ri   r'   r%   r*  r*  =  sH     6:x 1 129)-L(:&-8<M8E%"3"345<r'   r*  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Srg)	MambaCausalLMOutputiQ  az  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cache_params (`MambaCache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlosslogitsr   r   ri   )r]   r^   r_   r`   ra   r0  r   r9   r,  r#  r1  r   r)   r   r-  rh   ri   r'   r%   r/  r/  Q  s\    
 )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<r'   r/  c                     ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\   S\	\
R                     S\	\
R                     S\\\4   4S jj5       rSrU =r$ )
MambaModelij  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )Nr   Fr   )rw   rD   r   r  
vocab_sizerx   
embeddings
ModuleListr:   r;   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r@   r+   idxr   s      r%   rD   MambaModel.__init__l  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$rRq3Z%FRq$rs&+#"6#5#56;T;TU//? %ss   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)r@   
state_dictprefixargsks        r%   r<  MambaModel.load_hookx  s4    Aq EO^^TUEV
99\=AB r'   c                     U R                   $ rV   r6  r   s    r%   get_input_embeddingsMambaModel.get_input_embeddings~  s    r'   c                     Xl         g rV   rI  r@   new_embeddingss     r%   set_input_embeddingsMambaModel.set_input_embeddings  s    (r'   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictrH   r   rI   c	                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Ucn  [        U R                   UR                  S5      UR                  UR                  S9n[        R                  " SU R                   R                  UR                  S9nOUc  [        S5      eOSnUn	U(       a  SOSn
U R                   H  nU" U	UUUS	9n	U(       d  M  X4-   n
M     U R!                  U	5      n	U(       a  X4-   n
U(       d  [#        S
 XU
4 5       5      $ [%        U	U(       a  UU
S9$ SU
S9$ )ay  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
NFz:You must specify exactly one of input_ids or inputs_embedsr   r0   r.   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyri   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frV   ri   ).0vs     r%   	<genexpr>%MambaModel.forward.<locals>.<genexpr>  s     f$Tq$Ts   	)r+  r   r   )r+   rT  r   rS  use_return_dict
ValueErrorr6  r9  r)   r   r.   r-   r9   r   r5   r8  r:  r-  r*  )r@   rQ  rR  r   rS  rT  rU  rH   r   r   all_hidden_statesmixer_blocks               r%   r   MambaModel.forward  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !; 	 (  L%"6BD;;K')--	M $#$58H$H! ' M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
r'   )r6  r9  r8  r:  )NNNNNNNN)r]   r^   r_   r`   rD   r<  rJ  rO  r   r   r9   rg   r)   boolr   r-  r*  r   rh   r   r   s   @r%   r3  r3  j  s    
)  1548-1$(/3&*5959L
E,,-L
   0 01L
 z*	L

 D>L
 'tnL
 d^L
 !!1!12L
 !!1!12L
 
uk!	"L
 L
r'   r3  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                     ^  \ rS rSrS/rU 4S jrS rS r SS\S\	\
\4   S\S	\	\
\4   4S
 jjr     SS\\   S\\R"                     S\\R"                     4S jjr\         SS\\R"                     S\\R"                     S\\R(                     S\\   S\\R"                     S\\   S\\   S\\   S\\R,                     S	\\\4   4S jj5       rSrU =r$ )MambaForCausalLMi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFru   )
rw   rD   r3  r   r   r   rx   r5  lm_headr=  )r@   r+   r   s     r%   rD   MambaForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr'   c                 6    U R                   R                  5       $ rV   )r   rJ  r   s    r%   rJ  %MambaForCausalLM.get_input_embeddings  s    }}1133r'   c                 8    U R                   R                  U5      $ rV   )r   rO  rM  s     r%   rO  %MambaForCausalLM.set_input_embeddings  s    }}11.AAr'   outputsmodel_kwargsnum_new_tokensrI   c                    UR                  SS 5      US'   UR                  SS5      (       a  SU;   a  US   b  US   SS  U-   US'   SU;   a<  US   n[        R                  " XUR                  UR                  S   S45      /SS	9US'   U$ )
Nr   rS  TrH   rK   r   r   r   r   )getr9   catnew_onesr   )r@   rl  rm  rn  kwargsr   s         r%   #_update_model_kwargs_for_generation4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H^$[$// L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* r'   r   rH   r   c                 z   SUR                  5       0nU(       a  Uc  [        R                  " SU R                  R                  R
                  UR                  S9nUb  SU0nUR                  S5      n	OUR                  S5      n	[        U R                  R                  XR                  U R                  S9nU(       a4  US   S:  a+  US S 2S4   R                  S5      R                  5       US'   S nU(       d  Ub  SU0nUR                  UUUUS.5        UR                  5        H  u  pX;  d  M  XU
'   M     U$ )NrQ  r   rW  rR  r0   rK   )r   rS  rH   r   )r   r9   r   r   r+   r5   r.   r   r)   r-   r   updateitems)r@   rQ  rR  rS  r   rH   r   rs  model_inputsr,   keyvalues               r%   prepare_inputs_for_generation.MambaForCausalLM.prepare_inputs_for_generation  s3    $Y%9%9%;<-
 #\\!T]]-A-A-M-MV_VfVfgN( /?!.!3!3A!6!*!2%dmm&:&:NS^S^fjfpfpqL*Q.(1!R%(8(B(B2(F(Q(Q(SL%!N]6+];L ,&"0"0		
 !,,.JC&$)S! ) r'   rQ  rR  labelsrT  rU  rS  c
                    Ub  UOU R                   R                  nU R                  UUUUUUU	US9nUS   nU R                  UR	                  U R                  R
                  R                  5      5      R                  5       nSnUb  UR	                  UR                  5      nUSSS2SS24   R                  5       nUSSS24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
N)r   rR  rT  rU  rS  rH   r   r   .rK   r   )r0  r1  r   r   )r+   r]  r   rf  rN   r   r-   r   r.   r   r   r   r   r/  r   r   )r@   rQ  r   rR  r   r~  rT  rU  rS  rH   rs  mamba_outputsr   r1  r0  shift_logitsshift_labelsloss_fctoutputs                      r%   r   MambaForCausalLM.forward-  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
r'   )r   rf  )r   )NNNNN)	NNNNNNNNN)r]   r^   r_   r`   _tied_weights_keysrD   rJ  rO  r   dictre   r   rd   rt  r   r)   r9   rg   r|  r   r,  rb  rf   r   r-  r/  r   rh   r   r   s   @r%   rd  rd    s    ++4B YZ"26sCx.RU	c3h, -15959.
 z*. !!1!12. !!1!12.`  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
r'   rd  )rd  r3  r   r)   )8ra   r  dataclassesr   typingr   r   r   r9   r   torch.nnr   activationsr
   configuration_utilsr   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerr]   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   r!   r&   r)   Modulerk   r   r   r   r*  r/  r3  rd  __all__ri   r'   r%   <module>r     s     ! ' '   % ! 3 ) 9 - 
  - 
		H	%#EXR@P=-~  &d/ d/NQ^ Q^hF299 F(+ 8 ?4? ?4 ?4D 
=+ = = 
=+ = =& f
% f
 f
R P
+_ P
P
f Sr'   