
    cCi-B                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJrJr  SS	KJr  S
SKJr  S
SKJrJrJr  S
SKJrJrJrJrJrJrJr  SSK J!r!  \RD                  " \#5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r,/ S Qr-g)!    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridAttention+   config	layer_idxc                 $   > [         TU ]  X5        g Nsuper__init__selfr    r!   	__class__s      w/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr&   "GraniteMoeHybridAttention.__init__,   s    +     	__name__
__module____qualname____firstlineno__r   intr&   __static_attributes____classcell__r)   s   @r*   r   r   +   s    ,5 ,# , ,r,   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridMambaLayer0   r    r!   c                 8   > [         TU ]  [        U5      U5        g r#   )r%   r&   r   r'   s      r*   r&   #GraniteMoeHybridMambaLayer.__init__1   s    V,i8r,   r-   r.   r6   s   @r*   r8   r8   0   s    95 9# 9 9r,   r8   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )GraniteMoeHybridRMSNormGated5   c                 $   > [         TU ]  X5        g r#   r$   )r(   hidden_sizeepsr)   s      r*   r&   %GraniteMoeHybridRMSNormGated.__init__6   s    *r,   r-   )gư>)r/   r0   r1   r2   r&   r4   r5   r6   s   @r*   r=   r=   5   s    + +r,   r=   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeHybridMLP:   r    c                 $   > [         TU ]  U5        g r#   r$   r(   r    r)   s     r*   r&   GraniteMoeHybridMLP.__init__;   s     r,   r-   )r/   r0   r1   r2   r   r&   r4   r5   r6   s   @r*   rD   rD   :   s    !5 ! !r,   rD   c                     ^  \ rS rSrS\S\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\
\   S\
\\R                  \R                  4      S\\   S\\R"                  \
\\R"                  \R"                  4      4   4S jj5       rSrU =r$ )GraniteMoeHybridDecoderLayer?   r    r!   c                   > [         TU ]  X5        [        U5      U l        S U l        S U l        UR                  U   S:X  a  [        X5      U l        O[        X5      U l        UR                  U   U l	        [        USS5      S:  U l        g )Nmambanum_local_expertsr   )r%   r&   rD   
shared_mlp	self_attnrM   layers_block_typer8   r   
layer_typegetattrhas_expertsr'   s      r*   r&   %GraniteMoeHybridDecoderLayer.__init__@   s    +-f5
##I.'93FFDJ6vIDN 229= #6+>BQFr,   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesattention_maskoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	                    Un
U R                  U5      nU R                  b  U R                  " SUUUUS.U	D6nSnOU R                  " SUUUUUUUS.U	D6u  pXU R                  -  -   nUn
U R	                  U5      nU R
                  (       a'  U R                  U5      u  pXR                  U5      -   nOU R                  U5      nSnXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU$ )ap  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
        padding-free training and/or improve torch.compile performance.
N)rZ   r^   cache_paramsr[   )rZ   r[   rW   r\   r]   r^   r`   r-   )input_layernormrM   rP   residual_multiplierpost_attention_layernormrT   block_sparse_moerO   )r(   rZ   r[   rW   r\   r]   r^   r_   r`   ra   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                  r*   forward$GraniteMoeHybridDecoderLayer.forwardP   s3   L !,,];::! JJ +-,-	
 M !%/3~~ 	0+- /"3#-$7	0 	0,M !43K3K#KK !55mD/3/D/D]/S,-0NNM OOM:M M 43K3K#KK "++G''Gr,   )rT   rR   rM   rP   rO   )NNFFNFN)r/   r0   r1   r2   r   r3   r&   r   torchTensorr   r   bool
LongTensortupler
   r   FloatTensorrn   r4   r5   r6   s   @r*   rJ   rJ   ?   s/   G5 G# G  %0A6R 26+/,1$)59/4KOU||U !.U "%	U
 $D>U D>U !!1!12U 'tnU &eELL%,,,F&GHU 45U 
u  (51B1BEDUDU1U+V"WW	XU SUr,   rJ   c                   >   ^  \ rS rSr% \\S'   S/rSrU 4S jrSr	U =r
$ )GraniteMoeHybridPreTrainedModel   r    rJ   Tc                   > [         TU ]  U5        [        U[        5      (       a  UR                  R
                  R                  S5        [        R                  " [        R                  " SUR                  S-   5      5      UR                  l        UR                  R
                  R                  S5        g [        U[        5      (       a&  UR                  R
                  R                  S5        g g )Ng      ?r   )r%   _init_weights
isinstancer8   dt_biasdatafill_rp   logarange	num_headsA_logDr=   weight)r(   moduler)   s     r*   rz   -GraniteMoeHybridPreTrainedModel._init_weights   s    f%f899NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <==MM$$S) >r,   r-   )r/   r0   r1   r2   r   __annotations___no_split_modules_is_statefulrz   r4   r5   r6   s   @r*   rw   rw      s!    ""78L* *r,   rw   c                   |  ^  \ rS rSrS\4U 4S jjr\\           SS\\	R                     S\\	R                     S\\	R                     S\\\\\	R                     4      S\\	R                     S	\\   S
\\   S\\   S\\   S\\   S\\	R                     S\\   S\\\4   4S jj5       5       rS rSrU =r$ )GraniteMoeHybridModel   r    c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r#   )r%   r&   r   
ModuleListrangenum_hidden_layersrJ   layersr'   s      r*   r&   GraniteMoeHybridModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A	input_idsr[   position_idsrW   inputs_embedsr]   r\   output_hidden_statesr_   return_dictr^   ra   rb   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        R                  S5        UcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9nUc  UR#                  S5      nU R%                  X%XU5      nU R'                  X+5      nUnS nU R(                  b  U R)                  UU5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nU R*                   Hj  nUR,                  S	:X  a  UOUnU(       a  UU4-  nU" U4UUUUUU	US
.UD6nUS   nU(       a  US   b	  UUS   4-  nU	(       d  MY  US   c  Ma  UUS   4-  nMl     U R/                  U5      nU(       a  UU4-  nU(       a  UR0                  (       d  SUl        [3        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer-   rM   )r[   rW   r\   r]   r^   r_   r`   T)last_hidden_staterW   rZ   
attentionsrl   )r    r\   r   r]   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrp   r   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rR   normhas_previous_stater	   )r(   r   r[   r   rW   r   r]   r\   r   r_   r   r^   ra   past_seen_tokenscausal_mask
mamba_maskrZ   r`   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r*   rn   GraniteMoeHybridModel.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 ,,^L
 &"??&"&//-"N #7BD0d"6BD![[M'4'?'?7'JP[J#!m%55!)
) /"3#-%9$7
 
M *!,M  #/"}Q'7&99N## $0%-*;)==%; )> 		-0  -!11?#E#E15O.%+++%+
 	
r,   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )rp   all)r(   r[   r^   r   s       r*   r   (GraniteMoeHybridModel._update_mamba_mask6  s:     $
!q ^%?EIIn`aNaDbDbJr,   )r   )NNNNNNNNNNN)r/   r0   r1   r2   r   r&   r   r   r   rp   rs   rq   r   r   listru   rr   r
   r   rt   r   rn   r   r4   r5   r6   s   @r*   r   r      s]   
5 
  151537KO59$(,0/3/3&*59s
E,,-s
 !.s
 u//0	s

 "%tE4E4E/F(F"GHs
   1 12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!1!12s
 45s
 
u--	.s
  s
j	 	r,   r   c                   L   ^  \ rS rSrS/rS\4U 4S jjr      SS jrSrU =r	$ )GraniteMoeHybridForCausalLMiB  zlm_head.weightr    c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r#   )r%   r&   r   model	post_initrG   s     r*   r&   $GraniteMoeHybridForCausalLM.__init__E  s&     *62
r,   c                    US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOhUR                   S   UR                   S   :w  a	  US S 2U4   nO>U(       a7  [        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUUS.5        UR                  5        H  u  pX;  d  M  XU'   M     U
$ )Nr   r   r   r   r   r   )r   rW   r]   r[   r^   )r   r   r    dtyper   longcumsummasked_fill_
contiguousupdateitems)r(   r   rW   r[   r   r^   r   r]   ra   empty_past_kvmodel_inputskeyvalues                r*   prepare_inputs_for_generation9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationK  sz    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 !,,.JC&$)S! ) r,   )r   )NNNNNT)
r/   r0   r1   r2   _tied_weights_keysr   r&   r   r4   r5   r6   s   @r*   r   r   B  s7    *+5  = =r,   r   )r   r   rw   ).typingr   r   rp   r   cache_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   utils.deprecationr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr/   r   r   r8   r=   rD   rJ   rw   r   r   __all__r-   r,   r*   <module>r      s     #     O & > > 0 3 b b   C 
		H	%, 9 ,
9 9
+#4 +
!- !
g#? gT*&E *G1 GTF"= FR fr,   