
    cCi                     J   S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  S
SKJrJrJrJr  SSKJr  \R,                  " \5      r " S S\SS9r " S S\R4                  5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQr g)    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging)deprecate_kwarg   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
GraniteFlashAttentionKwargs&   aR  
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

Attributes:
    cu_seq_lens_q (`torch.LongTensor`)
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`)
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idx N)__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__int	IntTensor__static_attributes__r       w/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   &   s7    " ######__r'   r   F)totalc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeSharedMLP?   zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
configc                 X  > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l
        [        R                  " U R                  U R                  SS9U l        g )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr-   	__class__s     r(   r1   GraniteMoeSharedMLP.__init__H   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr'   hidden_statesreturnc                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr   )dimr   r   )r8   chunkr6   r9   )r;   r>   chunked_hidden_statess      r(   forwardGraniteMoeSharedMLP.forwardQ   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r'   )r6   r2   r8   r3   r9   )r   r   r   r   r    r   r1   r!   TensorrE   r&   __classcell__r<   s   @r(   r+   r+   ?   s7    V5 VU\\ ell  r'   r+   c                     ^  \ rS rSrS\S\4U 4S jjr\" SSSS9        SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\
\   S\
\\R                  \R                  4      S\\   S\\R"                  \
\\R"                  \R"                  4      4   4S jj5       rSrU =r$ )GraniteMoeSharedDecoderLayerY   r-   	layer_idxc                 t   > [         TU ]  X5        UR                  S:X  a  S U l        g [        U5      U l        g )Nr   )r0   r1   r4   r+   
shared_mlpr;   r-   rM   r<   s      r(   r1   %GraniteMoeSharedDecoderLayer.__init__Z   s1    +"("A"AQ"F$L_`fLgr'   past_key_valuepast_key_valuesz4.58)new_nameversionr>   attention_maskposition_idsoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr?   c
                 t   UnU R                  U5      nU R                  " SUUUUUUUU	S.U
D6u  pXU R                  -  -   nUnU R                  U5      nU R	                  U5      u  pU R
                  c  UnOXR                  U5      -   nAXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU$ )aq  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
        padding-free training and/or improve torch.compile performance.
)r>   rV   rW   rS   rX   rY   rZ   r\   r   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerO   )r;   r>   rV   rW   rS   rX   rY   rZ   r[   r\   r]   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                   r(   rE   $GraniteMoeSharedDecoderLayer.forward^   s    N !,,]; ,0>> 
,
')%+/) 3
,
 
,
( !43K3K#KK !55mD+/+@+@+O(??"-M-0NNM 43K3K#KK "++G''Gr'   )rO   )NNNFFNFN)r   r   r   r   r   r$   r1   r   r!   rG   r   r"   r   booltupler	   r   FloatTensorrE   r&   rH   rI   s   @r(   rK   rK   Y   sH   h5 h# h %0A6R 2637+/,1$)59/4KOO||O !.O u//0	O
 "%O $D>O D>O !!1!12O 'tnO &eELL%,,,F&GHO 45O 
u  (51B1BEDUDU1U+V"WW	XO SOr'   rK   c                   &    \ rS rSr% \\S'   S/rSrg)GraniteMoeSharedPreTrainedModel   r-   rK   r   N)r   r   r   r   r   r#   _no_split_modulesr&   r   r'   r(   rn   rn      s    ""78r'   rn   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeSharedModel   r-   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf N)r0   r1   r   
ModuleListrangenum_hidden_layersrK   layersrP   s      r(   r1   GraniteMoeSharedModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A)ry   )r   r   r   r   r   r1   r&   rH   rI   s   @r(   rr   rr      s    
5 
 
r'   rr   c                   6   ^  \ rS rSrS/rS\4U 4S jjrSrU =r$ )GraniteMoeSharedForCausalLM   zlm_head.weightr-   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g ru   )r0   r1   rr   model	post_initr:   s     r(   r1   $GraniteMoeSharedForCausalLM.__init__   s&     *62
r'   )r   )	r   r   r   r   _tied_weights_keysr   r1   r&   rH   rI   s   @r(   r|   r|      s    *+5  r'   r|   )r|   rr   rn   )!typingr   r   r!   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   utils.deprecationr   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler+   rK   rn   rr   r|   __all__r   r'   r(   <module>r      s     '   !   &  0  C 
		H	%)5 2")) 4U#9 Up9&? 9

O 
"7  fr'   