
    cCil.                     >   S SK JrJr  S SKrS SKJr  SSKJrJr  SSKJ	r	  SSK
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  \R8                  " \5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r#/ SQr$g)    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   B   ^  \ rS rSrSrSS\S\\   4U 4S jjjrSr	U =r
$ )GraniteAttention(   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g N)super__init__attention_multiplierscalingselfr   r   	__class__s      e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/granite/modular_granite.pyr    GraniteAttention.__init__+   s    +22    )r"   r   )__name__
__module____qualname____firstlineno____doc__r   r   intr    __static_attributes____classcell__r%   s   @r&   r   r   (   s"    G3} 3# 3 3r(   r   c                     ^  \ rS rSrS\S\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\
\\R                  \R                  4      S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )GraniteDecoderLayer0   r   r   c                 b   > [         TU ]  X5        UR                  U l        [        XS9U l        g )N)r   r   )r   r    residual_multiplierr   	self_attnr#   s      r&   r    GraniteDecoderLayer.__init__1   s*    +#)#=#= )Mr(   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesattention_maskposition_idsoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pXU R                  -  -   nUn
U R                  U5      nU R	                  U5      nXU R                  -  -   nU4nU(       a  X4-  nU$ )at  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r=   r>   r?   r:   r@   rA   rB   rC    )input_layernormr7   r6   post_attention_layernormmlp)r$   r=   r>   r?   r:   r@   rA   rB   rC   kwargsresidualself_attn_weightsoutputss                r&   forwardGraniteDecoderLayer.forward6   s    F !,,]; ,0>> 
,
')%+/) 3
,
 
,
( !43K3K#KK !55mD/ 43K3K#KK "++Gr(   )r6   r7   )NNNFFNN)r)   r*   r+   r,   r   r.   r    r   torchTensorr   
LongTensorr   booltupleFloatTensorrN   r/   r0   r1   s   @r&   r3   r3   0   s   N} N N
 %0A6R 2637+/,1$)59KO?||? !.? u//0	?
 "%? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X? S?r(   r3   c                       \ rS rSrSrg)GranitePreTrainedModely   rF   N)r)   r*   r+   r,   r/   rF   r(   r&   rW   rW   y   s    r(   rW   c                     ^  \ rS rSrS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )GraniteModel}   r   c           	         > [         TU ]  U5        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )	r   r    embedding_multiplierr   
ModuleListrangenum_hidden_layersr3   layersr#   s      r&   r    GraniteModel.__init__~   sV     $*$?$?!mmEJ6KcKcEdeEd	 3Ede
es   	A+	input_idsr>   r?   r:   inputs_embedsrA   r@   output_hidden_statesrB   rJ   rD   c
                 ,   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        U R                   S9nU	cD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9n	Uc  U	R#                  S5      n[%        U R                   UUU	UUS9nUnU R'                  X5      nU(       a  S	OS nU(       a  S	OS nU R(                  S U R                   R*                    H7  nU(       a  X4-  nU" U4UUUUUU	US
.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R-                  U5      nU(       a  X4-  n[/        UU(       a  UOS UUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)r   r   r   )device)r   input_embedsr>   rB   r:   r?   rF   )r>   r?   r:   r@   rA   rB   rC   )last_hidden_stater:   r=   
attentions)r   r@   re   rA   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensr]   r   get_seq_lengthrP   arangeshaperg   	unsqueezer	   
rotary_embra   r`   normr
   )r$   rc   r>   r?   r:   rd   rA   r@   re   rB   rJ   past_seen_tokenscausal_maskr=   rC   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r&   rN   GraniteModel.forward   sB    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*) /"3#-$7
 
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r(   )r]   ra   )	NNNNNNNNN)r)   r*   r+   r,   r   r    r   rP   rR   rQ   r   rU   rS   r   r   r
   rN   r/   r0   r1   s   @r&   rZ   rZ   }   s    
} 
 151537+/59$(,0/359_
E,,-_
 !._
 u//0	_

 "%_
   1 12_
 D>_
 $D>_
 'tn_
 !!1!12_
 +,_
 
!_
 _
r(   rZ   c                   h   \ rS rSr           SS\\R                     S\\R                     S\\R                     S\\\	\
\R                     4      S\\R                     S\\R                     S	\\   S
\\   S\\   S\\R                     S\\\R                  4   S\\   S\4S jjrSrg)GraniteForCausalLM   Nrc   r>   r?   r:   rd   labelsrA   r@   re   rB   logits_to_keeprJ   rD   c                     Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S 5      OUnU R                  US S 2US S 24   5      nUU R                   R                  -  nS nUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )N)	rc   r>   r?   r:   rd   rA   r@   re   rB   )logitsr   
vocab_size)lossr   r:   r=   rj   rF   )r   r@   re   modelri   
isinstancer.   slicelm_headlogits_scalingloss_functionr   r   r:   r=   rj   )r$   rc   r>   r?   r:   rd   r   rA   r@   re   rB   r   rJ   rM   r=   slice_indicesr   r   s                     r&   rN   GraniteForCausalLM.forward   s+    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A$++444%%pVF{{OeOepiopD%#33!//))
 	
r(   rF   )NNNNNNNNNNr   )r)   r*   r+   r,   r   rP   rR   rQ   r   r   listrU   rS   r.   r   r   r   rN   r/   rF   r(   r&   r   r      s)    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 +,2
 
 2
 2
r(   r   )r   rZ   rW   )%typingr   r   rP   r   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr)   rn   r   r3   rW   rZ   r   __all__rF   r(   r&   <module>r      s     #   . / O & 0 0  1 
		H	%3~ 3F+ FR	1 	g
: g
T3
) 3
l Kr(   