
    bCimP                        S SK JrJr  S SKrS SKJr  SSKJrJr  SSK	J
r
Jr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  \RR                  " \*5      r+ " S S\
5      r, " S S\$5      r- " S S\"5      r. " S S\5      r/ " S S\ 5      r0 " S S\#5      r1 " S S\(5      r2 " S S \!5      r3/ S!Qr4g)"    )CallableOptionalN   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2Modelc                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                      SU 4S jjr	Sr
U =r$ )Cohere2Config/   a  
This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


Args:
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`CohereModel`]
    hidden_size (`int`, *optional*, defaults to 8192):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 22528):
        Dimension of the MLP representations.
    logit_scale (`float`, *optional*, defaults to 0.0625):
        The scaling factor for the output logits.
    num_hidden_layers (`int`, *optional*, defaults to 40):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 64):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 5):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 255001):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*, defaults to 4096):
        Size of the sliding window attention context.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.

```python
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
```
cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                   > Xl         Xl        X l        X@l        X0l        XPl        X`l        Uc  UnXpl        Xl        Xl	        Xl
        Xl        UU l        UU l        UU l        UU l        UU l        UU l        X&-  U l        ['        U 5        [(        TU ]T  " SUUUUS.UD6  UR-                  SS5      U l        U R"                  c_  [1        U SS5      U l        [3        U R
                  5       Vs/ s H'  n[5        US-   U R.                  -  5      (       a  SOSPM)     snU l        [7        U R"                  U R
                  5        g s  snf )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern      sliding_attentionfull_attention )
vocab_sizemax_position_embeddingshidden_sizelogit_scaleintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangelayer_norm_eps	use_cache
rope_thetarope_scalingattention_biasattention_dropoutsliding_windowlayer_typeshead_dimr   super__init__get_sliding_window_patterngetattrrangeboolr	   )selfr7   r9   r;   r:   r<   r=   r>   r?   r8   r@   rA   rB   r-   r.   r/   r0   rC   rD   rE   rF   rG   rH   kwargsi	__class__s                            e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/cohere2/modular_cohere2.pyrK   Cohere2Config.__init__   sb   4 %'>$&&!2!2#6  &"5#6 $!2,"$(,!2,&#: 	t$ 	
%%% 3		

 	
 (.zz2JA'N$#+249QST+UD( t556 6A (,QUd6R6R,R'S'S#Yii6 D 	d..0F0FG	 s   /.E)rM   rE   rF   rI   r?   r9   r@   r;   rA   rH   r:   r8   r=   r<   r>   rD   rC   rG   rB   r7   )i      i X  g      ?(   @   NsilurW   g{Gz?gh㈵>Tr      i Tg     @NF        i   N)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrK   __static_attributes____classcell__rT   s   @rU   r   r   /   s    n` J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56   $ /IH IH    r   c                       \ rS rSrSrg)Cohere2RotaryEmbedding   r6   Nr]   r^   r_   r`   rf   r6   ri   rU   rk   rk          ri   rk   c                       \ rS rSrSrg)Cohere2LayerNormi  r6   Nrm   r6   ri   rU   rp   rp     rn   ri   rp   c                   X   \ rS rSrSrSS\S\\   4S jjr\	" SSS	S
9  SS\
R                  S\\
R                  \
R                  4   S\\
R                     S\\   S\\
R                     S\\   S\\
R                  \\
R                     \\\
R                        4   4S jj5       rSrg)Cohere2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    [         R                  R                  U 5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        SU l        UR                  U   S:X  a  UR                  OS U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  UR                  U R                  -  UR"                  S9U l        [         R                   " UR                  U R                  -  UR                  UR"                  S9U l        g )NrI   g      Tr4   )bias)nnModulerK   rs   rt   rN   r9   r=   rI   r>   num_key_value_groupsscalingrF   	is_causalrH   rG   LinearrE   q_projk_projv_projo_proj)rQ   rs   rt   s      rU   rK   Cohere2Attention.__init__  ss   
		4 "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!97=7I7I)7TXk7kf33quii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
ri   past_key_valuer"   4.58new_nameversionr'   position_embeddingsr(   cache_positionrR   returnc                 d   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  pU R                  b  [        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                   U R"                  U R                  S.UD6u  nnUR$                  " / UQSP76 R'                  5       nU R)                  U5      nUU4$ )Nr3   r   )sincosr   eagerr\   )dropoutrz   rG   )shaperI   r}   view	transposer~   r   rG   r   updatert   r   rs   _attn_implementationr   trainingrF   rz   reshape
contiguousr   )rQ   r'   r   r(   r"   r   rR   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rU   forwardCohere2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&*';LVY'_$L&#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((ri   )rF   rs   rI   r{   r~   rt   ry   r   r}   rz   rG   r   N)NN)r]   r^   r_   r`   ra   r   r   intrK   r   torchTensortupler   
LongTensorr   r   r   rf   r6   ri   rU   rr   rr     s    G
} 
# 
0 %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*) S*)ri   rr   c                   b  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9    SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\   S\\   S\\R                     S\\   S\
\R"                  \\
\R"                  \R"                  4      4   4S jj5       rSrU =r$ )Cohere2DecoderLayeriN  rs   rt   c                 L   > [         TU ]  X5        UR                  U   U l        g r   )rJ   rK   rH   attention_type)rQ   rs   rt   rT   s      rU   rK   Cohere2DecoderLayer.__init__O  s#    +$00;ri   r   r"   r   r   r'   r   r(   rB   r   rR   r   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   U-   nU$ )N)r'   r   r(   r"   rB   r   r6   )input_layernorm	self_attnmlp)rQ   r'   r   r(   r"   rB   r   rR   residualhidden_states_attention_hidden_states_mlps               rU   r   Cohere2DecoderLayer.forwardS  sq     !,,];%)^^ &
' 3)+)&
 &
" !HH]3 :=NNri   )r   )NNFN)r]   r^   r_   r`   r   r   rK   r   r   r   r   r   r   rP   r   r   r   FloatTensorr   rf   rg   rh   s   @rU   r   r   N  s    <} < < %0A6R
 26+/$)59|| #5<<#=> !.	
 "% D> !!1!12 -. 
u  (51B1BEDUDU1U+V"WW	X Sri   r   c                        \ rS rSr% \\S'   Srg)Cohere2PreTrainedModelio  rs   r6   N)r]   r^   r_   r`   r   __annotations__rf   r6   ri   rU   r   r   o  s    ri   r   c                     ^  \ rS rSrS\4U 4S jjr       SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\R                     S\\   S\4S jjrSrU =r$ )Cohere2Modelis  rs   c                    > [         TU ]  U5        [        UR                  UR                  S9U l        [        US9U l        g )N)r9   epsrs   )rJ   rK   rp   r9   rA   r+   rk   
rotary_emb)rQ   rs   rT   s     rU   rK   Cohere2Model.__init__t  s6     $&2D2D6K`K`a	0?ri   r%   r(   position_idsr"   r&   rB   r   rR   r   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a'  Uc$  U R                  (       d  [        U R                  S9nUcD  Ub  UR                  5       OSn	[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S
0 UD6[        S
0 UD6S.n
UnU R                  X5      nU R                    H  nU" U4UXR"                     UUUS.UD6nM      U R%                  U5      n['        UUS	9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r3   )device)rs   input_embedsr(   r   r"   r   )r5   r4   )r   r(   r"   rB   r   )last_hidden_stater"   r6   )
ValueErrorr)   r   r   rs   get_seq_lengthr   aranger   r   	unsqueeze
isinstancedictr
   r   r   r*   r   r+   r   )rQ   r%   r(   r   r"   r&   rB   r   rR   past_seen_tokenscausal_mask_mappingmask_kwargsr'   r   decoder_layers                  rU   r   Cohere2Model.forwardy  s~    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L?-FF++ -"0"0#2 ,K #5"C{"C%F%U%U#
 &"oomJ![[M)$723O3OP /#- M ) 		-0&++
 	
ri   )r+   r   )NNNNNNN)r]   r^   r_   r`   r   rK   r   r   r   r   r   r   rP   r   r   r   r   rf   rg   rh   s   @rU   r   r   s  s    @} @ 151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
!<
 <
ri   r   c                       \ rS rSrSrg)Cohere2ForCausalLMi  r6   Nrm   r6   ri   rU   r   r     rn   ri   r   )r   r   r   r   )5typingr   r   r   torch.nnrw   cache_utilsr   r   configuration_utilsr   r	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerr]   loggerr   rk   rp   rr   r   r   r   r   __all__r6   ri   rU   <module>r      s     &   . J R B 7 9 5 & 0 0	 	 	 1 
		H	%KH$ KH\	2 		 	F) F)R, B2 B
; B
J	* 	 \ri   