
    cCiX.                        S SK Jr  S SKrS SKJr  SSKJrJr  SSKJr  SSK	J
r
Jr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJrJ r J!r!J"r"   " S S\\5      r# " S S\5      r$ " S S\5      r% " S S\!5      r& " S S\5      r' " S S\ 5      r( " S S\"5      r) " S S\5      r* " S S \5      r+ " S! S"\5      r, " S# S$\5      r- " S% S&\5      r./ S'Qr/g)(    )OptionalN)nn   )CacheDynamicCache)PretrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)Unpack)TransformersKwargsauto_docstring)check_model_inputs   )MistralConfig)Qwen2AttentionQwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassificationQwen2MLP
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormQwen2RotaryEmbeddingc                   N    \ rS rSrSrSr                    SS jrSrg)MinistralConfig   a  
This is the configuration class to store the configuration of a [`MinistralModel`]. It is used to instantiate an
Ministral model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Ministral-8B-Instruct-2410.

[mistralai/Ministral-8B-Instruct-2410](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410)
[mistralai/Ministral-8B-Instruct-2410](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 32000):
        Vocabulary size of the Ministral model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`MinistralModel`]
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 14336):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_key_value_heads (`int`, *optional*, defaults to 8):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
    head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
        The attention head dimension.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
        The maximum sequence length that this model might ever be used with. Ministral's sliding window attention
        allows sequence of up to 4096*32 tokens.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*):
        The id of the padding token.
    bos_token_id (`int`, *optional*, defaults to 1):
        The id of the "beginning-of-sequence" token.
    eos_token_id (`int`, *optional*, defaults to 2):
        The id of the "end-of-sequence" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether the model's input and output word embeddings should be tied.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    sliding_window (`int`, *optional*, defaults to 4096):
        Sliding window attention window size. If not specified, will default to `4096`.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.

```python
>>> from transformers import MinistralModel, MinistralConfig

>>> # Initializing a Ministral 8B style configuration
>>> configuration = MinistralConfig()

>>> # Initializing a model from the Ministral 8B style configuration
>>> model = MinistralModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```	ministralNc                 ^   [         R                  " U 4UUUUS.UD6  Xl        Xl        X l        X0l        X@l        XPl        UU l        Xpl	        Uc  UnX`l
        Xl        Xl        Xl        Xl        UU l        UU l        UU l        U R"                  c  U R                  b  SOS/U-  U l        g g )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_attentionfull_attention)r   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headssliding_windowhead_dimnum_key_value_heads
hidden_actinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_dropoutlayer_types)selfr)   r+   r,   r-   r.   r1   r0   r2   r*   r3   r4   r5   r"   r#   r$   r%   r6   r/   r7   r8   kwargss                         i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/ministral/modular_ministral.pyr(   MinistralConfig.__init__k   s    0 	!!	
%%% 3	
 	
 %'>$&!2!2#6 ,  &"5#6 $!2("$!2&#'+':':'F#L\ ! "D $    )r7   r0   r2   r+   r3   r,   r8   r*   r.   r-   r1   r4   r6   r/   r5   r)   )i }     i 8      r?      Nsilui   g{Gz?gư>TN   r   Fg     @r>   g        N)__name__
__module____qualname____firstlineno____doc__
model_typer(   __static_attributes__ r=   r;   r   r      sS    IV J  )!+9"r=   r   c                       \ rS rSrSrg)MinistralMLP   rJ   NrC   rD   rE   rF   rI   rJ   r=   r;   rL   rL          r=   rL   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )MinistralAttention   	layer_idxc                   > [         TU ]  X5        [        R                  " UR                  UR
                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l	        [        R                  " UR                  UR                  U R                  -  SS9U l
        g )NF)bias)superr(   r   Linearr+   r.   r0   q_projr1   k_projv_proj)r9   configrS   	__class__s      r;   r(   MinistralAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkr=   )rY   rX   rZ   )rC   rD   rE   rF   intr(   rI   __classcell__r\   s   @r;   rQ   rQ      s    l# l lr=   rQ   c                       \ rS rSrSrg)MinistralRMSNorm   rJ   NrN   rJ   r=   r;   rb   rb      rO   r=   rb   c                       \ rS rSrSrg)MinistralDecoderLayer   rJ   NrN   rJ   r=   r;   re   re      rO   r=   re   c                       \ rS rSrSrg)MinistralPreTrainedModel   rJ   NrN   rJ   r=   r;   rh   rh      rO   r=   rh   c                       \ rS rSrSrg)MinistralRotaryEmbedding   rJ   NrN   rJ   r=   r;   rk   rk      rO   r=   rk   c                   "  ^  \ rS rSrS\4U 4S jjr\" 5       \       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\	R                     S\\   S\4S jj5       5       rSrU =r$ )MinistralModel   r[   c                 (   > [         TU ]  U5        U ?g )N)rV   r(   has_sliding_layers)r9   r[   r\   s     r;   r(   MinistralModel.__init__   s     #r=   	input_idsattention_maskposition_idspast_key_valuesinputs_embedsr5   cache_positionr:   returnc                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S
0 UD6[        S
0 UD6S.n
UnU R                  X5      nU R                  S U R                  R                     H  nU" U4XR"                     UUUUUS.UD6nM!     U R%                  U5      n['        UU(       a  US	9$ S S	9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r[   r   rB   )device)r[   input_embedsrt   rx   rv   ru   )r'   r&   )rt   ru   rv   r5   rx   position_embeddings)last_hidden_staterv   rJ   )
ValueErrorembed_tokensr   r[   get_seq_lengthtorcharangeshaper{   	unsqueeze
isinstancedictr	   r
   
rotary_emblayersr-   attention_typenormr   )r9   rs   rt   ru   rv   rw   r5   rx   r:   past_seen_tokenscausal_mask_mappingmask_kwargshidden_statesr}   decoder_layers                  r;   forwardMinistralModel.forward   s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%F%U%U#
 & #oomJ![[)H4;;+H+HIM)	23O3OP) /#-$7	 	M J 		-0&+/8O
 	
>B
 	
r=   rJ   )NNNNNNN)rC   rD   rE   rF   r   r(   r   r   r   r   
LongTensorTensorr   FloatTensorboolr   r   r   r   rI   r_   r`   s   @r;   rn   rn      s    $ $  151537+/59$(59C
E,,-C
 !.C
 u//0	C

 "%C
   1 12C
 D>C
 !!1!12C
 +,C
 
!C
  C
r=   rn   c                       \ rS rSrSrg)MinistralForCausalLMi  rJ   NrN   rJ   r=   r;   r   r     rO   r=   r   c                       \ rS rSrSrg)"MinistralForSequenceClassificationi  rJ   NrN   rJ   r=   r;   r   r     rO   r=   r   c                       \ rS rSrSrg)MinistralForTokenClassificationi  rJ   NrN   rJ   r=   r;   r   r     rO   r=   r   c                       \ rS rSrSrg)MinistralForQuestionAnsweringi  rJ   NrN   rJ   r=   r;   r   r     rO   r=   r   )r   rh   rn   r   r   r   r   )0typingr   r   r   cache_utilsr   r   configuration_utilsr   masking_utilsr	   r
   modeling_outputsr   processing_utilsr   utilsr   r   utils.genericr   mistral.configuration_mistralr   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   r   r   rL   rQ   rb   re   rh   rk   rn   r   r   r   r   __all__rJ   r=   r;   <module>r      s       . 3 R 7 & 7 / 9   G"m%5 G"T	8 	l l	| 		- 		3 		3 	J
Z J
Z	+ 		)G 		&A 		$= 	r=   