
    cCi]\                     B   S r SSKJrJrJr  SSKrSSKJr  SSKJr  SSK	J
r
Jr  SSKJrJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+  \RX                  " \-5      r.Sr/Sr0 " S S\5      r1 " S S\%5      r2 " S S\&5      r3 " S S\Rh                  5      r5 " S S\+5      r6 " S S\*5      r7 " S S \$5      r8 " S! S"\8\#5      r9 " S# S$\5      r: " S% S&\!5      r; " S' S(\"5      r< " S) S*\ 5      r=/ S+Qr>g),zLG AI Research EXAONE Lab    )CallableOptionalUnionN)nn)check_model_inputs   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )
LlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Olmo2DecoderLayerOlmo2MLPzLGAI-EXAONE/EXAONE-4.0-32BExaone4Configc                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                    SU 4S jjr	Sr
U =r$ )r#   <   a  
This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-32B [LGAI-EXAONE/EXAONE-4.0-32B](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
outputs. Read the documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 102400):
        Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Exaone4Model`].
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
        Dimensionality of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 32768 for EXAONE 3.5).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if ``config.is_decoder=True``.
    bos_token_id (`int`, *optional*, defaults to 0):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 2):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    sliding_window (`int`, *optional*):
        The size of the sliding window for the sliding window attention.
    sliding_window_pattern (`str`, *optional*):
        The pattern to use for sliding window attention. Can be one of:
            - `None`: No sliding window attention is used
            - `int`: Every `sliding_window` layers, use global attention, else use local attention.
            - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
              attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
              final layer always uses global attention regardless of the pattern.
        For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
            - Layer 0, 1, 2: local attention,
            - Layer 3: global attention,
            ...(repeated)
    layer_types (`list`, *optional*):
        Attention pattern for each layer. Prioritized over `sliding_window_pattern`.

Example:

```python
>>> from transformers import Exaone4Model, Exaone4Config

>>> # Initializing a EXAONE configuration
>>> configuration = Exaone4Config()

>>> # Initializing a model from configuration
>>> model = Exaone4Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```exaone4past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 D  > Xl         X l        X@l        XPl        X`l        X0l        Xpl        Xl        Xl        Xl	        Xl
        UU l        Xl        UU l        UU l        UU l        UU l        U R                  c  SnU R                   cI  [#        U R                  5       Vs/ s H#  nUS-   U-  S:w  a  UU R                  :  a  SOSPM%     snU l        SU R                   ;   a  SU l        ['        U R                   U R                  5        [(        TU ]T  " SXUS.UD6  g s  snf )	Nr      sliding_attentionfull_attentionsliding_windowhybrid)bos_token_ideos_token_idtie_word_embeddings )
vocab_sizehidden_sizenum_hidden_layersnum_attention_headsnum_key_value_headsintermediate_size
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cacheattention_dropout
rope_thetarope_scalingr5   sliding_window_patternlayer_typesrangecache_implementationr   super__init__)selfr;   r<   r@   r=   r>   r?   rA   rB   rC   rD   rE   r7   r8   r9   rG   rH   rF   r5   rI   rJ   kwargsi	__class__s                          e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/exaone4/modular_exaone4.pyrN   Exaone4Config.__init__   s<   0 %&!2#6 #6 !2$'>$!2("!2$(,&<#&&%&"#
 t556	  7A U56!;DDZDZ@Z $%& 7	 D t///(0D%d..0F0FG 	
%Vi	
ms	
 s    *D)rF   rL   rA   r<   rC   r@   rJ   rB   r>   r=   r?   rD   rH   rG   r5   rI   rE   r;   )i     i @      rV   rV   silui   g{Gz?gh㈵>Tr   r   Fg     @N        rU      N)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrN   __static_attributes____classcell__rR   s   @rS   r#   r#   <   s    tl J#4"5 &/%.%.%."+ )"+ &(9:#%568IJ!"_$56  $! +9
 9
    c                       \ rS rSrSrg)Exaone4RMSNormi  r:   NrZ   r[   r\   r]   rc   r:   rf   rS   rh   rh         rf   rh   c                       \ rS rSrSrg)Exaone4RotaryEmbeddingi  r:   Nri   r:   rf   rS   rl   rl     rj   rf   rl   c                   Z  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9   SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\   S\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jj5       rSrU =r$ )Exaone4Attentioni	  config	layer_idxc                 d  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        [        USUR                  UR                  -  5      U l        UR                  UR
                  -  U l	        UR                  U l
        SU l        U R                  S-  U l        UR                  U l        UR                  U l        UR                  U   S:H  U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R
                  U R                  -  SS9U l        ["        R$                  " U R                  U R
                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  -  U R                  SS9U l        [/        U R                  UR0                  S9U l        [/        U R                  UR0                  S9U l        g )Nhead_dimTg      r3   F)biaseps)rM   rN   ro   rp   r>   r?   r<   getattrrr   num_key_value_groupsrF   	is_causalscalingr5   rI   rJ   
is_slidingr   Linearq_projk_projv_projo_projrh   rD   q_normk_normrO   ro   rp   rR   s      rS   rN   Exaone4Attention.__init__
  s   "#)#=#= #)#=#= !--
F4F4F&JdJd4de$*$>$>&B\B\$\!!'!9!9}}d*$33&,&C&C# ,,Y7;NNii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 8 84== H$JZJZafg$T]]8K8KL$T]]8K8KLrf   past_key_valuer'   z4.58)new_nameversionr,   position_embeddingsr-   cache_positionrP   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  pU R                  b  U R                  (       a  [        XX5      u  pUb#  SU0nUR                  XU R                  U5      u  p[        nU R                  R                   S:w  a  ["        U R                  R                      nU" U U	U
UU4U R$                  (       d  SOU R&                  U R(                  U R                  (       a  U R                  OS S.UD6u  nnUR*                  " / UQSP76 R-                  5       nU R/                  U5      nUU4$ )Nr2   r   r   eagerrX   )dropoutry   r5   )shaperr   r|   view	transposer}   r~   r   r   r5   rz   r   updaterp   r    ro   _attn_implementationr   trainingrF   ry   reshape
contiguousr   )rO   r,   r   r-   r'   r   rP   input_shapehidden_shapequery_states
key_statesvalue_statescossincache_kwargsattention_interfaceattn_outputattn_weightss                     rS   forwardExaone4Attention.forward"  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST {{<0[[,
&&$//';LVY'_$L& .L (7'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL26//4..t
%
 
%
!\ "));;;;FFHkk+.L((rf   )rF   ro   rr   r<   rx   rz   r   r}   rp   r>   rw   r?   r   r   r|   ry   r5   rI   r~   )NNN)rZ   r[   r\   r]   r#   intrN   r   torchTensortupler   r	   
LongTensorr   r   r   rc   rd   re   s   @rS   rn   rn   	  s    M} M M0 %0A6R
 26+/591)||1) #5<<#=>1) !.	1)
 "%1) !!1!121) +,1) 
u||Xell3XeELL>Q5RR	S1) S1)rf   rn   c                       \ rS rSrSrg)
Exaone4MLPiW  r:   Nri   r:   rf   rS   r   r   W  rj   rf   r   c                       \ rS rSrSrg)Exaone4DecoderLayeri[  r:   Nri   r:   rf   rS   r   r   [  rj   rf   r   c                       \ rS rSr\rS/rSrg)Exaone4PreTrainedModeli_  r   r:   N)rZ   r[   r\   r]   r#   config_class_no_split_modulesrc   r:   rf   rS   r   r   _  s     L./rf   r   c                   "  ^  \ rS rSrS\4U 4S jjr\" 5              SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\   S
\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )Exaone4Modelid  ro   c           	      $  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        U R                  5         g s  snf )Nrt   )rM   rN   r   
ModuleListrK   r=   r   r/   rh   r<   rD   r0   	post_initr   s      rS   rN   Exaone4Model.__init__e  ss     mmEJ6KcKcEdeEd	 3Ede
 #6#5#56;N;NO	 	 fs   Br*   r-   position_idsr'   r+   rE   r   rP   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       dH  U R                  UUUUUS.nS[        S0 UD60n
SU R                  R                  ;   a  [        S0 UD6U
S'   UnU R                  X5      n[!        U R"                  5       H1  u  pU R                  R                  U   nU" U4UU
U   UUUUS	.UD6nM3     U R%                  U5      n['        UU(       a  US
9$ S S
9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)ro   r   r2   )device)ro   input_embedsr-   r   r'   r   r4   r3   )r   r-   r   r'   rE   r   )last_hidden_stater'   r:   )
ValueErrorr.   r
   ro   get_seq_lengthr   aranger   r   	unsqueeze
isinstancedictr   rJ   r   
rotary_emb	enumerater/   r0   r   )rO   r*   r-   r   r'   r+   rE   r   rP   past_seen_tokenscausal_mask_mappingmask_kwargsr,   r   rQ   decoder_layer
layer_types                    rS   r   Exaone4Model.forwardo  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K !"4"C{"C# #dkk&=&==;\;k_j;k#$78%"oomJ )$++ 6A003J)	$72:>) /#-	 	M !7 		-0&+/8O
 	
>B
 	
rf   )r/   r0   )NNNNNNN)rZ   r[   r\   r]   r#   rN   r   r   r   r   r   r	   FloatTensorboolr   r   r   r   r   r   rc   rd   re   s   @rS   r   r   d  s    }   151537+/59$(59E
E,,-E
 !.E
 u//0	E

 "%E
   1 12E
 D>E
 !!1!12E
 +,E
 
u--	.E
 E
rf   r   c                   :  ^  \ rS rSr         SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\R                     S\\
   S	\\R                     S
\\\R                  4   S\\   S\4U 4S jjjrSrU =r$ )Exaone4ForCausalLMi  r*   r-   r   r'   r+   labelsrE   r   logits_to_keeprP   r   c
                 :   > [         TU ]  " SUUUUUUUUU	S.	U
D6  g)u  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B")
>>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B")

>>> prompt = "Explain how wonderful you are"
>>> messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
>>> input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    enable_thinking=False,
)

>>> output = model.generate(input_ids, max_new_tokens=128)
>>> tokenizer.decode(output[0], skip_special_tokens=False)
"[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n<think>\n\n</think>\n\nOh, thank you for such a kind and lovely question! 😊  \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with:  \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake!  \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered!  \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out"
```
)	r*   r-   r   r'   r+   r   rE   r   r   Nr:   )rM   r   )rO   r*   r-   r   r'   r+   r   rE   r   r   rP   rR   s              rS   r   Exaone4ForCausalLM.forward  s<    X 	 	
)%+'))	
 	
rf   r:   )	NNNNNNNNr   )rZ   r[   r\   r]   r   r   r   r   r	   r   r   r   r   r   r   r   r   rc   rd   re   s   @rS   r   r     s     151537+/59-1$(59347
E,,-7
 !.7
 u//0	7

 "%7
   1 127
 ))*7
 D>7
 !!1!127
 c5<</07
 +,7
 
 7
 7
rf   r   c                       \ rS rSrSrg) Exaone4ForSequenceClassificationi  r:   Nri   r:   rf   rS   r   r     rj   rf   r   c                       \ rS rSrSrg)Exaone4ForTokenClassificationi  r:   Nri   r:   rf   rS   r   r     rj   rf   r   c                       \ rS rSrSrg)Exaone4ForQuestionAnsweringi  r:   Nri   r:   rf   rS   r   r     rj   rf   r   )r#   r   r   r   r   r   r   )?r^   typingr   r   r   r   r   transformers.utils.genericr   cache_utilsr	   r
   configuration_utilsr   r   masking_utilsr   r   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   r   r   r   r   r    olmo2.modeling_olmo2r!   r"   
get_loggerrZ   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr#   rh   rl   Modulern   r   r   r   r   r   r   r   r   __all__r:   rf   rS   <module>r      s      , ,   9 . J R 6 & 1   ? 
		H	%2 !B
$ B
J	\ 		1 	K)ryy K)\	 		+ 	01 0
Q
): Q
h8
) 8
v	'E 		$? 		"; 	rf   