
    cCiIQ                        S SK JrJrJr  S SKrS SKrS SKJr  SSKJ	r	J
r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJrJrJrJrJ r J!r!J"r"  SSK#J$r$  \(       a  SSK%J&r&  SS0r'Sr(\RR                  " \*5      r+ " S S\5      r, " S S\$\5      r- " S S\R\                  5      r/ " S S\5      r0 " S S\"5      r1 " S S\!5      r2 " S  S!\ 5      r3 " S" S#\5      r4 " S$ S%\5      r5 " S& S'\5      r6/ S(Qr7g))    )TYPE_CHECKINGAnyOptionalN)nn   )CacheDynamicCache)PretrainedConfig)create_causal_mask)BaseModelOutputWithPast)PreTrainedModel)Unpack)
AddedTokenPreTrainedTokenizer)TransformersKwargslogging   )LlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding)LlamaTokenizer)	TextInput
vocab_fileztokenizer.modelu   ▁c                      ^  \ rS rSrSrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.r                    SU 4S jjr	Sr
U =r$ )GemmaConfig5   a  
This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Gemma-7B.
e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`GemmaModel`]
    hidden_size (`int`, *optional*, defaults to 3072):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 24576):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 28):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 16):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    head_dim (`int`, *optional*, defaults to 256):
        The attention head dimension.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The legacy activation function. It is overwritten by the `hidden_activation`.
    hidden_activation (`str` or `function`, *optional*):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
        if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 2):
        Beginning of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
```python
>>> from transformers import GemmaModel, GemmaConfig
>>> # Initializing a Gemma gemma-7b style configuration
>>> configuration = GemmaConfig()
>>> # Initializing a model from the gemma-7b style configuration
>>> model = GemmaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```gemmapast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                    > Xl         Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl        Xl	        Xl
        Xl        Xl        UU l        UU l        UU l        [         TU ]D  " SUUUUS.UD6  g )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_heads
hidden_acthidden_activationinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropoutsuper__init__)selfr2   r4   r5   r6   r7   r9   r8   r:   r;   r3   r<   r=   r>   r-   r/   r.   r0   r?   r@   rA   kwargs	__class__s                         a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/gemma/modular_gemma.pyrC   GemmaConfig.__init__   s    0 %'>$&!2!2#6  #6 $!2!2("$,!2 	
%%% 3		

 	
    )r@   rA   r8   r:   r;   r4   r<   r5   r3   r7   r6   r9   r=   r?   r>   r2   )i  i   i `        rK      gelu_pytorch_tanhNi    g{Gz?ư>Tr      r   Tg     @Fg        )__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrC   __static_attributes____classcell__rF   s   @rG   r   r   5   s    AF J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 & $ +/
 /
rI   r   c            	           \ rS rSrSr          SS\\\\4      4S jjr	S r
S rSS	S
\\   4S jrS r  SS\\   S\S\S
\4S jjrS rSrg)GemmaTokenizer   aa
  
Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
no padding token in the original model.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
        The end of sequence token.
    pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
        A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
        attention mechanisms or loss computation.
    sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

    add_bos_token (`bool`, *optional*, defaults to `True`):
        Whether or not to add an `bos_token` at the start of sequences.
    add_eos_token (`bool`, *optional*, defaults to `False`):
        Whether or not to add an `eos_token` at the end of sequences.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
        extra spaces.
    use_default_system_prompt (`bool`, *optional*, defaults to `False`):
        Whether or not the default system prompt for Gemma should be used.
    spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to add spaces between special tokens.
Nsp_model_kwargsc                     Uc  0 OUU l         [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUnXl        Xpl        Xl        Xl        [        R                  " S0 U R                   D6U l
        U R                  R                  U5        [        R                  " U 4UUUUUUUU	U
US.
UD6  g )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr_   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokensr1   )r_   
isinstancestrr   r   rg   rh   rj   spmSentencePieceProcessorsp_modelLoadr   rC   )rD   r   re   rc   rd   rf   r_   rg   rh   ri   rj   rk   rE   s                rG   rC   GemmaTokenizer.__init__   s    &5%<r/MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	$**)B&22JT5I5IJ:&$$	
''+)E&?*G	
 	
rI   c                     [        S5      eNzNot needed for GemmaAttributeErrorrD   s    rG   get_spm_processor GemmaTokenizer.get_spm_processor      344rI   c                     [        S5      ert   ru   rw   s    rG   unk_token_lengthGemmaTokenizer.unk_token_length  rz   rI   textr   returnc                 0    [         R                  " X40 UD6$ )zE
Args:
    text: TextInput
Simply calls PreTrainedTokenizer's method
)r   tokenizerD   r~   rE   s      rG   r   GemmaTokenizer.tokenize  s     #++DA&AArI   c                 >    U R                   R                  U[        S9$ )zf
Args:
    text: TextInput
Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
)out_type)rp   encoderm   r   s      rG   	_tokenizeGemmaTokenizer._tokenize$  s     }}##D3#77rI   	token_idsskip_special_tokensrk   c                 (   / n/ nU H  nU(       a  XpR                   ;   a  M  XpR                  ;   a]  U(       a*  UR                  U R                  R	                  U5      5        UR                  U R                  U   R
                  5        / nM  UR                  U5        M     U(       a*  UR                  U R                  R	                  U5      5        U(       a  SR                  U5      nOSR                  U5      nUR                  [        S5      $ )N  )	all_special_ids_added_tokens_decoderappendrp   decodecontentjoinreplaceSPIECE_UNDERLINE)rD   r   r   rk   rE   	sub_textscurrent_sub_textidss           rG   _decodeGemmaTokenizer._decode,  s     	C"s.B.B'B000#$$T]]%9%9:J%KL  !;!;C!@!H!HI#%  '',  T]]112BCD(+I	*I  !1377rI   c                     / nSnU HG  nX@R                   ;   a$  X0R                  R                  U5      U-   -  n/ nM6  UR                  U5        MI     X0R                  R                  U5      -  nU$ )z:Converts a sequence of tokens (string) in a single string.r   )_added_tokens_encoderrp   r   r   )rD   tokenscurrent_sub_tokens
out_stringtokens        rG   convert_tokens_to_string'GemmaTokenizer.convert_tokens_to_stringI  st    
E222mm223EFNN
%'""))%0  	mm**+=>>
rI   )rg   rh   rp   r_   rj   r   )
z<unk>z<bos>z<eos>z<pad>NTFFFF)FF)rP   rQ   rR   rS   rT   r   dictrm   r   rC   rx   r|   listr   r   intboolr   r   rY   r1   rI   rG   r]   r]      s    ,b 48%*"'&+)
 "$sCx.1)
V55B[ BtCy B8 %*.3	898 "8 (,	8 
8:rI   r]   c                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
GemmaRMSNormiX  dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g )N)rB   rC   r   r   	Parametertorchzerosweight)rD   r   r   rF   s      rG   rC   GemmaRMSNorm.__init__Y  s,    ll5;;s#34rI   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr   T)keepdim)r   rsqrtpowmeanr   )rD   xs     rG   _normGemmaRMSNorm._norm^  s4    5;;quuQx}}R}>IJJJrI   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )Ng      ?)r   floatr   type_as)rD   r   outputs      rG   forwardGemmaRMSNorm.forwarda  sC    AGGI& 3!2!2!445~~a  rI   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   shaper   rw   s    rG   
extra_reprGemmaRMSNorm.extra_reprh  s'    ))*+6$((<<rI   )r   r   )rN   )rP   rQ   rR   rS   r   r   rC   r   r   r   rY   rZ   r[   s   @rG   r   r   X  s0    5C 5e 5 5
K!= =rI   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )GemmaMLPil  c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	rB   rC   r   Linearr4   r5   	gate_projup_proj	down_proj)rD   configrF   s     rG   rC   GemmaMLP.__init__m  ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXrI   )r   r   r   )rP   rQ   rR   rS   rC   rY   rZ   r[   s   @rG   r   r   l  s    Y YrI   r   c                       \ rS rSrSrg)GemmaRotaryEmbeddingit  r1   NrP   rQ   rR   rS   rY   r1   rI   rG   r   r   t      rI   r   c                       \ rS rSrS rSrg)GemmaPreTrainedModelix  c                     [         R                  " X5        SUR                  R                  ;   a%  UR                  R
                  R                  5         g g )NRMSNorm)r   _init_weightsrF   rP   r   datazero_)rD   modules     rG   r   "GemmaPreTrainedModel._init_weightsy  sA    %%d3 ((111MM$$& 2rI   r1   N)rP   rQ   rR   rS   r   rY   r1   rI   rG   r   r   x  s    'rI   r   c                       \ rS rSr       SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\
   S	\\R                     S
\\   S\4S jjrSrg)
GemmaModeli  Nr%   r(   position_idsr"   r&   r>   cache_positionrE   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9n
UnU R                  X5      n[
        R                  " U R                  R                  S-  UR                  S9nX-  nU R                  S U R                  R                     H  nU" U4U
UUUUUS	.UD6nM     U R#                  U5      n[%        UU(       a  US
9$ S S
9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rO   )device)r   input_embedsr(   r   r"   r   g      ?)dtype)r(   r   r"   r>   r   position_embeddings)last_hidden_stater"   )
ValueErrorr)   r	   r   get_seq_lengthr   aranger   r   	unsqueezer   
rotary_embtensorr4   r   r*   r6   r+   r   )rD   r%   r(   r   r"   r&   r>   r   rE   past_seen_tokenscausal_maskr'   r   
normalizerdecoder_layers                  rG   r   GemmaModel.forward  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oomJ
 \\$++"9"93">mFYFYZ
%2![[)H4;;+H+HIM)	*) /#-$7	 	M J 		-0&+/8O
 	
>B
 	
rI   r1   )NNNNNNN)rP   rQ   rR   rS   r   r   
LongTensorTensorr   FloatTensorr   r   r   r   r   rY   r1   rI   rG   r   r     s     151537+/59$(59A
E,,-A
 !.A
 u//0	A

 "%A
   1 12A
 D>A
 !!1!12A
 +,A
 
!A
 A
rI   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )GemmaForCausalLMi  c                  8   > [        5       R                  " S0 U D6$ )a  
Example:

```python
>>> from transformers import AutoTokenizer, GemmaForCausalLM

>>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```r1   )rB   r   )super_kwargsrF   s    rG   r   GemmaForCausalLM.forward  s    $ w...rI   r1   )rP   rQ   rR   rS   r   rY   rZ   r[   s   @rG   r   r     s    / /rI   r   c                       \ rS rSrSrg)GemmaForSequenceClassificationi  r1   Nr   r1   rI   rG   r   r     r   rI   r   c                       \ rS rSrSrg)GemmaForTokenClassificationi  r1   Nr   r1   rI   rG   r   r     r   rI   r   )r   r]   r   r   r   r   r   )8typingr   r   r   sentencepiecern   r   r   cache_utilsr   r	   configuration_utilsr
   masking_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   tokenization_utilsr   r   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   llama.tokenization_llamar   tokenization_utils_baser   VOCAB_FILES_NAMESr   
get_loggerrP   loggerr   r]   Moduler   r   r   r   r   r   r   r   __all__r1   rI   rG   <module>r     s    0 /    . 3 / 7 - & A 0   6 4!#45   
		H	%D
" D
NY^%8 Yx=299 =(Yx Y	/ 	'/ 'B
 B
J/' /,	%C 		"= 	rI   