
    cCi                         S SK r S SKJr  S SKJr  S SKJr  SSKJr  SSK	J
r
Jr  \
" 5       (       a  SS	KJr  OSr\R                  " \5      rS
SS.r " S S\5      rS/rg)    N)copyfile)Optional)
processors   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )GemmaTokenizerztokenizer.modelztokenizer.json)
vocab_filetokenizer_filec                      ^  \ rS rSrSr\r\rSr	SS/r
         SU 4S jjrS r\S 5       r\S	 5       r\R                   S
 5       r\R                   S 5       rSS\S\\   S\\   4S jjrSS jrSrU =r$ )GemmaTokenizerFast"   u 
  
Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.

This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`

```python
>>> from transformers import GemmaTokenizerFast

>>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
>>> tokenizer.encode("Hello this is a test")
[2, 4521, 736, 603, 476, 2121]
```

If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
[post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`, *optional*):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    tokenizer_file (`str`, *optional*):
        [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
        contains everything needed to load the tokenizer.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
        extra spaces.
    unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The padding token
    add_bos_token (`bool`, *optional*, defaults to `True`):
        Whether or not to add an `bos_token` at the start of sequences.
    add_eos_token (`bool`, *optional*, defaults to `False`):
        Whether or not to add an `eos_token` at the end of sequences.
left	input_idsattention_maskc
                 ~   > [         TU ]  " SUUUUUUUUU	S.	U
D6  Xl        Xl        U R	                  5         Xl        g )N)	r   r   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_tokenadd_bos_tokenadd_eos_token )super__init___add_bos_token_add_eos_tokenupdate_post_processorr   )selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/gemma/tokenization_gemma_fast.pyr   GemmaTokenizerFast.__init__W   s[     	 	
!))E''	
 	
 ,+""$$    c                    U R                   nU R                  nUc  U R                  (       a  [        S5      eU R                  nU R
                  nUc  U R                  (       a  [        S5      eU R                  (       a  US-   OS SU R                  (       a  SU-   S-   OS 3nU U R                  (       a  SU-   S	-   OS S
U R                  (       a  SU-   S	-   OS 3n/ nU R                  (       a  UR                  X45        U R                  (       a  UR                  X445        [        R                  " XVUS9U R                  l        g)zU
Updates the underlying post processor with the current `bos_token` and `eos_token`.
Nz)add_bos_token = True but bos_token = Nonez)add_eos_token = True but eos_token = Nonez:0  z$A:0 z:0z:1z $B:1)singlepairspecial_tokens)r   bos_token_idr   
ValueErrorr   eos_token_idr   appendr   TemplateProcessing
_tokenizerpost_processor)r"   bosr.   eosr0   r+   r,   r-   s           r%   r!   (GemmaTokenizerFast.update_post_processorv   s6    nn((;4--HIInn((;4--HII%)%7%7S5[R@[_[m[mcCiRVFVsuDvw0B0B39t+K5gkgygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&r'   c                     U R                   $ N)r    r"   s    r%   r    GemmaTokenizerFast.add_eos_token       """r'   c                     U R                   $ r9   )r   r:   s    r%   r    GemmaTokenizerFast.add_bos_token   r<   r'   c                 0    Xl         U R                  5         g r9   )r    r!   r"   values     r%   r   r;          #""$r'   c                 0    Xl         U R                  5         g r9   )r   r!   r@   s     r%   r   r>      rB   r'   save_directoryfilename_prefixreturnc                    U R                   (       d  [        S5      e[        R                  R	                  U5      (       d  [
        R                  SU S35        g [        R                  R                  X(       a  US-   OS[        S   -   5      n[        R                  R                  U R                  5      [        R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory-r)   r   )can_save_slow_tokenizerr/   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r"   rD   rE   out_vocab_files       r%   save_vocabulary"GemmaTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r'   c                     U R                   (       a  U R                  /O/ nU R                  (       a  U R                  /O/ nX1-   U-   nUb
  XS-   U-   U-   nU$ r9   )r   r.   r   r0   )r"   token_ids_0token_ids_1r.   r0   outputs         r%    build_inputs_with_special_tokens3GemmaTokenizerFast.build_inputs_with_special_tokens   s\    .2.@.@))*b.2.@.@))*b+l:"*[8<GFr'   )r   r    r   )	NNFz<unk>z<bos>z<eos>z<pad>TFr9   )__name__
__module____qualname____firstlineno____doc__rP   vocab_files_namesr   slow_tokenizer_classpadding_sidemodel_input_namesr   r!   propertyr   r   setterstrr   tuplerS   rY   __static_attributes____classcell__)r$   s   @r%   r   r   "   s    -^ *)L$&67 %*%>
4 # # # # % % % %
!c !HSM !]bcf]g !(	 	r'   r   )rJ   shutilr   typingr   
tokenizersr   tokenization_utils_fastr   utilsr   r	   tokenization_gemmar   
get_loggerr[   rM   rP   r   __all__r   r'   r%   <module>rr      sb    
   ! > 8 2N			H	%#4HXY ^0 ^B  
 r'   