
    cCi@'                         S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJr  \" 5       (       a  SS	KJr  OSr\R                  " \5      rS
SS.r " S S\	5      rS/rg)z Tokenization class for model T5.    N)copyfile)Optional   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )T5Tokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                     ^  \ rS rSr% Sr\rSS/r\r	/ r
\\   \S'           SU 4S jjr\S 5       rSS\S	\\   S
\\   4S jjr SS\\   S\\\      S
\\   4S jjr SS\\   S\\\      S
\\   4S jjrS rS rSrU =r$ )T5TokenizerFast)   a  
Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    extra_ids (`int`, *optional*, defaults to 100):
        Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
        "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
        calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
    additional_special_tokens (`list[str]`, *optional*):
        Additional special tokens used by the tokenizer.
    add_prefix_space (`bool`, *optional*):
        Whether or not the tokenizer should automatically add a prefix space
    from_slow (`book`, *optional*, defaults to `False`):
        Whether or not the tokenizer should be converted from a slow one. If `add_prefix_space` is set, this will be set to `True`.
	input_idsattention_maskprefix_tokensc	                   > Ub~  U V
s/ s H  n
S[        U
5      ;   d  M  U
PM     nn
[        U5      S:  a$  U[        U5       Vs/ s H	  nSU S3PM     sn-  nOIUS:  a!  U[        U5      :w  a  [        SU SU S35      eO![        U5       Vs/ s H	  nSU S3PM     nnUnUb  [        R                  S5        S	U	S
'   [        TU ]  " SUUUUUUUUS.U	D6  Xl        X`l	        g s  sn
f s  snf s  snf )Nz
<extra_id_r	   >r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokenszXYou set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizersT	from_slow)r   r   	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensadd_prefix_space )
strlenrange
ValueErrorloggerwarning_oncesuper__init__r   
_extra_ids)selfr   r   r   r   r   r   r   r   kwargsxextra_tokensi	__class__s                e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/t5/tokenization_t5_fast.pyr$   T5TokenizerFast.__init__V   sA    %0'@['@!LTWXYTZDZA'@L[< 1$)yIY-ZIYA
1#Q.?IY-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH7G!j1-7GLH(4%'j #'F; 
	
!)&?-
	
 
	
 %#A \-Z Is   C'C'C,C1c                     U [         R                  ;   aH  [         R                  U    nUb  X!:w  a  U$ Uc(  [        R                  " SU SU  SU SU S3	[        5        U$ )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       r,   !_eventually_correct_t5_max_length1T5TokenizerFast._eventually_correct_t5_max_length   s    (O,Q,QQ*9*O*OPm*n'$05J5^,,&.34 5 66 734 5$$?#@ Agg "      save_directoryfilename_prefixreturnc                    U R                   (       d  [        S5      e[        R                  R	                  U5      (       d  [
        R                  SU S35        g [        R                  R                  X(       a  US-   OS[        S   -   5      n[        R                  R                  U R                  5      [        R                  R                  U5      :w  a.  [        U R                  U5        [
        R                  SU 35        U4$ )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   zCopy vocab file to )can_save_slow_tokenizerr    ospathisdirr!   errorjoinVOCAB_FILES_NAMESabspathr   r   info)r&   r:   r;   out_vocab_files       r,   save_vocabularyT5TokenizerFast.save_vocabulary   s    ++ 
 ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5KK-n-=>?  r9   token_ids_0token_ids_1c                     XR                   /-   nUc  U R                  U-   $ X R                   /-   nU R                  U-   U-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`list[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)eos_token_idr   )r&   rL   rM   s      r,    build_inputs_with_special_tokens0T5TokenizerFast.build_inputs_with_special_tokens   sS    & "%6%6$77%%33%):):(;;K%%3kAAr9   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )aw  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.
r   )rO   r   )r&   rL   rM   eoss       r,   $create_token_type_ids_from_sequences4T5TokenizerFast.create_token_type_ids_from_sequences   sL        !{()QC//;${2S89QC??r9   c                 T    [        [        [        S U R                  5      5      5      $ )Nc                 F    [        [        R                  " SU 5      5      S L$ )Nz<extra_id_\d+>)boolresearch)r(   s    r,   <lambda>5T5TokenizerFast.get_sentinel_tokens.<locals>.<lambda>   s    bii0A1&E!Fd!Rr9   )listsetfilterr   )r&   s    r,   get_sentinel_tokens#T5TokenizerFast.get_sentinel_tokens   s&    RTXTrTrst
 	
r9   c                 j    U R                  5        Vs/ s H  oR                  U5      PM     sn$ s  snf N)r`   convert_tokens_to_ids)r&   tokens     r,   get_sentinel_token_ids&T5TokenizerFast.get_sentinel_token_ids   s.    ?C?W?W?YZ?Ye**51?YZZZs   0)r%   r   )NNz</s>z<unk>z<pad>d   NNrc   )__name__
__module____qualname____firstlineno____doc__rF   vocab_files_namesmodel_input_namesr
   slow_tokenizer_classr   r]   int__annotations__r$   staticmethodr7   r   r   tuplerJ   rP   rT   r`   rf   __static_attributes____classcell__)r+   s   @r,   r   r   )   s   $L *$&67&!M49! "&.$`    *!c !HSM !]bcf]g !* JNB9B3;DI3FB	cB6 JN@9@3;DI3F@	c@,

[ [r9   r   )rm   rA   rY   r0   shutilr   typingr   tokenization_utils_fastr   utilsr   r   tokenization_t5r
   
get_loggerri   r!   rF   r   __all__r   r9   r,   <module>r~      sl    ' 	 	    > 8 ,K 
		H	%#1EUV }[- }[@ 
r9   