
    cCi7                         S SK Jr  SSKJr  SSKJr  SSKJr  \R                  " \	5      r
SSS	S
.r " S S\5      rS/rg)    )Optional   )PreTrainedTokenizerFast)logging   )HerbertTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec            
          ^  \ rS rSrSr\r\r        SU 4S jjr	 SS\
\   S\\
\      S\
\   4S jjr SS\
\   S\\
\      S\S\
\   4U 4S	 jjjrSS
\S\\   S\\   4S jjrSrU =r$ )HerbertTokenizerFast   a=  
Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).

Peculiarities:

- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
  a punctuation character will be treated separately.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
c	           
      8   > [         T
U ]  " UU4UUUUUUS.U	D6  g )N)r   	cls_token	unk_token	pad_token
mask_token	sep_token)super__init__)selfr	   r
   r   r   r   r   r   r   kwargs	__class__s             o/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/herbert/tokenization_herbert_fast.pyr   HerbertTokenizerFast.__init__2   s;     	
	
 *!
	
 
	
    token_ids_0token_ids_1returnc                 d    U R                   /nU R                  /nUc  X1-   U-   $ X1-   U-   U-   U-   $ )a>  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An HerBERT, like BERT sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)cls_token_idsep_token_id)r   r   r   clsseps        r    build_inputs_with_special_tokens5HerbertTokenizerFast.build_inputs_with_special_tokensJ   sL    (   !  !$s** 3&4s::r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r'   r   r   )r   get_special_tokens_masklen)r   r   r   r'   r   s       r   r)   ,HerbertTokenizerFast.get_special_tokens_maske   s    $ &72']a 3   31#K 001QC77sqcC,,-3sS=M7MNRSQTTTr   save_directoryfilename_prefixc                 ^    U R                   R                  R                  XS9n[        U5      $ )N)name)
_tokenizermodelsavetuple)r   r,   r-   filess       r   save_vocabulary$HerbertTokenizerFast.save_vocabulary   s)    %%**>*PU|r    )NNNz<s>z<unk>z<pad>z<mask>z</s>)N)NF)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesr   slow_tokenizer_classr   listintr   r%   boolr)   strr3   r5   __static_attributes____classcell__)r   s   @r   r   r      s    $ *+ 
2 JN;9;3;DI3F;	c;8 sxU9U3;DI3FUkoU	cU U6c HSM ]bcf]g  r   r   N)typingr   tokenization_utils_fastr   utilsr   tokenization_herbertr   
get_loggerr8   loggerr=   r   __all__r7   r   r   <module>rM      sM      >  2 
		H	%#/`pq f2 fR "
"r   