
    cCi+"                         S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	  SSK
JrJr  \" 5       (       a  S	S
KJr  OSr\R                  " \5      rSSS.rSr " S S\	5      rS/rg)z'Tokenization classes for RemBERT model.    N)copyfile)Optional   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )RemBertTokenizerzsentencepiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c            
          ^  \ rS rSrSr\r\r            SU 4S jjr	 SS\
\   S\\
\      S\
\   4S jjr SS\
\   S\\
\      S\S\
\   4S	 jjrSS
\S\\   S\\   4S jjrSrU =r$ )RemBertTokenizerFast&   a#  
Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    remove_space (`bool`, *optional*, defaults to `True`):
        Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
    keep_accents (`bool`, *optional*, defaults to `False`):
        Whether or not to keep accents when tokenizing.
    bos_token (`str`, *optional*, defaults to `"[CLS]"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"[SEP]"`):
        The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
        that is used for the end of sequence. The token used is the `sep_token`.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    sep_token (`str`, *optional*, defaults to `"[SEP]"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    cls_token (`str`, *optional*, defaults to `"[CLS]"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    mask_token (`str`, *optional*, defaults to `"[MASK]"`):
        The token used for masking values. This is the token used when training this model with masked language
        modeling. This is the token which the model will try to predict.
c                    > [        U[        5      (       a  [        USSS9OUn[        TU ]  " U4UUUUUUUU	U
UUS.UD6  X0l        X@l        XPl        Xl        g )NTF)lstriprstrip)r   do_lower_caseremove_spacekeep_accents	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 o/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/rembert/tokenization_rembert_fast.pyr!   RemBertTokenizerFast.__init__X   s    " KUU_adJeJeZ
4Fku
	
)'%%!	
 	
  +(($    token_ids_0token_ids_1returnc                 d    U R                   /nU R                  /nUc  XA-   U-   $ XA-   U-   U-   U-   $ )aL  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RemBERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added
    token_ids_1 (`List[int]`, *optional*, defaults to `None`):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idcls_token_id)r"   r(   r)   sepclss        r%    build_inputs_with_special_tokens5RemBertTokenizerFast.build_inputs_with_special_tokens   sL    &   !  !$s** 3&4s::r'   already_has_special_tokensc                     U(       a>  Ub  [        S5      eU Vs/ s H"  oDU R                  U R                  4;   a  SOSPM$     sn$ Ub+  S/S/[        U5      -  -   S/-   S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   S/-   $ s  snf )ax  
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of ids.
    token_ids_1 (`List[int]`, *optional*, defaults to `None`):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Set to True if the token list is already formatted with special tokens for the model

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.r
   r   )
ValueErrorr,   r-   len)r"   r(   r)   r2   xs        r%   get_special_tokens_mask,RemBertTokenizerFast.get_special_tokens_mask   s    & && R  VaaU`PQt00$2C2CDDA!KU`aa"31#K 001QC7A3[AQ;QRVWUXXXsqcC,,-33	 bs   )Bsave_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzVocabulary path (z) should be a directory- r   )
ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r"   r9   r:   out_vocab_files       r%   save_vocabulary$RemBertTokenizerFast.save_vocabulary   s    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r'   )r   r   r   r   )NNTTF[CLS][SEP]z<unk>rJ   z<pad>rI   z[MASK])N)NF)__name__
__module____qualname____firstlineno____doc__rD   vocab_files_namesr   slow_tokenizer_classr!   listintr   r0   boolr7   r   tuplerG   __static_attributes____classcell__)r$   s   @r%   r   r   &   s    ,\ *+ &%R JN;9;3;DI3F;	c;4 sx4943;DI3F4ko4	c4>!c !HSM !]bcf]g ! !r'   r   )rO   r>   shutilr   typingr   tokenization_utilsr   tokenization_utils_fastr   utilsr   r	   tokenization_rembertr   
get_loggerrK   rA   rD   SPIECE_UNDERLINEr   __all__ r'   r%   <module>rb      sn    . 	   , > 8 6			H	%#8L\]   ]!2 ]!@ "
"r'   