
    cCi.                         S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	J
r
  \	" 5       (       a  SS	KJr  OSr\
R                  " \5      rS
SS.r " S S\5      rS/rg)zTokenization classes for XGLM.    N)copyfile)Optional   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )XGLMTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filec                      ^  \ rS rSrSr\rSS/r\r	        SU 4S jjr
 SS\\   S\\\      S\\   4S	 jjr SS\\   S\\\      S\\   4S
 jjrSS\S\\   S\\   4S jjrSrU =r$ )XGLMTokenizerFast$   a  
Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
and [`XLNetTokenizer`]. Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
        Additional special tokens used by the tokenizer.
	input_idsattention_maskc	                 D  > SU l         [        U R                   5       V
s/ s H	  n
SU
 S3PM     nn
U	R                  S/ 5      =(       d    / U	S'   U	S==   U Vs/ s H  oU	S   ;  d  M  UPM     sn-  ss'   [        TU ]  " U4UUUUUUUS.U	D6  Xl        g s  sn
f s  snf )N   z<madeupword>additional_special_tokens)r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token)num_madeup_wordsrangegetsuper__init__r   )selfr   r   r   r   r   r   r   r   kwargsimadeup_wordsword	__class__s                i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/xglm/tokenization_xglm_fast.pyr    XGLMTokenizerFast.__init__X   s     !"49$:O:O4PQ4Pq+aS*4PQ.4jj9TVX.Y._]_*+*+)0
)T@[9\-\D\0
 	
+ 	
	
)
	
 
	
 %' R0
s   BB
*B
token_ids_0token_ids_1returnc                 \    Uc  U R                   /U-   $ U R                   /nX1-   U-   U-   U-   $ )a;  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`

Args:
    token_ids_0 (`list[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idr!   r)   r*   seps       r'    build_inputs_with_special_tokens2XGLMTokenizerFast.build_inputs_with_special_tokens{   sD    ( %%&44  ! 3&,{::    c                 x    U R                   /nUc  [        X1-   5      S/-  $ [        X1-   U-   U-   U-   5      S/-  $ )a  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.

r   )r-   lenr.   s       r'   $create_token_type_ids_from_sequences6XGLMTokenizerFast.create_token_type_ids_from_sequences   sQ    $   !s()QC//3$s*S0;>?1#EEr2   save_directoryfilename_prefixc                    U R                   (       d  [        S5      e[        R                  R	                  U5      (       d  [
        R                  SU S35        g [        R                  R                  X(       a  US-   OS[        S   -   5      n[        R                  R                  U R                  5      [        R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizer
ValueErrorospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r!   r7   r8   out_vocab_files       r'   save_vocabulary!XGLMTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r2   )r   r   )NN<s></s>rJ   rI   z<unk>z<pad>)N)__name__
__module____qualname____firstlineno____doc__rD   vocab_files_namesmodel_input_namesr
   slow_tokenizer_classr    listintr   r0   r5   strtuplerG   __static_attributes____classcell__)r&   s   @r'   r   r   $   s    -^ *$&67( !%H JN;9;3;DI3F;	c;4 JNF9F3;DI3FF	cF0!c !HSM !]bcf]g ! !r2   r   )rO   r>   shutilr   typingr   tokenization_utils_fastr   utilsr   r   tokenization_xglmr
   
get_loggerrK   rA   rD   r   __all__ r2   r'   <module>ra      sd    % 	   > 8 0M 
		H	%#<P`a Y!/ Y!x 
r2   