
    cCiZ                         S r SSKJr  SSKJr  SSKJr  SSKJr  SSK	J
r
  \R                  " \5      rS	S
SS.rSS0r " S S\5      rS/rg)zTokenization classes for Qwen2.    )Optional   )
AddedToken)PreTrainedTokenizerFast)logging   )Qwen2Tokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filezqwen/qwen-tokenizeri   c                   t   ^  \ rS rSrSr\rSS/r\r	       SU 4S jjr
SS\S\\   S\\   4S	 jjrS
rU =r$ )Qwen2TokenizerFast%   a  
Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.

Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:

```python
>>> from transformers import Qwen2TokenizerFast

>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]

>>> tokenizer(" Hello world")["input_ids"]
[21927, 1879]
```
This is expected.

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`, *optional*):
        Path to the vocabulary file.
    merges_file (`str`, *optional*):
        Path to the merges file.
    tokenizer_file (`str`, *optional*):
        Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
        contains everything needed to load the tokenizer.
    unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead. Not applicable to this tokenizer.
    bos_token (`str`, *optional*):
        The beginning of sequence token. Not applicable for this tokenizer.
    eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The token used for padding, for example when batching sequences of different lengths.
	input_idsattention_maskc                 V  > [        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUn[        T	U ]  " SUUUUUUUS.UD6  g )NFT)lstriprstripspecial
normalized)r
   r   r   	unk_token	bos_token	eos_token	pad_token )
isinstancestrr   super__init__)
selfr
   r   r   r   r   r   r   kwargs	__class__s
            k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/qwen2/tokenization_qwen2_fast.pyr   Qwen2TokenizerFast.__init__S   s    $ )S)) yudW\] 	 )S)) yudW\] 	 )S)) yudW\] 	 )S)) yudW\] 	 	 		
!#)		
 		
    save_directoryfilename_prefixreturnc                 ^    U R                   R                  R                  XS9n[        U5      $ )N)name)
_tokenizermodelsavetuple)r    r&   r'   filess       r#   save_vocabulary"Qwen2TokenizerFast.save_vocabulary   s)    %%**>*PU|r%   r   )NNN<|endoftext|>Nr2   r2   )N)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   slow_tokenizer_classr   r   r   r.   r0   __static_attributes____classcell__)r"   s   @r#   r   r   %   sj    'R *$&67) !!!.
bc HSM ]bcf]g  r%   r   N)r7   typingr   tokenization_utilsr   tokenization_utils_fastr   utilsr   tokenization_qwen2r	   
get_loggerr3   loggerr8   MAX_MODEL_INPUT_SIZESr   __all__r   r%   r#   <module>rG      se    &  , >  . 
		H	% &  /6 a0 aH  
 r%   