
    cCi	                         S r SSKJr  SSKJr  SSKJr  SSKJr  \R                  " \
5      rSS	S
S.r " S S\5      rS/rg)z)Fast Tokenization classes for OpenAI GPT.    )Optional   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                   v   ^  \ rS rSrSr\rSS/r\r	SU 4S jjr
\S 5       rSS\S\\   S	\\   4S
 jjrSrU =r$ )OpenAIGPTTokenizerFast   a  
Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
the following peculiarities:

- lower case all inputs
- uses BERT's BasicTokenizer for pre-BPE tokenization

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
	input_idsattention_maskc                 ,   > [         TU ]  " X4X4S.UD6  g )N)r   	unk_token)super__init__)selfr	   r
   r   r   kwargs	__class__s         m/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   OpenAIGPTTokenizerFast.__init__6   s    oohno    c                     g)NT )r   s    r   do_lower_case$OpenAIGPTTokenizerFast.do_lower_case9   s    r   save_directoryfilename_prefixreturnc                 ^    U R                   R                  R                  XS9n[        U5      $ )N)name)
_tokenizermodelsavetuple)r   r   r    filess       r   save_vocabulary&OpenAIGPTTokenizerFast.save_vocabulary=   s)    %%**>*PU|r   r   )NNNz<unk>)N)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   slow_tokenizer_classr   propertyr   strr   r'   r)   __static_attributes____classcell__)r   s   @r   r   r      sf    ( *$&67-p  c HSM ]bcf]g  r   r   N)r/   typingr   tokenization_utils_fastr   utilsr   tokenization_openair   
get_loggerr+   loggerr0   r   __all__r   r   r   <module>r?      sN    0  >  3 
		H	%#/`pq "4 "J $
$r   