
    h                     ^    S SK r S SKJr   " S S5      r " S S\5      r " S S\5      rS	 rg)
    N)Pathc                   2    \ rS rSrS rS rS rS rS rSr	g)	TokenizerInterface   c                     Xl         g N
model_path)selfr
   s     Y/home/james-whalen/.local/lib/python3.13/site-packages/torchao/_models/llama/tokenizer.py__init__TokenizerInterface.__init__   s    $    c                     [        S5      eNz/This method should be overridden by subclasses.NotImplementedErrorr   texts     r   encodeTokenizerInterface.encode       !"STTr   c                     [        S5      er   r   r   tokenss     r   decodeTokenizerInterface.decode   r   r   c                     [        S5      er   r   r   s    r   bos_idTokenizerInterface.bos_id   r   r   c                     [        S5      er   r   r   s    r   eos_idTokenizerInterface.eos_id   r   r   r	   N)
__name__
__module____qualname____firstlineno__r   r   r   r    r#   __static_attributes__ r   r   r   r      s    %UUUUr   r   c                   @   ^  \ rS rSrU 4S jrS rS rS rS rSr	U =r
$ )SentencePieceWrapper   c                    > SS K n[        TU ]	  U5        UR                  [	        U5      5      U l        U R                  5       U l        U R                  5       U l	        g )Nr   )
sentencepiecesuperr   SentencePieceProcessorstr	processorr    bos_token_idr#   eos_token_id)r   r
   spm	__class__s      r   r   SentencePieceWrapper.__init__   sE    #$33C
OD KKM KKMr   c                 8    U R                   R                  U5      $ r   )r3   EncodeAsIdsr   s     r   r   SentencePieceWrapper.encode&   s    ~~))$//r   c                 8    U R                   R                  U5      $ r   )r3   	DecodeIdsr   s     r   r   SentencePieceWrapper.decode)   s    ~~''//r   c                 6    U R                   R                  5       $ r   )r3   r    r   s    r   r    SentencePieceWrapper.bos_id,       ~~$$&&r   c                 6    U R                   R                  5       $ r   )r3   r#   r   s    r   r#   SentencePieceWrapper.eos_id/   rA   r   )r4   r5   r3   )r%   r&   r'   r(   r   r   r   r    r#   r)   __classcell__r7   s   @r   r,   r,      s!    *00'' 'r   r,   c                   b   ^  \ rS rSr% Sr\\\4   \S'   Sr	Sr
U 4S jrS rS rS	 rS
 rSrU =r$ )TiktokenWrapper3   zE
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
special_tokens   zs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                   > SS K nSS Kn[        TU ]  U5        [        R
                  R                  U5      (       d   [        U5      5       eUR                  R                  [        U5      5      n[        U5      n/ SQ[        SU R                  S-
  5       Vs/ s H	  nSU S3PM     sn-   n[        U5       VVs0 s H
  u  pWXtU-   _M     snnU l        UR                  [!        U5      R"                  U R$                  UU R                  S9U l        U R                  S   U l        U R                  S   U l        U R-                  5       U l        U R1                  5       U l        g s  snf s  snnf )	Nr   )
<|begin_of_text|><|end_of_text|>z<|reserved_special_token_0|>z<|reserved_special_token_1|>z<|reserved_special_token_2|>z<|reserved_special_token_3|>z<|start_header_id|>z<|end_header_id|>z<|reserved_special_token_4|>z
<|eot_id|>   z<|reserved_special_token_z|>)namepat_strmergeable_ranksrI   rL   rM   )tiktokentiktoken.loadr0   r   ospathisfiler2   loadload_tiktoken_bpelenrangenum_reserved_special_tokens	enumeraterI   Encodingr   rO   rP   model_bos_id_eos_idr    r4   r#   r5   )	r   r
   rR   rQ   num_base_tokensirI   tokenr7   s	           r   r   TiktokenWrapper.__init__>   sU   $ww~~j)):3z?:)"--99#j/Jo.
 1d>>BC
C (s"-C

  8A7P
7P81EQ&&7P
 &&j!&&LL+..	 ' 

 !//0CD //0AB KKM KKM#

s   E(=E-c                 8    U R                   R                  U5      $ r   )r^   r   r   s     r   r   TiktokenWrapper.encoded   s    zz  &&r   c                 8    U R                   R                  U5      $ r   )r^   r   r   s     r   r   TiktokenWrapper.decodeg   s    zz  ((r   c                     U R                   $ r   )r_   r   s    r   r    TiktokenWrapper.bos_idj       ||r   c                     U R                   $ r   )r`   r   s    r   r#   TiktokenWrapper.eos_idm   rk   r   )r_   r`   r4   r5   r^   rI   )r%   r&   r'   r(   __doc__dictr2   int__annotations__r[   rP   r   r   r   r    r#   r)   rD   rE   s   @r   rG   rG   3   sE     cN""% EG$*L') r   rG   c                 L    S[        U5      ;   a  [        U 5      $ [        U 5      $ )a#  
Factory function to get the appropriate tokenizer based on the model name.

Args:
- tokenizer_model_path (str): The file path to the tokenizer model.
- model_name (str): The name of the model, used to determine the tokenizer type.
Returns:
- TokenizerInterface: An instance of a tokenizer.
zLlama-3)r2   rG   r,   )tokenizer_model_path
model_names     r   get_tokenizerru   q   s(     C
O#344#$899r   )rT   pathlibr   r   r,   rG   ru   r*   r   r   <module>rw      s9    
 U U"'- ',;( ;|:r   