
    :iXV              	       l   S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	  S SK
Jr  S SKrS SKJrJr  S SKJrJrJrJr  S SKJr  \R.                  " \5      rS	\\	-  S
\4S jr " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! SS\"\   S\#S-  S
\$\%\#4   4S jjr&g)    N)cached_property)groupby)Path)	TypedDict)AudioConfigAudioSpectrogramConfig)SpecialTokenPolicySpecialTokens	TokenizerTokenizerVersion)ImageConfigpathreturnc                     [        U [        5      (       a  [        U 5      n U R                  5       =(       a%    SU R                  ;   =(       a    U R
                  S:H  $ )z3Check if the given path is a tekken tokenizer file.tekken.json)
isinstancestrr   is_filenamesuffix)r   s    a/home/james-whalen/.local/lib/python3.13/site-packages/mistral_common/tokens/tokenizers/tekken.py	is_tekkenr      s@    $Dz<<>Nh$))3Nw8NN    c                   >    \ rS rSr% Sr\\S'   \\S'   \S-  \S'   Srg)	TokenInfo    zToken information in the JSON file.

Attributes:
    rank: The rank of the token.
    token_bytes: The token in bytes, base64 encoded.
    token_str: The token in string format.
ranktoken_bytesN	token_str )	__name__
__module____qualname____firstlineno____doc__int__annotations__r   __static_attributes__r!   r   r   r   r       s     ITzr   r   c                   8    \ rS rSr% Sr\\S'   \\S'   \\S'   Sr	g)SpecialTokenInfo.   zSpecial token information in the JSON file.

Attributes:
    rank: The rank of the token.
    token_str: The token in string format.
    is_control: Whether the token is a control token.
r   r    
is_controlr!   N)
r"   r#   r$   r%   r&   r'   r(   r   boolr)   r!   r   r   r+   r+   .   s     INr   r+   c                   L    \ rS rSr% Sr\\S'   \\S'   \\S'   \\S'   \\S'   Srg	)
TekkenConfig<   a<  Tekken configuration in the JSON file.

Attributes:
    pattern: The pattern of the tokenizer.
    num_vocab_tokens: The number of vocabulary tokens.
    default_vocab_size: The default vocabulary size.
    default_num_special_tokens: The default number of special tokens.
    version: The version of the tokenizer.
patternnum_vocab_tokensdefault_vocab_sizedefault_num_special_tokensversionr!   N)	r"   r#   r$   r%   r&   r   r(   r'   r)   r!   r   r   r0   r0   <   s%     L ##Lr   r0   c                   r    \ rS rSr% Sr\\   \S'   \\   S-  \S'   \	\S'   \
\S'   \\S'   \\S	'   \\S
'   Srg)	ModelDataN   a  The data of the tekken tokenizer model.

Attributes:
    vocab: The vocabulary of the tokenizer.
    config: The configuration of the tokenizer.
    version: The version of the tokenizer.
    type: The type of the tokenizer.
    image: The image configuration of the tokenizer.
vocabNspecial_tokensconfigr6   typeimageaudior!   )r"   r#   r$   r%   r&   listr   r(   r+   r0   r'   r   r   r   r)   r!   r   r   r8   r8   N   s@     	?)*T11L
Ir   r8   c                      \ rS rSrSr\" S\R                  SS9\" S\R                  SS9\" S\R                  SS9\" S\R                  SS9\" S	\R                  SS9\" S
\R                  SS9\" S\R                  SS9\" S\R                  SS9\" S\R                  SS9\" S\R                   SS9\" S\R"                  SS9\" S\R$                  SS9\" S\R&                  SS9\" S\R(                  SS9\" S\R*                  SS9\" S\R,                  SS9\" S\R.                  SS9\" S\R0                  SS9\" S\R2                  SS9\" S\R4                  SS94rSrSSSSS.S\\   S\\   S\S \ S!\ S"\!S#\S$\\"-  S-  S%\#S-  S&\$S-  4S' jjr%\&S(\"4S) j5       r'\(S*\)S    S+\\"-  S(S 4S, j5       r*\&S(\#S-  4S- j5       r+\+RX                  S.\#S(S4S/ j5       r+\&S(\$S-  4S0 j5       r-\-RX                  S.\$S(S4S1 j5       r-\&S(\ 4S2 j5       r.\&S(\ 4S3 j5       r/\&S(\!4S4 j5       r0\&S(\14S5 j5       r2\2RX                  S6\1S(S4S7 j5       r2\3S(\ 4S8 j5       r4\3S(\ 4S9 j5       r5\3S(\ 4S: j5       r6\3S(\ 4S; j5       r7S(\\   4S< jr8S=\S>\9S?\9S(\\    4S@ jr:SA\\    SB\1S(\\   4SC jr;SD\ S(\94SE jr<S=\S(\ 4SF jr=SMSA\\    SB\1S-  S(\4SG jjr>SA\\    S(\4SH jr?SA\\    S(\4SI jr@SD\ S(\4SJ jrASMSD\ SB\1S-  S(\B4SK jjrCSLrDg)N
Tekkenizerb   zTekken tokenizer.

This tokenizer is based on the [tiktoken](https://github.com/openai/tiktoken) library. It fastens the tokenization
for multiple languages.
r   Tr   r    r-                           	   
                              z<SPECIAL_{id}>
tekkenizerN)r   _pathimage_configaudio_configr:   r;   r2   
vocab_sizenum_special_tokensr6   r   rY   rZ   r[   c          
         U[        U5      U-   ::  d   U[        U5      U45       eX@l        [        [        U Vs/ s H  oS   PM	     sn5      5      n[        U5      U:X  d
   SU 35       e[        U5      U::  d   e[        [        U5      U5       Vs/ s H$  n[	        XR
                  R                  US9SS9PM&     nnU(       a'  [        R                  SUS   S    SUS	   S    35        X.-   n[        [        U Vs/ s H  oS   PM	     sn5      5      [        U5      s=:X  a  U:X  d   U5       e   U5       eXE-
  n[        R                  S
U SU S35        [        XS9U l
        [        [        U5      5      [        U R                  R                  5       5      :X  d   UU R                  45       e[        R                  " UUU R                  0 S9U l        X`l        Xl        Xl        X l        U Vs0 s H  oS   US   _M     snU l        [        U5       Vs/ s H  oR)                  U5      PM     snU l        [,        R.                  U l        Ub  [3        U5      U l        gSU l        gs  snf s  snf s  snf s  snf s  snf )a  Initialize the tekken tokenizer.

Args:
    vocab: The vocabulary of the tokenizer.
    special_tokens: The special tokens of the tokenizer.
    pattern: The pattern of the tokenizer.
    vocab_size: The vocabulary size of the tokenizer.
    num_special_tokens: The number of special tokens of the tokenizer.
    version: The version of the tokenizer.
    name: The name of the tokenizer.
    image_config: The image configuration of the tokenizer.
r    zSpecial tokens must be unique: )idTrD   zAdding special tokens r   z, ..., zNon special vocabulary size is z with z special tokens.)	max_vocab)r   pat_strmergeable_ranksr;   r   N)len_vocab_sizesetranger+   SPECIAL_TOKEN_TEMPLATEformatloggerinfo_reload_mergeable_ranks_tekken_token2id_nospecialvaluestiktokenEncoding_model_version_image_config_audio_config_all_special_tokens_special_tokens_reverse_vocabid_to_piece_vocabr	   IGNORE_special_token_policyr   
_file_path)selfr:   r;   r2   r\   r]   r6   r   rY   rZ   r[   tnum_defined_special_tokensispecial_fillerinner_vocab_sizes                   r   __init__Tekkenizer.__init__   s   4 SZ*<<< 	
J?
 	
<
 & &)n-Unnn-U)V%W">"&@@tDcdrcsBtt@>"&8888 3~.0BC
C !/J/J/Q/QUV/Q/WdhiC 	 
 KK():;)G(HP^_aPbcnPoOpq (83?1+?@ASEXn\nn 	
	
n 	
	
n &: 	56F5GvN`Maaqrs*A%*d'5)*+s43R3R3Y3Y3[/\\ 	
++_
 	
\ '' ;;	
  ))#1 Q_-`Q_Anai.GQ_-`*49*4EF4Eq''*4EF%7%>%>").):$u+W .V

 @2 .aFs   I3+I8I=9J Jr   c                 J    U R                   c  [        S5      eU R                   $ )zThe path to the tokenizer file.z)The tokenizer was not loaded from a file.)r{   
ValueErrorr|   s    r   	file_pathTekkenizer.file_path   s$     ??"HIIr   clsr   c                    [        U[        5      (       a  [        U5      nUR                  5       (       d   U5       e[	        USSS9 n[
        R                  " U5      nSSS5        WS   R                  S5      nU[        R                  ;  a+  [        SU SU S	[        [        R                  5       35      eUc   e[        U5      nUR                  S
S5      nUc8  U[        S5      :  a  [        SU S35      e[        [        R                  5      nOU Vs/ s H  oPM     nnXsS
'   UR                  S5      =n	(       a:  U[        S5      :  a  [        SU SUR                   S35      e[        S0 U	D6US'   O&UR                  S5      =n
(       a  [        S0 U
D6US'   UR                  S5      =n(       a,  UR!                  S5      n[#        S0 UD6n[%        SSU0UD6US'   UnU " US   UUS   S   US   S   US   S   UUR&                  R)                  SS5      UR                  S5      UR                  S5      US9
$ ! , (       d  f       GN= fs  snf )z|Load the tekken tokenizer from a file.

Args:
    path: The path to the tokenizer file.

Returns:
    The tekken tokenizer.
rutf-8)encodingNr<   r6   zUnknown version: z in z+. Make sure to use a valid version string: r;   v7zSpecial tokens not found in zL. Please update your tokenizer file and include all special tokens you need.
multimodalv11z-The image config has to be called 'image' in z for tokenizers of version .r>   r?   audio_encoding_configencoding_configr:   r2   r4   r5   r    )
r:   r;   r2   r\   r]   r6   r   rZ   r[   rY   r!   )r   r   r   existsopenjsonloadgetr   __members__r   r@   rB   DEPRECATED_SPECIAL_TOKENSvaluer   popr   r   r   replace)r   r   funtyped_version_strr6   special_tokens_dictsr;   tokenmmr>   r?   r   
model_datas                 r   	from_fileTekkenizer.from_file   s    dC  :D{{}}"d"}$g.!iilG / x(,,Y7/;;;#L>dV <<<@AQA]A]<^;_a 
 '''"<0>EkkJZ\`>a')$// 24& 9a a 
 "&j&J&J!K1EF1Ee1ENF$2 !\**2*)%00 CD6Idelererdsstu   +0R0GGkk'**U**3U3GGKK((5(#ii(?@O4GGO*T?TeTGG '
W%)x(3!(+,@A)(34PQ""7B/#0#0
 	
] /.0 Gs   II
Ic                     U R                   $ )z)The image configuration of the tokenizer.)rs   r   s    r   r>   Tekkenizer.image"  s     !!!r   r   c                     [        S5      e)Nz!Can only set Image config at initr   r|   r   s     r   r>   r   '      <==r   c                     U R                   $ )zvThe audio configuration of the tokenizer.

Returns:
     The audio configuration object if it exists, otherwise None.
)rt   r   s    r   r?   Tekkenizer.audio+  s     !!!r   c                     [        S5      e)Nz!Can only set Audio config at initr   r   s     r   r?   r   4  r   r   c                 ,    [        U R                  5      $ )z.The number of special tokens of the tokenizer.)rd   ru   r   s    r   r]   Tekkenizer.num_special_tokens8  s     4++,,r   c                     U R                   $ )z!Vocabulary size of the tokenizer.)re   r   s    r   n_wordsTekkenizer.n_words=  s     r   c                     U R                   $ )zThe version of the tokenizer.)rr   r   s    r   r6   Tekkenizer.versionB  s     }}r   c                     U R                   $ )z'The policy for handling special tokens.)rz   r   s    r   special_token_policyTekkenizer.special_token_policyG  s     )))r   policyc                     [        U[        5      (       d  [        S[        U5       S35      e[        R
                  " S[        5        Xl        g)z+Set the policy for handling special tokens.z!Expected SpecialTokenPolicy, got r   zThe attributed `special_token_policy` is deprecated and will be removed in 1.10.0. Please pass a special token policy explicitly to the relevant methods.N)r   r	   r   r=   warningswarnFutureWarningrz   )r|   r   s     r   r   r   L  sJ     &"455@faPQQY 	
 &,"r   c                 $    U R                  S5      $ )z#The beginning of sentence token id.z<s>get_control_tokenr   s    r   bos_idTekkenizer.bos_id\  s     %%e,,r   c                 $    U R                  S5      $ )zThe end of sentence token id.z</s>r   r   s    r   eos_idTekkenizer.eos_ida  s     %%f--r   c                 $    U R                  S5      $ )zThe padding token id.z<pad>r   r   s    r   pad_idTekkenizer.pad_idf       %%g..r   c                 $    U R                  S5      $ )zThe unknown token id.z<unk>r   r   s    r   unk_idTekkenizer.unk_idk  r   r   c                     U R                   $ )a  All tokens in the vocabulary as strings.

Note:
   This will collapse all tokens for which we have a decoding error into
   the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

Returns:
    The vocabulary of the tokenizer.
)rx   r   s    r   r:   Tekkenizer.vocabp  s     {{r   sboseosc                     U R                   R                  U5      nU Vs/ s H  oUU R                  -   PM     nnU(       a  U R                  /UQnU(       a  / UQU R                  PnU$ s  snf )zEncode a string into a list of token ids.

Args:
    s: The string to encode.
    bos: Whether to add the beginning of sentence token.
    eos: Whether to add the end of sentence token.

Returns:
    The list of token ids.
)rq   encoder]   r   r   )r|   r   r   r   tokensr}   s         r   r   Tekkenizer.encode  si     !KK..q17=>v!d---v>kk+F+F+v+t{{+F ?s   A)r   r   c           
        ^  / n[        UU 4S j5       H  u  pEU(       au  U[        R                  :X  a  [        S[	        U5       S35      eU[        R
                  :X  a  UR                  U 4S jU 5       5        Mi  U[        R                  :X  a  M  M  UR                  T R                  R                  U Vs/ s H  ofT R                  -
  PM     sn5      5        M     U$ s  snf )Nc                 "   > U TR                   :  $ Nr]   )r}   r|   s    r   <lambda>(Tekkenizer._decode_all.<locals>.<lambda>  s    1t?V?V;Vr   z/Decoding `tokens` that contain special tokens (a  ) is not allowed. 
Either make sure `tokens` do not include any special tokens or, if you want to decode `tokens` that includes special tokens, change the tokenizer's special token policy to IGNORE or KEEP: 
```
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy

tokenizer = MistralTokenizer.v3(is_tekken=True)
tekken = tokenizer.instruct_tokenizer.tokenizer
tekken.special_token_policy = SpecialTokenPolicy.IGNORE  # or SpecialTokenPolicy.KEEP
```c              3   H   >#    U  H  nTR                   U   S    v   M     g7f)r    N)ru   ).0r}   r|   s     r   	<genexpr>)Tekkenizer._decode_all.<locals>.<genexpr>  s$     "[UZPQ4#;#;A#>{#KUZs   ")r   r	   RAISEr   r@   KEEPextendry   appendrq   decoder]   )r|   r   r   decoded
is_specialgroupr}   s   `      r   _decode_allTekkenizer._decode_all  s    !(1V!WJ'+=+C+CC$I$u+ 	W  	   *-?-D-DDNN"[UZ"[[)-?-F-FF G
 t{{11X]2^X]STt7N7N3NX]2^_`- "X.  3_s   9C"token_idc                 D    SXR                   -
  s=:*  =(       a    S:  $ s  $ )z$Check if a token id is a byte token.r      r   r|   r   s     r   is_byteTekkenizer.is_byte  s$    H666<<<<<<r   c                 Z    XR                   ;   a  U R                   U   $ [        SU 35      e)z$Get the token id of a control token.zUnknown control token )rv   r   )r|   r   s     r   r   Tekkenizer.get_control_token  s2    22255a885aS9::r   c                    Ub-  [        U[        5      (       d  [        S[        U5       S35      eUc5  [        R
                  " SU R                   S3[        5        U R                  nSR                  U R                  XS95      $ )a  Decode a list of token ids into a string.

Args:
    tokens: The list of token ids to decode.
    special_token_policy: The policy for handling special tokens.
        Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
        if `None`. Passing `None` is deprecated and will be changed
        to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
zFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r   ,Using the tokenizer's special token policy () is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.r   r   )
r   r	   r   r=   r   r   rz   r   joinr   )r|   r   r   s      r   r   Tekkenizer.decode  s      +J?SUg4h4hXY]^rYsXttuv   'MMB4C]C]B^ _H H
  $(#=#= wwt'''Z[[r   c                 Z    [         R                  " S[        5        U R                  U5      $ )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

This is a convenient method for debugging.
z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)r   r   r   
_to_stringr|   r   s     r   	to_stringTekkenizer.to_string  s,     	\ 	
 v&&r   c                 >    U R                  U[        R                  S9$ )Nr   r   r	   r   r   s     r   r   Tekkenizer._to_string  s    {{68J8O8O{PPr   c                 @    U R                  U/[        R                  S9$ )z0Convert a token id to its string representation.r   r   r   s     r   rw   Tekkenizer.id_to_piece  s    {{H:<N<S<S{TTr   c                    Uc5  [         R                  " SU R                   S3[        5        U R                  nXR                  :  az  U[
        R                  :X  a!  U R                  U   S   R                  S5      $ U[
        R                  :X  a  [        U S35      eU[
        R                  :X  a  g[        SU 35      eU R                  R                  XR                  -
  5      $ )a  Convert a token id to its byte representation.

Args:
    token_id: The token id to convert.
    special_token_policy: The policy for handling special tokens.
        Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
        if `None`. Passing `None` is deprecated and will be changed
        to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The byte representation of the token.
r   r   r    r   z is a special tokenr   zUnknown special token policy )r   r   rz   r   r]   r	   r   ru   r   r   r   ry   rq   decode_single_token_bytes)r|   r   r   s      r   id_to_byte_pieceTekkenizer.id_to_byte_piece  s      'MMB4C]C]B^ _H H
  $(#=#= ---#'9'>'>>//9+FMMgVV%);)A)AA H:-@!ABB%);)B)BB #@AU@V!WXX{{44X@W@W5WXXr   )ru   rt   r{   rs   rq   rz   rv   rm   rr   rx   re   r   )Er"   r#   r$   r%   r&   r+   r
   unkr   r   
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddler   begin_system
end_systembegin_tool_contentr   rh   r@   r   r   r'   r   r   r   r   r   propertyr   classmethodr=   r   r>   setterr?   r]   r   r6   r	   r   r   r   r   r   r   r:   r.   r   r   r   r   r   r   r   rw   bytesr  r)   r!   r   r   rB   rB   b   s4    	a=+<+<Na=+<+<Na=+<+<Na=+C+CPTUa=+A+AdSa=+D+DQUVa=+B+BtTa=+K+KX\]a=+I+IVZ[a=+C+CPTUbM,=,=$ObM,=,=$ObM,C,CPTUbM,A,AdSbM,@,@TRbM,@,@TRbM,@,@TRbM,F,FSWXbM,D,DQUVbM,L,LY]^)!. . !#'+/+/MEIME -.ME 	ME
 ME  ME "ME ME TzD ME "D(ME "D(ME^ 4   E
tL) E
t E
 E
 E
N "{T) " " \\>; >4 > > "{T) " " \\>; >4 > > -C - -       )   *&8 * *   ,+= ,$ , !, - - - . . . / / / / / /tCy " $ T d3i &$s) CU Z^_bZc 8= = =;3 ;3 ;\T#Y \>PSW>W \cf \@'S	 'c ' Qc Qs QUC UC U#Y #YDVY]D] #Yin #Y #Yr   rB   r:   ra   c                 P   UbW  [        U 5      U:  d   [        U 5      U45       e[        U 5      U:  a'  U SU n [        R                  S[        U 5       S35        0 n[        U 5       Hf  u  p4UR	                  5       1 Sk:X  d   eUS   U:X  d   e[
        R                  " US   5      nUS:  d  U[        U/5      :X  d   X545       eUS   X%'   Mh     [        U5      [        U 5      :X  d   e[        UR                  5       5      [        [        [        U5      5      5      :X  d   eU$ )zAReload our tokenizer JSON file and convert it to Tiktoken format.Nz(Cutting non special vocabulary to first z tokens.>   r   r    r   r   r   r   )rd   rj   rk   	enumeratekeysbase64	b64decoder  rf   rn   rg   )r:   ra   ranksr   xmerges         r   rl   rl     s   
 5zY&?UY(??&u:	!*9%EKKB3u:,hWX !E% vvx????yA~~  =!12Cx5E1#J.:
:.y ! u:U###u||~#eCJ&7"8888Lr   r   )'r   r   loggingr   	functoolsr   	itertoolsr   pathlibr   typingr   ro   &mistral_common.tokens.tokenizers.audior   r   %mistral_common.tokens.tokenizers.baser	   r
   r   r   &mistral_common.tokens.tokenizers.imager   	getLoggerr"   rj   r   r.   r   r   r+   r0   r8   rB   r@   r'   dictr  rl   r!   r   r   <module>r/     s        %     V  ?			8	$OC$J O4 O	 y 9 $	 (sY sYp !	?Tz 
%*r   