
    :i5              	          S SK r S SKJrJr  S SKJr  S SKJr  S SKJ	r	J
r
  S SKrS SKJrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJrJr  S SKJr  S SKJr  S SKJ r   S SK!J"r"  S SK#J$r$   " S S\%\5      r& " S S\%\5      r' " S S\(\5      r) " S S\%\5      r* " S S\5      r+ " S S\5      r,\
" S\S9r-\
" S\S9r.\
" S \+S9r/ " S! S"\	\-\.\/\4   5      r0g)#    N)ABCabstractmethod)Enum)Path)GenericTypeVar)
ConfigDictField)Audio)MistralBase)
FIMRequest)UserContentChunk)AssistantMessageTypeUserMessage)InstructRequest)Tool)TranscriptionRequest)AudioEncoder)ImageEncoderc                        \ rS rSrSrSrSrSrg)UserMessagePosition   zWhere to encode available toolsfirstlast N)__name__
__module____qualname____firstlineno____doc__r   r   __static_attributes__r       _/home/james-whalen/.local/lib/python3.13/site-packages/mistral_common/tokens/tokenizers/base.pyr   r      s    )EDr"   r   c                       \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSr g) SpecialTokens    a  Enum of special tokens used in the tokenizer.

Attributes:
    unk: The unknown token.
    bos: The beginning of string token.
    eos: The end of string token.
    begin_inst: The beginning of instruction token.
    end_inst: The end of instruction token.
    begin_tools: The beginning of tools token.
    end_tools: The end of tools token.
    begin_tool_results: The beginning of tool results token.
    end_tool_results: The end of tool results token.
    tool_calls: The tool calls token.
    img: The image token.
    pad: The pad token.
    img_break: The image break token.
    img_end: The image end token.
    prefix: The prefix token for FIM.
    middle: The middle token for FIM.
    suffix: The suffix token for FIM.
    begin_system: The beginning of system prompt token.
    end_system: The end of system prompt token.
    begin_tool_content: The beginning of tool content token.
    args: The args token.
    call_id: The call id token.
    audio: The audio token.
    begin_audio: The beginning of audio token.
    transcribe: The transcribe token.
    begin_think: The beginning of think token.
    end_think: The end of think token.

Examples:
    >>> unk = SpecialTokens.unk
z<unk>z<s>z</s>z[INST]z[/INST]z[AVAILABLE_TOOLS]z[/AVAILABLE_TOOLS]z[TOOL_RESULTS]z[/TOOL_RESULTS]z[TOOL_CALLS]z[IMG]z<pad>z[IMG_BREAK]z	[IMG_END]z[PREFIX]z[MIDDLE]z[SUFFIX]z[SYSTEM_PROMPT]z[/SYSTEM_PROMPT]z[TOOL_CONTENT]z[ARGS]z	[CALL_ID]z[AUDIO]z[BEGIN_AUDIO]z[TRANSCRIBE]z[THINK]z[/THINK]r   N)!r   r   r   r   r    unkboseos
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddlesuffixbegin_system
end_systembegin_tool_contentargscall_idaudiobegin_audio
transcribebegin_think	end_thinkr!   r   r"   r#   r%   r%       s    !F C
C
CJH%K$I)(J
C
CIGFFF$L#J)DGE!KJKIr"   r%   c                   $    \ rS rSrSrSrSrSrSrg)SpecialTokenPolicya   zWhat to do with special tokens when encoding/decoding.

Attributes:
    IGNORE: Ignore special tokens.
    KEEP: Keep special tokens.
    RAISE: Raise an error if special tokens are found.
r         r   N)	r   r   r   r   r    IGNOREKEEPRAISEr!   r   r"   r#   rC   rC   a   s     FDEr"   rC   c                       \ rS rSrSrS\SS 4S jr\S\4S j5       r	SSS\
4S	 jrSSS\
4S
 jrSSS\
4S jrSSS\
4S jrSrSrSrSrSrSrSrg)TokenizerVersiono   a  Enum of tokenizer versions.

Allow to distinguish between different versions of the tokenizer and maintain backward compatibility.

Attributes:
    v1: The first version of the tokenizer.
    v2: The second version of the tokenizer that includes special control tokens [INST], [\INST].
    v3: The third version of the tokenizer that includes improved function calling.
    v7: The seventh version of the tokenizer that includes improved system prompt and function calling.
    v11: The eleventh version of the tokenizer that includes improved function calling.
    v13: The thirteenth version of the tokenizer that includes no call id tokenization and better prompt caching.

Examples:
    >>> version = TokenizerVersion.v1
valuereturnc                     [         R                  " SU5      (       d  [        SU S35      e[        R	                  X5      nXl        U$ )Nz^v\d+$zInvalid version format: z#. Must be 'v' followed by a number.)rematch
ValueErrorstr__new___value_)clsrM   objs      r#   rT   TokenizerVersion.__new__   sA    xx	5))7w>abcckk#%
r"   c                 2    [        U R                  SS  5      $ )NrE   )intrM   selfs    r#   _version_numTokenizerVersion._version_num   s    4::ab>""r"   otherzstr | TokenizerVersionc                 t    [        U[        5      (       a  [        U5      nU R                  UR                  :  $ N
isinstancerS   rK   r]   r\   r_   s     r#   __lt__TokenizerVersion.__lt__   s0    eS!!$U+E  5#5#555r"   c                 v    [        U[        5      (       a$  [        U5      nU R                  UR                  :*  $ g ra   rb   rd   s     r#   __le__TokenizerVersion.__le__   5    eS!!$U+E$$(:(::: "r"   c                 v    [        U[        5      (       a$  [        U5      nU R                  UR                  :  $ g ra   rb   rd   s     r#   __gt__TokenizerVersion.__gt__   s5    eS!!$U+E$$u'9'999 "r"   c                 v    [        U[        5      (       a$  [        U5      nU R                  UR                  :  $ g ra   rb   rd   s     r#   __ge__TokenizerVersion.__ge__   rj   r"   v1v2v3v7v11v13r   N)r   r   r   r   r    rS   rT   propertyrZ   r]   boolre   rh   rl   ro   rq   rr   rs   rt   ru   rv   r!   r   r"   r#   rK   rK   o   s     C $6  #c # #64 6 6
;4 ; ;
:4 : :
;4 ; ;
 
B	B	B	B
C
Cr"   rK   c                       \ rS rSr% Sr\" SS9r\\   \	S'   Sr
\S-  \	S'   Sr\\   S-  \	S'   \" \S	9r\\R                      \	S
'   \" \S	9r\\   \	S'   Srg)	Tokenized   an  A tokenized [`InstructRequest`][mistral_common.tokens.instruct.request].

Attributes:
    tokens: The token ids.
    text: The text representation of the tokens.
    prefix_ids: The prefix ids for FIM.
    images: The loaded images associated with the tokens.

Examples:
    >>> tokenized = Tokenized(tokens=[1, 2, 3], text="Hello world", prefix_ids=[1], images=[])
T)arbitrary_types_allowedtokensNtext
prefix_ids)default_factoryimagesaudiosr   )r   r   r   r   r    r	   model_configlistrZ   __annotations__r~   rS   r   r
   r   npndarrayr   r   r!   r   r"   r#   rz   rz      si    
 d;LID#*#'JS	D '$T:FD:5FDK5r"   rz   c            
          \ rS rSr\\S\4S j5       5       r\S\\	   4S j5       r
\S\S\	4S j5       r\\S\4S j5       5       r\\S\4S j5       5       r\\S\4S	 j5       5       r\\S\4S
 j5       5       r\S\	S\S\S\\   4S j5       r\SS\\   S\S-  S\	4S jj5       r\S\	S\4S j5       r\\S\4S j5       5       r\S\\   S\	4S j5       r\S\\   S\	4S j5       r\\S\4S j5       5       rSrg)	Tokenizer   rN   c                     g)z!Vocabulary size of the tokenizer.Nr   r[   s    r#   n_wordsTokenizer.n_words       r"   c                     g)z(All tokens in the vocabulary as strings.Nr   r[   s    r#   vocabTokenizer.vocab   r   r"   token_idc                     g)z$Convert a token id to the token str.Nr   )r\   r   s     r#   id_to_pieceTokenizer.id_to_piece   r   r"   c                     g)z$id of the Beginning of String token.Nr   r[   s    r#   bos_idTokenizer.bos_id   r   r"   c                     g)zid of the End of String token.Nr   r[   s    r#   eos_idTokenizer.eos_id   r   r"   c                     g)zid of the Pad token.Nr   r[   s    r#   pad_idTokenizer.pad_id   r   r"   c                     g)zid of the Unk token.Nr   r[   s    r#   unk_idTokenizer.unk_id   r   r"   sr(   r)   c                     g)z(Convert a string to a list of token ids.Nr   )r\   r   r(   r)   s       r#   encodeTokenizer.encode   r   r"   Nr}   special_token_policyc                     g)am  Decode the token ids to a string.

Args:
    tokens: The token ids to decode.
    special_token_policy: The policy to use for special tokens.
        Passing `None` will default to `self._special_token_policy` for
        [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
        for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
        Note that passing `None` will be deprecated and `special_token_policy` will default to
        `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
Nr   r\   r}   r   s      r#   decodeTokenizer.decode   r   r"   c                     g)zGet the id of a control token.Nr   )r\   r   s     r#   get_control_tokenTokenizer.get_control_token   r   r"   c                     g)z!Get the version of the tokenizer.Nr   r[   s    r#   versionTokenizer.version   r   r"   c                     g)z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

This is a convenient method for debugging.
Nr   r\   r}   s     r#   	to_stringTokenizer.to_string   s     	r"   c                     g ra   r   r   s     r#   
_to_stringTokenizer._to_string      47r"   c                     g)zThe file path of the tokenizer.Nr   r[   s    r#   	file_pathTokenizer.file_path	  s     	r"   r   ra   )r   r   r   r   rw   r   rZ   r   r   rS   r   r   r   r   r   r   rx   r   rC   r   r   rK   r   r   r   r   r   r!   r   r"   r#   r   r      s   1 1  1 8tCy 8 8 4C 4C 4 4 4 4  4 . .  . $ $  $ $ $  $ 7 7$ 7T 7d3i 7 7 T#Y >PSW>W cf    .3 .3 . . 1) 1  1 S	 c   7c7s7 74   r"   r   InstructRequestType)boundFIMRequestTypeTokenizedTypec                       \ rS rSr% Sr\\S'   \S-  \S'   \S-  \S'   S\S\S-  S\S-  SS4S jr	\
S	\S\4S
 j5       r\
S	\S\4S j5       r\
SS\\   S\S-  S\4S jj5       r\
S	\S\4S j5       r\
  SS\S\\   S-  S\S\S\S-  S\S\\\   \\R8                     \\   4   4S jj5       r\
  SS\\\   -  S\S\S-  S\S\\\   \\R8                     \\   4   4
S jj5       r \
S\\   S\4S j5       r!Sr"g)InstructTokenizeri  zBase class for instruct tokenizers.

Attributes:
    tokenizer: The tokenizer to use.
    image_encoder: The image encoder to use if any.
	tokenizerNimage_encoderaudio_encoderrN   c                     g)zInitialize the instruct tokenizer.

Args:
    tokenizer: The tokenizer to use.
    image_encoder: The image encoder to use if any.
    audio_encoder: The audio encoder to use if any.
Nr   )r\   r   r   r   s       r#   __init__InstructTokenizer.__init__!  r   r"   requestc                     g)zInstruct request to Tokenized object

Args:
    request: The instruct request to encode.

Returns:
    The tokenized instruct request.
Nr   r\   r   s     r#   encode_instruct!InstructTokenizer.encode_instruct,  r   r"   c                     g)a  
Encodes an audio transcription request into a tokenized format.

This method processes a transcription request containing audio data,
encodes the user message, and returns the tokenized output.

Args:
    request: The transcription request object containing
        the audio data to be encoded.

Returns:
    Tokenized: The tokenized representation of the audio data, including processed audio and tokens
Nr   r   s     r#   encode_transcription&InstructTokenizer.encode_transcription7  s     	r"   r}   r   c                     g)ag  Convert token ids to string

Args:
    tokens: The token ids to decode.
    special_token_policy: The policy to use for special tokens.
        Passing `None` will default to `self._special_token_policy` for
        [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
        for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
        Note that passing `None` will be deprecated and `special_token_policy` will default to
        `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
Nr   r   s      r#   r   InstructTokenizer.decodeH  r   r"   c                     g)zxFIM request to Tokenized object

Args:
    request: The FIM request to encode.

Returns:
    The tokenized FIM request.
Nr   r   s     r#   
encode_fimInstructTokenizer.encode_fimY  r   r"   messageavailable_toolsis_lastis_firstsystem_promptforce_img_firstc                     g)ag  Encode a user message.

Args:
    message: The user message to encode.
    available_tools: The available tools.
    is_last: Whether the message is the last one.
    is_first: Whether the message is the first one.
    system_prompt: The system prompt.
    force_img_first: Whether to force the image to be first.

Returns:
    The encoded tokens and images.
Nr   )r\   r   r   r   r   r   r   s          r#   encode_user_message%InstructTokenizer.encode_user_messaged  s    . 	r"   contentc                     g)a	  Encode a user content.

Args:
    content: The user content to encode.
    is_last: Whether the content is the last one.
    system_prompt: The system prompt.
    force_img_first: Whether to force the image to be first.

Returns:
    The encoded tokens and images.
Nr   )r\   r   r   r   r   s        r#   encode_user_content%InstructTokenizer.encode_user_content}  s    & 	r"   c                     g ra   r   r   s     r#   r   InstructTokenizer._to_string  r   r"   r   ra   )NF)#r   r   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   rZ   rC   rS   r   r   r   r   r   rx   tupler   r   r   r   r   r   r   r!   r   r"   r#   r   r     s    $&&$&&	"	3?$3F	WcfjWj			 ': }   ,@ ]    T#Y >PSW>W cf    . ]    %) % dd* 	
  Tz  
tCy$rzz*DK7	8 0 
 %) %t,--  Tz	
  
tCy$rzz*DK7	8 ( 7c7s7 7r"   r   )1rP   abcr   r   enumr   pathlibr   typingr   r   numpyr   pydanticr	   r
   mistral_common.audior   mistral_common.baser   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   )mistral_common.protocol.instruct.messagesr   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   &mistral_common.tokens.tokenizers.imager   rS   r   r%   rZ   rC   rK   rz   r   r   r   r   r   r   r"   r#   <module>r      s    	 #   #  & & + : C E < N ? ?#t >C >Bd 5sD 5p6 6*Q Qh 3?K )<y9~8 3^]Th hi ~8r"   