
    :i#              
          S SK r S SKJr  S SKJrJrJrJr  S SKr	S SK
Jr  S SKJrJrJr  S SKJr  S SKJrJrJrJrJrJrJrJr  S SKJrJrJrJrJ r J!r!  S S	K"J#r#  S S
K$J%r%J&r&  S SK'J(r(  S SK)J*r*  S SK+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  S SK5J6r6  S SK7J8r8   " S S\.\\-\,\2\4   5      r9 " S S\9\\-\,\2\4   5      r: " S S\:\\-\,\2\4   5      r; " S S\;\\-\,\2\4   5      r< " S S\<5      r= " S S\=5      r> " S S\>5      r?g)    N)abstractmethod)AnyGenericSequenceoverload)Audio) InvalidAssistantMessageException InvalidMessageStructureExceptionTokenizerException)
FIMRequest)
AudioChunkAudioURLChunkContentChunk
ImageChunkImageURLChunk	TextChunk
ThinkChunkUserContentChunk)UATSAssistantMessageAssistantMessageTypeSystemMessageToolMessageUserMessage)InstructRequest)ToolToolCall)TranscriptionRequest)AudioEncoder)	FIMRequestTypeInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokens	TokenizedTokenizedType	TokenizerUserMessagePosition)ImageEncoder)
Tekkenizerc            
         ^  \ rS rSrSr  S S\S\S-  S\S-  4U 4S jjjr\	S\S-  4S	 j5       r
S\\   4S
 jr\S\S\\\4   4S j5       r\S\S\S\\   4S j5       r\S\S\S\S\\   4S j5       r\S\S\\   4S j5       rS\\\   S-     S\\   S\S\SS4
S jr\S\\   SS4S j5       rS\\\4   S\4S jr S!S\\   S\!S-  S\"4S jjr#S\\   S\"4S jr$Sr%U =r&$ )"InstructTokenizerBase3   zBase instruct tokenizer.N	tokenizerimage_encoderaudio_encoderc                 J   > Xl         X l        X0l        [        TU ]  XU5        g)zInitialize the instruct tokenizer.

Args:
    tokenizer: The tokenizer to use.
    image_encoder: The image encoder to use if any.
    audio_encoder: The audio encoder to use.
N)r.   r/   r0   super__init__selfr.   r/   r0   	__class__s       c/home/james-whalen/.local/lib/python3.13/site-packages/mistral_common/tokens/tokenizers/instruct.pyr3   InstructTokenizerBase.__init__8   s%     #**=A    returnc                     U R                   $ N)r/   r5   s    r7   
mm_encoder InstructTokenizerBase.mm_encoderJ   s    
 !!!r9   c                 0    U R                   R                  /$ )zReturn the start tokens.)r.   bos_idr=   s    r7   startInstructTokenizerBase.startQ   s    %%&&r9   requestc                     SnSn[        [        U R                  5      5       H&  u  p4[        U[        5      (       d  M  US:X  a  UnUnM(     X!4$ )zFind the first and last user message in the request.

Args:
    request: The request to search for user messages.

Returns:
    The index of the first and last user message.
)list	enumeratemessages
isinstancer   )rD   last_user_idxfirst_user_idximsgs        r7   find_first_last_user*InstructTokenizerBase.find_first_last_userU   sT     9W%5%567FA#{++!R'%&N !	 8
 ,,r9   messageis_before_last_user_messagec                     [        S5      e)zuEncode a tool message.

Raises:
    NotImplementedError: The tool message is not implemented for the base tokenizer.
zTool message not implementedNotImplementedErrorr5   rQ   rR   s      r7   encode_tool_message)InstructTokenizerBase.encode_tool_messageh   s     ""@AAr9   continue_messagec                     [        S5      e)zEncode an assistant message.

Raises:
    NotImplementedError: The assistant message is not implemented for the base tokenizer.
z!Assistant message not implementedrT   )r5   rQ   rR   rY   s       r7   encode_assistant_message.InstructTokenizerBase.encode_assistant_messageq   s     ""EFFr9   chunkc                     [        S5      e)zsEncode a think chunk.

Raises:
    NotImplementedError: The think chunk is not implemented for the base tokenizer.
zThink chunk not implementedrT   r5   r]   s     r7   encode_think"InstructTokenizerBase.encode_think|   s     ""?@@r9   	tokenizedrI   
max_tokenslast_user_message_indexc                     g r<    )r5   rb   rI   rc   rd   s        r7   _truncate_for_max_tokens.InstructTokenizerBase._truncate_for_max_tokens   s     	r9   c                     g r<   rf   clsrI   s     r7   validate_messages'InstructTokenizerBase.validate_messages   s     	r9   c           
         / n/ nSn/ nU R                  UR                  5        U R                  U5      u  pg[        UR                  5       GH  u  pUR                  (       a<  U[        UR                  5      S-
  :X  a   [        U	[        5      (       d  [        S5      e[        U	[        5      (       aR  U R                  U	UR                  X:H  X:H  UR                  SS9u  pnUR                  U5        UR                  U5        O[        U	[        5      (       a  U R                  XU:  5      n
O[        U	[        5      (       a`  UR                  =(       a    U[        UR                  5      S-
  :H  nU R!                  XU:  US9n
U[        UR                  5      S-
  :X  a  U
nO>[        U	["        5      (       a  U R%                  U	5      n
O['        S[)        U	5       35      eUR+                  U
5        GM     UR,                  b(  U R/                  UUR                  UR,                  U5        U R1                  5       nU H  nUc  M  UR                  U5        M     [3        UU R5                  U[6        R8                  S9UUUS	9$ )
ziEncode an instruct request.

Args:
    request: The request to encode.

Returns:
    The encoded tokens.
N   z?Cannot continue final message if it is not an assistant messageT)system_promptforce_img_first)rY   zUnknown message type special_token_policy)tokenstext
prefix_idsimagesaudios)rl   rI   rO   rH   continue_final_messagelenrJ   r   r
   r   encode_user_messageavailable_toolsrp   extendr   rW   r[   r   encode_system_messager   typeappendtruncate_at_max_tokensrg   rB   r%   decoder#   KEEP)r5   rD   rw   rx   rv   tokens_listrL   rK   msg_idxrN   
new_tokens
new_images
new_audiosrY   rt   toks                   r7   encode_instruct%InstructTokenizerBase.encode_instruct   sl    $& '+
.0 	w//0 )-(A(A'(J%%g&6&67LG..G$4$4 5 99"3(8996U  #{++595M5M++,-")"7"7$( 6N 62

 j)j)C--!55c];RS
C!122#*#A#A#lwRUV]VfVfRgjkRkGk !::=0CS ; 
 c'"2"23a77!+JC//!77<
(+@c)LMMz*G 8J ))5))  ..	 Cc"  V:L:Q:QR!
 	
r9   rt   rs   c                 4    U R                   R                  XS9$ )ac  Decode tokens to a string.

Args:
    tokens: The tokens to decode.
    special_token_policy: The policy to use for special tokens.
        Passing `None` will default to `self._special_token_policy` for
        [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
        for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
        Note that passing `None` will be deprecated and `special_token_policy` will default to
        `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
rr   )r.   r   )r5   rt   rs   s      r7   r   InstructTokenizerBase.decode   s     ~~$$V$WWr9   c                 8    U R                   R                  U5      $ r<   )r.   
_to_string)r5   rt   s     r7   r    InstructTokenizerBase._to_string   s    ~~((00r9   )r0   r/   r.   NNr<   )'__name__
__module____qualname____firstlineno____doc__r'   r)   r   r3   propertyr>   rG   intrB   staticmethodr   tuplerO   r   r   boolrW   r   r[   r   r`   rg   classmethodr   rl   r   r%   r   r#   strr   r   __static_attributes____classcell__r6   s   @r7   r,   r,   3   s    $
 .2-1	BB $d*B $d*	B B$ "L4/ " "'tCy ' -o -%S/ - -$ B; BUY B^bcf^g B B G+GJNGbfG	cG G A* Ac A AS	D() +, 	
 "% 
 d   O
 !5t!;<O
 
O
bXT#Y X>PSW>W Xcf X"1c 1s 1 1r9   r,   c                      \ rS rSrSr  SS\S\\   S-  S\S\S\	S-  S	\S
\
\\   \\R                     \\   4   4S jjrS\S
\\   4S jr  SS\	\\   -  S\S\	S-  S	\S
\
\\   \\R                     \\   4   4
S jjrS\S\S
\\   4S jrS\S\S\S
\\   4S jrS\S
\\   4S jrS\S
\4S jrS\S
\4S jrSrg)InstructTokenizerV1   zjInstruct tokenizer V1.

This tokenizer has basic for messages. It does not support tools or image inputs.
NrQ   r|   is_lastis_firstrp   rq   r:   c                    [        UR                  [        5      (       d   S5       eU R                  b   S5       eSnU(       a  U(       a  US-   UR                  -   nOUR                  nSU S3nU R	                  USSS	9u  pnXU4$ )
a"  Encode a user message.

Args:
    message: The message to encode.
    available_tools: Not used.
    is_last: Not used.
    is_first: Whether the message is the first one.
    system_prompt: The system prompt.
    force_img_first: Not used.

Returns:
    The encoded tokens and empty list.
"Message content must be normalizedNz(InstructTokenizerV1 cannot encode images 

z[INST] z [/INST]F)contentr   rp   )rJ   r   r   r/   encode_user_content)r5   rQ   r|   r   r   rp   rq   r   message_txtcurr_tokensimageaudios               r7   r{   'InstructTokenizerV1.encode_user_message  s    , '//3//U1UU/!!)U+UU)#f,w>GooGy1$($<$<[Z_os$<$t!E5((r9   c                 F    [        SU R                  R                   35      e)Nz,System message encoding not implemented for )rU   r6   r   r5   rQ   s     r7   r~   )InstructTokenizerV1.encode_system_message%  s#    !$PQUQ_Q_QhQhPi"jkkr9   r   c                     [        U[        5      (       d   eU(       a  U(       a  US-   U-   nU R                  R                  USSS9nU/ / 4$ )zEncode a user content.

Args:
    content: The content to encode.
    is_last: Whether the message is the last one.
    system_prompt: The system prompt.
    force_img_first: Not used.

Returns:
    The encoded tokens and empty list.
r   Fboseos)rJ   r   r.   encode)r5   r   r   rp   rq   rt   s         r7   r   'InstructTokenizerV1.encode_user_content(  sR    $ '3''''}#f,w6G&&wEu&Er2~r9   rR   c                     [        S5      e)znEncode a tool message.

Raises:
    TokenizerException: The tool message is not implemented for this version.
&Tools not implemented for tokenizer V1r   rV   s      r7   rW   'InstructTokenizerV1.encode_tool_messageB  s     !!IJJr9   rY   c                 b   [        U[        5      (       d   U5       eUR                  b$  [        UR                  5      S:  a  [	        S5      eU(       a  UR
                  (       a  [        S5      eUR                  (       aL  [        UR                  [        5      (       d   S5       eU R                  R                  UR                  SSS9nO$[	        UR                   SUR                   35      eUR
                  (       d,  U(       d%  UR                  U R                  R                  5        U$ )  Encode an assistant message.

Args:
    message: The message to encode.
    is_before_last_user_message: Not used.
    continue_message: Whether to continue the message generation.
        Only use this if the assistant message is the last message.

Returns:
    The encoded tokens.
r   r   U`continue_message` is only supported for assistant messages that have `prefix=False`.z4Message content must be a string for tokenizer < V13Fr   z // )rJ   r   
tool_callsrz   r   prefixr	   r   r   r.   r   r   eos_idr5   rQ   rR   rY   r   s        r7   r[   ,InstructTokenizerV1.encode_assistant_messageJ  s     '#344=g=4)c'2D2D.E.I$%MNN2g  __goos33k5kk3..//UPU/VK$'8W=O=O<P%QRR~~&6t~~445r9   r]   c                     [        S5      e)zlEncode a think chunk.

Raises:
    TokenizerException: The think chunk is not implemented for this version.
z*Think not implemented for tokenizer < V13.r   r_   s     r7   r`    InstructTokenizerV1.encode_thinkh  s     !!MNNr9   rD   c                 F    [        SU R                  R                   35      e)zkEncode a FIM request.

Raises:
   TokenizerException: The FIM request is not implemented for this version.
zFIM not available for r   r.   versionr5   rD   s     r7   
encode_fimInstructTokenizerV1.encode_fimp  s"     !#9$..:P:P9Q!RSSr9   c                 F    [        SU R                  R                   35      e)Nz Transcription not available for r   r   s     r7   encode_transcription(InstructTokenizerV1.encode_transcriptionx  s      #CDNNDZDZC[!\]]r9   rf   NF) r   r   r   r   r   r   rG   r   r   r   r   r   npndarrayr   r{   r   r~   r   r   r   rW   r   r[   r   r`   r   r%   r   r   r   r   rf   r9   r7   r   r      s    %) %!)!) dd*!) 	!)
 !) Tz!) !) 
tCy$rzz*DK7	8!)Fl] ltCy l %) %t,--  Tz	
  
tCy$rzz*DK7	84K; KUY K^bcf^g K+JNbf	c<O* Oc OT* T T^,@ ^Y ^r9   r   c                     ^  \ rS rSrSr\R                  r  S!S\S\	S-  S\
S-  4U 4S jjjr  S"S\S	\\   S-  S
\S\S\S-  S\S\\\   \\R(                     \\   4   4S jjrS\S\4S jrS\S\\\4   4S jrS\S\S\\   4S jrS\S\\\4   4S jrS\S\\   4S jr S\S\\   4S jr!S\S\S\S\\   4S jr"S\S\\   4S jr#S\$S\%4S jr&S r'U =r($ )#InstructTokenizerV2i|  zXInstruct tokenizer V2.

This tokenizer adds supports to images, tools and FIM requests.
Nr.   r/   r0   c                   > [         TU ]  XU5        U R                  R                  [        R
                  R                  5      U l        U R                  R                  [        R                  R                  5      U l	        U R                  R                  [        R                  R                  5      U l        U R                  R                  [        R                  R                  5      U l        U R                  R                  [        R                  R                  5      U l        U R                  R                  [        R                   R                  5      U l        U R                  R                  [        R$                  R                  5      U l        U R                  R                  [        R(                  R                  5      U l        U R                  R                  [        R,                  R                  5      U l        U R                  R                  [        R0                  R                  5      U l        gInitialize the tokenizer.

Args:
    tokenizer: The tokenizer to use.
    image_encoder: The image encoder to use.
    audio_encoder: The audio encoder to use.
N)r2   r3   r.   get_control_tokenr$   
begin_instvalue
BEGIN_INSTend_instEND_INSTbegin_toolsBEGIN_AVAILABLE_TOOLS	end_toolsEND_AVAILABLE_TOOLSbegin_tool_resultsBEGIN_TOOL_RESULTSend_tool_resultsEND_TOOL_RESULTSr   
TOOL_CALLSr   BOSr   PREFIXsuffixSUFFIXr4   s       r7   r3   InstructTokenizerV2.__init__  sl    	=A..::=;S;S;Y;YZ889O9O9U9UV%)^^%E%EmF_F_FeFe%f"#'>>#C#CMD[D[DaDa#b "&.."B"B=CcCcCiCi"j $ @ @A_A_AeAe f..::=;S;S;Y;YZ>>33M4E4E4K4KLnn66}7K7K7Q7QRnn66}7K7K7Q7QRr9   rQ   r|   r   r   rp   rq   r:   c                 .   SnXt=(       a    U R                   [        R                  :H  -  nXs=(       a    U R                   [        R                  :H  -  n/ nU(       ao  U(       ah  U V	s/ s H  oR	                  5       PM     n
n	U R
                  R                  [        R                  " U
SS9SSS9nU R                  /UQU R                  PnU R                  UR                  UUUS9u  pn/ UQU R                  PnU R                  /nX-   U-   nUX4$ s  sn	f )aa  Encode a user message.

Args:
    message: The message to encode.
    available_tools: The list of available tools if any.
    is_last: Whether the message is the last one.
    is_first: Not used.
    system_prompt: The system prompt.
    force_img_first: Whether to force the image to be first.

Returns:
    The encoded tokens and the list of images.
Fensure_asciir   )r   r   rp   rq   )&_user_message_position_to_encode_toolsr(   firstlast
model_dumpr.   r   jsondumpsr   r   r   r   r   r   )r5   rQ   r|   r   r   rp   rq   do_encode_toolstools_tokenstooltoolstools_json_tokensrt   r   r   prefix_tokenssuffix_tokensr   s                     r7   r{   'InstructTokenizerV2.encode_user_message  s&   ,  r)T)TXkXqXq)qrp(S(SWjWoWo(op"$3BC?4__&?EC $ 5 5djjUZ6[aflq 5 r**" ((L  $77OO'+	  8  
u 9,88#,}<E(() Ds   'Dr   c                 h     [         R                  " U5      $ ! [         R                   a    Us $ f = fr<   )r   loadsJSONDecodeError)r5   r   s     r7   _parse_json_content'InstructTokenizerV2._parse_json_content  s0    	::g&&## 	N	s    11tool_messagec                 R    UR                   U R                  UR                  5      S.$ )z8Bit of a hack due to the way tool results are tokenized.)namer   )r  r  r   r5   r  s     r7   _prepare_tool_result(InstructTokenizerV2._prepare_tool_result  s+     !%%//0D0DE
 	
r9   rR   c                     U(       a  / $ [         R                  " U R                  U5      /SS9nU R                  /U R                  R                  USSS9QU R                  PnU$ )zEncode a tool message.

Args:
    message: The message to encode.
    is_before_last_user_message: Whether the message is before the last user message. If true, the message is
        not encoded.

Returns:
    The encoded tokens.
Fr   r   r   r   r  r   r.   r   r   r5   rQ   rR   tool_result_strr   s        r7   rW   'InstructTokenizerV2.encode_tool_message  ss     'I **d&?&?&H%IX]^##
^^""?5"I
 !!

 r9   	tool_callc                 z    UR                   R                  U R                  UR                   R                  5      S.$ )z:Bit of a hack due to the way function calls are tokenized.r  	arguments)functionr  r  r  )r5   r  s     r7   _prepare_function_call*InstructTokenizerV2._prepare_function_call  s7     &&++11)2D2D2N2NO
 	
r9   c                     UR                   (       d
   SU 35       e[        UR                   [        5      (       d   S5       eU R                  R	                  UR                   R                  S5      SSS9$ )Nz)Assistant message must have content. Got 3Message content must be a string for tokenizer < V7 Fr   )r   rJ   r   r.   r   rstripr   s     r7   (_encode_normal_content_assistant_message<InstructTokenizerV2._encode_normal_content_assistant_message  sd    U"KG9 UU'//3//f1ff/~~$$W__%;%;C%@eQV$WWr9   c                     UR                   (       d
   SU 35       e/ nUR                    H#  nUR                  U R                  U5      5        M%     [        R                  " USS9nU R
                  /U R                  R                  USSS9QnU$ )N,Assistant message must have tool calls. Got Fr   r   )r   r   r  r   r   r   r.   r   )r5   rQ   prepared_tool_callsr  tool_call_strr   s         r7   '_encode_tool_calls_in_assistant_message;InstructTokenizerV2._encode_tool_calls_in_assistant_message  s    !![%QRYQZ#[[!  ++I&&t'B'B9'MN ,

#6UKOO
^^""=e"G
 r9   rY   c                 @   UR                   (       a  UR                  (       a  [        SU 35      eU(       a  UR                  (       a  [	        S5      eUR                   (       a  U(       a  / $ U R                  U5      nOaUR                  (       a8  [        UR                  [        5      (       d   S5       eU R                  U5      nO[        SUR                   35      eUR                  (       d,  U(       d%  UR                  U R                  R                  5        U$ )a  Encode an assistant message.

Args:
    message: The message to encode.
    is_before_last_user_message: Whether the message is before the last user message. If has tools and true, the
        message is not encoded.
    continue_message: Whether to continue the message generation.
        Only use this if the assistant message is the last message.

Returns:
    The encoded tokens.
zICannot have tool calls and content defined in the same assistant message r   r  Invalid assistant message: )r   r   
ValueErrorr   r	   r  rJ   r   r  r   r   r.   r   r   s        r7   r[   ,InstructTokenizerV2.encode_assistant_message  s     '//hiphqrss2g  *	FFwOK__goos33j5jj3GGPK$'B7??BS%TUU~~&6t~~445r9   ru   c                 D    U R                   R                  SU-   SSS9SS $ )z;Remove prefix space in the case of SentencePieceTokenizers.u   ☺Fr      N)r.   r   )r5   ru   s     r7   _encode_infilling%InstructTokenizerV2._encode_infilling2  s+     ~~$$UT\u%$HLLr9   rD   c                 B   U R                   R                  UR                  SSS9nUR                  (       a  U R	                  UR                  5      O/ nU R
                  U R                  /UQU R                  PUQn[        X@R                  U[        R                  S9S9$ )zcEncode a FIM request.

Args:
    request: The request to encode.

Returns:
    The encoded tokens.
Fr   rr   )rt   ru   )r.   r   promptr   r'  r   r   r   r%   r   r#   r   )r5   rD   r   r   rt   s        r7   r   InstructTokenizerV2.encode_fim7  s     --gnn%U-SBI....w~~>VXHHKK
 
 KK	

 
 [[VhVmVm[-noor9   )
r   r   r   r   r   r   r   r   r   r   r   r   ))r   r   r   r   r   r(   r   r   r'   r)   r   r3   r   rG   r   r   r   r   r   r   r   r   r{   r   r  r   dictr  rW   r   r  r   r  r  r[   r'  r   r%   r   r   r   r   s   @r7   r   r   |  s   
 .A-E-E*
 .2-1	SS $d*S $d*	S S> %) %0)0) dd*0) 	0)
 0) Tz0) 0) 
tCy$rzz*DK7	80)d3 3 
 
c3h 
; UY ^bcf^g 0
 
T#s(^ 
X@T XY]^aYb X

?S 
X\]`Xa 
"+"JN"bf"	c"HMc Md3i M
p* p p pr9   r   c                     ^  \ rS rSrSr  SS\S\S-  S\S-  4U 4S jjjrS\	S	\
\\4   4S
 jrS\S	\
\\4   4S jrS\S\S	\\   4S jrS\S\S\S	\\   4U 4S jjr\S\\-  \-  S	\\\   SS4   4S j5       r\S\\-  S	\\\   \R<                  S4   4S j5       r\S\\ -  S	\\\   S\!4   4S j5       rS\\"-  S	\\\   \R<                  S-  \!S-  4   4S jrS\#\"   S	\\\   \\R<                     \\!   4   4S jr$  SS\\\%   -  S\S\S-  S\S	\\\   \\R<                     \\!   4   4
U 4S jjjr&Sr'U =r($ ) InstructTokenizerV3iL  zpInstruct tokenizer V3.

The only difference with V2 tokenizer is that it encodes the tool messages differently.
Nr.   r/   r0   c                 "   > [         TU ]  XUS9  g)r   )r/   r0   N)r2   r3   r4   s       r7   r3   InstructTokenizerV3.__init__T  s     	}]r9   r  r:   c                     UR                   R                  U R                  UR                   R                  5      S.nUR                  (       a  UR                  S:w  a  UR                  US'   U$ )Nr  nullid)r  r  r  r  r3  )r5   r  function_calls      r7   r  *InstructTokenizerV3._prepare_function_callc  sZ    &&++11)2D2D2N2NO

 <<ILLF2"+,,M$r9   r  c                 z    UR                   c   S5       eU R                  UR                  5      UR                   S.$ )Nz7Tool message has to have the tool call id defined in v3)r   call_id)tool_call_idr  r   r  s     r7   r  (InstructTokenizerV3._prepare_tool_resultn  sC    ((4o6oo4 //0D0DE#00
 	
r9   rQ   rR   c                     [         R                  " U R                  U5      SS9nU R                  /U R                  R                  USSS9QU R                  PnU$ )a  Encode a tool message.

Note:
    Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_tool_message] but tools
    are not wrapped in a list and the history is also tokenized.

Args:
    message: The message to encode.
    is_before_last_user_message: Whether the message is before the last user message. If true, the message is
        not encoded.

Returns:
    The encoded tokens.
Fr   r   r
  r  s        r7   rW   'InstructTokenizerV3.encode_tool_messagev  sf     **T%>%>w%GV[\##
^^""?5"I
 !!

 r9   rY   c                 &   > [         TU ]  USU5      $ )a  Encode an assistant message.

Note:
    Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_assistant_message] but
    always encode the tool history.
    continue_message: Whether to continue the message generation.
        Only use this if the assistant message is the last message.

Args:
    message: The message to encode.
    is_before_last_user_message: Not used.

Returns:
    The encoded tokens.
F)r2   r[   )r5   rQ   rR   rY   r6   s       r7   r[   ,InstructTokenizerV3.encode_assistant_message  s    $ w/@PQQr9   r]   c                     g r<   rf   r_   s     r7   _encode_content_chunk)InstructTokenizerV3._encode_content_chunk  s    jmr9   c                     g r<   rf   r_   s     r7   r?  r@    s    nqr9   c                     g r<   rf   r_   s     r7   r?  r@    s    ilr9   c                    [        U[        5      (       a  U R                  R                  USSS9S S 4$ [        U[        5      (       a(  U R                  R                  UR
                  SSS9S S 4$ [        U[        5      (       a  U R                  U5      S S 4$ [        U[        [        45      (       a>  U R                  c   S5       eU R                  U5      nUR                  UR                  S 4$ [        U[        [        45      (       a>  U R                  c   S5       eU R                  U5      nUR                  S UR                   4$ [#        SU 35      e)NFr   z+Make sure to define a image encoder at initz+Make sure to define a audio encoder at initzUnknown chunk type: )rJ   r   r.   r   r   ru   r   r`   r   r   r/   rt   r   r   r   r0   r   r#  )r5   r]   img_encodingaudio_encodings       r7   r?  r@    sE   eS!!>>((Eu(EtTQQy))>>((E(JDRVVVz**$$U+T477
M:;;%%1`3``1--e4L&&(:(:D@@
M:;;%%1`3``1!//6N!(($0D0DDD3E7;<<r9   r   c                     / n/ n/ nU HR  nU R                  U5      u  pgnUR                  U5        Ub  UR                  U5        Uc  MA  UR                  U5        MT     X#U4$ r<   )r?  r}   r   )	r5   r   rt   rw   r   r]   chunk_tokensmaybe_imagemaybe_audios	            r7   _encode_content_chunks*InstructTokenizerV3._encode_content_chunks  sp     #%E595O5OPU5V2L{MM,'&k*&[)  u$$r9   r   rp   rq   c                   > [        U[        5      (       a  [        TU ]  XU5      $ / n/ n/ n[	        U5      S:H  =(       a    [        US   [
        [        45      nU(       a  U(       a
  US   US   /nSn	U GH  n
SnU	(       a2  U(       a+  U(       a$  Sn	US-   nXPR                  R                  USSS9-  n[        U
[        [        45      (       aA  U(       a   S	[        U
5       S
35       eU R                  U
5      u  pnUR                  U5        OU[        U
[
        [        45      (       a&  U R                  U
5      u  pnUR                  U5        OU R                  U
5      S   nUR                  U5        GM     XVU4$ )  Encode a user content.

Args:
    content: The content to encode.
    is_last: Whether the message is the last one.
    system_prompt: The system prompt.
    force_img_first: Whether to force the image to be first.

Returns:
    The encoded tokens and the images.
r&  ro   r   Tr   Fr   r   zEIt is not possible that `content` is non-empty when chunk is of type .)rJ   r   r2   r   rz   r   r   r.   r   r   r   r   r?  r   r}   )r5   r   r   rp   rq   rt   rw   r   has_one_img_one_text_firstfirst_chunkr]   content_strrG  _chunk_audiochunk_imager6   s                   r7   r   'InstructTokenizerV3.encode_user_content  ss   $ gs##7.wOO#%%(\Q%6%n:gajS]_lRm;n"9qz71:.GEKw=#+f4..//E/RR%*m!<==& [\`af\g[hhij 04/I/I%/P,[)EJ#>??/3/I/I%/P,1k*#99%@CMM,'% ( u$$r9   rf   r   r   ))r   r   r   r   r   r'   r)   r   r3   r   r,  r   r   r  r   r  r   rG   r   rW   r   r[   r   r   r   r   r?  r   r   r   r   r   r   r   r   r   rJ  r   r   r   r   r   s   @r7   r.  r.  L  sN    .2-1	^^ $d*^ $d*	^ ^	 	T#s(^ 	
 
c3h 
; UY ^bcf^g .R+RJNRbfR	cR( m3?Z+GmERVWZR[]acgRgLhm mq:+Eq%PTUXPY[][e[egkPkJlq ql:+El%PTUXPY[_afPfJgl l=3+= =%S	SUS]S]`dSdfknrfrHrBs =,%-%	tCy$rzz*DK7	8%( %) %3%t,--3% 3% Tz	3%
 3% 
tCy$rzz*DK7	83% 3%r9   r.  c                     ^  \ rS rSrSr  S!S\S\S-  S\S-  SS4U 4S jjjrS	\	\	\
   S-     S
\	\   S\
S\
SS4
S jrS\S\	\
   4S jr  S"S\\	\   -  S\S\S-  S\S\\	\
   \	\R(                     \	\   4   4
U 4S jjjr  S"S\S\	\   S-  S\S\S\S-  S\S\\	\
   \	\R(                     \	\   4   4U 4S jjjrS\S\4S jr\S
\	\   SS4S j5       r\ S
\	\   S\4S j5       r!S\"S\S\	\
   4S jr#S\S\S\S\	\
   4S jr$S r%U =r&$ )#InstructTokenizerV7i  a  Instruct tokenizer V7.

The difference with V3 tokenizer is that it encodes the system prompts differently:
- in V7 the system prompts are treated as separate SystemMessages
- they are no longer prepended to the last user message
- they are printed between special tokens

Nr.   r/   r0   r:   c                   > [         TU ]  XU5        U R                  R                  [        R
                  R                  5      U l        U R                  R                  [        R                  R                  5      U l	        U R                  R                  [        R                  R                  5      U l        SU l        Ub9  U R                  R                  [        R                  R                  5      U l        ggr   )r2   r3   r.   r   r$   begin_systemr   BEGIN_SYSTEM
end_system
END_SYSTEMbegin_tool_contentBEGIN_TOOL_CONTENT
TRANSCRIBE
transcriber4   s       r7   r3   InstructTokenizerV7.__init__  s     	=A NN<<]=W=W=]=]^..::=;S;S;Y;YZ"&.."B"B=CcCcCiCi"j$"nn>>}?W?W?]?]^DO %r9   tokenized_messagesrI   rc   rd   c                   ^^^^ [        S T 5       5      U-
  mS[        SS 4UUUU4S jjnSnTS:  a  U[        T5      :  a  U" U5        US-  n[        TUS-
     [        5      (       a]  U[        T5      :  aN  [        TU   [        5      (       d6  U" U5        US-  nU[        T5      :  a  [        TU   [        5      (       d  M6  TS:  a  U[        T5      :  a  M  TS:  a  [        S5      eg )Nc              3   @   #    U  H  oc  M  [        U5      v   M     g 7fr<   )rz   ).0ts     r7   	<genexpr>?InstructTokenizerV7._truncate_for_max_tokens.<locals>.<genexpr>0  s     J&8fc!ff&8s   idxr:   c                    > [        TU    [        5      (       a  g U T:X  a  g TU    nUc   eT[        U5      -  mS TU '   g r<   )rJ   r   rz   )ri  r   rd   rI   to_droprb  s     r7   drop:InstructTokenizerV7._truncate_for_max_tokens.<locals>.drop2  sP    (3-77--$S)C?"?s3xG&*s#r9   r   ro   z+Input couldn't fit in truncate_at_max_token)sumr   rz   rJ   r   r   )r5   rb  rI   rc   rd   rl  current_idxrk  s    `` `  @r7   rg   ,InstructTokenizerV7._truncate_for_max_tokens$  s     J&8JJZW	+c 	+d 	+ 	+ kkCM91K(;?3[AA "CM1*XkEZ\g:h:h%1$K "CM1*XkEZ\g:h:h kkCM9 Q;$%RSS r9   rQ   c                     U R                   /n[        UR                  =n[        5      (       a
  [	        US9/nX R                  U5      S   -  nUR                  U R                  5        U$ )zfEncode a system message.

Args:
    message: The message to encode.

Returns:
    The encoded tokens.
)ru   r   )rZ  rJ   r   r   r   rJ  r   r\  )r5   rQ   rt   r   s       r7   r~   )InstructTokenizerV7.encode_system_messageM  sb     ##$0g#66 g./G--g6q99doo&r9   r   r   rp   rq   c                   > Ub   S5       e[        U[        5      (       a  [        T	U ]  XU5      $ [	        U5      S:H  =(       a    [        US   [
        [        45      nU(       a  U(       a
  US   US   /nU R                  U5      u  pgnXgU4$ )rM  ?in Tokenizer V7 we don't encode system prompts in user messagesr&  ro   r   )rJ   r   r2   r   rz   r   r   rJ  )
r5   r   r   rp   rq   rO  rt   rw   r   r6   s
            r7   r   'InstructTokenizerV7.encode_user_content^  s    $ $g&gg$gs##7.wOO%(\Q%6%n:gajS]_lRm;n"9qz71:.G $ ; ;G Du$$r9   r|   r   c           	      J   > Ub   S5       e[         T
U ]  UUUUSUS9u  pxn	XxU	4$ )at  Encode a user message.

Args:
    message: The message to encode.
    available_tools: The list of available tools if any.
    is_last: Whether the message is the last one.
    is_first: Whether the message is the first one.
    system_prompt: Not used.
    force_img_first: Whether to force the image to be first.

Returns:
    The encoded tokens and the list of images.
Nrt  )r   r   rp   rq   )r2   r{   )r5   rQ   r|   r   r   rp   rq   rt   rw   r   r6   s             r7   r{   'InstructTokenizerV7.encode_user_message}  sO    , $g&gg$ % ;+ !< !
 u$$r9   rD   c                    U R                   c   U R                  R                   S35       eU R                  5       nU R	                  [        [        UR                  S9/S9/ SSSS9u  p4n/ UQUQnUR                  b,  SUR                   3nX0R                  R                  USSS	9-  nUR                  U R                   5        [        X0R                  R                  U5      US
9$ )a  
Encodes an audio transcription request into a tokenized format.

This method processes a transcription request containing audio data,
encodes the user message, and returns the tokenized output.

Args:
    request: The transcription request object containing
        the audio data to be encoded.

Returns:
    Tokenized: The tokenized representation of the audio data, including processed audio and tokens
Nz! needs to have a TRANSCRIBE token)input_audio)r   T)r|   r   r   rp   zlang:Fr   )rt   ru   rx   )r_  r6   r   rB   r{   r   r   r   languager.   r   r   r%   r   )r5   rD   r   rt   rR  r   language_strings          r7   r   (InstructTokenizerV7.encode_transcription  s     *it~~/F/F.GGh,ii*33!F GH 4 
5 $6#F#' %g&6&6%78Onn++OE+RRFdoo&^^-F-Fv-NW\]]r9   c                 v    U R                  U5      (       a#  [        S U 5       5      (       a  [        S5      eg g )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr<   )rJ   r   re  rQ   s     r7   rg  8InstructTokenizerV7.validate_messages.<locals>.<genexpr>  s     NX':g}55X   z9System messages are not yet allowed when audio is present)
_has_audioanyr#  rj   s     r7   rl   %InstructTokenizerV7.validate_messages  s9    >>(##NXNNN !\]] O $r9   c                 &    [        S U  5       5      $ )Nc              3      #    U  HY  n[        U[        5      =(       a=    [        UR                  [        5      =(       a    [	        S  UR                   5       5      v   M[     g7f)c              3   B   #    U  H  n[        U[        5      v   M     g 7fr<   )rJ   r   )re  r]   s     r7   rg  ;InstructTokenizerV7._has_audio.<locals>.<genexpr>.<genexpr>  s     OeJuj11r  N)rJ   r   r   rG   r  r  s     r7   rg  1InstructTokenizerV7._has_audio.<locals>.<genexpr>  sT      
 $ w, P7??D1POwOOP $s   A!A#)r  )rI   s    r7   r  InstructTokenizerV7._has_audio  s     
 $	
 
 	
r9   rR   c                 ^   UR                   c   e[        UR                  [        5      (       d   S5       eU R                  R                  UR                   SSS9nU R                  R                  UR                  SSS9nU R                  /UQU R                  Pn/ UQUQU R                  PnU$ )a@  Encode a tool message.

Note:
    Same as [V3][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV3.encode_tool_message]
    but tools are not wrapped in a list and history is also tokenized

Args:
    message: The message to encode.
    is_before_last_user_message: Not used.

Returns:
    The encoded tokens.
r   Fr   )	r8  rJ   r   r   r.   r   r   r^  r   )r5   rQ   rR   tool_call_id_tokensrt   r   r   s          r7   rW   'InstructTokenizerV7.encode_tool_message  s     ##///'//3//U1UU/"nn33G4H4HeY^3_&&wEu&M ##
 
 ##




 !!

 r9   rY   c                 t   UR                   (       d  UR                  (       d  [        SU 35      eU(       a  UR                  (       a  [	        S5      e/ nUR                   (       ap  [        UR                   [        5      (       a  U R                  U5      nO?[        UR                   [        5      (       a   X@R                  UR                   5      S   -  nUR                  (       a  X@R                  U5      -  nUR                  (       d,  U(       d%  UR                  U R                  R                  5        U$ )r   r"  r   r   )r   r   r   r   r	   rJ   r   r  rG   rJ  r  r   r.   r   r   s        r7   r[   ,InstructTokenizerV7.encode_assistant_message  s     w'9'9$'B7)%LMM2g  ??'//3//"KKGTGOOT22::7??KANNGGPPK~~&6t~~445r9   )rZ  r^  r\  r_  r   r   )'r   r   r   r   r   r'   r)   r   r3   rG   r   r   rg   r   r~   r   r   r   r   r   r   r   r   r   r   r{   r   r%   r   r   r   rl   r   r  r   rW   r[   r   r   r   s   @r7   rW  rW    s_    .2-1	__ $d*_ $d*	_
 
_ _.'T cT!12'T +,'T 	'T
 "%'T 
'TR] tCy * %) %%t,--% % Tz	%
 % 
tCy$rzz*DK7	8% %J %) %!%!% dd*!% 	!%
 !% Tz!% !% 
tCy$rzz*DK7	8!% !%F^,@ ^Y ^B ^d ^ ^ ^
 
T$Z 
D 
 
; UY ^bcf^g > + JN bf 	c   r9   rW  c            	       l   ^  \ rS rSrSr  SS\S\S-  S\S-  SS4U 4S jjjrS	\	S\
\   4S
 jrSrU =r$ )InstructTokenizerV11i  zInstruct tokenizer V11.

The difference with V7 tokenizer is that it encodes tool calls differently:
Tool call results are encoded as :
- [begin tool call] call_name_tokens [call id] call_id_tokens [args] content tokens
Nr.   r/   r0   r:   c                   > [         TU ]  XU5        U R                  R                  [        R
                  R                  5      U l        U R                  R                  [        R                  R                  5      U l	        g r<   )
r2   r3   r.   r   r$   argsr   ARGSr7  CALL_IDr4   s       r7   r3   InstructTokenizerV11.__init__  sX     	=ANN44]5G5G5M5MN	~~778M8M8S8STr9   rQ   c           
         UR                   (       d
   SU 35       e/ nUR                    H  nU R                  U5      n/ nSU;   a+  U R                  /U R                  R	                  US   SSS9QnUU R
                  /U R                  R	                  US   SSS9QUQU R                  PU R                  R	                  [        R                  " US   SS9SSS9Q-  nM     U$ )Nr  r3  Fr   r  r  r   )	r   r  r  r.   r   r   r  r   r   )r5   rQ   r   r  preparedidss         r7   r  <InstructTokenizerV11._encode_tool_calls_in_assistant_message$  s   !![%QRYQZ#[[! ++I229=HCx||bdnn&;&;HTNPU[`&;&ab&&x'7U&N  			
 &&tzz(;2GV['\bgmr&s K , r9   )r  r  r   )r   r   r   r   r   r'   r)   r   r3   r   rG   r   r  r   r   r   s   @r7   r  r    sp     .2-1	UU $d*U $d*	U
 
U U?S X\]`Xa  r9   r  c            	          ^  \ rS rSrSr\R                  r  SS\S\	S-  S\
S-  SS4U 4S jjjrS	\S\\   4S
 jrS	\S\S\\   4S jrS\S\\   4S jrSrU =r$ )InstructTokenizerV13i8  zInstruct tokenizer V13.

The difference with V11 tokenizer is that it encodes tool calls differently:
    - available tools are tokenized at the first user message.
    - call id is no longer tokenized for tool calls or results.
Nr.   r/   r0   r:   c                   > [         TU ]  XU5        [        U[        5      (       d   S[	        U5       35       e[
        R                  R                  UR                  ;   a  [
        R                  R                  UR                  ;   a]  UR                  [
        R                  R                  5      U l        UR                  [
        R                  R                  5      U l        g S U l        S U l        g )Nz$Tokenizer must be a Tekkenizer. Got )r2   r3   rJ   r*   r   r$   begin_thinkr   _special_tokens_reverse_vocab	end_thinkr   BEGIN_THINK	END_THINKr4   s       r7   r3   InstructTokenizerV13.__init__B  s     	=A)Z00j4XY]^gYhXi2jj0%%++y/V/VV''--1X1XX+4+F+F}G`G`GfGf+gD)2)D)D]E\E\EbEb)cDN#D!DNr9   rQ   c           
         UR                   (       d
   SU 35       e/ nUR                    H  nUR                  (       a  UR                  S:w  d   eU R                  U5      nUU R                  /U R                  R                  US   SSS9QU R                  PU R                  R                  [        R                  " US   SS9SSS9Q-  nM     U$ )Nr  r2  r  Fr   r  r   )	r   r3  r  r   r.   r   r  r   r   )r5   rQ   r   r  r  s        r7   r  <InstructTokenizerV13._encode_tool_calls_in_assistant_messageT  s    !![%QRYQZ#[[! ++I<<ILLF$:::229=H&&x'7U&N 		 &&tzz(;2GV['\bgmr&s	 K	 , r9   rR   c                     UR                   c   S5       eU R                  R                  UR                  SSS9nU R                  /UQU R
                  PnU$ )zEncode a tool message.

Args:
    message: The message to encode.
    is_before_last_user_message: Not used.
Returns:
    The encoded tokens.
z2Tool call id must be provided for tokenizer >= v13Fr   )r8  r.   r   r   r   r   )r5   rQ   rR   rt   r   s        r7   rW   (InstructTokenizerV13.encode_tool_messagec  sk     ##/e1ee/&&wEu&M##

 !!

 r9   r]   c                    U R                   c   S5       eU R                  c   S5       eU R                  R                  UR                  SSS9nU R                   /UQnUR
                  (       a  UR                  U R                  5        U$ )zjEncode a thinking chunk.

Args:
    chunk: The thinking chunk to encode.
Returns:
    The encoded tokens.
z2think tokens are not available for this tokenizer.Fr   )r  r  r.   r   thinkingclosedr   )r5   r]   rt   think_tokenss       r7   r`   !InstructTokenizerV13.encode_thinkv  s     +a-aa+~~)_+__)&&u~~5e&L((262<</r9   )r  r  r   )r   r   r   r   r   r(   r   r   r'   r)   r   r3   r   rG   r   r  r   r   rW   r   r`   r   r   r   s   @r7   r  r  8  s     .A-F-F*
 .2-1	"" $d*" $d*	"
 
" "$?S X\]`Xa ; UY ^bcf^g &* c  r9   r  )@r   abcr   typingr   r   r   r   numpyr   mistral_common.audior   mistral_common.exceptionsr	   r
   r   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   r   r   r   r   r   r   r   )mistral_common.protocol.instruct.messagesr   r   r   r   r   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   r   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   %mistral_common.tokens.tokenizers.baser    r!   r"   r#   r$   r%   r&   r'   r(   &mistral_common.tokens.tokenizers.imager)   'mistral_common.tokens.tokenizers.tekkenr*   r,   r   r   r.  rW  r  r  rf   r9   r7   <module>r     s     3 3  & 
 ;	 	 	  E F N ?
 
 
 @ >D1w2NMSgghD1N^7#6Wk#kl^DMp!4nmUi!ijMp`t%!4nmUi!ijt%nL- L^#. #LL/ Lr9   