
    :iG                        % S SK r S SKJr  S SKJrJrJr  S SKJr  S SK	J
r
  S SKJrJrJrJrJr  S SKJrJr  S SKJr  S S	KJrJrJrJrJr  S S
KJr  S SKJrJ r J!r!  S SK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  S SK*J+r+J,r,J-r-  S SK.J/r/J0r0J1r1J2r2J3r3J4r4  S SK5J6r6J7r7J8r8  S SK9J:r:J;r;  S SK<J=r=  S\+S\:\6-  S\,4S jr>S\S\:S\ 4S jr? " S S\\\\\\(4   5      r@0 SS _S\@R                  _SS _S\@R                  _S \@R                  _S!\@R                  _S"\@R                  _S#S$ _S%\@R                  _S&\@R                  _S'\@R                  _S(\@R                  _S)S* _S+\@R                  _S,\@R                  _S-S. _S/\@R                  _\@R                  \@R                  \@R                  \@R                  \@R                  \@R                  \@R                  \@R                  S0 S1 S2 S3.ErE\F\G\/ \@4   4   \HS4'   g)5    N)Path)AnyCallableGeneric)TokenizerException)
FIMRequest)UATSAssistantMessageTypeSystemMessageTypeToolMessageTypeUserMessageType)InstructRequestNormalizer normalizer_for_tokenizer_version)ChatCompletionRequest)MistralRequestValidatorMistralRequestValidatorV3MistralRequestValidatorV5MistralRequestValidatorV13ValidationMode)TranscriptionRequest)AudioConfigAudioEncoderSpecialAudioIDs)InstructRequestInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokensTokenizedTypeTokenizerVersion)ImageConfigImageEncoderSpecialImageIDs)InstructTokenizerV1InstructTokenizerV2InstructTokenizerV3InstructTokenizerV7InstructTokenizerV11InstructTokenizerV13)SentencePieceTokenizerget_image_configis_sentencepiece)
Tekkenizer	is_tekken)download_tokenizer_from_hf_hubimage_config	tokenizerreturnc                    [        UR                  [        R                  R                  5      UR                  [        R
                  R                  5      UR                  [        R                  R                  5      S9n[        X5      $ )zLoad a image encoder from a config and a tokenizer.

Args:
    image_config: The image config.
    tokenizer: The tokenizer.

Returns:
    The image encoder.
)img	img_breakimg_end)r#   get_control_tokenr   r4   valuer5   r6   r"   )r0   r1   special_idss      b/home/james-whalen/.local/lib/python3.13/site-packages/mistral_common/tokens/tokenizers/mistral.pyload_image_encoderr;   :   si     "''(9(9(?(?@--m.E.E.K.KL++M,A,A,G,GHK
 22    audio_configc                     [        UR                  [        R                  R                  5      UR                  [        R
                  R                  5      S9n[        X5      $ )zLoad a audio encoder from a config and a tokenizer.

Args:
    audio_config: The audio config.
    tokenizer: The tokenizer.

Returns:
    The audio encoder.
)audiobegin_audio)r   r7   r   r?   r8   r@   r   )r=   r1   r9   s      r:   load_audio_encoderrA   L   sO     "))-*=*=*C*CD//0I0I0O0OPK 22r<   c                   R   \ rS rSrSrS\\\\\	4   S\
\\	\\4   S\\\	\\\4   4S jrS\\\\S4   4   4S	 jr\S\4S
 j5       r\S)S j5       r\S)S j5       r\S*S\S\SS 4S jj5       r\S+S\SS 4S jj5       r\S+S\S\SS 4S jj5       r\SSSS\ RB                  4S\S\\-  S-  S\S-  S\S\S\ SS 4S jj5       r"\\ RB                  4S\\-  S\ SS 4S jj5       r# S,S\$\%   S \&S-  S\4S! jjr'S\(S\4S" jr)S\S\4S# jr*S,S$\+\&   S%\,S-  S\4S& jjr-S$\+\&   S\4S' jr.S(r/g)-MistralTokenizer]   aG  Mistral tokenizer.

This class is a wrapper around a [InstructTokenizer][mistral_common.tokens.tokenizers.base.InstructTokenizer],
a [MistralRequestValidator][mistral_common.protocol.instruct.validator.MistralRequestValidator] and a
[InstructRequestNormalizer][mistral_common.protocol.instruct.normalize.InstructRequestNormalizer].

It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

Attributes:
    instruct_tokenizer: The instruct tokenizer to use. See
        [InstructTokenizer][mistral_common.tokens.tokenizers.instruct.InstructTokenizer].
instruct_tokenizer	validatorrequest_normalizerc                 *    X l         X0l        UU l        g)zInitializes a `MistralTokenizer`.

Args:
    instruct_tokenizer: The instruct tokenizer to use.
    validator: The request validator to use.
    request_normalizer: The request normalizer to use.
N"_chat_completion_request_validator_instruct_request_normalizerrE   )selfrE   rF   rG   s       r:   __init__MistralTokenizer.__init__m   s     3</,>) 	r<   r2   .c                     [         R                  U R                  R                  R                  U R
                  R                  44$ )z
Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

Returns:
    A tuple of the factory function and the arguments to reconstruct the object from its source file.
)rC   	from_filerE   r1   	file_pathrJ   _mode)rL   s    r:   
__reduce__MistralTokenizer.__reduce__   s@      ))##--773399,
 
 	
r<   c                 @    [        [        5      R                  S   S-  $ )N   data)r   __file__parentsclss    r:   
_data_pathMistralTokenizer._data_path   s    H~%%a(611r<   c                 r    U R                  [        U R                  5       S-  5      [        R                  S9$ )zGet the Mistral tokenizer v1.ztokenizer.model.v1moderP   strr\   r   testrZ   s    r:   v1MistralTokenizer.v1   s0     }}S!14H!HIP^PcPc}ddr<   c                 r    U R                  [        U R                  5       S-  5      [        R                  S9$ )zGet the Mistral tokenizer v2.z*mistral_instruct_tokenizer_240216.model.v2r_   ra   rZ   s    r:   v2MistralTokenizer.v2   s8     }} #OOPWeWjWj  
 	
r<   Fr.   is_mmc                     U(       a
  U(       a  SnO,U(       a
  U(       d  SnOU(       d  U(       a  [        S5      eSnU R                  [        U R                  5       U-  5      [        R
                  S9$ )a  Get the Mistral tokenizer v3.

Args:
    is_tekken: Whether the tokenizer is a tekken tokenizer. See
        [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
    is_mm: Whether to load image tokenizer.

Returns:
    The Mistral tokenizer v3.
ztekken_240911.jsonztekken_240718.jsonz;Multimodal tokenizer is currently only supported for tekkenz*mistral_instruct_tokenizer_240323.model.v3r_   )
ValueErrorrP   rb   r\   r   rc   )r[   r.   ri   tokenizer_names       r:   v3MistralTokenizer.v3   sX     1Nu1NuZ[[IN}}S!1N!BC.J]J]}^^r<   c                     U(       a8  U R                  [        U R                  5       S-  5      [        R                  S9$ U R                  [        U R                  5       S-  5      [        R                  S9$ )z}Get the Mistral tokenizer v7.

Args:
    is_mm: Whether to load the image tokenizer.

Returns:
    The Mistral tokenizer v7.
z,mistral_instruct_tokenizer_241114.model.v7m1r_   z*mistral_instruct_tokenizer_241114.model.v7ra   )r[   ri   s     r:   v7MistralTokenizer.v7   sq     ==CNN$'UUV]k]p]p !   ==CNN$'SST[i[n[n !  r<   modelstrictc                    U(       dT  [         R                  " S[        5        [        R	                  5        H!  u  p4X1R                  5       ;   d  M  U" 5       s  $    U[        ;  a  [        SU 35      e[        U   " 5       $ )a@  Get the Mistral tokenizer for a given model.

Args:
    model: The model name.
    strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
        This is deprecated and will be removed in `mistral_common=1.10.0`.

Returns:
    The Mistral tokenizer for the given model.
a  Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` which will become the default in `mistral_common=1.10.0`.If you are using `mistral_common` for open-sourced model weights, we recommend using `MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.zUnrecognized model: )warningswarnFutureWarningMODEL_NAME_TO_TOKENIZER_CLSitemslowerr   )r[   rr   rs   
model_nametokenizer_clss        r:   
from_modelMistralTokenizer.from_model   sx     MMT
  .I-N-N-P)
.(?* .Q 33$';E7%CDD*5133r<   Nrepo_idtokenrevisionforce_downloadlocal_files_onlyr`   c                 B    [        U UUUUS9n[        R                  XeS9$ )a  Download the Mistral tokenizer for a given Hugging Face repository ID.

See [here](https://huggingface.co/mistralai/models) for a list of our OSS models.

Args:
    repo_id: The Hugging Face repo ID.
    token: The Hugging Face token to use to download the tokenizer.
    revision: The revision of the model to use. If `None`, the latest revision will be used.
    mode: The validation mode to use.
    force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
        even if it is already cached.
    local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
        already cached.

Returns:
    The Mistral tokenizer for the given model.
)r   r   r   r   r   r_   )r/   rC   rP   )r   r   r   r   r   r`   tokenizer_paths          r:   from_hf_hubMistralTokenizer.from_hf_hub   s3    4 8)-
  )).)DDr<   tokenizer_filenamec                    [        U5      (       a/  [        R                  " U5      nUR                  nUR                  nO7[        U5      (       a  [        U5      n[        U5      nSnO[        SU 35      eUb  [        XC5      OSnSnUb'  [        U[        5      (       d   S5       e[        XS5      n[        UR                  5      nUR                  [        R                  :X  a/  Ub   S5       eUb   S5       e[!        [#        U5      [%        US9US9$ UR                  [        R&                  :X  a/  Ub   S5       eUb   S5       e[!        [)        U5      [%        US9US9$ UR                  [        R*                  :X  a#  Ub   S5       e[!        [-        X6S9[/        US9US9$ UR                  [        R0                  :X  a  [!        [3        X6US	9[5        US9US9$ UR                  [        R6                  :X  a  [!        [9        X6US	9[5        US9US9$ UR                  [        R:                  :X  a  [!        [=        X6S9[?        US9US9$ [        S
U 35      e)zLoads a tokenizer from a file.

Args:
    tokenizer_filename: The path to the tokenizer file.
    mode: The validation mode to use.

Returns:
    The loaded tokenizer.
NzUnrecognized tokenizer file: z-Audio is only supported for tekken tokenizersz#Tokenizer version needs to be >= v3z#Tokenizer version needs to be >= v7r_   )rF   rG   )image_encoder)r   audio_encoderz!Unrecognized tokenizer filename: ) r.   r-   rP   imager?   r,   r*   r+   r   r;   
isinstancerA   r   versionr    rd   rC   r$   r   rg   r%   rm   r&   r   rp   r'   r   v11r(   v13r)   r   )	r[   r   r`   r1   r0   r=   r   r   rG   s	            r:   rP   MistralTokenizer.from_file  s   " '((",,-?@I$??L$??L011./ABI+,>?LL$'DEWDX%YZZGSG_*<Cei#i44e6ee4.|GM=i>O>OP 0 3 33 (O*OO( (O*OO(##I.1t<#5 
 "2"5"55 (O*OO( (O*OO(##I.1t<#5 
 "2"5"55 (O*OO(##IK3>#5 
 "2"5"55##IZgh3>#5 
 "2"6"66#$Y[hi3>#5 
 "2"6"66#$YL4$?#5  !#DEWDX!YZZr<   requestmax_model_input_lenc                    U R                   R                  U5      nUc  UR                  (       a  [        S5      eU R                  R                  U5      nUR                  (       a  X$l        U R                  R                  U5      $ )a  Encodes a chat completion request.

Args:
    request: The chat completion request to encode.
    max_model_input_len: The maximum length of the input to the model.
        If `None`, the input will not be truncated.

Returns:
    The encoded chat completion request.
zUencoding a chat completion request with truncation, but no max model len was provided)	rJ   validate_requesttruncate_for_context_lengthr   rK   from_chat_completion_requesttruncate_at_max_tokensrE   encode_instruct)rL   r   r   validated_requestinstruct_requests        r:   encode_chat_completion'MistralTokenizer.encode_chat_completion^  s}     !CCTTU\]&7+N+N %g   <<YYZkl..6I3&&667GHHr<   c                 8    U R                   R                  U5      $ )zEncodes a transcription request.

Args:
    request: The transcription request to encode.

Returns:
    The encoded transcription request.
)rE   encode_transcriptionrL   r   s     r:   r   %MistralTokenizer.encode_transcription}  s     &&;;GDDr<   c                 8    U R                   R                  U5      $ )zEncodes a fill in the middle request.

Args:
    request: The fill in the middle request to encode.

Returns:
    The encoded fill in the middle request.
)rE   
encode_fimr   s     r:   r   MistralTokenizer.encode_fim  s     &&11'::r<   tokensspecial_token_policyc                 4    U R                   R                  XS9$ )a'  Decodes a list of tokens into a string.

Args:
    tokens: The tokens to decode.
    special_token_policy: The policy to use for special tokens. Passing `None` is deprecated and will be changed
        to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
)r   )rE   decode)rL   r   r   s      r:   r   MistralTokenizer.decode  s     &&--f-``r<   c                 8    U R                   R                  U5      $ N)rE   
_to_string)rL   r   s     r:   r   MistralTokenizer._to_string  s    &&11&99r<   rI   )r2   rC   )FF)Fr   )0__name__
__module____qualname____firstlineno____doc__r   r   r   r   r
   r   r   r   r   r   r   rM   tupler   r   rS   classmethodr   r\   rd   rg   boolrm   rp   rb   r}   staticmethodr   rc   r   rP   r   r	   intr   r   r   r   listr   r   r   __static_attributes__ r<   r:   rC   rC   ]   s   
-oz=Zn.no
 +?<PRact+tu
 61?DUWjj
	
*

E(E#s(O";< 

 24 2 2 e e 
 
 _4 _ _AS _ _, t (:  $ 4s 4D 4=O 4 4@  $(#$!&-22 E EczD  E * E 	 E
  E  E 
 E  ED   .22O[$JO[ O[ 
	O[ O[d W[I,T2IILtI	I>	E,@ 	E] 	E	;* 	; 	;aT#Y a>PSW>W acf a:c :s :r<   rC   zministral-8b-2410c                  (    [         R                  SS9$ NT)r.   rC   rm   r   r<   r:   <lambda>r     s    !1!4!4t!4!Dr<   zmistral-tiny-2312zopen-mistral-nemo-2407c                  (    [         R                  SS9$ r   r   r   r<   r:   r   r     s    &6&9&9D&9&Ir<   zmistral-tiny-2407zmistral-small-2312zopen-mixtral-8x22b-2404zmistral-small-2402zmistral-small-2409c                  (    [         R                  SS9$ r   r   r   r<   r:   r   r     s    "2"5"5"5"Er<   zmistral-medium-2312zmistral-large-2402zmistral-large-2407zmistral-large-2411zpixtral-large-2411c                  (    [         R                  SS9$ NT)ri   rC   rp   r   r<   r:   r   r     s    "2"5"5D"5"Ar<   zcodestral-2405zcodestral-mamba-2407zpixtral-12b-2409c                  *    [         R                  SSS9$ NT)r.   ri   r   r   r<   r:   r   r     s     0 3 3d$ 3 Or<   zopen-mistral-7bc                  (    [         R                  SS9$ r   r   r   r<   r:   r   r     s    ,//$/?r<   c                  *    [         R                  SSS9$ r   r   r   r<   r:   r   r     s    '**T*Fr<   c                  (    [         R                  SS9$ r   r   r   r<   r:   r   r     s    -00t0<r<   )zopen-mixtral-8x7bzmistral-embedzmistral-small-v1zmistral-large-v1zmistral-smallzmistral-largezopen-mixtral-8x22bzcodestral-22bzmistral-nemopixtralzpixtral-largerx   )Iru   pathlibr   typingr   r   r   mistral_common.exceptionsr   #mistral_common.protocol.fim.requestr   )mistral_common.protocol.instruct.messagesr	   r
   r   r   r   *mistral_common.protocol.instruct.normalizer   r   (mistral_common.protocol.instruct.requestr   *mistral_common.protocol.instruct.validatorr   r   r   r   r   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   r   %mistral_common.tokens.tokenizers.baser   r   r   r   r   r   r    &mistral_common.tokens.tokenizers.imager!   r"   r#   )mistral_common.tokens.tokenizers.instructr$   r%   r&   r'   r(   r)   .mistral_common.tokens.tokenizers.sentencepiecer*   r+   r,   'mistral_common.tokens.tokenizers.tekkenr-   r.   &mistral_common.tokens.tokenizers.utilsr/   r;   rA   rC   rg   rm   rd   rp   rx   dictrb   __annotations__r   r<   r:   <module>r      s     ) ) ;  s J  O ] ]   
  
 J Q3[ 3ZJ`=` 3eq 3$3[ 3Z 3L 3"D:O1?DUWddeD:N
JDJ),,J IJ ),,	J
 *--J /22J *--J EJ +..J *--J *--J *--J AJ &))J ,//J  O!J$ '**%J& *,,%(((++(++%((%((*--%((?F<;J T#x4D0D'E"EF r<   