
    :i#                        S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	J
r
Jr  S SKJrJrJr  S SKJrJr  \" 5       (       a  S SKJr  S	\\-  S
\4S jrSS\\-  S\S
\4S jjrS\\-  S
\S-  4S jr " S S\5      rg)    N)cached_property)Path)TokenizerException)assert_sentencepiece_installedis_sentencepiece_installed)SpecialTokenPolicy	TokenizerTokenizerVersion)ImageConfigMultiModalVersion)SentencePieceProcessorpathreturnc                 b  ^  [        T [        5      (       a  [        T 5      m [        [        R
                  5      n[        [        R
                  5      S/-   nU VVs/ s H  o2  H
  nSU U 3PM     M     snnS/-   nT R                  5       =(       a    [        U 4S jU 5       5      $ s  snnf )z1Check if the given path is a SentencePiece model. z.model.z.modelc              3   Z   >#    U  H   nTR                   R                  U5      v   M"     g 7fN)nameendswith).0suffixr   s     h/home/james-whalen/.local/lib/python3.13/site-packages/mistral_common/tokens/tokenizers/sentencepiece.py	<genexpr>#is_sentencepiece.<locals>.<genexpr>   s#     !T8$))"4"4V"<"<8s   (+)	
isinstancestrr   listr
   __members__r   is_fileany)r   instruct_versionsmm_versionsvmsuffixess   `     r   is_sentencepiecer&      s    $Dz-99:(445<K*;Q*;Q['!QC [ *;QU]T^^H<<>Tc!T8!TTT Rs   B+tokenizer_filenameraise_deprecatedc                    [        U 5      n U R                  S5      S   nUS:w  a  UR                  S5      S   nUS:X  a!  U(       a  [        SU  S35      e[        S5      $ U[        R                  ;  a  [        S	U  35      e[        U5      $ )
z3Get the version of the tokenizer from the filename..modelr$   r   z4Make sure to rename your tokenizer file to end with z.v1.v1!Unrecognized tokenizer filename: )r   splitr   r
   r   )r'   r(   _version_strs      r   get_spm_versionr1       s    /0%++C04Lw#))#.q1w$'[\n[oos%tuu  %%+777 #DEWDX!YZZL))    c                     [        U 5      n U R                  S5      S   nUS:X  d  SU;  a  gSUR                  S5      S   -   nU[        R                  ;  a  [	        SU  35      e[        U5      R
                  $ )z1Get the image config from the tokenizer filename.r*   r+   r,   r$   Nr.   )r   r/   r   r   r   config)r'   r0   _mm_version_strs      r   get_image_configr6   5   s    /0%++C04Lw#\"9L..s3B77O/;;; #DEWDX!YZZ_-444r2   c            	         ^  \ rS rSrSrSS\\-  S\S-  SS4U 4S jjjr\	S\4S j5       r
\	S\4S	 j5       rS
\S\4S jr\	S\4S j5       rS\\   4S jr\S\4S j5       r\S\4S j5       r\S\\   4S j5       rS
\S\S\S\\   4S jrSS\\   S\S-  S\4S jjrS\S\4S jrS\\   S\S\4S jrS\\   S\4S jrS\\   S\4S jr\	S\4S j5       r\	S\4S j5       rSr U =r!$ ) SentencePieceTokenizerE   zC[SentencePiece](https://github.com/google/sentencepiece) tokenizer.N
model_pathtokenizer_versionr   c                   > [        5         [        R                  " U R                  R                  5      U l        [        R                  R                  U5      (       d   U5       e[        [        U[        5      (       a  UOUR                  5       S9U l        U R                  R                  5       U R                  R                  5       :X  d   e[!        U R"                  5       Vs/ s H  o0R                  R%                  U5      PM     snU l        U=(       d
    [)        USS9U l        [-        U5      U l        [0        TU ]e  5         gs  snf )zInitialize the `SentencePieceTokenizer`.

Args:
    model_path: The path to the `SentencePiece` model.
    tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
)
model_fileF)r(   N)r   logging	getLogger	__class____name___loggerosr   isfiler   r   r   as_posix_model
vocab_sizeget_piece_sizerangen_wordsid_to_piece_vocabr1   _versionr   
_file_pathsuper__init__)selfr:   r;   ir@   s       r   rP   SentencePieceTokenizer.__init__H   s     	'((()@)@Aww~~j))5:5),%/
C%@%@zjFYFYF[
 {{%%'4;;+E+E+GGGG;@;NO;Na{{..q1;NO*;*rzlq?rz* Ps   ($Ec                     U R                   $ )z The path to the tokenizer model.)rN   rQ   s    r   	file_path SentencePieceTokenizer.file_path`   s     r2   c                     U R                   $ )zThe version of the tokenizer.)rM   rU   s    r   versionSentencePieceTokenizer.versione   s     }}r2   sc                 8    U R                   R                  U5      $ )z+Get the control token for the given string.)rF   piece_to_id)rQ   r[   s     r   get_control_token(SentencePieceTokenizer.get_control_tokenj   s    {{&&q))r2   c                 6    U R                   R                  5       $ )z!Vocabulary size of the tokenizer.)rF   rG   rU   s    r   rJ   SentencePieceTokenizer.n_wordsn   s     {{%%''r2   c                     U R                   $ )z(All tokens in the vocabulary as strings.)rL   rU   s    r   vocabSentencePieceTokenizer.vocabs   s    {{r2   c                 6    U R                   R                  5       $ )z#The beginning of sentence token id.)rF   bos_idrU   s    r   rf   SentencePieceTokenizer.bos_idw        {{!!##r2   c                 6    U R                   R                  5       $ )zThe end of sentence token id.)rF   eos_idrU   s    r   rj   SentencePieceTokenizer.eos_id|   rh   r2   c                     [        U R                  5       Vs1 s H&  oR                  R                  U5      (       d  M$  UiM(     sn$ s  snf r   )rI   rJ   rF   	IsControl)rQ   toks     r   _control_tokens&SentencePieceTokenizer._control_tokens   s4    $T\\2Q2kk6K6KC6P2QQQs
   #AAboseosc                     [        U[        5      (       d   eU R                  R                  U5      nU(       a  U R                  /UQnU(       a  / UQU R
                  PnU$ )zEncode the given string into a list of token ids.

Args:
    s: The string to encode.
    bos: Whether to add the beginning of sentence token.
    eos: Whether to add the end of sentence token.

Returns:
    The list of token ids.
)r   r   rF   encoderf   rj   )rQ   r[   rq   rr   ts        r   rt   SentencePieceTokenizer.encode   sZ     !S!!!!{{))!,!q!A!!!T[[!Ar2   tokensspecial_token_policyc                 ^   Ub-  [        U[        5      (       d  [        S[        U5       S35      eUc+  [        R
                  " S[        5        [        R                  nU[        R                  [        R                  4;   a  U R                  X5      $ U R                  R                  U5      $ )a	  Decode the given list of token ids into a string.

Note:
    Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
    SentencePiece pieces.

Args:
    tokens: The list of token ids.
    special_token_policy: The policy to use for special tokens. If `None`, the default policy
        is `SpecialTokenPolicy.IGNORE`.  Passing `None` is deprecated and will be changed
        to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

Returns:
    The decoded string.
zFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r*   zUsing the tokenizer's special token policy `None` is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.)r   r   
ValueErrortypewarningswarnFutureWarningIGNOREKEEPRAISE_decode_with_special_tokensrF   decode)rQ   rw   rx   s      r   r   SentencePieceTokenizer.decode   s       +J?SUg4h4hXY]^rYsXttuv   'MMH
  $6#<#< $6$;$;=O=U=U#VV33FQQ{{!!&))r2   token_idc                 8    U R                   R                  U5      $ )z,Convert the given token id to a token piece.)rF   rK   )rQ   r   s     r   rK   "SentencePieceTokenizer.id_to_piece   s    {{&&x00r2   c           	         / n/ nU H  nXPR                   ;   az  U[        R                  :X  a  [        S5      eU(       a2  UR	                  U Vs/ s H  oPR                  U5      PM     sn5        / nUR                  U R                  W5      5        M  UR                  U5        M     U(       a0  UR	                  U Vs/ s H  oPR                  U5      PM     sn5        SR                  U5      $ s  snf s  snf )NzNDecoding `tokens` that contain special tokens with special_token_policy=RAISE.r   )ro   r   r   rz   extendrK   appendjoin)rQ   rw   rx   	text_listcurr_tokensrn   s         r   r   2SentencePieceTokenizer._decode_with_special_tokens   s    	!#C***'+=+C+CC$%uvv$${%S{&6&6s&;{%ST"$K  !1!1#!67 ""3'  {K{..s3{KLwwy!! &T Ls   C0
>C5c                 Z    [         R                  " S[        5        U R                  U5      $ )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

This is a convenient method for debugging.
z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)r|   r}   r~   
_to_stringrQ   rw   s     r   	to_string SentencePieceTokenizer.to_string   s,     	\ 	
 v&&r2   c                 >    U R                  U[        R                  S9$ )N)rx   )r   r   r   r   s     r   r   !SentencePieceTokenizer._to_string   s    {{68J8O8O{PPr2   c                 6    U R                   R                  5       $ )zThe padding token id.)rF   pad_idrU   s    r   r   SentencePieceTokenizer.pad_id   rh   r2   c                 6    U R                   R                  5       $ )zThe unknown token id.)rF   unk_idrU   s    r   r   SentencePieceTokenizer.unk_id   rh   r2   )rN   rB   rF   rM   rL   r   )"rA   
__module____qualname____firstlineno____doc__r   r   r
   rP   propertyrV   rY   intr^   rJ   r   rc   r   rf   rj   setro   boolrt   r   r   rK   r   r   r   r   r   __static_attributes____classcell__)r@   s   @r   r8   r8   E   s   N3: BRUYBY ei  0 4   )  *3 *3 * ( ( (tCy  $ $ $ $ $ $ RS R R $ T d3i &$*T#Y $*>PSW>W $*cf $*L1C 1C 1"$s) "Se "jm "*'S	 'c ' Qc Qs Q $ $ $ $ $ $r2   r8   )F)r>   rC   r|   	functoolsr   pathlibr   mistral_common.exceptionsr   mistral_common.importsr   r   %mistral_common.tokens.tokenizers.baser   r	   r
   &mistral_common.tokens.tokenizers.imager   r   sentencepiecer   r   r   r&   r1   r6   r8    r2   r   <module>r      s     	  %  8 ] 
 R4	U3: 	U$ 	U*d
 *d *Wg **5t 5d8J 5 m$Y m$r2   