
    5hR                       S SK Jr  S SKJr  S SKrS SKrS SKrS SKrS SKJ	r	  S SK
JrJrJrJrJrJrJrJr   S SKJr   S SKJr  S SKJr  S S	KJr  S S
KJr  SrS SKrSSK J!r!  \RD                  " \#5      r$ " S S5      r%\ " S S\5      5       r&\ " S S\&\5      5       r' " S S\&5      r( " S S\'5      r) " S S\'5      r* " S S\'5      r+ " S S\,\5      r-S"S jr. " S  S!\'5      r/g! \ a    Sr Nf = f! \ a    SrSrSrSrSr Nf = f)#    )annotations)EnumN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessor)MistralTokenizer)
Tekkenizer)_filter_valid_tokenizer_files)SentencePieceTokenizerTF   )
GGUFWriterc                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'      S       SS jjrSS jrSSS jjrSS jrSS jr	SS jr
SS jrSS jrSrg
)SpecialVocab*   	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateNc                    0 U l         0 U l        X@l        X l        / U l        S U l        Ub  X0l        OSU l        U R                  [        U5      5        g )N)boseosunkseppadclsmask)	r   r   n_vocabload_mergesr   r   special_token_types_loadr   )selfpathr'   r(   r&   s        2/home/james-whalen/llama.cpp/gguf-py/gguf/vocab.py__init__SpecialVocab.__init__0   sR    
 "$!#&!*':$'YD$

4:    c                    SR                  [        U R                  5      U R                  =(       d    SU R                  =(       d    S5      $ )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r*   s    r,   __repr__SpecialVocab.__repr__A   s<    X__d44?AWAWAb[b
 	
r/   c                   U R                   (       aO  U(       d,  [        R                  S[        U R                   5       S35        UR	                  U R                   5        O&U R
                  (       a  [        R                  S5        U R                  R                  5        Ha  u  p4[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  S	U S
U 35        U" U5        Mc     U R                  R                  5        Ha  u  p6[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  SU SU 35        U" U5        Mc     U R                  bE  U(       d"  [        R                  SU R                   35        UR                  U R                  5        g g )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to add_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor3   add_token_mergesr'   warningr   itemsgetattrr   r   add_chat_template)r*   gwquiettyptokid
id_handlervalueadd_handlers           r,   add_to_ggufSpecialVocab.add_to_ggufF   s   ;;gc$++&6%7zBC,NNgh00668JC7>rT#iCXZ^7_J!!CC5	RWQXXcde9#d5'JKu 9 00668JC9@xPSuTZE[]a9bK"!4SE9KE7R]^_l3%z%AB 9 )78J8J7KLM  !3!34 *r/   c                    U R                  U5        U R                  U5        U R                  (       a$  U R                  (       d  U R	                  U5        g g g N)_try_load_from_tokenizer_json_try_load_from_config_jsonr'   r   _try_load_merges_txt)r*   r+   s     r,   r)   SpecialVocab._loadb   sC    **40''-DKK%%d+ %0r/   c                8   US-  nUR                  5       (       d  g[        USSS9 n[        US5      R                  5       nUR	                  S5      (       d  UR                  S5        SnOS	n/ nU H  nUS	-  nUR                  5       nU(       d  M!  UR                  S S
5      n[        U5      S:w  a'  [        R                  UR                   SU S35        Mi  UR                  US    SUS	    35        M     S S S 5        WU l        g! , (       d  f       N= f)Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr3   r=   r@   nameappendr   )	r*   r+   merges_filefp
first_lineline_numr   linepartss	            r,   rQ   !SpecialVocab._try_load_merges_txth   s   \)""$$+sw72b"++-J((--
FAzz|

4+u:?NNk&6&6%7wxjHc#deq
!E!H:67  8$ ' 87s   CD
Dc           	     &   [        U[        5      (       d  g US:  a  [        SU SU 35      eU R                  b  X R                  :  a  XR                  ;   a  g X R                  U'   g [
        R                  SU SU SU R                   S35        g )Nr   z%invalid value for special token type : zSpecial token type z, id z out of range, must be under r:   )
isinstanceint
ValueErrorr&   r   r=   r@   )r*   rF   tids      r,   _set_special_tokenSpecialVocab._set_special_token   s    #s##7DSEC5QRR<<3#5,,,*-""3',SEse;XY]YeYeXffqrsr/   c                R  ^' S nUS-  nUR                  5       (       Ga  [        USS9 n[        R                  " U5      nS S S 5        U R                  (       GaD  UR                  S0 5      R                  S5      n[        U[        5      (       Ga  U(       Ga  [        US   [        5      (       a  XPl	        O[        US   [        5      (       a  [        US   5      S:X  a  [        US   S   [        5      (       a  [        S U 5       5      (       a.  [        R                  S	[        [        S
5      S-   5      < 35        U VVs/ s H;  nS
R!                  U Vs/ s H  nSR!                  S U 5       5      PM     sn5      PM=     snnU l	        O[#        S5      eUR                  S0 5      nO0 nS n	US-  n
U
R                  5       (       a)  [        U
SS9 n[        R                  " U5      n	S S S 5        U(       Ga  U	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU(       d  U(       a  U	(       a  U=U	S'   nU(       d  U(       a  U	(       a  U=U	S'   nUR                  S5      =n(       Ga  UR                  SU/5       GH  nUR                  S5      S:X  a  SU R$                  S'   SU R$                  S'   SU R$                  S'   U(       d!  U	(       a  UR                  SU/5      S   nXS'   U(       d!  U	(       a  UR                  SU/5      S   nXS'   M  UR                  S5      S:X  d  M  UR                  S/ 5      nUR                  S / 5      nS nS n[        U5      S!:  Gah  US   R                  S"0 5      R                  S#5      =n(       aB  U	(       d  UnUX4;   a  SOS$U R$                  S'   UX4;  a  [        R                  S%U< S&35        US'   R                  S"0 5      R                  S#5      =n(       a  U	(       d  UnOUU:w  a  S(U R&                  ;  a"  [)        U R&                  5      S)-   U l        XS*'   OPS+U R&                  ;  a"  [)        U R&                  5      S,-   U l        XS-'   O[        R                  S.U< S/U< S035        U=U	S'   nUU:X  a  SOS$U R$                  S'   UU:w  a  [        R                  S1U< S&35        U(       GaZ  U(       a*  US   R                  S"0 5      R                  S#5      U:X  a  S!OSnU(       a*  US'   R                  S"0 5      R                  S#5      U:X  a  S'OS nU(       a  US:X  d
  U(       a  Uc  [        R                  S25        U[+        UU5         =n(       Ga  US   R                  S30 5      R                  S#5      nUS'   R                  S30 5      R                  S#5      nUS4:w  d  US5:w  a  [        R                  S6U S7U S835        US4:X  Ga:  US5:X  Ga3  US!S' =n(       Ga&  S$nUS   R                  S"0 5      R                  S#5      =n(       a2  UX4;   a	  U(       d  SnUX4;  a  [        R                  S9U< S835        O[        R                  S:US   < S835        [        U5      S:X  as  US!   R                  S"0 5      R                  S#5      =n(       a+  UX4;   a  SnUX4;  a  [        R                  S;U< S835        O[        R                  S<US!   < S835        UU R$                  S'   U(       a  U(       d  U	(       a  XS'   GM     U	(       d  gS nUS=-  nUS>-  nUR                  5       (       a  [        USS9 nUR-                  5       nS S S 5        [        US?-  R/                  S@5      5      =n(       aN  SAUSB./nU HB  n[        USS9 n UR1                  UR2                  U R-                  5       SB.5        S S S 5        MD     OMUR                  5       (       a8  [        USS9 n[        R                  " U5      R                  SC5      nS S S 5        U	R                  SCU5      n!U!b  [        U![        [        45      (       a  U!U l        O[        R                  SDU
< SE35        U R&                   H  n"U	R                  SFU" SG35      n#[        U#[6        5      (       a  U#U R$                  U"'   U	R                  U" SG35      n$[        U$[        5      (       a  U$m'OB[        U$[8        5      (       a+  U$R                  SH5      n%[        U%[        5      (       d  M  U%m'OM  [;        U'4SI jU 5       S 5      n&U R=                  U"U&5        M     g! , (       d  f       G	N= fs  snf s  snnf ! , (       d  f       GN= f! , (       d  f       GN3= f! , (       d  f       GM  = f! , (       d  f       GN= f)JNtokenizer.jsonrU   rV   modelr   r   r[   c              3  >   #    U  H  o  H
  nS U;   v   M     M     g7f)r\   N ).0pairss      r,   	<genexpr>=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>   s     IFDDqsaxDxFs   z'Spaces in merges detected, encoding as r\      rX   c              3  `   #    U  H$  nUS :X  a  [        [        U5      S-   5      OUv   M&     g7fr\   r   Nchrordrz   cs     r,   r}   r~      s/      ,&15A >?#XCFSL(91(L15s   ,.zUnknown tokenizer merges formatadded_tokensztokenizer_config.json	bos_token	cls_token	eos_token	sep_tokenpost_processor
processorstypeRobertaProcessingTr   r    r"   r$   TemplateProcessingsingler{   r   SpecialTokenidFzUnknown leading special token z in TemplateProcessing<single>eot)r   	eot_tokeneom)r   	eom_tokenzOverriding EOS token z with z without EOT/EOM fallback!zUnknown trailing special token z`TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>r   ABzUnknown sequence z...z in TemplateProcessing<pair>zUnknown separator token zUnknown middle sequence zUnknown second separator token zUnknown second middle sequence zchat_template.jsonzchat_template.jinjaadditional_chat_templatesz*.jinjadefault)rd   templater   z$Bad type for chat_template field in z - ignoringr8   r<   contentc              3  r   >#    U  H,  oR                  S 5      T:X  d  M  UR                  S5      v   M.     g7f)r   r   N)get)rz   atok
tc_contents     r,   r}   r~   .  s,     ^LDHHY<OS]<]$Ls   77)r]   r^   jsonloadr'   r   ro   liststrr   r3   anyr=   r@   r   r   joinrq   r   r(   tupleslicereadglobre   stemr   booldictr_   rs   )(r*   r+   	tokenizertokenizer_filefr   r{   partr   tokenizer_configtokenizer_config_filespecial_bosspecial_clsspecial_eosspecial_sepr   	processortmpl_single	tmpl_pairspecial_firstspecial_last	seq_startseq_stoptmpl_atmpl_badd_sepspecial_entrychat_template_altchat_template_jsonchat_template_jinjaadditional_templatestemplate_pathrg   r   rF   	add_entryentryentry_contentmaybe_token_idr   s(                                          @r,   rO   *SpecialVocab._try_load_from_tokenizer_json   s	   	 00!!##n9Q IIaL	 :"w377Afd++!&)S11&,#F1It44VAY19LQ[\bcd\efg\hjmQnQn IFIII"NN-TUXY\]`YadgYgUhTk+lm )/' )/  HH 15!" 15	 %'GG ,&15,& %& 15!"	 )/' ))JKK$==<LL $'> > ((**+@A#'99Q<  A+1r66{CK+1r66{CK+1r66{CK+1r66{CK;3C>II -;3C>II -!*/?!@@~@!/!3!3L>BR!SI }}V,0CC8<..u58<..u58<..u5*/?*3--}*Ma*PK<G[9*/?*3--}*Ma*PK<G[9  !}}V,0DD&/mmHb&A$-MM&"$=	(,'+{+a/0;A0B0B>SU0V0Z0Z[_0``}`'72?KHUZeYsHsy~ 6 6u =#08R#R$*NN5STaSd  eC  4D  %E/:2/B/B>SU/V/Z/Z[_/``|`'72>K%1[%@',D4L4L'LCHIaIaCbenCn(@HS(E).d6N6N)NCHIaIaCbenCn(@HS(E(.9N{o]cdpcs  tN  8O  )PR^$^$4[$AKHTXcHcin 6 6u =#/;#>$*NN5TUaTd  eC  4D  %E$-:y|?O?OP^`b?c?g?ghl?mq~?~  EFI-9im>O>OP^`b>c>g>ghl>mq}>}r  DHH -)q.lW_Wg &  0R  !S,5eIx6P,QQyQ)21)9)9*b)I)M)Md)S)22):)::r)J)N)Nt)T#)S=FcM$*NN5FvhcRXQYYu3v$w#)S=Vs]U^_`acUdHd	Hd.3G8A!8H8HY[8\8`8`ae8f'f}'f+8[<V+V_k6:G+8@Z+Z,2NN=UVcUf  gC  <D  -E(.9QR[\]R^Qaa}7~('*9~':<EaL<L<L^]_<`<d<dei<j+j=+j/<@Z/Z:>/<[D^/^06A`an`q  rN  @O  1P,2NN=\]fgh]i\l  mI  <J  -KDKD$:$:5$A'.{GWHS(E S "TT   !$88"%::&&(()g>!$%FFH! ?'+T4O-O,U,UV_,`'aa#a.7EV%W$X!%9Mm@B)00-:L:LZ\ZaZaZc1de A@ &:  ''))(W=$(IIaL$4$4_$E! >(,,_>OP J}sDk$J$J!.DNNABWAZZefg++C(,,tC5-?@I)T**.7&&s+$((C58E%%%"
E4(( %		) 4!-55*
!^L^N ##C8' ,( C :9!"'* A@x ?>
 A@ >=sS   gg"g>gg 5g2-h &h
gg 
g/2
h
h	
h&c                ^   US-  nUR                  5       (       d  g[        USS9 n[        R                  " U5      nS S S 5        U R                   HH  nWR                  U S35      nUc  SU;   a  US   R                  U S35      nU R                  XV5        MJ     g! , (       d  f       Ng= f)Nzconfig.jsonFrU   rV   r9   text_configT)r]   r^   r   r   r(   r   rs   )r*   r+   config_filer   configrF   token_ids          r,   rP   'SpecialVocab._try_load_from_config_json4  s    ]*""$$+'2aYYq\F 3++CzzSE"34HMV$;!-044uI5FG##C2 ,  32s   B
B,)r   r   r'   r   r&   r   r(   )FNN)r+   zstr | os.PathLike[str]r'   r   r(   zIterable[str] | Noner&   z
int | Nonereturnr   )F)rD   r   rE   r   r   None)r+   r   r   r   )r+   r   r   r   )rF   r   rr   r   r   r   )__name__
__module____qualname____firstlineno____annotations__r-   r5   rK   r)   rQ   rs   rO   rP   __static_attributes__ry   r/   r,   r   r   *   st    &&%%;; AF48"*9=1 "

58,2
teNr/   r   c                  *    \ rS rSr% S\S'   S\S'   Srg)	BaseVocabiC  zClassVar[str]tokenizer_modelrd   ry   N)r   r   r   r   r   r   ry   r/   r,   r   r   C  s    ""
r/   r   c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)VocabiI  rp   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizerc                    g rN   ry   )r*   	base_paths     r,   r-   Vocab.__init__P  s    r/   c                    g rN   ry   r4   s    r,   
all_tokensVocab.all_tokensQ  s    3r/   ry   Nr   r   r   z-Iterable[tuple[bytes, float, gguf.TokenType]])r   r   r   r   r   r-   r   r   ry   r/   r,   r   r   I  s    O%%  ,Nr/   r   c                  &    \ rS rSrSrSrSS jrSrg)NoVocabiT  no_vocabc                    g)Nz3<NoVocab for a model without integrated vocabulary>ry   r4   s    r,   r5   NoVocab.__repr__X  s    Dr/   ry   Nr   )r   r   r   r   r   rd   r5   r   ry   r/   r,   r   r   T  s     ODEr/   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)BpeVocabi\  gpt2bpec           
        0 nUS-  =nR                  5       (       a\  [        USS9 n[        R                  " U5      U l        S S S 5         [        US-  SS9 n[        R                  " U5      nS S S 5        OUS-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:w  d#  UR                  S	S
5      (       d  US   S   S:w  a  [        S5      eUS   U l        UR                  S5      =nb.  U Vs0 s H!  nUS   U R                  ;  d  M  US   US   _M#     nn[        U R                  5      n	[        [        X[        U5      -   5      5      n
[        UR                  5       5      nX:w  a1  U	[        U5      -   S-
  n[        S[        U5       SU	 SU SU 35      e[        UR                  5       S S9nX l        U VVs/ s H  u  pUPM	     snnU l        Xl        U R                   [        U R                  5      -   U l        X0l        g ! , (       d  f       GN= f! , (       d  f       GN= f! [
         a     GN$f = f! , (       d  f       GN= fs  snf s  snnf )Nz
vocab.jsonrU   rV   added_tokens.jsonrv   rw   r   BPEbyte_fallbackFdecoder	ByteLevelzCannot find GPT-2 BPE tokenizervocabr   r   r   r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                    U S   $ Nr   ry   )text_idxs    r,   <lambda>#BpeVocab.__init__.<locals>.<lambda>  s    (1+r/   key)existsr^   r   r   r   FileNotFoundErrorr   r3   r   rangesortedvaluesrq   rA   r   r   vocab_size_baser   r   )r*   r   r   r   r   tokenizer_jsonr   addeditemr   expected_ids
actual_idsexpected_end_idrA   textidxs                   r,   r-   BpeVocab.__init__`  sl   ')(<77O??AAo8A!YYq\
 9)&99GLPQ#'99Q<L ML (*::O o8A!%1 9 /=W.EO'50O4G4GY^4_4_!),V4C'(IJJ(1DJ'++N;;H -2 F,1D#'	?$**#D !<Yd ;,1   F 4::
E*3|;L.LMNl1134
%(3z?:Q>O}S_,==n *|3.?vj\S T T |))+1MN$0=B$CUktTU$C$.$($8$83t?U?U;V$V$3[ 98
 ML$  98 F %DsY   H"I H40I II) I)I."
H14
I>I I 
II
I&c              #     #    U R                   R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                   5       H&  u  pEX4   S[        R                  R
                  4v   M(     g s  snnf 7f)Ng        )r   rA   	enumerategguf	TokenTypeNORMAL)r*   encoded_tokr   reverse_vocabi_s         r,   
bpe_tokensBpeVocab.bpe_tokens  sa     @D

@P@P@RS@R_[@RSdjj)DA"C)>)>>> * Ts   A7A1A	A7c              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fN     @rU   )r   encoder  r  CONTROLr*   r  scores      r,   r   BpeVocab.added_tokens  s:     **DE++g&t~~/E/EEE +   AAc              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frN   )r  r   r4   s    r,   r   BpeVocab.all_tokens  s/     ??$$$$$&&& 	%&   848688c                N    SU R                    S[        U R                  5       S3$ )Nz<BpeVocab with  base tokens and  added tokens>r  r3   r   r4   s    r,   r5   BpeVocab.__repr__  s,     !5!5 66GDLbLbHcGddrssr/   )r   r   r   r   r   r  Nr   r   r   )r   r   r   r   r   rd   r-   r  r   r   r5   r   ry   r/   r,   r   r   \  s(    OD24h?F
'tr/   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)SentencePieceVocabi  llamaspmc           	        [         c  [        S5      e0 nUS-  =nR                  5       (       a.   [        US-  SS9 n[        R
                  " U5      nS S S 5        O/UR                  S-  =nR                  5       (       d  [        S5      e[        5       U l        U R                  R                  [        U5      5        U R                  R                  5       nUR                  5        VVs0 s H  u  pgXu:  d  M  Xv_M     nnn[        [        XU[        U5      -   5      5      n	[!        UR#                  5       5      n
X:w  a  [%        SU	 SU
 35      eX l        U
 Vs/ s H  oxU   PM	     snU l        XPl        U R*                  [        U R(                  5      -   U l        X0l        g ! , (       d  f       GN*= f! [         a     GN:f = fs  snnf s  snf )	Nzsentencepiece is not installedztokenizer.modelr   rU   rV   zCannot find tokenizer.modelzExpected new token IDs z to be sequential; got )r   RuntimeErrorr  r^   r   r   r  parentsentencepiece_tokenizerLoadFromFiler   r   rA   r   r  r3   r	  keysrq   r   r   r  r   )r*   r   r   r   r   r   piecer   
new_tokensexpected_new_idsactual_new_idss              r,   r-   SentencePieceVocab.__init__  s   !)?@@')(+<<<ODDFF)&99GLPQ#'99Q<L ML &/%5%58I%II/QQSS#$ABB'='?$$$11#o2FG11<<>
7C7I7I7K`7K)%rO_IBI7K
`js:2N OP!*//"34-67G6HH_`n_opqq #/<J"KNbb>N"K","&"6"6T=S=S9T"T"11 ML$  a #Ls@   F9 F'F9 ,G
;G
G'
F61F9 6F9 9
GGc              #    #    U R                   n[        UR                  5       5       GH  nUR                  U5      nUR	                  S5      nUR                  U5      n[        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                   nXEU4v   GM     g 7fNrU   )r8  r  r   	IdToPiecer#  GetScorer  r  r  	IsUnknownUNKNOWN	IsControlr$  IsUnusedUNUSEDIsByteBYTE)r*   r   r  r;  r  r&  toktypes          r,   sentencepiece_tokens'SentencePieceVocab.sentencepiece_tokens  s     00	y++-.A''*E <<0D$--a0Enn++G""1%%..00""1%%..00 !!!$$..//""..--w&&) /s   EEc              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fr!  )r   r#  r  r  USER_DEFINEDr%  s      r,   r   SentencePieceVocab.added_tokens  s:     **DE++g&t~~/J/JJJ +r(  c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frN   )rL  r   r4   s    r,   r   SentencePieceVocab.all_tokens  s1     ,,...$$&&& 	/&r+  c                N    SU R                    S[        U R                  5       S3$ )Nz<SentencePieceVocab with r-  r.  r/  r4   s    r,   r5   SentencePieceVocab.__repr__  s.    *4+?+?*@@QRUVZVlVlRmQnn|}}r/   )r   r   r   r8  r   r  Nr   r   r   )r   r   r   r   r   rd   r-   rL  r   r   r5   r   ry   r/   r,   r2  r2    s(    OD 2D'0K
'~r/   r2  c                  h    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	 rSS
 jrSS jrSrg)LlamaHfVocabi  r3  hfftc                H   US-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:H  =(       a0    UR                  SS5      =(       a    UR                  S	S
5      (       + nU(       a  [	        S5      eU(       d7  US   S:w  d#  UR                  S	S5      (       a  US   S   S:w  a  [        S5      e SSKJn  UR                  UUS
S9U l
        U R                  R                  (       d   e/ U l        [        5       U l        [        5       U l        [#        U R                  R%                  5       R'                  5       S S9 Hd  u  pXR                  R(                  :  d  M   U R                  R+                  U	5        XR                  U	'   U R                   R-                  U
5        Mf     U R                  R.                   V	s0 s H   n	XR                  R1                  5       U	   _M"     sn	U l        [        U R                  R4                  5      U l        U R                  R(                  U l        U R8                  [;        U R                  5      -   U l        X l        g ! , (       d  f       GNO= f! [         a  n[        S5      UeS nAff = fs  sn	f )Nrv   rU   rV   rw   r   r   ignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirlocal_files_onlyc                    U S   $ r   ry   )xs    r,   r  'LlamaHfVocab.__init__.<locals>.<lambda>#  s    AaDr/   r  )r^   r   r   r   	TypeErrorr  transformersrZ  ImportErrorfrom_pretrainedr   is_fastr   r   r   setadded_tokens_idsr	  get_added_vocabrA   r   re   addall_special_tokens	get_vocabspecialsall_special_idsspecial_idsr  r3   r   )r*   r   r   r   r  r   	is_llama3rZ  etoktokidxs              r,   r-   LlamaHfVocab.__init__  sb   #&66/G4!YYq\N 5 +9*AF#u, ?1D1D_V[1\ ?#''>> 	 EFFF#u,O4G4GY^4_4_i(0J>#$EFF	2 '66! 7 

 ~~%%%% "$!%!$ "NN**,224.
KC 222&&--c2.4&&s+%%))&1
 ~~88
8 ))+C008
 t~~==>  $~~88#33c$:P:P6QQ.{ 54(  	F 	>
s)   I/J 'J/
I>
JJJc              #  n  #    U R                   R                  5       R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                  5       HV  nX@R
                  ;   a  M  X4   R                  S5      nXPR                  U5      U R                  XEU R                  5      4v   MX     g s  snnf 7frA  )
r   rj  rA   r  r  rf  r#  get_token_scoreget_token_typerm  )r*   r  r   r  r   
token_texts         r,   	hf_tokensLlamaHfVocab.hf_tokens8  s     37>>3K3K3M3S3S3U
3UBO3U 	 
 d223H000 '077@J 228<d>Q>Qd&6&6?   4	
s   ,B5B/A9B5c                    [         R                  " SU5      (       a  [        R                  R                  $ X;   a  [        R                  R
                  $ [        R                  R                  $ )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr  r  rJ  r$  r  )r*   r   rv  rm  s       r,   ru  LlamaHfVocab.get_token_typeJ  sJ    <<.
;;>>&&& *2)@t~~%%[dnnF[F[[r/   c                    g)Nr"  ry   )r*   r   s     r,   rt  LlamaHfVocab.get_token_scoreR  s     r/   c              #  D  #    U R                    H  nXR                  ;   aI  U R                  U R                  U   SU R                  5      nU R	                  U R                  U   5      nO[
        R                  R                  nSnUR                  S5      X24v   M     g 7f)Nr/   r"  rU   )	r   rk  ru  rm  rt  r  r  rO  r#  )r*   r  rK  r&  s       r,   r   LlamaHfVocab.added_tokensW  s     **D}}$--dmmD.A3HXHXY,,T]]4-@A..55++g&66 +s   BB c                t    SU R                   R                  ;   =(       d    SU R                   R                  ;   $ )Nz<0x0A>
)r   r   r4   s    r,   has_newline_tokenLlamaHfVocab.has_newline_tokenb  s+    4>>///O44>>;O;O3OOr/   c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frN   )rw  r   r4   s    r,   r   LlamaHfVocab.all_tokense  s/     >>###$$&&& 	$&r+  c                N    SU R                    S[        U R                  5       S3$ )Nz<LlamaHfVocab with r-  r.  r/  r4   s    r,   r5   LlamaHfVocab.__repr__i  s-    $T%9%9$::KCPTPfPfLgKhhvwwr/   )	r   rf  r   r   rm  rk  r   r   r  Nr   r   )r   rp   rv  bytesrm  zset[int]r   zgguf.TokenType)r   rp   r   floatr   )r   r   r   r   r   rd   r-   rw  ru  rt  r   r  r   r5   r   ry   r/   r,   rV  rV    s9    OD@/D$\
	7P'xr/   rV  c                      \ rS rSrSrSrSrg)MistralTokenizerTypeim  r4  tekkenry   N)r   r   r   r   r4  r  r   ry   r/   r,   r  r  m  s    
CFr/   r  c            	        [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~r      ¡   ¬   ®   ÿNr   r   )r   r  r   re   r   r   zip)bscsnbcs_strs        r,   bytes_to_unicoder  u  s     	U3s8SX\*+
uSYD	A.
/	0
uSYD	A.
/	0 
 
AB	A4[;IIaLIIdQhFA	 
 !!bc!fbF!B   "s   C:c                  J   \ rS rSr% SrSr0 rS\S'   / rS\S'   SS jr	\
SS j5       r\
SS	 j5       rSS
 jrSS jrSS jr\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       rSS jr\S 5       rS rSrg)MistralVocabi  mistralr   r   r   r   c                   [         (       d  [        S5      e[        c   S5       e[        c   S5       e[        c   S5       e[
        R                  SU 35        UR                  S5       Vs/ s H)  o"R                  5       (       d  M  UR                  5       PM+     nn[        U5      n[        U5      S:X  a  [        SU 35      e[        U5      S:  a3  SU;   a  SnO[        U5      S	   n[
        R                  S
U SU 35        OUS   n[        R                  " X-  5      R                  R                   U l        [#        U R                   [        5      (       a  [$        R&                  O[$        R(                  U l        U R                   R,                  U l        X-  U l        SU R*                  R2                  -   S-   U R                   R4                  -   U l        g s  snf )NzwTo use MistralVocab, please install the `mistral-common` package. You can install it with `pip install mistral-common`.mistral_common is not installedzLoading Mistral tokenizer from z**/*r   z*No tokenizer file found in the directory: r   ztekken.jsonr   z"Multiple tokenizer files found in z. Using zmistral--)_mistral_common_installedrb  r   r   r   r=   r>   r   r]   as_posixr3   rq   r	  r@   	from_fileinstruct_tokenizerr   ro   r  r  r4  tokenizer_typen_wordsr   r   rI   version_name)r*   r   r   	all_filesvalid_tokenizer_filesr   s         r,   r-   MistralVocab.__init__  s   ((H  -8[:[[8+N-NN+%H'HH%5i[AB ,5>>&+AQ+AaYY[\QZZ\+A	Q =i H$%*I)UVV$%) 55!.!'(=!>r!BNN4YKxGWX 315N)33&


YY 	
 $..*55 !''%)) 	
 ..00(9,,222S84>>;Q;QQ 	
7 Rs   ,GGc                    U R                   $ rN   )r  r4   s    r,   tokenizer_nameMistralVocab.tokenizer_name  s    zzr/   c                F    U R                   [        R                  :X  a  S$ S$ )Nr3  r   )r  r  r4  r4   s    r,   gguf_tokenizer_model!MistralVocab.gguf_tokenizer_model  s!    --1E1I1IIwUvUr/   c              #  "  #    [         c   S5       e[        U R                  [         5      (       d   S[        U R                  5       35       e[	        U R                  R
                  R                  5       5       GH  nU R                  R
                  R                  U5      nUR                  S5      nU R                  R
                  R                  U5      n[        R                  R                  nU R                  R
                  R                  U5      (       a  [        R                  R                  nU R                  R
                  R                  U5      (       a  [        R                  R                   nU R                  R
                  R#                  U5      (       a  [        R                  R$                  nU R                  R
                  R'                  U5      (       a  [        R                  R(                  nX4U4v   GM     g 7f)Nr  z%Expected SentencePieceTokenizer, got rU   )r   ro   r   r   r  _modelr   rB  r#  rC  r  r  r  rD  rE  rF  r$  rG  rH  rI  rJ  )r*   r  r;  r  r&  rK  s         r,   _sentencepiece_tokens"MistralVocab._sentencepiece_tokens  sr    %1T3TT1$..*@AA 	
3D4H3IJ	
A t~~,,779:ANN))33A6E<<(D>>0099!<Enn++G~~$$..q11..00~~$$..q11..00~~$$--a00..//~~$$++A....--w&&! ;s   HHc              #  B  #    [         c   S5       e[        U R                  [         5      (       d   S[        U R                  5       35       e[	        5       n[        U R                  R                  5       HJ  nU R                  R                  U5      R                  S5      S[        R                  R                  4v   ML     U R                  R                   H@  nU R                  X15      R                  S5      S[        R                  R                  4v   MB     g 7f)Nr  Expected Tekkenizer, got rU   r   )r   ro   r   r   r  r  num_special_tokensid_to_piecer#  r  r  r$  _tekken_token2id_nospecialtoken_bytes_to_stringr  )r*   byte_encoderr   tokens       r,   _tekken_tokensMistralVocab._tekken_tokens  s     %H'HH%$..*55 	
'T^^(<'=>	
5 ()dnn??@H**84;;GD&&  A ^^>>E**5?FFwO%%  ?s   DDc                   [         b  [        c   S5       eU R                  [        R                  :X  aF  [        U R                  [         5      (       d   eU R                  R                  R                  U5      $ U R                  [        R                  :X  a]  [        U R                  [        5      (       d   eU R                  R                  R                  U5      U R                  R                  -   $ [        SU R                   35      e)Nr  Unknown tokenizer type: )r   r   r  r  r4  ro   r   _vocabindexr  r  rq   )r*   r  s     r,   get_token_idMistralVocab.get_token_id  s    %1j6LoNooL"6":"::dnn.DEEEE>>((..u55  $8$?$??dnnj9999%%++E2T^^5V5VV 78K8K7LMNNr/   c                .    U R                   R                  $ rN   )r   bos_idr4   s    r,   r  MistralVocab.bos_id      ~~$$$r/   c                .    U R                   R                  $ rN   )r   eos_idr4   s    r,   r  MistralVocab.eos_id  r  r/   c                z    U R                   R                  S:X  a  U R                  $ U R                   R                  $ )Nr   )r   pad_idr  r4   s    r,   r  MistralVocab.pad_id  s.    >>  B&;;~~$$$r/   c                .    U R                   R                  $ rN   )r   unk_idr4   s    r,   r  MistralVocab.unk_id  r  r/   c                `    U R                   R                  U R                   R                  5      $ rN   )r   r  r  r4   s    r,   r   MistralVocab.bos_token  !    ~~))$..*?*?@@r/   c                `    U R                   R                  U R                   R                  5      $ rN   )r   r  r  r4   s    r,   r   MistralVocab.eos_token  r  r/   c                `    U R                   R                  U R                   R                  5      $ rN   )r   r  r  r4   s    r,   	pad_tokenMistralVocab.pad_token  r  r/   c                `    U R                   R                  U R                   R                  5      $ rN   )r   r  r  r4   s    r,   	unk_tokenMistralVocab.unk_token!  r  r/   c              #    #    U R                   [        R                  :X  a  U R                  5        S h  vN   g U R                   [        R                  :X  a  U R                  5        S h  vN   g [        SU R                    35      e NT N7f)Nr  )r  r  r4  r  r  r  rq   r4   s    r,   r   MistralVocab.all_tokens%  sy     "6":"::11333  $8$?$??**,,, 78K8K7LMNN 4 -s!   2BB	6B+B,BBc           	         SR                  U R                  S5       Vs/ s H  o![        U5         PM     sn5      $ s  snf )NrX   latin-1)r   decoder   )r  r  chars      r,   r  "MistralVocab.token_bytes_to_string/  s5    wwAHHY<OP<ODSY/<OPQQPs   =c                   ^ [         b  [        U R                  [         5      (       d   S[        U R                  5       35       eU R                  R                  R
                  mTR                  5        VVs0 s H  u  pX!_M	     nnn/ n[        SU R                  U R                  R                  -
  5       H  nX5   n/ n[        S[        U5      5       H9  nUS U n	XhS  n
U	T;   d  M  U
T;   d  M  X-   T;   d  M&  UR                  XU45        M;     U(       d   [        SU SUR                  S5       35      e[        UU4S jSS	9nUR                  U5        M     [        US
 SS	9n[!        5       nU Vs/ s H-  nU R#                  US   U5      U R#                  US   U5      /PM/     nnU VVs/ s H;  nSR%                  U Vs/ s H  nSR%                  S U 5       5      PM     sn5      PM=     nnnU$ s  snnf s  snf s  snf s  snnf )Nr  r   r   z-Could not find valid merge for token at rank rn   r  c                $   > TU S      TU S      4$ )Nr   r   ry   )r^  mergeable_rankss    r,   r  >MistralVocab.extract_vocab_merges_from_model.<locals>.<lambda>R  s    qt4oad6KLr/   F)r  reversec                    U S   $ )Nr[   ry   )vals    r,   r  r  V  s    #a&r/   r   r\   rX   c              3  ^   #    U  H#  oS :X  a  [        [        U5      S-   5      OUv   M%     g7fr   r   r   s     r,   r}   ?MistralVocab.extract_vocab_merges_from_model.<locals>.<genexpr>f  s&     O$QcCA-q@$s   +-)r   ro   r   r   r  _mergeable_ranksrA   r  r   r  r3   re   rq   r  r	  extendr  r  r   )r*   token_bytesranktoken_bytes_mapmerge_pairsr  merged_tokenlocaljleftrightr  r  decoded_merge_pairsr{   r   r   r  s                    @r,   extract_vocab_merges_from_model,MistralVocab.extract_vocab_merges_from_model3  sH    %*T^^Z*P*P 	
'T^^(<'=>	
P ..//@@7F7L7L7N
7N"3+D7N 	 
  sDOOdnn.O.OOPA*-LE1c,/0#BQ'$R(O+0/9LL$q!12 1  CA3bI\I\]fIgHhi  LE
 u%+ Q, [.@%P') #

 # **3q6<@**3q6<@ # 	 
  ,	
 , HH !% !% GGO$OO $ , 	 	
 g
@
	
s$   8G:84H 3H
"H*H
H
)r  r   r   r  r   Nr   r   r   )r  r   r   rp   )r   rp   )r   r   r   r   r   rd   r   r   r   r-   propertyr  r  r  r  r  r  r  r  r  r   r   r  r  r   staticmethodr  r  r   ry   r/   r,   r  r    s+   OD(*~*#%y%*
X   V V'0(O % % % % % %
 % % A A A A A A A AO R R:r/   r  )r   zdict[int, str])0
__future__r   enumr   rz  loggingr   ospathlibr   typingr   r   r   r	   r
   r   r   r   sentencepiecer   rb  (mistral_common.tokens.tokenizers.mistralr   'mistral_common.tokens.tokenizers.tekkenr   &mistral_common.tokens.tokenizers.utilsr   .mistral_common.tokens.tokenizers.sentencepiecer   r  r  gguf_writerr   	getLoggerr   r=   r   r   r   r   r   r2  rV  r   r  r  r  ry   r/   r,   <module>r     sO   "  	   	  d d d"4%IB !%  #			8	$V Vr   
 OIx O OEi EHtu HtVH~ H~Vxx5 xxv3 !4^5 ^E  "!"  ) %J!$(!)s"   C) C7 )C43C47D
	D
