
    >Ci4Q                    t   S SK Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJ	r	J
r
JrJrJrJrJrJr   S SKJr  S SKrSSKJr  \R.                  " \5      r " S S	5      r\ " S
 S\5      5       r\ " S S\\5      5       r " S S\5      r " S S\5      r " S S\5      r " S S\5      r g! \ a    Sr N~f = f)    )annotationsN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessor   )
GGUFWriterc                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'      S       SS jjrSS jrSSS jjrSS jrSS jr	SS jr
SS jrSS jrSrg
)SpecialVocab   	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateNc                    0 U l         0 U l        X@l        X l        / U l        S U l        Ub  X0l        OSU l        U R                  [        U5      5        g )N)boseosunkseppadclsmask)	r   r   n_vocabload_mergesr   r   special_token_types_loadr   )selfpathr"   r#   r!   s        D/home/james-whalen/.local/lib/python3.13/site-packages/gguf/vocab.py__init__SpecialVocab.__init__   sR    
 "$!#&!*':$'YD$

4:    c                    SR                  [        U R                  5      U R                  =(       d    SU R                  =(       d    S5      $ )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r%   s    r'   __repr__SpecialVocab.__repr__-   s<    X__d44?AWAWAb[b
 	
r*   c                   U R                   (       aO  U(       d,  [        R                  S[        U R                   5       S35        UR	                  U R                   5        O&U R
                  (       a  [        R                  S5        U R                  R                  5        Ha  u  p4[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  S	U S
U 35        U" U5        Mc     U R                  R                  5        Ha  u  p6[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  SU SU 35        U" U5        Mc     U R                  bE  U(       d"  [        R                  SU R                   35        UR                  U R                  5        g g )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to add_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor.   add_token_mergesr"   warningr   itemsgetattrr   r   add_chat_template)r%   gwquiettyptokid
id_handlervalueadd_handlers           r'   add_to_ggufSpecialVocab.add_to_gguf2   s   ;;gc$++&6%7zBC,NNgh00668JC7>rT#iCXZ^7_J!!CC5	RWQXXcde9#d5'JKu 9 00668JC9@xPSuTZE[]a9bK"!4SE9KE7R]^_l3%z%AB 9 )78J8J7KLM  !3!34 *r*   c                    U R                  U5        U R                  U5        U R                  (       a$  U R                  (       d  U R	                  U5        g g g N)_try_load_from_tokenizer_json_try_load_from_config_jsonr"   r   _try_load_merges_txt)r%   r&   s     r'   r$   SpecialVocab._loadN   sC    **40''-DKK%%d+ %0r*   c                8   US-  nUR                  5       (       d  g[        USSS9 n[        US5      R                  5       nUR	                  S5      (       d  UR                  S5        SnOS	n/ nU H  nUS	-  nUR                  5       nU(       d  M!  UR                  S S
5      n[        U5      S:w  a'  [        R                  UR                   SU S35        Mi  UR                  US    SUS	    35        M     S S S 5        WU l        g! , (       d  f       N= f)Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr.   r8   r;   nameappendr   )	r%   r&   merges_filefp
first_lineline_numr   linepartss	            r'   rL   !SpecialVocab._try_load_merges_txtT   s   \)""$$+sw72b"++-J((--
FAzz|

4+u:?NNk&6&6%7wxjHc#deq
!E!H:67  8$ ' 87s   CD
Dc           	     &   [        U[        5      (       d  g US:  a  [        SU SU 35      eU R                  b  X R                  :  a  XR                  ;   a  g X R                  U'   g [
        R                  SU SU SU R                   S35        g )Nr   z%invalid value for special token type z: zSpecial token type z, id z out of range, must be under r5   )
isinstanceint
ValueErrorr!   r   r8   r;   )r%   rA   tids      r'   _set_special_tokenSpecialVocab._set_special_tokenm   s    #s##7DSEC5QRR<<3#5,,,*-""3',SEse;XY]YeYeXffqrsr*   c                z  ^ US-  nUR                  5       (       Ga  [        USS9 n[        R                  " U5      nS S S 5        U R                  (       GaD  WR                  S0 5      R                  S5      n[        U[        5      (       Ga  U(       Ga  [        US   [        5      (       a  XPl	        O[        US   [        5      (       a  [        US   5      S:X  a  [        US   S   [        5      (       a  [        S U 5       5      (       a.  [        R                  S	[        [        S
5      S-   5      < 35        U VVs/ s H;  nS
R!                  U Vs/ s H  nSR!                  S U 5       5      PM     sn5      PM=     snnU l	        O[#        S5      eWR                  S0 5      nO0 nUS-  n	U	R                  5       (       d  g[        U	SS9 n[        R                  " U5      n
S S S 5        S nUS-  nUR                  5       (       a8  [        USS9 n[        R                  " U5      R                  S5      nS S S 5        W
R                  SU5      nUb  [        U[        [        45      (       a  Xl        O[        R                  SU	< S35        U R&                   H  nU
R                  SU S35      n[        U[(        5      (       a  XR*                  U'   U
R                  U S35      n[        U[        5      (       a  UmOB[        U[,        5      (       a+  UR                  S5      n[        U[        5      (       d  M  UmOM  [/        U4S jU 5       S 5      nU R1                  UU5        M     g! , (       d  f       GNE= fs  snf s  snnf ! , (       d  f       GN= f! , (       d  f       GNq= f)Ntokenizer.jsonrP   rQ   modelr   r   rV   c              3  >   #    U  H  o  H
  nS U;   v   M     M     g7f)rW   N ).0pairss      r'   	<genexpr>=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>   s     IFDDqsaxDxFs   z'Spaces in merges detected, encoding as rW      rS   c              3  `   #    U  H$  nUS :X  a  [        [        U5      S-   5      OUv   M&     g7f)rW   ry   N)chrord)rt   cs     r'   rw   rx      s/      ,&15A >?#XCFSL(91(L15s   ,.zUnknown tokenizer merges formatadded_tokensztokenizer_config.jsonTzchat_template.jsonr   z$Bad type for chat_template field in z - ignoringr3   r7   contentc              3  r   >#    U  H,  oR                  S 5      T:X  d  M  UR                  S5      v   M.     g7f)r   idN)get)rt   atok
tc_contents     r'   rw   rx      s,     ^LDHHY<OS]<]$Ls   77)rX   rY   jsonloadr"   r   ri   liststrr   r.   anyr8   r;   r{   r|   joinrk   r   r#   boolr   dictrZ   rm   )r%   r&   tokenizer_filef	tokenizerr   ru   partr~   tokenizer_config_filetokenizer_configchat_template_altchat_template_filer   rA   	add_entryentryentry_contentmaybe_token_idr   s                      @r'   rJ   *SpecialVocab._try_load_from_tokenizer_jsony   sG    00!!##n9Q IIaL	 :"w377Afd++!&)S11&,#F1It44VAY19LQ[\bcd\efg\hjmQnQn IFIII"NN-TUXY\]`YadgYgUhTk+lm )/' )/  HH 15!" 15	 %'GG ,&15,& %& 15!"	 )/' ))JKK$==<LL $'> >$,,..'G<#yy| = !$88%%''(W=$(IIaL$4$4_$E! >(,,_>OP J}sDk$J$J!.NNABWAZZefg++C(,,tC5-?@I)T**.7&&s+$((C58E%%%"
E4(( %		) 4!-55*
!^L^N ##C8' ,( E :9!"'* =<
 >=s;   M<N"N<NN&N+<
NN
N(+
N:c                   US-  nUR                  5       (       d  g[        USS9 n[        R                  " U5      nS S S 5        U R                   H'  nU R                  UWR                  U S35      5        M)     g! , (       d  f       NF= f)Nzconfig.jsonFrP   rQ   r4   T)rX   rY   r   r   r#   rm   r   )r%   r&   config_filer   configrA   s         r'   rK   'SpecialVocab._try_load_from_config_json   sx    ]*""$$+'2aYYq\F 3++C##Cse94E)FG ,	 32s   A==
B)r   r   r"   r   r!   r   r#   )FNN)r&   zstr | os.PathLike[str]r"   r   r#   zIterable[str] | Noner!   z
int | Nonereturnr   )F)r?   r   r@   r   r   None)r&   r   r   r   )r&   r   r   r   )rA   r   rl   r   r   r   )__name__
__module____qualname____firstlineno____annotations__r(   r0   rF   r$   rL   rm   rJ   rK   __static_attributes__rs   r*   r'   r   r      st    &&%%;; AF48"*9=1 "

58,2
tENr*   r   c                  *    \ rS rSr% S\S'   S\S'   Srg)	BaseVocab   zClassVar[str]tokenizer_modelr_   rs   N)r   r   r   r   r   r   rs   r*   r'   r   r      s    ""
r*   r   c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)Vocab   rj   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizerc                    g rI   rs   )r%   	base_paths     r'   r(   Vocab.__init__   s    r*   c                    g rI   rs   r/   s    r'   
all_tokensVocab.all_tokens   s    3r*   rs   Nr   r   r   z-Iterable[tuple[bytes, float, gguf.TokenType]])r   r   r   r   r   r(   r   r   rs   r*   r'   r   r      s    O%%  ,Nr*   r   c                  &    \ rS rSrSrSrSS jrSrg)NoVocab   no_vocabc                    g)Nz3<NoVocab for a model without integrated vocabulary>rs   r/   s    r'   r0   NoVocab.__repr__   s    Dr*   rs   Nr   )r   r   r   r   r   r_   r0   r   rs   r*   r'   r   r      s     ODEr*   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)BpeVocab   gpt2bpec           
        0 nUS-  =nR                  5       (       a\  [        USS9 n[        R                  " U5      U l        S S S 5         [        US-  SS9 n[        R                  " U5      nS S S 5        OUS-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:w  d#  UR                  S	S
5      (       d  US   S   S:w  a  [        S5      eUS   U l        UR                  S5      =nb.  U Vs0 s H!  nUS   U R                  ;  d  M  US   US   _M#     nn[        U R                  5      n	[        [        X[        U5      -   5      5      n
[        UR                  5       5      nX:w  a1  U	[        U5      -   S-
  n[        S[        U5       SU	 SU SU 35      e[        UR                  5       S S9nX l        U VVs/ s H  u  pUPM	     snnU l        Xl        U R                   [        U R                  5      -   U l        X0l        g ! , (       d  f       GN= f! , (       d  f       GN= f! [
         a     GN$f = f! , (       d  f       GN= fs  snf s  snnf )Nz
vocab.jsonrP   rQ   added_tokens.jsonrp   rq   typeBPEbyte_fallbackFdecoder	ByteLevelzCannot find GPT-2 BPE tokenizervocabr~   r   r   r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                    U S   $ Nr   rs   )text_idxs    r'   <lambda>#BpeVocab.__init__.<locals>.<lambda>  s    (1+r*   key)existsrY   r   r   r   FileNotFoundErrorr   r.   r   rangesortedvaluesrk   r<   r   r   vocab_size_baser   r   )r%   r   r~   r   r   tokenizer_jsonr   addeditemr   expected_ids
actual_idsexpected_end_idr<   textidxs                   r'   r(   BpeVocab.__init__   sl   ')(<77O??AAo8A!YYq\
 9)&99GLPQ#'99Q<L ML (*::O o8A!%1 9 /=W.EO'50O4G4GY^4_4_!),V4C'(IJJ(1DJ'++N;;H -2 F,1D#'	?$**#D !<Yd ;,1   F 4::
E*3|;L.LMNl1134
%(3z?:Q>O}S_,==n *|3.?vj\S T T |))+1MN$0=B$CUktTU$C$.$($8$83t?U?U;V$V$3[ 98
 ML$  98 F %DsY   H"I H40I II) I)I."
H14
I>I I 
II
I&c              #     #    U R                   R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                   5       H&  u  pEX4   S[        R                  R
                  4v   M(     g s  snnf 7f)Ng        )r   r<   	enumerategguf	TokenTypeNORMAL)r%   encoded_tokr   reverse_vocabi_s         r'   
bpe_tokensBpeVocab.bpe_tokens  sa     @D

@P@P@RS@R_[@RSdjj)DA"C)>)>>> * Ts   A7A1A	A7c              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fN     @rP   )r   encoder   r   CONTROLr%   r   scores      r'   r~   BpeVocab.added_tokens"  s:     **DE++g&t~~/E/EEE +   AAc              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frI   )r   r~   r/   s    r'   r   BpeVocab.all_tokens'  s/     ??$$$$$&&& 	%&   848688c                N    SU R                    S[        U R                  5       S3$ )Nz<BpeVocab with  base tokens and  added tokens>r   r.   r   r/   s    r'   r0   BpeVocab.__repr__+  s,     !5!5 66GDLbLbHcGddrssr*   )r   r   r   r   r   r   Nr   r   r   )r   r   r   r   r   r_   r(   r   r~   r   r0   r   rs   r*   r'   r   r      s(    OD24h?F
'tr*   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)SentencePieceVocabi/  llamaspmc           	        [         c  [        S5      e0 nUS-  =nR                  5       (       a.   [        US-  SS9 n[        R
                  " U5      nS S S 5        O/UR                  S-  =nR                  5       (       d  [        S5      e[        5       U l        U R                  R                  [        U5      5        U R                  R                  5       nUR                  5        VVs0 s H  u  pgXu:  d  M  Xv_M     nnn[        [        XU[        U5      -   5      5      n	[!        UR#                  5       5      n
X:w  a  [%        SU	 SU
 35      eX l        U
 Vs/ s H  oxU   PM	     snU l        XPl        U R*                  [        U R(                  5      -   U l        X0l        g ! , (       d  f       GN*= f! [         a     GN:f = fs  snnf s  snf )	Nzsentencepiece is not installedztokenizer.modelr   rP   rQ   zCannot find tokenizer.modelzExpected new token IDs z to be sequential; got )r   RuntimeErrorr   rY   r   r   r   parentsentencepiece_tokenizerLoadFromFiler   r   r<   r   r   r.   r   keysrk   r   r   r   r   )r%   r   r~   r   r   r   piecer   
new_tokensexpected_new_idsactual_new_idss              r'   r(   SentencePieceVocab.__init__3  s   !)?@@')(+<<<ODDFF)&99GLPQ#'99Q<L ML &/%5%58I%II/QQSS#$ABB'='?$$$11#o2FG11<<>
7C7I7I7K`7K)%rO_IBI7K
`js:2N OP!*//"34-67G6HH_`n_opqq #/<J"KNbb>N"K","&"6"6T=S=S9T"T"11 ML$  a #Ls@   F9 F'F9 ,G
;G
G'
F61F9 6F9 9
GGc              #    #    U R                   n[        UR                  5       5       GH  nUR                  U5      nUR	                  S5      nUR                  U5      n[        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                   nXEU4v   GM     g 7fNrP   )r  r   r   	IdToPiecer   GetScorer   r   r   	IsUnknownUNKNOWN	IsControlr   IsUnusedUNUSEDIsByteBYTE)r%   r   r   r	  r   r   toktypes          r'   sentencepiece_tokens'SentencePieceVocab.sentencepiece_tokensU  s     00	y++-.A''*E <<0D$--a0Enn++G""1%%..00""1%%..00 !!!$$..//""..--w&&) /s   EEc              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fr   )r   r   r   r   USER_DEFINEDr   s      r'   r~   SentencePieceVocab.added_tokensm  s:     **DE++g&t~~/J/JJJ +r   c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frI   )r  r~   r/   s    r'   r   SentencePieceVocab.all_tokensr  s1     ,,...$$&&& 	/&r   c                N    SU R                    S[        U R                  5       S3$ )Nz<SentencePieceVocab with r   r   r   r/   s    r'   r0   SentencePieceVocab.__repr__v  s.    *4+?+?*@@QRUVZVlVlRmQnn|}}r*   )r   r   r   r  r   r   Nr   r   r   )r   r   r   r   r   r_   r(   r  r~   r   r0   r   rs   r*   r'   r   r   /  s(    OD 2D'0K
'~r*   r   c                  h    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	 rSS
 jrSS jrSrg)LlamaHfVocabiz  r  hfftc                H   US-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:H  =(       a0    UR                  SS5      =(       a    UR                  S	S
5      (       + nU(       a  [	        S5      eU(       d7  US   S:w  d#  UR                  S	S5      (       a  US   S   S:w  a  [        S5      e SSKJn  UR                  UUS
S9U l
        U R                  R                  (       d   e/ U l        [        5       U l        [        5       U l        [#        U R                  R%                  5       R'                  5       S S9 Hd  u  pXR                  R(                  :  d  M   U R                  R+                  U	5        XR                  U	'   U R                   R-                  U
5        Mf     U R                  R.                   V	s0 s H   n	XR                  R1                  5       U	   _M"     sn	U l        [        U R                  R4                  5      U l        U R                  R(                  U l        U R8                  [;        U R                  5      -   U l        X l        g ! , (       d  f       GNO= f! [         a  n[        S5      UeS nAff = fs  sn	f )Nrp   rP   rQ   rq   r   r   ignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirlocal_files_onlyc                    U S   $ r   rs   )xs    r'   r   'LlamaHfVocab.__init__.<locals>.<lambda>  s    AaDr*   r   )rY   r   r   r   	TypeErrorr   transformersr(  ImportErrorfrom_pretrainedr   is_fastr   r   r   setadded_tokens_idsr   get_added_vocabr<   r   r`   addall_special_tokens	get_vocabspecialsall_special_idsspecial_idsr   r.   r   )r%   r   r   r   r   r   	is_llama3r(  etoktokidxs              r'   r(   LlamaHfVocab.__init__~  sb   #&66/G4!YYq\N 5 +9*AF#u, ?1D1D_V[1\ ?#''>> 	 EFFF#u,O4G4GY^4_4_i(0J>#$EFF	2 '66! 7 

 ~~%%%% "$!%!$ "NN**,224.
KC 222&&--c2.4&&s+%%))&1
 ~~88
8 ))+C008
 t~~==>  $~~88#33c$:P:P6QQ.{ 54(  	F 	>
s)   I/J 'J/
I>
JJJc              #  n  #    U R                   R                  5       R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                  5       HV  nX@R
                  ;   a  M  X4   R                  S5      nXPR                  U5      U R                  XEU R                  5      4v   MX     g s  snnf 7fr  )
r   r8  r<   r   r   r4  r   get_token_scoreget_token_typer;  )r%   r   r   r   token_id
token_texts         r'   	hf_tokensLlamaHfVocab.hf_tokens  s     37>>3K3K3M3S3S3U
3UBO3U 	 
 d223H000 '077@J 228<d>Q>Qd&6&6?   4	
s   ,B5B/A9B5c                    [         R                  " SU5      (       a  [        R                  R                  $ X;   a  [        R                  R
                  $ [        R                  R                  $ )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr   r   r  r   r   )r%   rD  rE  r;  s       r'   rC  LlamaHfVocab.get_token_type  sJ    <<.
;;>>&&& *2)@t~~%%[dnnF[F[[r*   c                    g)Nr   rs   )r%   rD  s     r'   rB  LlamaHfVocab.get_token_score  s     r*   c              #  D  #    U R                    H  nXR                  ;   aI  U R                  U R                  U   SU R                  5      nU R	                  U R                  U   5      nO[
        R                  R                  nSnUR                  S5      X24v   M     g 7f)Nr*   r   rP   )	r   r9  rC  r;  rB  r   r   r  r   )r%   r   r  r   s       r'   r~   LlamaHfVocab.added_tokens  s     **D}}$--dmmD.A3HXHXY,,T]]4-@A..55++g&66 +s   BB c                t    SU R                   R                  ;   =(       d    SU R                   R                  ;   $ )Nz<0x0A>
)r   r   r/   s    r'   has_newline_tokenLlamaHfVocab.has_newline_token  s+    4>>///O44>>;O;O3OOr*   c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frI   )rF  r~   r/   s    r'   r   LlamaHfVocab.all_tokens  s/     >>###$$&&& 	$&r   c                N    SU R                    S[        U R                  5       S3$ )Nz<LlamaHfVocab with r   r   r   r/   s    r'   r0   LlamaHfVocab.__repr__  s-    $T%9%9$::KCPTPfPfLgKhhvwwr*   )	r   r4  r   r   r;  r9  r   r   r   Nr   r   )rD  rj   rE  bytesr;  zset[int]r   zgguf.TokenType)rD  rj   r   floatr   )r   r   r   r   r   r_   r(   rF  rC  rB  r~   rR  r   r0   r   rs   r*   r'   r$  r$  z  s9    OD@/D$\
	7P'xr*   r$  )!
__future__r   rI  loggingr   ospathlibr   typingr   r   r   r   r	   r
   r   r   sentencepiecer   r0  r   gguf_writerr   	getLoggerr   r8   r   r   r   r   r   r   r$  rs   r*   r'   <module>rb     s    " 	   	  d d d"4  #			8	$r rj   
 OIx O OEi EHtu HtVH~ H~Vxx5 xx]  "!"s   B, ,B76B7