
    +ht4                         S SK r S SKrS SKrS SKJrJrJrJr  S SKJ	r	  S SK
Jr  S SKJrJr  S SKJr   " S S5      r " S	 S
\5      rg)    N)DictListOptionalUnion)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc            
           \ rS rSrS\4S jrSS\4S jjrSS\S\S\S\\	   4S	 jjr
S
\\	   S\4S jrS\\   S\4S jrS rS rSrg)SPTokenizer   
model_pathc                 .   [         R                  R                  U5      (       d   U5       e[        US9U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l        U R                  R                  5       U l
        U R                  R                  5       U R                  R                  5       :X  d   e/ SQn/ SQU-   n0 U l        0 U l        U HI  nU R                  U R                  U'   X@R                  U R                  '   U =R                  S-  sl        MK     SR                  U Vs/ s H  n[        R                   " U5      PM     sn5      U l        g s  snf )N)
model_file)z
<|system|>z<|user|><|assistant|>z<|observation|>)z[MASK][gMASK]z[sMASK]sopeop   |)ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idunk_idpad_idget_piece_sizespecial_tokensindex_special_tokensjoinreescaperole_special_token_expression)selfr   role_special_tokensr#   tokens        ^/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/kolors/tokenizer.py__init__SPTokenizer.__init__   s6   ww~~j))5:5).*E !MM446==//1==//1==//1}}'')T]]-I-I-KKKK\GJ]] $&!#E)-D&6;%%dll3LLAL $ .1XXUh6iUhEryy7GUh6i-j*6is   % Fsc           	      0   U(       a  Sn/ n[         R                  " U R                  U5       H  nX5R                  5       :  a:  UR	                  U R
                  R                  XUR                  5        5      5        UR                  XR                  5       UR                  5        5        UR                  5       nM     U[        U5      :  a,  UR	                  U R
                  R                  XS  5      5        U$ U R
                  R                  U5      $ )Nr   )
r&   finditerr(   startextendr   EncodeAsPiecesappendendlen)r)   r/   encode_special_tokens
last_indextmatchs         r,   tokenizeSPTokenizer.tokenize1   s     JAT%G%GK-HHT]]99!:WXY;;=599;78"YY[
	 L
 CF"55anEFH==//22    boseosreturnc                     [        U[        5      (       d   eU R                  R                  U5      nU(       a  U R                  /U-   nU(       a  X@R
                  /-   nU$ N)
isinstancestrr   encoder   r   )r)   r/   r?   r@   r:   s        r,   rF   SPTokenizer.encode@   sR    !S!!!!MM  #!A[[M!Ar>   r:   c                    S/ p2U H\  nX@R                   ;   a9  U(       a  X R                  R                  U5      -  n/ nX R                   U   -  nMK  UR                  U5        M^     U(       a  X R                  R                  U5      -  nU$ )N )r$   r   decoder5   )r)   r:   textbufferr+   s        r,   rJ   SPTokenizer.decodeI   s~    2fE111MM0088DF11%88e$  MM((00Dr>   tokensc                 <    U R                   R                  U5      nU$ rC   )r   DecodePieces)r)   rN   rK   s      r,   decode_tokensSPTokenizer.decode_tokensW   s    }}))&1r>   c                 t    XR                   ;   a  U R                   U   $ U R                  R                  U5      $ z0Converts a token (str) in an id using the vocab.)r#   r   	PieceToIdr)   r+   s     r,   convert_token_to_idSPTokenizer.convert_token_to_id[   s4    '''&&u--}}&&u--r>   c                     XR                   ;   a  U R                   U   $ XR                  U R                  U R                  4;   d  US:  a  gU R                  R                  U5      $ )=Converts an index (integer) in a token (str) using the vocab.r   rI   )r$   r   r   r!   r   	IdToPiecer)   indexs     r,   convert_id_to_tokenSPTokenizer.convert_id_to_tokena   sV    ---,,U33[[$++t{{;;uqy}}&&u--r>   )r   r   r$   r   r!   r(   r   r#   N)F)FF)__name__
__module____qualname____firstlineno__rE   r-   r<   boolr   intrF   rJ   rQ   rW   r^   __static_attributes__ r>   r,   r   r      sy    k3 k,3# 3 $ T d3i S	 c DI # ..r>   r   c                   <  ^  \ rS rSrSS0r/ SQr   S(U 4S jjrS r\S\	4S j5       r
\
R                  S	\	4S
 j5       r
\S\	4S j5       r\R                  S	\	4S j5       r\S 5       r\S\	4S j5       r\R                  S	\	4S j5       r\S 5       r\S 5       rS rS rS rS rS\\	   S\	4S jrS)S jrS rS rS*S jr S)S\\   S\\\      S\\   4S jjrS\R>                  SSS4S \ \!\	\"4   \#4   S!\\   S"\S#\\   S$\\$   S%\\$   S\%4S& jjr&S'r'U =r($ )+ChatGLMTokenizerj   
vocab_fileztokenizer.model)	input_idsattention_maskposition_idsc                   > SU l         Xl        [        U5      U l        U R                  R                  U R                  R
                  U R                  R                  S.U l        X@l        [        TU ](  " SUUUS.UD6  g )NGLMTokenizer)z<bos><eos><pad>)padding_sideclean_up_tokenization_spacesr8   rg   )namerk   r   	tokenizerr   r   r!   r#   r8   superr-   )r)   rk   rs   rt   r8   kwargs	__class__s         r,   r-   ChatGLMTokenizer.__init__o   s{     #	$$Z0^^**^^**^^**

 &;" 	
%)E"7	
 		
r>   c                     XR                   ;   a  U R                   U   $ XR                  R                   ;   d   U SU R                   35       eU R                  R                   U   $ )Nz is not a special token for )r#   rv   ru   rV   s     r,   get_commandChatGLMTokenizer.get_command   sc    '''&&u--555h%@\]a]f]f\g7hh5~~,,U33r>   rA   c                     gNz<unk>rg   r)   s    r,   	unk_tokenChatGLMTokenizer.unk_token       r>   valuec                     Xl         g rC   )
_unk_tokenr)   r   s     r,   r   r          r>   c                     gr   rg   r   s    r,   	pad_tokenChatGLMTokenizer.pad_token   r   r>   c                     Xl         g rC   )
_pad_tokenr   s     r,   r   r      r   r>   c                 $    U R                  S5      $ )Nrr   r|   r   s    r,   pad_token_idChatGLMTokenizer.pad_token_id       ((r>   c                     g)Nz</s>rg   r   s    r,   	eos_tokenChatGLMTokenizer.eos_token   s    r>   c                     Xl         g rC   )
_eos_tokenr   s     r,   r   r      r   r>   c                 $    U R                  S5      $ )Nrq   r   r   s    r,   eos_token_idChatGLMTokenizer.eos_token_id   r   r>   c                 .    U R                   R                  $ rC   )rv   r   r   s    r,   r   ChatGLMTokenizer.vocab_size   s    ~~%%%r>   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict)ranger   _convert_id_to_tokenupdateadded_tokens_encoder)r)   ivocabs      r,   	get_vocabChatGLMTokenizer.get_vocab   sL    :?:PQ:PQ**1-q0:PQT../ Rs   Ac                 H    U R                   R                  XR                  S9$ )N)r8   )rv   r<   r8   )r)   rK   rx   s      r,   	_tokenizeChatGLMTokenizer._tokenize   s     ~~&&tC]C]&^^r>   c                 8    U R                   R                  U5      $ rT   )rv   rW   rV   s     r,   _convert_token_to_id%ChatGLMTokenizer._convert_token_to_id       ~~11%88r>   c                 8    U R                   R                  U5      $ )rZ   )rv   r^   r\   s     r,   r   %ChatGLMTokenizer._convert_id_to_token   r   r>   rN   c                 8    U R                   R                  U5      $ rC   )rv   rQ   )r)   rN   s     r,   convert_tokens_to_string)ChatGLMTokenizer.convert_tokens_to_string   s    ~~++F33r>   Nc                    [         R                  R                  U5      (       a-  [         R                  R                  XR                  S   5      nOUn[        U R                  S5       nUR                  5       nSSS5        [        US5       nUR                  W5        SSS5        U4$ ! , (       d  f       N7= f! , (       d  f       U4$ = f)a9  
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.
    filename_prefix (`str`, *optional*):
        An optional prefix to add to the named of the saved files.

Returns:
    `Tuple(str)`: Paths to the files saved.
rk   rbNwb)	r   r   isdirr%   vocab_files_namesopenrk   readwrite)r)   save_directoryfilename_prefixrk   fin	proto_strwriters          r,   save_vocabulary ChatGLMTokenizer.save_vocabulary   s     77==((n6L6L\6Z[J'J$//4(C
I ) *d#vLL# $ } )( $# }s   *B,B=,
B:=
Cc                 J    U R                  S5      U R                  S5      /nU$ )Nr   r   r   )r)   prefix_tokenss     r,   get_prefix_tokens"ChatGLMTokenizer.get_prefix_tokens   s(    )))4d6F6Fu6MNr>   c                     US;   d   U5       eU R                  SU S35      /U R                  R                  U S35      -   nU R                  R                  U5      nXE-   nU$ )N)systemuser	assistantobservationz<|z|>
)r|   rv   rF   )r)   rolemetadatamessagerole_tokensmessage_tokensrN   s          r,   build_single_message%ChatGLMTokenizer.build_single_message   sp    EEKtKE''"TF"67$..:O:OS[R\\^P_:``..w7-r>   c           
         Uc  / n/ nU Hk  nUS   nUS   S:X  a%  SU;   a  US-   [         R                  " US   SSS9-   nUR                  U R                  US   UR	                  S	S
5      U5      5        Mm     UR                  U R                  US
U5      5        UR                  U R                  S5      /5        U R                  U/SSS9$ )Ncontentr   r   toolsr      F)indentensure_asciir   rI   r   ptT)return_tensorsis_split_into_words)jsondumpsr3   r   getr|   batch_encode_plus)r)   queryhistoryr   rl   itemr   s          r,   build_chat_input!ChatGLMTokenizer.build_chat_input   s    ?G	D9oGF|x'GtO!D.4::d7mA\a+bbT66tF|TXXjZ\E]_fgh	 
 	224UCD$**?;<=%%yk$\`%aar>   token_ids_0token_ids_1c                 b    U R                  5       nX1-   nUb  X-   U R                  S5      /-   nU$ )a6  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
rq   )r   r|   )r)   r   r   r   s       r,    build_inputs_with_special_tokens1ChatGLMTokenizer.build_inputs_with_special_tokens   sA    & ..0#1"%3t7G7G7P6QQKr>   encoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskrs   c                 :   U R                   S:X  d   eXR                  S      n[        U5      nU[        R                  :X  a  [        U5      nUb  Ub  X$-  S:w  a
  X$-  S-   U-  nU[        R
                  :g  =(       a    [        U5      U:g  n	SU;  a	  S/U-  US'   SU;  a  [        [        U5      5      US'   U	(       aZ  U[        U5      -
  n
SU;   a  S/U
-  US   -   US'   SU;   a  S/U
-  US   -   US'   U R                  /U
-  U-   XR                  S   '   U$ )a  
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
    encoded_inputs:
        Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
    max_length: maximum length of the returned list and optionally padding length (see below).
        Will truncate by taking into account the special tokens.
    padding_strategy: PaddingStrategy to use for padding.

        - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
        - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
        - PaddingStrategy.DO_NOT_PAD: Do not pad
        The tokenizer padding sides are defined in self.padding_side:

            - 'left': pads on the left of the sequences
            - 'right': pads on the right of the sequences
    pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
        This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
        `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
    return_attention_mask:
        (optional) Set to False to avoid returning attention mask (default: set to model specifics)
leftr   r   rm   rn   )	rs   model_input_namesr7   r   LONGEST
DO_NOT_PADlistr   r   )r)   r   r   r   r   r   rs   required_input
seq_lengthneeds_to_be_padded
differences              r,   _padChatGLMTokenizer._pad  s[   H   F***'(>(>q(AB(
666^,J!&8&D*JimnJn%;q@DVVJ-1K1KKqPSTbPcgqPq >101sZ/?N+,/-1%
2C-DN>*#c.&99J>14533CnUeFf3f/0/23z1ANSaDb1b~.9=9J9J8Kj8X[i8iN11!45r>   )r   r   r   r8   ru   r#   rv   rk   )r   FFrC   )Nr   ))r`   ra   rb   rc   r   r   r-   r|   propertyrE   r   setterr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   re   r   r   r   r   r   r   r
   r	   rd   dictr   rf   __classcell__)ry   s   @r,   ri   ri   j   s"   %'89G
 %*#
24 3    s     3    s     ) ) 3    s     ) ) & &_994tCy 4S 44b JN93;DI3F	c8 %),;,F,F,004'+Ad3#45}DEA SMA *	A
 %SMA  (~A tnA 
A Ar>   ri   )r   r   r&   typingr   r   r   r   sentencepiecer   transformersr   $transformers.tokenization_utils_baser	   r
   transformers.utilsr   r   ri   rg   r>   r,   <module>r     s>     	 	 . . 0 , L .M. M.`h* hr>   