
    cCiA
                     0    S r SSKJrJrJr   " S S5      rg)z Tokenization utils for RoFormer.    )NormalizedStringPreTokenizedStringnormalizersc                   H    \ rS rSrSS jrS\S\S\\   4S jrS\	4S	 jr
S
rg)JiebaPreTokenizer   returnNc                     Xl         [        R                  " SSSSS9U l         SS KnX l        g ! [         a    [	        S5      ef = f)NFT)
clean_texthandle_chinese_charsstrip_accents	lowercaser   zkYou need to install rjieba to use RoFormerTokenizer. See https://pypi.org/project/rjieba/ for installation.)vocabr   BertNormalizerrjiebaImportErrorjieba)selfr   r   s      i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/roformer/tokenization_utils.py__init__JiebaPreTokenizer.__init__   sY    
&55!%	
	 
  	I 	s	   / Ainormalized_stringc                 h   / nU R                   R                  [        U5      SS9 H  u  pEnX@R                  ;   a  UR	                  X%U 5        M*  U R
                  R                  U5      R                  5       nU H/  nU(       d  M  U[        U5      -   nUR	                  X%U 5        UnM1     M     U$ )NF)hmm)	r   tokenizestrr   appendr   normalize_strsplitlen)r   r   r   splitstokenstartend
token_lists           r   jieba_splitJiebaPreTokenizer.jieba_split&   s     "&!4!4S9J5KQV!4!WE#

"/c:;!--;;EBHHJ
'Eu#c%j0&7c&BC #	 ( "X*     pretokc                 :    UR                  U R                  5        g )N)r    r'   )r   r*   s     r   pre_tokenizeJiebaPreTokenizer.pre_tokenizeA   s    T%%&r)   )r   r   r   )r	   N)__name__
__module____qualname____firstlineno__r   intr   listr'   r   r,   __static_attributes__ r)   r   r   r      s5    "S 5E $O_J` 6'#5 'r)   r   N)__doc__
tokenizersr   r   r   r   r5   r)   r   <module>r8      s    ' H H.' .'r)   