
    C1i                      P    S SK r S SKJr  S SKJrJrJr  S SKr " S S5      rSr	Sr
g)    N)cached_property)ListOptionalTuplec            
       R   \ rS rSrSr  SS\R                  S\S\\	   S\\	   4S jjr
\S	\4S
 j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\4S j5       r\S	\\   4S j5       rS\	S	\\   4S jrS\\   S	\	4S jrS\\   S	\	4S jr\S	\\   4S j5       rS\\   S	\\\	   \\\      4   4S jrS\\   S	\\\	   \\\      4   4S jrS\\   S	\\\	   \\\      4   4S jr Sr!g)	Tokenizer	   z-Simple wrapper around a tokenizers.Tokenizer.N	tokenizermultilingualtasklanguagec                    Xl         U(       a  U[        ;  a'  [        SU< SSR                  [        5      < S35      eU[        ;  a'  [        SU< SSR                  [        5      < S35      eU R                   R                  SU-  5      U l        U R                   R                  SU-  5      U l        X@l        g S U l        S U l        SU l        g )N'z'' is not a valid task (accepted tasks: z, )z9' is not a valid language code (accepted language codes: z<|%s|>en)	r
   _TASKS
ValueErrorjoin_LANGUAGE_CODEStoken_to_idr   r   language_code)selfr
   r   r   r   s        R/home/james-whalen/.local/lib/python3.13/site-packages/faster_whisper/tokenizer.py__init__Tokenizer.__init__   s     #6! TYYv.0 
 . ?!;= 
 228d?CDI NN66x(7JKDM!)DI DM!%D    returnc                 8    U R                   R                  S5      $ )Nz<|transcribe|>r
   r   r   s    r   
transcribeTokenizer.transcribe*   s    ~~))*:;;r   c                 8    U R                   R                  S5      $ )Nz<|translate|>r   r    s    r   	translateTokenizer.translate.       ~~))/::r   c                 8    U R                   R                  S5      $ )Nz<|startoftranscript|>r   r    s    r   sotTokenizer.sot2   s    ~~))*ABBr   c                 8    U R                   R                  S5      $ )Nz<|startoflm|>r   r    s    r   sot_lmTokenizer.sot_lm6   r&   r   c                 8    U R                   R                  S5      $ )Nz<|startofprev|>r   r    s    r   sot_prevTokenizer.sot_prev:   s    ~~))*;<<r   c                 8    U R                   R                  S5      $ )Nz<|endoftext|>r   r    s    r   eotTokenizer.eot>   r&   r   c                 8    U R                   R                  S5      $ )Nz<|notimestamps|>r   r    s    r   no_timestampsTokenizer.no_timestampsB   s    ~~))*<==r   c                 |    U R                   R                  S5      =(       d    U R                   R                  S5      $ )Nz<|nospeech|>z<|nocaptions|>r   r    s    r   	no_speechTokenizer.no_speechF   s2    ~~)).9 
T^^=W=W>
 	
r   c                      U R                   S-   $ )N   )r4   r    s    r   timestamp_beginTokenizer.timestamp_beginL   s    !!A%%r   c                     U R                   /nU R                  b  UR                  U R                  5        U R                  b  UR                  U R                  5        U$ N)r(   r   appendr   )r   sequences     r   sot_sequenceTokenizer.sot_sequenceP   sF    HH:==$OODMM*99 OODII&r   textc                 J    U R                   R                  USS9R                  $ )NF)add_special_tokens)r
   encodeids)r   rC   s     r   rF   Tokenizer.encode\   s"    ~~$$Te$DHHHr   tokensc                     U Vs/ s H  o"U R                   :  d  M  UPM     nnU R                  R                  U5      $ s  snf r>   )r1   r
   decode)r   rI   tokentext_tokenss       r   rK   Tokenizer.decode_   s:    *0E&DHH4Du&E~~$$[11 Fs   ??c           	         / /nU H`  nX0R                   :  a:  SX0R                   -
  S-  S S3nUR                  U5        UR                  / 5        ML  US   R                  U5        Mb     SR                  U Vs/ s H5  n[        U[        5      (       a  UOU R
                  R                  U5      PM7     sn5      $ s  snf )Nz<|g{Gz?z.2fz|> )r;   r?   r   
isinstancestrr
   rK   )r   rI   outputsrL   	timestampss         r   decode_with_timestamps Tokenizer.decode_with_timestampsc   s    $E,,, %*>*>">$!Fs K2N	y)r"""5)  wwLSTGq*Q$$Q$..*?*?*BBGT
 	
Ts   9<B<c                    [        S5      nUSR                  5       -  n[        S5      n[        S U 5       5      (       d   eU R	                  S5      S   U R	                  S5      S   1nU[        U5      -    HY  nU R	                  U5      U R	                  SU-   5      4 H-  n[        U5      S	:X  d  XB;   d  M  UR                  US   5        M/     M[     [        [        U5      5      $ )
uM  
Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

- ♪♪♪
- ( SPEAKING FOREIGN LANGUAGE )
- [DAVID] Hey there,

keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   ^   #    U  H#  nS [        U5      s=:*  =(       a    S:*  Os  v   M%     g7f)i@&  i&  N)ord).0cs     r   	<genexpr>.Tokenizer.non_speech_tokens.<locals>.<genexpr>   s$     E}!6SV--v--}s   +-z -r   z ' r:   )	listsplitsetallrF   lenaddtuplesorted)r   symbolsmiscellaneousresultsymbolrI   s         r   non_speech_tokensTokenizer.non_speech_tokensr   s     =>Z``b	
 34E}EEEEE ++d#A&D(9!(<=] 33FF#C&L) v;!#v'>JJvay) 4 VF^$$r   c                 f    U R                   S;   a  U R                  U5      $ U R                  U5      $ )N>   jalomythzhyue)r   split_tokens_on_unicodesplit_tokens_on_spaces)r   rI   s     r   split_to_word_tokensTokenizer.split_to_word_tokens   s7     !FF //77**622r   c                    U R                  U5      nSn/ n/ n/ nSnU H  nUR                  U5        U R                  U5      n	 U	R                  U5      n
X-  n
U
b  U
[	        U5      :  d  MO  X*   U:X  d  MY  UR                  U	5        UR                  U5        / nU[	        U	5      -  nM     XE4$ ! [         a    S n
 Nef = f)Nu   �r   )rW   r?   indexr   re   )r   rI   decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetrL   decodedreplacement_char_indexs              r   rv   !Tokenizer.split_tokens_on_unicode   s     226:#E!!%(11.AG.)07G)H&&8& &-&\):: 8<LLW%"">2!##g,.# & !!  .)-&.s   B00B?>B?c                    U R                  U5      u  p#/ n/ n[        X#5       H  u  pgUS   U R                  :  nUR                  S5      n	UR	                  5       [
        R                  ;   n
U(       d  U	(       d  U
(       d  [        U5      S:X  a$  UR                  U5        UR                  U5        M  US   U-   US'   US   R                  U5        M     XE4$ )Nr   r`   rP   )
rv   zipr1   
startswithstripstringpunctuationre   r?   extend)r   rI   subwordssubword_tokens_listr~   r   subwordsubword_tokensspecial
with_spacer   s              r   rw    Tokenizer.split_tokens_on_spaces   s     )-(D(DV(L%'*8'I#G$Q'4883G ++C0J!--/V-?-??K*s5zQW%"">2!"I/b	B&&~6 (J !!r   )r   r   r   r
   )NN)"__name__
__module____qualname____firstlineno____doc__
tokenizersr   boolr   rS   r   r   intr!   r$   r(   r+   r.   r1   r4   r7   propertyr;   r   rA   rF   rK   rW   r   rm   rx   rv   rw   __static_attributes__ r   r   r   r   	   sQ   7 #"&&''& & sm	&
 3-&< <C < < ;3 ; ; CS C C ; ; ; =# = = ;S ; ; >s > > 
3 
 

 & & & 	d3i 	 	I3 I49 I2T#Y 23 2
T#Y 
3 
 !%5: !% !%F	33i	3	tCy$tCy/)	*	3"3i"	tCy$tCy/)	*"@"3i"	tCy$tCy/)	*"r   r   )r!   r$   )dafamarasazbabebgbnbobrbscacscydadeelr   eseteufafifofrglguhahawhehihrhthuhyidisitrp   jwkakkkmknkolalblnrq   ltlvmgmimkmlmnmrmsmtrr   nenlnnnoocpaplpsptrorusasdsiskslsnsosqsrsusvswtatetgrs   tktltrttukuruzviyiyort   ru   )r   	functoolsr   typingr   r   r   r   r   r   r   r   r   r   <module>r      s0     % ( ( J" J"Z

er   