
    bCi"                     V    S r SSKr\R                  S:  a  SSKrOSSKr " S S5      rg)z"English Normalizer class for CLVP.    N)      c                       \ rS rSrS rS\S\4S jrS\S\4S jrS\S\4S	 jr	S\S\4S
 jr
S\S\4S jrS\S\4S jrS\S\4S jrS\S\4S jrS\S\4S jrS\S\4S jrS rSrg)EnglishNormalizer   c                     S Vs/ s H3  n[         R                  " SUS   -  [         R                  5      US   4PM5     snU l        / SQU l        / SQU l        / SQU l        g s  snf )N))mrsmisess)mrmister)drdoctor)stsaint)cocompany)jrjunior)majmajor)gengeneral)drsdoctors)revreverend)lt
lieutenant)hon	honorable)sgtsergeant)captcaptain)esqesquire)ltdlimited)colcolonel)ftfortz\b%s\.r      )
 onetwothreefourfivesixseveneightnine)
teneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen)
r.   r.   twentythirtyfortyfiftysixtyseventyeightyninety)recompile
IGNORECASE_abbreviationsonesteenstens)selfxs     d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/clvp/number_normalizer.py__init__EnglishNormalizer.__init__   sl    
 ZZ
QqT)2==91Q4@
0 a	

 k	K
s   :A#numreturnc                 @   US:X  a  gUS:  a  SU R                  [        U5      5      -   $ US:  a  U R                  U   $ US:  a  U R                  US-
     $ US:  a7  U R                  US-     US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ US	:  a:  U R                  US-     S
-   US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ US:  a<  U R                  US	-  5      S-   US	-  S:w  a  SU R                  US	-  5      -   -   $ S-   $ US:  a<  U R                  US-  5      S-   US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ US:  a<  U R                  US-  5      S-   US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ US:  a<  U R                  US-  5      S-   US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ US:  a<  U R                  US-  5      S-   US-  S:w  a  SU R                  US-  5      -   -   $ S-   $ g)aP  
Converts numbers(`int`) to words(`str`).

Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
r   zerozminus 
      d   -r.      hundred i@B z	 thousand, i ʚ;z millionl    J)z billionl     I5 z	 trillionl     NZoz quadrillionznumber out of range)number_to_wordsabsrN   rO   rP   )rQ   rV   s     rS   rb   !EnglishNormalizer.number_to_wordsF   s	    !81Wd223s8<<<2X99S>!2X::cBh''3Y99SBY'SVY[S[_`S`31E1EcBh1O+Oiifhii4Z		#*%
2_beh_hlm_mcD<P<PQTWZQZ<[6[vsuv 9_$$SD[1>ADjAo4$..sTz::W TVW
 = $$SI%56CF?VWCW4$..sY??a ^`a
 $$$$SM%9:GJ]GZ^_G_4$..s]/BCCi fhi
 (($$S,=%=>KNQbKbfgKg4$..s5F/FGGq npq
 ,,$$S,A%AB ! 22a7 4//6K0KLL  )    textc                 D    UR                  SS5      R                  S5      $ )z
Converts unicode to ascii
asciiignorezutf-8)encodedecoderQ   rf   s     rS   convert_to_ascii"EnglishNormalizer.convert_to_ascii   s      {{7H-44W==re   mc                    UR                  S5      nUR                  S5      n[        U5      S:  a  US-   $ US   (       a  [        US   5      OSn[        U5      S:  a  US   (       a  [        US   5      OSnU(       a,  U(       a%  US:X  a  SOSnUS:X  a  SOS	nU< S
U< SU< S
U< 3$ U(       a  US:X  a  SOSnU< S
U< 3$ U(       a  US:X  a  SOS	nU< S
U< 3$ g)zJ
This method is used to expand numerical dollar values into spoken words.
r-   .   z dollarsr   dollardollarscentcentsr`   ra   zzero dollars)groupsplitlenint)rQ   ro   matchpartsrt   rv   dollar_unit	cent_units           rS   _expand_dollars!EnglishNormalizer._expand_dollars   s     
C u:>:%%#(8#eAh-!$UaE!HE!H!u&-l(	K"'1*'I%,k5)LL&-l(	K%{33"'1*'I#Y//!re   c                 D    UR                  S5      R                  SS5      $ )z6
This method is used to remove commas from sentences.
r-   ,r.   rw   replacerQ   ro   s     rS   _remove_commas EnglishNormalizer._remove_commas   s     wwqz!!#r**re   c                 D    UR                  S5      R                  SS5      $ )z?
This method is used to expand '.' into spoken word ' point '.
r-   rq   z point r   r   s     rS   _expand_decimal_point'EnglishNormalizer._expand_decimal_point   s     wwqz!!#y11re   c                     SSSS.n[        UR                  S5      SS 5      nSUS	-  ::  a  US	-  S
::  a  SnOUR                  US-  S5      nU R                  U5      U-   $ )zP
This method is used to expand ordinals such as '1st', '2nd' into spoken words.
r   ndrd)r-   rr   r   r   NrZ   r\   r[   th)rz   rw   getrb   )rQ   rV   ordinal_suffixessuffixs       rS   _expand_ordinal!EnglishNormalizer._expand_ordinal   sp      $6#))A,s#$s?sSyBF%))#(D9F##C(611re   c                 &   [        UR                  S5      5      nUS:  aa  US:  a[  US:X  a  gUS:  a  US:  a  SU R                  US-  5      -   $ US-  S:X  a  U R                  US-  5      S	-   $ U R                  U5      $ U R                  U5      $ )
z
This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
link :
https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
r   r^   i  i  ztwo thousandi  ztwo thousand r\   r_   )rz   rw   rb   )rQ   ro   rV   s      rS   _expand_number EnglishNormalizer._expand_number   s     !''!*o:#*d{%td
&)=)=cCi)HHHsa++C3J7*DD++C00'',,re   c                    [         R                  " SU R                  U5      n[         R                  " SSU5      n[         R                  " SU R                  U5      n[         R                  " SU R                  U5      n[         R                  " SU R
                  U5      n[         R                  " SU R                  U5      nU$ )zw
This method is used to normalize numbers within a text such as converting the numbers to words, removing
commas, etc.
z([0-9][0-9,]+[0-9])u   £([0-9,]*[0-9])z	\1 poundsz\$([0-9.,]*[0-9])z([0-9]++\.[0-9]+)z[0-9]++(st|nd|rd|th)z[0-9]+)rJ   subr   r   r   r   r   rl   s     rS   normalize_numbers#EnglishNormalizer.normalize_numbers   s    
 vv,d.A.A4Hvv)<>vv*D,@,@$Gvv*D,F,FMvv-t/C/CTJvvi!4!4d;re   c                 ^    U R                    H  u  p#[        R                  " X#U5      nM     U$ )z
Expands the abbreviate words.
)rM   rJ   r   )rQ   rf   regexreplacements       rS   expand_abbreviations&EnglishNormalizer.expand_abbreviations   s,     #'"5"5E66%d3D #6re   c                 Z    [         R                  " [         R                  " S5      SU5      $ )z
Removes multiple whitespaces
z\s+r`   )rJ   r   rK   rl   s     rS   collapse_whitespace%EnglishNormalizer.collapse_whitespace   s      vvbjj(#t44re   c                     U R                  U5      nUR                  5       nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  SS5      nU$ )zt
Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
abbreviations
"r.   )rm   lowerr   r   r   r   rl   s     rS   __call__EnglishNormalizer.__call__   sd     $$T*zz|%%d+((.''-||C$re   )rM   rN   rO   rP   N)__name__
__module____qualname____firstlineno__rT   rz   strrb   rm   r   r   r   r   r   r   r   r   r   __static_attributes__ re   rS   r   r      s    'kR9)3 9)3 9)v>S >S >" " "0+ + +2s 2s 223 23 2- - -(c c   5 5 5re   r   )__doc__sysversion_inforJ   r   r   r   re   rS   <module>r      s/     ) 
 wX Xre   