
    cCiIY                     $   S SK r S SKrS SKJr  S SKJr  S SK Jr  S SKJrJ	r	  S SK
r
0 SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_rS.S#\4S$ jjrS#\4S% jr " S& S'5      r " S( S)5      r " S* S+5      r " S, S-5      rg)/    N)Iterator)Fraction)Match)OptionalUnionu   œoeu   ŒOE   øo   ØO   æae   ÆAE   ßssu   ẞSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁLsc                 r   ^^ U4S jmSR                  U4S j[        R                  " SU 5       5       5      $ )z
Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
manual mappings)
c                    > U T;   a  U $ U [         ;   a	  [         U    $ [        R                  " U 5      S:X  a  g[        R                  " U 5      S   S;   a  gU $ )NMn r   MSP )ADDITIONAL_DIACRITICSunicodedatacategory)charkeeps    h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/whisper/english_normalizer.pyreplace_character8remove_symbols_and_diacritics.<locals>.replace_character5   sY    4<K**(..!!$'4/!!$'*e3    r"   c              3   4   >#    U  H  nT" U5      v   M     g 7fN ).0cr+   s     r*   	<genexpr>0remove_symbols_and_diacritics.<locals>.<genexpr>C   s     R1QA$Q''1Qs   NFKDjoinr&   	normalize)r   r)   r+   s    `@r*   remove_symbols_and_diacriticsr9   /   s,     77R1F1Fvq1QRRRr-   c                 \    SR                  S [        R                  " SU 5       5       5      $ )zS
Replace any other markers, symbols, punctuations with a space, keeping diacritics
r"   c              3   d   #    U  H&  n[         R                  " U5      S    S;   a  SOUv   M(     g7f)r   r#   r$   N)r&   r'   )r1   r2   s     r*   r3   !remove_symbols.<locals>.<genexpr>J   s.     oNn+..q1!4=31DNns   .0NFKCr6   r   s    r*   remove_symbolsr?   F   s(     77okNcNcdjlmNnooor-   c                   8    \ rS rSrS	S\S\4S jjrS\4S jrSrg)
BasicTextNormalizerM   remove_diacriticssplit_lettersc                 @    U(       a  [         O[        U l        X l        g r/   )r9   r?   cleanrD   )selfrC   rD   s      r*   __init__BasicTextNormalizer.__init__N   s    6G2^
*r-   r   c                    UR                  5       n[        R                  " SSU5      n[        R                  " SSU5      nU R                  U5      R                  5       nU R                  (       a5  SR                  [        R                  " SU[        R                  5      5      n[        R                  " SSU5      nU$ )N[<\[][^>\]]*[>\]]r"   \(([^)]+?)\)r$   z\X\s+)	lowerresubrF   rD   r7   regexfindallUrG   r   s     r*   __call__BasicTextNormalizer.__call__R   s    GGIFF'Q/FF?B*JJqM!ua9:AFF63"r-   )rF   rD   N)FF)	__name__
__module____qualname____firstlineno__boolrH   strrU   __static_attributes__r0   r-   r*   rA   rA   M   s!    +$ +t +# r-   rA   c                   t   ^  \ rS rSrSrU 4S jrS\\   S\\   4S jr	S\4S jr
S\4S	 jrS\4S
 jrSrU =r$ )EnglishNumberNormalizer`   aZ  
Convert any spelled-out numbers into arabic numbers, while handling:

- remove any commas
- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
- spell out `one` and `ones`
- interpret successive single-digit numbers as nominal: `one oh one` -> `101`
c                   > [         TU ]  5         1 SkU l        [        / SQSS9 VVs0 s H  u  pX!_M	     snnU l        U R                  R                  5        VVs0 s H  u  p#US:X  a  SOUS-   US4_M     snnU l        SS	S
SSSS.U R                  R                  5        VVs0 s H<  u  p#US:  d  M  US:w  d  M  US:w  d  M  X"R                  S5      (       a  SOS-   US4_M>     snnEU l        0 U R                  EU R                  EU l	        SSSSSSSSS.U l
        U R                  R                  5        VVs0 s H  u  p#UR                  SS5      US4_M     snnU l        U R                  R                  5        VVs0 s H  u  p#UR                  SS 5      US4_M     snnU l        0 U R                  EU R                  EU l        S!S"S#S$S%S&S'S(S)S*S+S,S-.U l        U R                  R                  5        VVs0 s H  u  p#US-   US4_M     snnU l        U R                  R                  5        VVs0 s H  u  p#US-   US4_M     snnU l        0 U R                   EU R"                  EU l        1 U R                  kU R                  kU R                  kU l        S.S.S/S/S0.U l        S1S1S2S2S3S3S4S4S5.U l        [-        [/        U R(                  R1                  5       5      [/        U R*                  R1                  5       5      -   5      U l        S6S70S7S8.U l        1 S9kU l        U R                  U R                  U R                  U R                  U R                  U R                  U R$                  U R(                  U R*                  U R4                  U R6                  4 VVs1 s H  nU  H  nUiM     M     snnU l        S:S;1U l        g s  snnf s  snnf s  snnf s  snnf s  snnf s  snnf s  snnf s  snnf )<N>   r   ohzero)onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen   )startri   sixesr   )r   r   )rw   st)   nd)   rd)   r   )   r   )zerothfirstsecondthirdfifthtwelfthr}   r   r   thr         (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyyiesiethd     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillion-+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsr   %)perpercent>   andpointdoubletriplerd   ones)superrH   zeros	enumerater   itemsones_pluralendswithones_ordinalones_suffixedtensreplacetens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_words)rG   inamevaluemappingkey	__class__s         r*   rH    EnglishNumberNormalizer.__init__k   s=   (
 % G
 G
	 W[V_V_VeVeVg
Vg{ttu}G$*uclBVg
  !
 $(99??#4#4KD19 L!&! L05 L}}S11t<udmK#4
 G 0 0FD4E4EF 	
	 W[V_V_VeVeVghVg{tDLLe4uclBVghY]YbYbYhYhYjkYj+$T\\#v6EYjkF 0 0FD4E4EF  $)047;>BF
 PTO_O_OeOeOg"hOg4#:s|#;Og"hRVRbRbRhRhRj#kRj;4D4K%$>Rj#k $[t'>'>$[$BZBZ$[!=$))=dii=$**= 	$
  	$
  D!9!9!@!@!BCd4KcKcKjKjKlFmmnC=
 =
 

		""		""  ))((((
    

" $V_G

$ ik" #i#k6
sA   OO&O6O>O#O?!O !O& O,:O2%O8r   returnc              #     ^^#    S mS mSnS[         4S jnS[        [         [        4   4UU4S jjn[        U5      S:X  a  g [	        U5       GH  u  pVUS:w  a  XS-
     OS nU[        U5      S-
  :w  a  XS-      OS nU(       a  SnM;  US L=(       a    [
        R                  " SU5      n	US   U R                  ;   n
U
(       a  USS  OUn[
        R                  " SU5      (       a  U" U5      nUc  [        S	5      eTbN  [        T[         5      (       a/  TR                  S
5      (       a  [        T5      [        U5      -   mM  U" T5      v   U
(       a  US   OTmUR                  S:X  a  UR                  mGM-  UmGM2  X`R                  ;  a  Tb
  U" T5      v   U" U5      v   GM[  X`R                  ;   a  [        T=(       d    S5      S-   mGM  X`R                  ;   a  U R                  U   nTc  UmGM  [        T[         5      (       d  XpR                  ;   aC  XpR                   ;   a  US:  a  TS S [        U5      -   mGM  [        T5      [        U5      -   mGM  US:  a+  TS-  S:X  a  TU-  mGM(  [        T5      [        U5      -   mGMB  TS-  S:X  a  TU-  mGMS  [        T5      [        U5      -   mGMm  X`R"                  ;   Ga;  U R"                  U   u  pTc  U" [        U5      U-   5      v   GO
[        T[         5      (       d  XpR                  ;   aU  XpR                   ;   a#  US:  a  U" TS S [        U5      -   U-   5      v   OU" [        T5      [        U5      -   U-   5      v   OUS:  aF  TS-  S:X  a  U" [        TU-   5      U-   5      v   OhU" [        T5      [        U5      -   U-   5      v   OETS-  S:X  a  U" [        TU-   5      U-   5      v   O"U" [        T5      [        U5      -   U-   5      v   S mGM  X`R                   ;   aq  U R                   U   nTc  UmGM  [        T[         5      (       a  [        T5      [        U5      -   mGM  TS-  S:X  a  TU-  mGM  [        T5      [        U5      -   mGM8  X`R$                  ;   a  U R$                  U   u  pTc  U" [        U5      U-   5      v   GMt  [        T[         5      (       a%  U" [        T5      [        U5      -   U-   5      v   GM  TS-  S:X  a  U" [        TU-   5      U-   5      v   GM  U" [        T5      [        U5      -   U-   5      v   GM  X`R&                  ;   a  U R&                  U   nTc  UmGM  [        T[         5      (       d  TS:X  aC  U" T5      nUb  UU-  OS nUb  UR                  S:X  a  UR                  mGMm  U" T5      v   UmGM|  TS-  S-  nTS-  nUUU-  -   mGM  X`R(                  ;   a  U R(                  U   u  nnTc  U" [        U5      U-   5      v   O[        T[         5      (       ag  U" T5      nUb  UU-  OS nUb1  UR                  S:X  a!  U" [        UR                  5      U-   5      v   OLU" T5      v   U" [        U5      U-   5      v   O+TS-  S-  nTS-  nUUU-  -   mU" [        T5      U-   5      v   S mGM{  X`R*                  ;   aB  Tb
  U" T5      v   XR                  ;   d  U	(       a  U R*                  U   mGM  U" U5      v   GM  X`R,                  ;   a,  Tb  U R,                  U   mU" T5      v   GM  U" U5      v   GM  X`R.                  ;   a  Tbv  U R.                  U   n[        U[0        5      (       a9  X;   a  U" [        T5      X   -   5      v   SnGM_  U" T5      v   U" U5      v   GMv  U" [        T5      U-   5      v   GM  U" U5      v   GM  X`R2                  ;   Ga;  XR                  ;  a!  U	(       d  Tb
  U" T5      v   U" U5      v   GM  US:X  a,  XpR&                  ;  a  Tb
  U" T5      v   U" U5      v   GM  GM  US:X  d  US:X  a  XR                  ;   d  XR                  ;   aN  US:X  a  SOSnU R                  R5                  US5      n[        T=(       d    S5      [        U5      U-  -   mSnGM  Tb
  U" T5      v   U" U5      v   GM  US:X  a3  XR6                  ;   d  U	(       a  [        T=(       d    S5      S
-   mGM  GM  [        SU 35      e[        SU 35      e   Tb  U" T5      v   g g 7f)NFr   c                 :     [        U 5      $ ! [         a     g f = fr/   )r   
ValueErrorr>   s    r*   to_fraction:EnglishNumberNormalizer.process_words.<locals>.to_fraction   s#    {" s   
 
resultc                 6   > [        U 5      n Tb  TU -   n S mS mU $ r/   )r\   )r   prefixr   s    r*   output5EnglishNumberNormalizer.process_words.<locals>.output   s*    [F!&EFMr-   r   rw   z^\d+(\.\d+)?$zConverting the fraction failed.r"   0
   r   r   Tr   r   r   r{   r}   r   zUnexpected token: )r\   r   intlenr   rO   matchr   r   
isinstancer   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   )rG   r   skipr   r   r   currentprevnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   r   s                        @@r*   process_words%EnglishNumberNormalizer.process_words   sm     $+/	3 		5c? 	 	 u:?#E*JA#$65Q<tD#$E
Q#65Q<DD"$.S288<Ld3SO t}}4J4>WQR[G"xx(*@AA 679$%EFF$!%--%..2E2E #E
S\ 9 $Um+'1v==A%KKE2E

*$ -'Wo%JJ&EKR(3.II%yy)= Es++tyy/@yy(TBY %cr
SY 6 #E
SY 6BYrzQ #E
SY 6s{a' #E
SY 6...#11':= TV!344s++tyy/@yy(TBY$U3BZ#d)%;f%DEE$SZ#d)%;f%DEEBYrzQ$S%6%?@@$SZ#d)%;f%DEEs{a'$S%6%?@@$SZ#d)%;f%DEEII%yy)= Es++JT2Es{a' #E
SY 6...#11':= TV!344s++ Uc$i!7&!@AAs{a'$S%6%?@@$SZ#d)%;f%DEE,,,!--g6
=&Es++uz#E*A*+-JTA}!); !$Um+ *"d]T1F$t|H"X
%::E555%)%>%>w%G"
F= Z6!9::s++#E*A*+-JTA}!);$S%5%>??$Um+$S_v%=>>"d]T1F$t|H"X
%::E Uf!455444$ -'::%!55g>F /)444$!55g>F -' /)NN*$!^^G4F!&$//>"(Ufl)B"CC#'D"(-/"(/1$SZ&%899 /)MM)zz)/($Um+ /)%#3#33 ,"(-/$Wo- 4 (Gx,?yy(DJJ,>'.(':!#yy}}T15 #EKR 03t9w3F F# ,"(-/$Wo-'}}, #EKR 03 6 1@ %'9'%CDD !#5gY!?@@G +J - s   eer   c                 d   / n[         R                  " SU5      n[        U5       H  u  pE[        UR	                  5       5      S:X  a  M$  U[        U5      S-
  :X  a  UR                  U5        MI  UR                  U5        UR                  SS9S   nX`R                  ;   d  X`R                  ;   a  UR                  S5        M  UR                  S5        M     S	R                  U5      n[         R                  " S
SU5      n[         R                  " SSU5      n[         R                  " SSU5      nU$ )Nz\band\s+a\s+half\br   rw   r{   )maxsplitr   z
point fivez
and a halfr$   z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)rO   splitr   r   stripappendrsplitr   r   r7   rP   )rG   r   resultssegmentsr   segment	last_words          r*   
preprocess"EnglishNumberNormalizer.preprocess  s    88115#H-JA7==?#q(CMA%%w'w'#NNAN6r:	->N>N1NNN<0NN<0 . HHW FF$h2FF$h2 FF17A>r-   c                     S[         4S jnS[         4S jn[        R                  " SX!5      n[        R                  " SX15      n[        R                  " SSU5      nU$ )Nmc                      U R                  S5      nU R                  S5      n[        U R                  S5      5      nU U SUS 3$ ! [         a    U R                  s $ f = f)Nrw   r{   r}   r   02d)groupr   r   string)r  currencyintegerr   s       r*   combine_cents:EnglishNumberNormalizer.postprocess.<locals>.combine_cents  sa     771:''!*AGGAJ"G9AeC[99  xx s   AA A! A!c                 v     S[        U R                  S5      5       3$ ! [         a    U R                  s $ f = f)Nr   rw   )r   r  r   r  )r  s    r*   extract_cents:EnglishNumberNormalizer.postprocess.<locals>.extract_cents  s9     C
O,--  xx s    88u,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   rO   rP   )rG   r   r"  r%  s       r*   postprocess#EnglishNumberNormalizer.postprocess  sW    	 U 	 	 U 	  FFBMUFF.A FF<1-r-   c                     U R                  U5      nSR                  S U R                  UR                  5       5       5       5      nU R	                  U5      nU$ )Nr$   c              3   .   #    U  H  oc  M  Uv   M     g 7fr/   r0   )r1   words     r*   r3   3EnglishNumberNormalizer.__call__.<locals>.<genexpr>  s     X&CdTT&Cs   	)r  r7   r  r  r'  rT   s     r*   rU    EnglishNumberNormalizer.__call__  sJ    OOAHHXd&8&8&CXXQr-   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rW   rX   rY   rZ   __doc__rH   r   r\   r   r  r  r'  rU   r]   __classcell__)r   s   @r*   r_   r_   `   sW    h-T] 49 ] # ] ~C :S 2#  r-   r_   c                   ,    \ rS rSrSrS rS\4S jrSrg)EnglishSpellingNormalizeri  zr
Applies British-American spelling mappings as listed in [1].

[1] https://www.tysto.com/uk-us-spelling-list.html
c                     Xl         g r/   r   rG   english_spelling_mappings     r*   rH   "EnglishSpellingNormalizer.__init__  s    /r-   r   c                 V   ^  SR                  U 4S jUR                  5        5       5      $ )Nr$   c              3   Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fr/   )r   r   )r1   r+  rG   s     r*   r3   5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>  s#     K((44s   (+)r7   r  rT   s   ` r*   rU   "EnglishSpellingNormalizer.__call__  s    xxKKKKr-   r3  N)	rW   rX   rY   rZ   r.  rH   r\   rU   r]   r0   r-   r*   r1  r1    s    0L# Lr-   r1  c                   (    \ rS rSrS rS\4S jrSrg)EnglishTextNormalizeri   c                 R   SU l         0 SS_SS_SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_0 S#S$_S%S&_S'S(_S)S*_S+S,_S-S._S/S0_S1S2_S3S4_S5S6_S7S8_S9S:_S;S<_S=S>_S?S@_SASB_SCSD_ESESFSGSHSISJSKSLSMSNSOSPSLSQSRSS.EU l        [        5       U l        [	        U5      U l        g )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersr_   standardize_numbersr1  standardize_spellingsr4  s     r*   rH   EnglishTextNormalizer.__init__  s   <6
*6
 )6
 (	6

 &6
 )6
 )6
 (6
 *6
 6
 6
 <6
 <6
 M6
 '6
" y#6
$ 	%6
& x'6
( y)6
* +6
, -6
. /6
0 16
2 
36
4 
56
6 )76
8 96
: ;6
< =6
> ?6
@ ,A6
B }C6
D 
E6
F yG6
H yI6
J #%%%%%#k6
n $;#< %>?W%X"r-   r   c                    UR                  5       n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " U R                  SU5      n[        R                  " SSU5      nU R                  R                  5        H  u  p#[        R                  " X#U5      nM     [        R                  " SSU5      n[        R                  " SS	U5      n[        US
S9nU R                  U5      nU R                  U5      n[        R                  " SS	U5      n[        R                  " SSU5      n[        R                  " SSU5      nU$ )NrK   r"   rL   z\s+''z	(\d),(\d)r  z\.([^0-9]|$)z \1u
   .%$¢€£)r)   u   [.$¢€£]([^0-9])z	([^0-9])%z\1 rM   r$   )	rN   rO   rP   r@  rA  r   r9   rB  rC  )rG   r   patternreplacements       r*   rU   EnglishTextNormalizer.__call__=  s   GGIFF'Q/FF?B*FF4''Q/FF7C#$(NN$8$8$: GwQ/A %; FF<!,FF?FA.)!,?$$Q'&&q) FF)615FF<+FF63"r-   )r@  rA  rB  rC  N)rW   rX   rY   rZ   rH   r\   rU   r]   r0   r-   r*   r<  r<     s    :Yx# r-   r<  )r"   )rO   r&   collections.abcr   	fractionsr   r   typingr   r   rQ   r%   r\   r9   r?   rA   r_   r1  r<  r0   r-   r*   <module>rM     s+    
  $   " $$ 	# 	#	
 	$ 	$ 	$ 
4 	# 	# 	# 	# 	$ 	$ 	#  	#! (SS S.pc p &O OdL LU Ur-   