
    љic                         S SK r S SKrS SKJrJr  S SKrS SKJr  / SQr " S S5      r	 " S S5      r
S5S
\S\S\ R                  4S jjrS5S
\S\ R                  S\4S jjr\ R                   " S5      4S\4S jjrS r\ R&                  " 5       SS	S\" S5      SSSSSSSSSS4S\ R                  S\S\S \S!\S"\S#\S$\S%\S&\S'\\/S4   S(\S)\S*\S+\4S, jj5       r " S- S.5      r  S6S/\\   S0\ R                  S1\S\S\ R                  4
S2 jjr  S6S/\\   S0\ R                  S1\S\S\ R                  4
S3 jjrS/\\   S\S\\   4S4 jrg)7    N)CallableList)version)ruendeesc                   R    \ rS rSrS
S jrS\4S jrSS jrS\4S jrS\4S jr	Sr
g	)OnnxWrapper
   c                 Z   SS K qSS KnUR                  5       nSUl        SUl        U(       a,  SUR                  5       ;   a  UR                  US/US9U l        OUR                  XS9U l        U R                  5         SU;   a  [        R                  " S5        S/U l        g S	S/U l        g )
Nr      CPUExecutionProvider)	providerssess_options)r   16kz,This model support only 16000 sampling rate!>  @  )numpynponnxruntimeSessionOptionsinter_op_num_threadsintra_op_num_threadsget_available_providersInferenceSessionsessionreset_stateswarningswarnsample_rates)selfpathforce_onnx_cpur   optss        N/home/james-whalen/.local/lib/python3.13/site-packages/silero_vad/utils_vad.py__init__OnnxWrapper.__init__   s    ))+$%!$%!48[8[8]]&77I_H`os7tDL&777PDLD=MMHI!&D!%uD    src                    UR                  5       S:X  a  UR                  S5      nUR                  5       S:  a  [        SUR                  5        35      eUS:w  a  US-  S:X  a  US-  nUS S 2S S U24   nSnX R                  ;  a  [        SU R                   S35      eX!R                  S   -  S:  a  [        S	5      eX4$ )
Nr   r      z*Too many dimensions for input audio chunk r   zSupported sampling rates: z (or multiply of 16000)g     @?@zInput audio chunk is too short)dim	unsqueeze
ValueErrorr!   shape)r"   xr*   steps       r&   _validate_inputOnnxWrapper._validate_input!   s    557a<AA557Q;I!%%'STT;BJ!O;D!FdF(AB&&&9$:K:K9LLcdee
?U"=>>ur)   c                     [         R                  " SUS45      R                  5       U l        [         R                  " S5      U l        SU l        SU l        g )Nr,      r   )torchzerosfloat_state_context_last_sr_last_batch_size)r"   
batch_sizes     r&   r   OnnxWrapper.reset_states3   s@    kk1j#"67==?A !r)   c                    U R                  X5      u  pUS:X  a  SOSnUR                  S   U:w  a  [        SUR                  S    S35      eUR                  S   nUS:X  a  SOS	nU R                  (       d  U R	                  U5        U R
                  (       a!  U R
                  U:w  a  U R	                  U5        U R                  (       a!  U R                  U:w  a  U R	                  U5        [        U R                  5      (       d  [        R                  " XE5      U l        [        R                  " U R                  U/S
S9nUS;   az  UR                  5       U R                  R                  5       [        R                  USS9S.nU R                  R!                  S U5      nUu  p[        R"                  " U	5      U l        O
[        5       eUSU* S 24   U l        X l        X@l        [        R"                  " U5      nU$ )Nr         zProvided number of samples is z< (Supported values: 256 for 8000 sample rate, 512 for 16000)r   @       r   r-   r   r   int64)dtype)inputstater*   .)r3   r0   r/   r=   r   r<   lenr;   r7   r8   catr   r:   r   arrayr   run
from_numpy)
r"   r1   r*   num_samplesr>   context_size
ort_inputsort_outsoutrK   s
             r&   __call__OnnxWrapper.__call__9   s   $$Q+5[cc772;+%=aggbk]  KG  H  I  IWWQZ
5[rb$$j)MM 3j)!!(=(=(Kj)4==!!!KK
ADMIIt}}a(a0#$779t{{7H7H7JRTRZRZ[]elRZRmnJ||''j9H!JC**51DK,#}~-. *s#
r)   c                    / nU R                  X5      u  pU R                  5         US:X  a  SOSnUR                  S   U-  (       a@  XAR                  S   U-  -
  n[        R                  R
                  R                  USU4SSS9n[        SUR                  S   U5       H2  nUS S 2XfU-   24   nU R                  Xr5      nUR                  U5        M4     [        R                  " USS	9n	U	R                  5       $ )
Nr   rA   rB   r   r   constantg        )valuerF   )r3   r   r0   r7   nn
functionalpadrangerV   appendrM   cpu)
r"   r1   r*   outsrQ   pad_numi
wavs_batch	out_chunkstackeds
             r&   audio_forwardOnnxWrapper.audio_forward^   s    $$Q+5[cc771:#!WWQZ+%=>G##''Aw<3'OAq!''!*k2A1a+o-.Jj5IKK	" 3
 ))Da({{}r)   )r;   r=   r<   r:   r!   r   N)F)r   )__name__
__module____qualname____firstlineno__r'   intr3   r   rV   rg   __static_attributes__ r)   r&   r   r   
   s1    .*S $"#c #J3 r)   r   c                   <    \ rS rSrS rS\R                  4S jrSrg)	Validatorq   c                 Z   UR                  S5      (       a  SOSU l        [        R                  R	                  US5        U R                  (       aM  SS KnU(       a+  SUR                  5       ;   a  UR                  SS/S9U l        g UR                  S5      U l        g [        SS9U l        g )	Nz.onnxTFz	inf.modelr   r   )r   )
model_path)
endswithonnxr7   hubdownload_url_to_filer   r   r   modelinit_jit_model)r"   urlr$   r   s       r&   r'   Validator.__init__r   s    LL11Du			&&sK899"8K<_<_<a"a(99+RhQi9j
(99+F
';?DJr)   inputsc                 z   [         R                  " 5          U R                  (       ac  SUR                  5       R	                  5       0nU R
                  R                  S U5      nU Vs/ s H  n[         R                  " U5      PM     nnOU R                  U5      nS S S 5        U$ s  snf ! , (       d  f       W$ = f)NrJ   )r7   no_gradrv   r`   r   ry   rO   Tensor)r"   r}   rS   ra   r1   s        r&   rV   Validator.__call__~   s    ]]_yy%vzz|'9'9';<
zz~~dJ7156AQ6zz&)  	 7	 _ s   AB+( B&B+&B++
B:)ry   rv   N)	ri   rj   rk   rl   r'   r7   r   rV   rn   ro   r)   r&   rq   rq   q   s    
@	u|| 	r)   rq   r   r#   sampling_ratereturnc                    [         R                  " [        R                  5      nU[         R                  " S5      :  a2   SS/S[	        U5      //n[        R
                  R                  XS9u  pEO [        R                  " U 5      u  pEUR                  S:  a%  UR!                  S5      S:  a  UR#                  SSS9nXQ:w  a%  [        R$                  R'                  XQ5      " U5      nUR)                  S5      $ !   [        R                  " U 5      u  pE N= f!    SSKJ	n  U" U 5      R                  5       nUR                  nUR                  n N! [         a!    [        S[        R                   S	3S
-   5      ef = f= f)N2.9channels1rate)effectsr   )AudioDecodertorchaudio version z$ requires torchcodec for audio I/O. *Install torchcodec or pin torchaudio < 2.9r   T)r-   keepdim)r   parse
torchaudio__version__strsox_effectsapply_effects_fileloadtorchcodec.decodersr   get_all_samplesdatasample_rateImportErrorRuntimeErrorndimsizemean
transformsResamplesqueeze)r#   r   ta_verr   wavr*   r   sampless           r&   
read_audior      sY   ]]:112Fe$$	,"C(&#m2D)EFG ,,???VGC	 ood+GC xx!|ahh1dh+	##,,R?D;;q>/	, ood+GC
		<&t,<<>ll(( ")**@*@)AAefBC s*    0C: 2D :DF 4E+E==F tensorc                    UR                  5       R                  5       nUR                  S:X  a  UR                  S5      n[        R
                  " [        R                  5      n [        R                  " XUSS9  g ! [         ak    U[        R
                  " S5      :  aO   SSK
Jn  U" USS9nUR                  U 5         g ! [         a!    [        S	[        R                   S
3S-   5      ef = fe f = f)Nr   r      )bits_per_sampler   )AudioEncoderr   )r   r   z! requires torchcodec for saving. r   )detachr`   r   r.   r   r   r   r   save	Exceptiontorchcodec.encodersr   to_filer   r   )r#   r   r   r   r   encoders         r&   
save_audior      s    ]]_  "F{{a!!!$]]:112FmRH W]]5))<&v5A% ")**@*@)AAbcBC  s   %A< <$C1!C+C,,C1r`   rt   c                 `    [         R                  R                  XS9nUR                  5         U$ )N)map_location)r7   jitr   eval)rt   devicery   s      r&   rz   rz      s$    IINN:N;E	JJLLr)   c           
          SS K nUR                  SU 0[        [        U 5      5       Vs/ s H  o3U-  PM	     snS9R	                  SSSS/S[        U 5      U-  /SSS	S
9  g s  snf )Nr   probs)index)r      areag?secondszspeech probabilitytab20)figsizekindylimxlimxlabelylabelcolormap)pandas	DataFramer^   rL   plot)r   r2   pdr1   s       r&   make_visualizationr      ss    LL'5!*/E
*;<*;QD*;<  >>Bd7At9As5zD7H3I!,!	 ?C ?#<s   A 
      ?   infd      Fr   rA   b   Taudio	thresholdmin_speech_duration_msmax_speech_duration_smin_silence_duration_msspeech_pad_msreturn_secondstime_resolutionvisualize_probsprogress_tracking_callbackneg_thresholdwindow_size_samplesmin_silence_at_max_speechuse_max_poss_sil_at_max_speechc                    [         R                  " U 5      (       d   [         R                  " U 5      n [	        U R
                  5      S:  aZ  [        [	        U R
                  5      5       H  nU R                  S5      n M     [	        U R
                  5      S:  a  [        S5      eUS:  a/  US-  S:X  a&  US-  nSnU SSU2   n [        R                  " S5        OSnUS;  a  [        S	5      eUS:X  a  S
OSnUR                  5         X4-  S-  nX7-  S-  nX5-  U-
  SU-  -
  nX6-  S-  nX>-  S-  n[	        U 5      n/ n[        SUU5       H  nU UUU-    n[	        U5      U:  aA  [         R                  R                  R                  US[        U[	        U5      -
  5      45      nU" UU5      R!                  5       nUR#                  U5        UU-   nUU:  a  UnUU-  S-  nU(       d  M  U" U5        M     Sn/ n0 n Uc  [%        US-
  S5      nSn!S=n"n#/ n$['        U5       GH  u  nnUU-  n%UU:  a/  U!(       a(  U%U!-
  n&U&U:  a  U$R#                  U!U&45        Sn!U#U":  a  U%n#UU:  a  U(       d	  SnU%U S'   MW  U(       a  U%U S   -
  U:  a  U(       aL  U$(       aE  [%        U$S S9u  n"n'U"U S'   UR#                  U 5        0 n U"U'-   n#U#U"U%-   :  a  U#U S'   OSnS=n"=n#n!/ n$O[U"(       a/  U"U S'   UR#                  U 5        0 n U#U":  a  SnOU#U S'   S=n"=n#n!/ n$O%U%U S'   UR#                  U 5        0 n S=n"=n#n!Sn/ n$GM  UU:  d  GM!  U(       d  GM+  U!(       d  U%n!U%U!-
  n(U(       d  U(U:  a  U!n"U(U:  a  GMQ  U!U S'   U S   U S   -
  U:  a  UR#                  U 5        0 n S=n"=n#n!Sn/ n$GM     U (       a"  UU S   -
  U:  a  UU S'   UR#                  U 5        ['        U5       GH  u  nn)US:X  a  [        [%        SU)S   U-
  5      5      U)S'   U[	        U5      S-
  :w  a  UUS-      S   U)S   -
  n*U*SU-  :  aH  U)S==   [        U*S-  5      -  ss'   [        [%        SUUS-      S   U*S-  -
  5      5      UUS-      S'   M  [        [)        UU)S   U-   5      5      U)S'   [        [%        SUUS-      S   U-
  5      5      UUS-      S'   M  [        [)        UU)S   U-   5      5      U)S'   GM
     U(       aM  UU-  n+U HA  n,[%        [+        U,S   U-  U	5      S5      U,S'   [)        [+        U,S   U-  U	5      U+5      U,S'   MC     O)US:  a#  U H  n,U,S==   U-  ss'   U,S==   U-  ss'   M     U
(       a  [-        UX-  5        U$ !   [        S5      e= f)a
  
This method is used for splitting long audios into speech chunks using silero VAD

Parameters
----------
audio: torch.Tensor, one dimensional
    One dimensional float torch.Tensor, other types are casted to torch if possible

model: preloaded .jit/.onnx silero VAD model

threshold: float (default - 0.5)
    Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
    It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.

sampling_rate: int (default - 16000)
    Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates

min_speech_duration_ms: int (default - 250 milliseconds)
    Final speech chunks shorter min_speech_duration_ms are thrown out

max_speech_duration_s: int (default -  inf)
    Maximum duration of speech chunks in seconds
    Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent aggressive cutting.
    Otherwise, they will be split aggressively just before max_speech_duration_s.

min_silence_duration_ms: int (default - 100 milliseconds)
    In the end of each speech chunk wait for min_silence_duration_ms before separating it

speech_pad_ms: int (default - 30 milliseconds)
    Final speech chunks are padded by speech_pad_ms each side

return_seconds: bool (default - False)
    whether return timestamps in seconds (default - samples)

time_resolution: bool (default - 1)
    time resolution of speech coordinates when requested as seconds

visualize_probs: bool (default - False)
    whether draw prob hist or not

progress_tracking_callback: Callable[[float], None] (default - None)
    callback function taking progress in percents as an argument

neg_threshold: float (default = threshold - 0.15)
    Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.

min_silence_at_max_speech: int (default - 98ms)
    Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached

use_max_poss_sil_at_max_speech: bool (default - True)
    Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.

window_size_samples: int (default - 512 samples)
    !!! DEPRECATED, DOES NOTHING !!!

Returns
----------
speeches: list of dicts
    list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
2Audio cannot be casted to tensor. Cast it manuallyr   r   zRMore than one dimension in audio. Are you trying to process audio with 2 channels?r   Nz@Sampling rate is a multiply of 16000, casting to 16000 manually!rG   zVCurrently silero VAD models support 8000 and 16000 (or multiply of 16000) sample ratesrA   rB     r,   r   F333333?g{Gz?Tstartc                     U S   $ )Nr   ro   )r1   s    r&   <lambda>'get_speech_timestamps.<locals>.<lambda>q  s    1r)   )keyend)r7   	is_tensorr   	TypeErrorrL   r0   r^   r   r/   r   r    r   r[   r\   r]   rm   itemr_   max	enumerateminroundr   )-r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rc   r2   min_speech_samplesspeech_pad_samplesmax_speech_samplesmin_silence_samples!min_silence_samples_at_max_speechaudio_length_samplesspeech_probscurrent_start_samplechunkspeech_probprogressprogress_percent	triggeredspeechescurrent_speechtemp_endprev_end
next_startpossible_ends
cur_samplesil_durdursil_dur_nowspeechsilence_durationaudio_length_secondsspeech_dicts-                                                r&   get_speech_timestampsr	     s   \ ??5!!	RLL'E 5;;!s5;;'(AMM!$E )u{{aqrru-%"71"<%ffXYM)qrr!.%!7#S	&?$F&6=&>ATTWX[mWmm'ADH(5(QTX(X%u:L %a)=?R S*,@CV,VWu:++HH''++EAs;NQTUZQ[;[7\3]^EE=1668K('*==**+H$';;sB%%&'78 !T IHNI,d3HHzM#L1;(1,
 9$( 8+G::$$h%89HH$'
 9$iI&0N7# *~g'>>ASS-- #M~ F#(0u%/!#%^
: 55.8N7+ %I3444: " ,4N5)OON3%'N!H,$)	2<w/788H8zH$&M -7N5)OON3%'N788H8zH %I$&M -'YY%$x/K1kDe6e#00(0u%"5)N7,CCGYYOON3!#3444:!	 "U 2X /.2IIM__ 4u'x(	66!#a;M)M"NOF7OH!!'!}W5uE!&8"88u%5%:!;;),SHQqSM'4JM]abMb4b-c)d1g& #C(<fUmN`>`$a bu),SHQqSM'4JM_4_-`)a1g&$8&-J\:\ ]^F5M ) 3mC#K#&u[-AM-QSb'cef#gK !$U;u+=+M%_au!vK $ 
#K D( $& $ <)<)LMOG	RPQQs   U> >Vc            	       z    \ rS rSr    SS\S\S\S\4S jjrS r\R                  " 5       SS\4S	 jj5       r
S
rg)VADIteratori  r   r   r   r   c                     Xl         X l        X0l        US;  a  [        S5      eX4-  S-  U l        X5-  S-  U l        U R                  5         g)a  
Class for stream imitation

Parameters
----------
model: preloaded .jit/.onnx silero VAD model

threshold: float (default - 0.5)
    Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
    It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.

sampling_rate: int (default - 16000)
    Currently silero VAD models support 8000 and 16000 sample rates

min_silence_duration_ms: int (default - 100 milliseconds)
    In the end of each speech chunk wait for min_silence_duration_ms before separating it

speech_pad_ms: int (default - 30 milliseconds)
    Final speech chunks are padded by speech_pad_ms each side
rG   zDVADIterator does not support sampling rates other than [8000, 16000]r   N)ry   r   r   r/   r   r   r   )r"   ry   r   r   r   r   s         r&   r'   VADIterator.__init__  sT    : 
"*-cdd#0#JT#Q "/"?$"Fr)   c                 b    U R                   R                  5         SU l        SU l        SU l        g )NFr   )ry   r   r   r   current_sample)r"   s    r&   r   VADIterator.reset_states  s(    

!r)   r   c                 (   [         R                  " U5      (       d   [         R                  " U5      nUR	                  5       S:X  a  [        US   5      O
[        U5      nU =R                  U-  sl        U R                  XR                  5      R                  5       nXPR                  :  a  U R                  (       a  SU l        XPR                  :  ak  U R                  (       dZ  SU l        [        SU R                  U R                  -
  U-
  5      nSU(       d  [        U5      0$ [!        X`R                  -  U5      0$ XPR                  S-
  :  a  U R                  (       a  U R                  (       d  U R                  U l        U R                  U R                  -
  U R"                  :  a  gU R                  U R                  -   U-
  nSU l        SU l        S	U(       d  [        U5      0$ [!        XpR                  -  U5      0$ g!   [        S5      e= f)
a  
x: torch.Tensor
    audio chunk (see examples in repo)

return_seconds: bool (default - False)
    whether return timestamps in seconds (default - samples)

time_resolution: int (default - 1)
    time resolution of speech coordinates when requested as seconds
r   r,   r   Tr   r   NFr   )r7   r   r   r   r-   rL   r  ry   r   r   r   r   r   r   r   rm   r   r   )r"   r1   r   r   r   r   speech_start
speech_ends           r&   rV   VADIterator.__call__  s    q!!VLLO ,-557a<c!A$iSV22jj$6$67<<>>>)t}}DM>>)4>>!DNq$"5"58O8O"ORe"efLnS.  E  E%P\_q_qPq  tC  KD  E  E..4//T^^== $ 3 3""T]]2T5M5MM!]]T-D-DDGZZ
 !!&ns:  C  C%PZ]o]oPo  rA  KB  C  C7V TUUs   H H)r  r   ry   r   r   r   r   r   N)r   r   r   r   )Fr   )ri   rj   rk   rl   r9   rm   r'   r   r7   r   rV   rn   ro   r)   r&   r  r    sb     %(&+03&(&!& !$& +.	&
 !$&P  ]]_* * *r)   r  tssr   r   c                     U(       a  U(       d  [        S5      e[        5       nU(       a  [        X5      OU nU H  nUR                  XS   US    5        M     [        R
                  " U5      $ )a  Collect audio chunks from a longer audio clip

This method extracts audio chunks from an audio clip, using a list of
provided coordinates, and concatenates them together. Coordinates can be
passed either as sample numbers or in seconds, in which case the audio
sampling rate is also needed.

Parameters
----------
tss: List[dict]
    Coordinate list of the clips to collect from the audio.
wav: torch.Tensor, one dimensional
    One dimensional float torch.Tensor, containing the audio to clip.
seconds: bool (default - False)
    Whether input coordinates are passed as seconds or samples.
sampling_rate: int (default - None)
    Input audio sampling rate. Required if seconds is True.

Returns
-------
torch.Tensor, one dimensional
    One dimensional float torch.Tensor of the concatenated clipped audio
    chunks.

Raises
------
ValueError
    Raised if sampling_rate is not provided when seconds is True.

3sampling_rate must be provided when seconds is Truer   r   r/   list_seconds_to_samples_tssr_   r7   rM   )r  r   r   r   chunks_tssrc   s          r&   collect_chunksr  (  s`    D }NOOVF:A"36sDcG*QuX./  99Vr)   c                    U(       a  U(       d  [        S5      e[        5       nSnU(       a  [        X5      OU nU H  nUR                  XUS    5        US   nM      UR                  XS 5        [        R
                  " U5      $ )a  Drop audio chunks from a longer audio clip

This method extracts audio chunks from an audio clip, using a list of
provided coordinates, and drops them. Coordinates can be passed either as
sample numbers or in seconds, in which case the audio sampling rate is also
needed.

Parameters
----------
tss: List[dict]
    Coordinate list of the clips to drop from from the audio.
wav: torch.Tensor, one dimensional
    One dimensional float torch.Tensor, containing the audio to clip.
seconds: bool (default - False)
    Whether input coordinates are passed as seconds or samples.
sampling_rate: int (default - None)
    Input audio sampling rate. Required if seconds is True.

Returns
-------
torch.Tensor, one dimensional
    One dimensional float torch.Tensor of the input audio minus the dropped
    chunks.

Raises
------
ValueError
    Raised if sampling_rate is not provided when seconds is True.

r  r   r   r   Nr  )r  r   r   r   r  	cur_startr  rc   s           r&   drop_chunksr   V  sz    D }NOOVFI:A"36sDsaj13eH	  MM#j/"99Vr)   c                 t    U  Vs/ s H&  n[        US   5      U-  [        US   5      U-  S.PM(     sn$ s  snf )zDConvert coordinates expressed in seconds to sample coordinates.
    r   r   )r   r   )r   )r  r   crds      r&   r  r    sQ      # s7|$}4SZ =0   s   -5)r   )FN)r7   r   typingr   r   r   	packagingr   	languagesr   rq   r   rm   r   r   r   r   rz   r   r   r9   boolr	  r  dictr  r   r  ro   r)   r&   <module>r(     sA     !  $	d dN 2S   @S %,, s 2  ,,u-s #  .1/48;9>u9</1161227PT1558;=AEs s%*s *-s 36	s
 27s 47s *-s +/s ,/s ,0s 7?w}6Ms */s 03s 69s ;?s sl[ [@ $)(,+T
 ++ + #&+ 27+` !&%)0T$Z 0\\00  #0 /4ll0fd C DJ r)   