
    bCi                     H   S SK r S SKrS SKJrJrJr  S SKrS SKrS SKJ	r	  S SK
Jr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(  SSK)J*r*  \(" 5       (       a  SSK+J,r,   " S S\5      r- " S S\	R\                  5      r/ " S S\	R\                  5      r0 " S S\	R\                  5      r1 " S S\	R\                  5      r2 " S S\	R\                  5      r3   SPS\	R\                  S \Rh                  S!\Rh                  S"\Rh                  S#\\Rh                     S$\\5   S%\5S&\\Rh                     4S' jjr6 " S( S)\	R\                  5      r7 " S* S+\	R\                  5      r8 " S, S-\5      r9 " S. S/\	R\                  5      r: " S0 S1\	R\                  5      r; " S2 S3\	R\                  5      r<\& " S4 S5\"5      5       r=  SQS6\>\?\?4   S7\5S8\?S#\\R                     S9\?S:\R                  4S; jjrB\rC\& " S< S=\=5      5       rDS>rE\&" S?S@9 " SA SB\=5      5       rF\&" SCS@9 " SD SE\=5      5       rG\& " SF SG\=5      5       rH " SH SI\	R\                  5      rI " SJ SK\	R\                  5      rJ\&" SLS@9 " SM SN\=5      5       rK/ SOQrLg)R    N)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_peft_availableis_torch_flex_attn_available   )Data2VecAudioConfig)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Data2VecAudioConvLayer7   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      n/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/data2vec/modeling_data2vec_audio.pyr)   Data2VecAudioConvLayer.__init__8   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@    c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )N)r1   	transposer3   r5   r7   hidden_statess     r;   forwardData2VecAudioConvLayer.forwardG   sV    		-0%//B76%//B76r=   )r5   r1   r+   r3   r,   r   __name__
__module____qualname____firstlineno__r)   rD   __static_attributes____classcell__r:   s   @r;   r    r    7   s    A r=   r    c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Data2VecAudioPadLayerR   c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )N   r   r   )r(   r)   num_pad_remove)r7   num_conv_pos_embeddingsr:   s     r;   r)   Data2VecAudioPadLayer.__init__S   s)    #:Q#>!#Car=   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   rT   rB   s     r;   rD   Data2VecAudioPadLayer.forwardW   s6    ")!Q0F43F3F2F0F*FGMr=   rY   rG   rN   s   @r;   rP   rP   R   s    K r=   rP   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ ) Data2VecAudioPositionalConvLayer]   c                 r  > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        UR
                  5      U l	        [        UR                     U l        [        R                  " UR                  SS9U l        g )NrS   )r#   paddinggroupsFr&   )r(   r)   r   r-   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr1   rP   r_   r	   r4   r5   r2   r3   r7   r8   r:   s     r;   r)   )Data2VecAudioPositionalConvLayer.__init__^   s    II33//1477
	 -V-H-HI !?!?@,,v'9'9eTr=   c                     U R                  U5      nU R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R	                  U5      nU$ Nr   rS   )r1   r_   rA   r3   r5   rB   s     r;   rD   (Data2VecAudioPositionalConvLayer.forwardm   sd    		-0]3%//156%//156r=   )r5   r1   r3   r_   rG   rN   s   @r;   r\   r\   ]   s    U r=   r\   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )$Data2VecAudioPositionalConvEmbeddingx   c                    > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf N)r(   r)   r   
ModuleListrangerU   r\   layersr7   r8   _r:   s      r;   r)   -Data2VecAudioPositionalConvEmbedding.__init__y   sF    mm?DVEcEc?de?d!-f5?de
es   Ac                     UR                  SS5      nU R                   H  nU" U5      nM     UR                  SS5      nU$ rg   )rA   rp   )r7   rC   layers      r;   rD   ,Data2VecAudioPositionalConvEmbedding.forward   sD    %//15[[E!-0M !%//15r=   )rp   rG   rN   s   @r;   rj   rj   x   s    
 r=   rj   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )Data2VecAudioFeatureEncoder   z.Construct the features from raw audio waveformc           
         > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        SU l	        g s  snf )N)r9   FT)
r(   r)   r   rn   ro   num_feat_extract_layersr    conv_layersgradient_checkpointing_requires_grad)r7   r8   ir:   s      r;   r)   $Data2VecAudioFeatureEncoder.__init__   s\    ==AFvGeGeAfgAfA#F7Afg
 ',#" hs   A%c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr~   r7   params     r;   _freeze_parameters.Data2VecAudioFeatureEncoder._freeze_parameters   s#    __&E"'E '#r=   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ )NT)r~   trainingr   r|   )r7   input_valuesrC   
conv_layers       r;   rD   #Data2VecAudioFeatureEncoder.forward   sK    $QW- 4==*.M'**J&}5M + r=   )r~   r|   r}   )
rH   rI   rJ   rK   __doc__r)   r   rD   rL   rM   rN   s   @r;   rx   rx      s    8#$

 
r=   rx   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Data2VecAudioFeatureProjection   c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr@   eps)r(   r)   r   r2   r*   layer_norm_epsr3   Linearra   
projectionDropoutfeat_proj_dropoutdropoutrd   s     r;   r)   'Data2VecAudioFeatureProjection.__init__   sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r=   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ rm   )r3   r   r   )r7   rC   norm_hidden_statess      r;   rD   &Data2VecAudioFeatureProjection.forward   s7    !__];(:;]300r=   )r   r3   r   rG   rN   s   @r;   r   r      s    <1 1r=   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )Nr@         rS   r   dimr   )pr   )sizetorchmatmulrA   r   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r;   eager_attention_forwardr      s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r=   c                   Z  ^  \ rS rSrSr     SS\S\S\S\S\S\S	\\	   4U 4S
 jjjr
    SS\R                  S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )Data2VecAudioAttention   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderr%   	is_causalr8   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r%   )r(   r)   r   r   r   head_dimr8   
ValueErrorr   r   r   r   r   k_projv_projq_projout_proj)	r7   r   r   r   r   r%   r   r8   r:   s	           r;   r)   Data2VecAudioAttention.__init__   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr=   rC   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                     USLnUR                   SS u  pU(       a  UR                   S   OU	n
XSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nU(       a  UOUnU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUU4U R                  (       d  SOU R                  U R                  UUS.UD6u  nnUR                  XS5      R                  5       nU R!                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNr@   r   rS   eager        )r   r   r   r   )shaper   r   r   rA   r   r   r   r8   _attn_implementationr   r   r   r   reshaper   r   )r7   rC   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r;   rD   Data2VecAudioAttention.forward   s    .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV-?)][[055~FPPQRTUV
{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L$..r=   )r8   r   r   r   r   r   r   r   r   r   r   r   )r   FTFN)NNNF)rH   rI   rJ   rK   r   intfloatboolr   r   r)   r   Tensorr   r   tuplerD   rL   rM   rN   s   @r;   r   r      s!   G  04CC C 	C
 C C C ,-C CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/ 3/r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Data2VecAudioFeedForwardi)  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g rm   )r(   r)   r   r   activation_dropoutintermediate_dropoutr   ra   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutrd   s     r;   r)   !Data2VecAudioFeedForward.__init__*  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r=   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ rm   )r   r   r   r   r   rB   s     r;   rD    Data2VecAudioFeedForward.forward7  sX    //>00?11-@))-8++M:r=   )r   r   r   r   r   rG   rN   s   @r;   r   r   )  s    @ r=   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )Data2VecAudioEncoderLayeriA  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r   r   r   r   r8   r   )r(   r)   r   ra   num_attention_headsattention_dropout	attentionr   r   r   r   r2   r   r3   r   feed_forwardfinal_layer_normrd   s     r;   r)   "Data2VecAudioEncoderLayer.__init__B  s    /((00,,
 zz&"7"78,,v'9'9v?T?TU4V< "V-?-?VEZEZ [r=   c                     UnU R                  XUS9u  pnU R                  U5      nXA-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xu4-  nU$ )Nr   r   )r   r   r3   r   r   )r7   rC   r   r   attn_residualr   rr   outputss           r;   rD   !Data2VecAudioEncoderLayer.forwardQ  s    %)-L] *8 *
&Q ]3%56%(9(9-(HH--m< "&Gr=   )r   r   r   r   r3   r   rG   rN   s   @r;   r   r   A  s    \ r=   r   c                      ^  \ rS rSrU 4S jr    SS\R                  S\\R                     S\	S\	S\	4
S	 jjr
S\\R                  S4   S
\R                  4S jrSrU =r$ )Data2VecAudioEncoderie  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[!        U5      PM     sn5      U l        SU l        g s  snf )Nr   F)r(   r)   r8   rj   pos_conv_embedr   r2   ra   r   r3   r   r   r   rn   ro   num_hidden_layersr   rp   r}   rq   s      r;   r)   Data2VecAudioEncoder.__init__f  s    B6J,,v'9'9v?T?TUzz&"7"78mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs    C	NrC   r   r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   U R                  UU5      nU R	                  U5      n	X-   nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      n
U R                   H  nU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a    XR                  R                  :  nU(       a  U
(       a  U" XUS9nUS   nU(       a  SnU(       d  M|  UWS   4-   nM     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [!        UUUS	9$ )
N r@   r   rS   r   r   NNc              3   .   #    U  H  oc  M  Uv   M     g 7frm   r  ).0vs     r;   	<genexpr>/Data2VecAudioEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_staterC   
attentions)	unsqueezerepeatr   _update_full_maskr  r3   r   r
   r   rp   r   randr   r8   	layerdropr   r   )r7   rC   r   r   r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusru   dropout_probabilityskip_the_layerlayer_outputss                  r;   rD   Data2VecAudioEncoder.forwardo  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M01//

 #11-@%;6]302R6LT6R[[E#$58H$H! #(**R.!]]Z/B[[EZEZ/ZN![ %!Te! !.a 0 ,  &9]1=M<O&O#' !*   14D Dm]GZ$[mmm++*
 	
r=   inputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ )Nflash_attention_2r   sdpaflex_attentionF)r   )	r8   r   r   dtyper   r   r   r   r   )r7   r   r  s      r;   r  &Data2VecAudioEncoder._update_full_mask  s    
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r=   )r8   r   r}   r3   rp   r  )NFFT)rH   rI   rJ   rK   r)   r   tensorr   r   r   rD   r   r  rL   rM   rN   s   @r;   r   r   e  s    , 26"'%* :
||:
 !.:
  	:

 #:
 :
xellD01 || r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Data2VecAudioAdapterLayeri  c                    > [         TU ]  5         [        R                  " UR                  SUR                  -  UR
                  UR                  SS9U l        g )NrS   r   )r$   r_   )r(   r)   r   r-   output_hidden_sizeadapter_kernel_sizeadapter_strider1   rd   s     r;   r)   "Data2VecAudioAdapterLayer.__init__  sJ    II%%)))&&((
	r=   c                 d    U R                  U5      n[        R                  R                  USS9nU$ )Nr   r   )r1   r   r   glurB   s     r;   rD   !Data2VecAudioAdapterLayer.forward  s/    		-0))-Q)?r=   )r1   rG   rN   s   @r;   r(  r(    s    
 r=   r(  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Data2VecAudioAdapteri  c                   >^ [         TU ]  5         TR                  TR                  :w  aV  [        R
                  " TR                  TR                  5      U l        [        R                  " TR                  5      U l        OS =U l        U l        [        R                  " U4S j[        TR                  5       5       5      U l        TR                  U l        g )Nc              3   :   >#    U  H  n[        T5      v   M     g 7frm   )r(  )r  rr   r8   s     r;   r  0Data2VecAudioAdapter.__init__.<locals>.<genexpr>  s     #pOo!$=f$E$EOos   )r(   r)   r*  ra   r   r   projr2   proj_layer_normrn   ro   num_adapter_layersrp   r  rd   s    `r;   r)   Data2VecAudioAdapter.__init__  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#puU[UnUnOo#pp))r=   c                 |   U R                   b/  U R                  b"  U R                  U5      nU R                  U5      nUR                  SS5      nU R                   HK  n[        R
                  R                  5       nU R                  (       a  X0R                  :  d  MC  U" U5      nMM     UR                  SS5      nU$ rg   )r6  r7  rA   rp   nprandomr   r  )r7   rC   ru   layerdrop_probs       r;   rD   Data2VecAudioAdapter.forward  s    99 T%9%9%E IIm4M 00?M%//15[[EYY--/N==^nn%D %m 4 !
 &//15r=   )r  rp   r6  r7  rG   rN   s   @r;   r2  r2    s    * r=   r2  c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrS r SS\\R                  \4   S	\\   4S
 jjr SS\S\R                  4S jjrSrg)Data2VecAudioPreTrainedModeli  r8   data2vec_audior   Tc                    [        U[        5      (       a  [        R                  " SUR                  R
                  -  5      n[        R                  R                  UR                  R                  U* US9  [        R                  R                  UR                  R                  U* US9  g[        U[        5      (       a5  [        R                  R                  UR                  R                  S5        g[        U[        R                  5      (       ak  UR                  R                  R!                  SU R"                  R$                  S9  UR                  b%  UR                  R                  R'                  5         gg[        U[        R(                  [        R*                  45      (       ae  UR                  b$  UR                  R                  R'                  5         UR                  b&  UR                  R                  R-                  S5        gg[        U[        R.                  5      (       a  [        R                  R1                  UR                  5        UR                  bh  [        R                  " UR2                  UR4                  UR6                  S   -  -  5      n[        R                  R                  UR                  U* US9  ggg)zInitialize the weightsr   )abr   r   )meanstdNg      ?)r   r   mathsqrtr   in_featuresr   inituniform_weightr%   r\   	constant_r1   r   datanormal_r8   initializer_rangezero_r2   	GroupNormfill_r-   kaiming_normal_r`   in_channelsr#   )r7   r   ks      r;   _init_weights*Data2VecAudioPreTrainedModel._init_weights  s   f<==		!f//;;;<AGGV..55!qAGGV..33rQ? @AAGGfkk..2		**MM&&CT[[5R5R&S{{&  &&( 'r|| <=={{&  &&(}}(""((- )		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r=   Ninput_lengthsadd_adapterc                 d   Uc  U R                   R                  OUnS n[        U R                   R                  U R                   R                  5       H  u  pEU" XU5      nM     U(       aD  [        U R                   R                  5       H!  nU" USU R                   R                  5      nM#     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   divinput_lengthr#   r$   s      r;   _conv_out_lengthWData2VecAudioPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s      99\7wWZ[[[r=   r   )r8   rZ  zipr.   r/   ro   r8  r,  )r7   rY  rZ  rb  r#   r$   rr   s          r;    _get_feat_extract_output_lengths=Data2VecAudioPreTrainedModel._get_feat_extract_output_lengths  s     2=1Ddkk--+	\
 $'t{{'>'>@W@W#XK,]PM $Y 4;;99: 04;;C]C] ^ ; r=   feature_vector_lengthr   c                    UR                  SS9S S 2S4   nU R                  XCS9nUR                  [        R                  5      nUR
                  S   n[        R                  " Xa4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr@   r   rZ  r   )r$  devicer   )rj  )cumsumre  tor   longr   zerosr$  rj  arangeflipr   )r7   rg  r   rZ  non_padded_lengthsoutput_lengths
batch_sizes          r;   "_get_feature_vector_attention_mask?Data2VecAudioPreTrainedModel._get_feature_vector_attention_mask-  s    
 ,22r2:1b5A>>?Q>k'**5::6#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr=   r  rm   )rH   rI   rJ   rK   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrW  r   r   
LongTensorr   r   r   re  rt  rL   r  r=   r;   r@  r@    s    ($O&*#N94 Z^"5#3#3S#89HPQU0 Y]%(:?:J:J r=   r@  r   	mask_probmask_length	min_masksr   c           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )r   max)ra  num_masked_spanepsilonr  r~  r  sequence_lengths     r;   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_spanh  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr=   Nr@   r$  r   F)replace)r   r;  r<  r  itemdetachsumtolistro   rn  r   choicero  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r~  r  r   r  rs  r  rr   rY  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanra  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r;   _compute_mask_indicesr  B  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   D  ^  \ rS rSrS\4U 4S jjrS r  SS\R                  S\	\R                     S\	\R                     4S jjr\     SS	\	\R                     S\	\R                     S\	\R                     S
\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )Data2VecAudioModeli  r8   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        UR$                  (       a  ['        U5      OS U l        U R+                  5         g )Nr   )r(   r)   r8   rx   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   	Parameterr   r   ra   rK  masked_spec_embedr   encoderrZ  r2  adapter	post_initrd   s     r;   r)   Data2VecAudioModel.__init__  s     !<V!D"@"H   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"+F37=7I7I+F3t 	r=   c                 8    U R                   R                  5         g
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r  r   r7   s    r;   freeze_feature_encoder)Data2VecAudioModel.freeze_feature_encoder  s    
 	113r=   rC   mask_time_indicesr   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )r~  r  r   r  )rj  r$  )r~  r  r  r@   )getattrr8   r   r  rl  r$  r  r   r  mask_time_lengthmask_time_min_masksr   r&  rj  r   r  mask_feature_lengthmask_feature_min_masksexpand)r7   rC   r  r   rs  r  ra   mask_feature_indicess           r;   _mask_hidden_states&Data2VecAudioModel._mask_hidden_states  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r=   r   r   r  r  r   c                 >   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   USS9nU R                  U5      u  pU R                  XUS9nU R                  UUUUUS9n	U	S   nU R                  b  U R                  U5      nU(       d	  X4U	SS -   $ [        UUU	R                  U	R                  S	9$ )
a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   rS   Fri  )r  r   r   r   r  r  r   )r  extract_featuresrC   r  )r8   r   r  use_return_dictr  rA   rt  r   r  r  r  r  Data2VecAudioBaseModelOutputrC   r  )
r7   r   r   r  r   r  r  r  rC   encoder_outputss
             r;   rD   Data2VecAudioModel.forward  sY    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!4qr7JJJ++-)77&11	
 	
r=   )r  r8   r  r  r  r  r	  NNNNN)rH   rI   rJ   rK   r   r)   r  r   FloatTensorr   r}  r  r   r   r   r   r   r  rD   rL   rM   rN   s   @r;   r  r    s    2 "4 :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*7
u||,7
 !.7
 $E$5$56	7

 $D>7
 'tn7
 d^7
 
u22	37
 7
r=   r  rS   zu
    Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                      ^  \ rS rSrU 4S jrS rS r\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\	R                     S\\\4   4S jj5       rSrU =r$ )Data2VecAudioForCTCiB  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                  " X!R                  5      U l        U R#                  5         g)a  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.rZ  )r(   r)   r  rA  r   r   final_dropoutr   
vocab_sizer   r:   hasattrrZ  r*  ra   r   lm_headr  )r7   r8   r*  r:   s      r;   r)   Data2VecAudioForCTC.__init__H  s     	 08zz&"6"67$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r=   c                 Z    [         R                  " S[        5        U R                  5         gr  The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningr  r  s    r;   freeze_feature_extractor,Data2VecAudioForCTC.freeze_feature_extractorc  '    
 	Q	

 	##%r=   c                 L    U R                   R                  R                  5         gr  rA  r  r   r  s    r;   r  *Data2VecAudioForCTC.freeze_feature_encodero      
 	--@@Br=   r   r   r   r  r  labelsr   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   nU R                  U5      nU R                  U5      n	Sn
UGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U	S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9n
SSS5        U(       d  U	4U[6        S -   nU
b  U
4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r  r   r  r@   )r   r$  r   F)enabled)blank	reductionzero_infinitylosslogitsrC   r  )r8   r  r  r  r   rA  r   r  r   	ones_likerm  re  r  rl  masked_selectr   r   log_softmaxfloat32rA   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rC   r  )r7   r   r   r   r  r  r  r   rC   r  r  rY  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r;   rD   Data2VecAudioForCTC.forwardv  s    " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]%%)/!5# & 
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)rA  r   r  r  )rH   rI   rJ   rK   r)   r  r  r   r   r   r   r   r   r   r   rD   rL   rM   rN   s   @r;   r  r  B  s    6
&C  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r=   r  z
    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )&Data2VecAudioForSequenceClassificationi  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )NrZ  zdSequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r(   r)   r  rZ  r   r  rA  r  use_weighted_layer_sumr   r  r   r  layer_weightsr   ra   classifier_proj_size	projector
num_labels
classifierr  r7   r8   
num_layersr:   s      r;   r)   /Data2VecAudioForSequenceClassification.__init__  s     6=))f.@.@v  18--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r=   c                 Z    [         R                  " S[        5        U R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
r  Nr  r  s    r;   r  ?Data2VecAudioForSequenceClassification.freeze_feature_extractor  r  r=   c                 L    U R                   R                  R                  5         gr  r  r  s    r;   r  =Data2VecAudioForSequenceClassification.freeze_feature_encoder  r  r=   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNrA  r   r   r   s     r;   freeze_base_model8Data2VecAudioForSequenceClassification.freeze_base_model  %    
 ((335E"'E 6r=   r   r   r   r  r  r  r   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nUc  UR                  SS9n
OU R                  UR                   S   U5      nUR#                  S5      R%                  SSUR                   S   5      nS	X) '   UR                  SS9UR                  SS9R                  SS5      -  n
U R'                  U
5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
    into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   r   r@   r   rS   r   r  )r8   r  r  rA  r  r   stackr   r   r   r  r   r  r   rE  rt  r   r  r  r  r   r  r   rC   r  )r7   r   r   r   r  r  r  r   rC   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    r;   rD   .Data2VecAudioForSequenceClassification.forward  s   . &1%<k$++B]B]'+{{'I'ItOc%%)/!5# & 
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r=   )r  rA  r  r   r  )rH   rI   rJ   rK   r)   r  r  r  r   r   r   r   r   r   r   r   rD   rL   rM   rN   s   @r;   r  r    s    "
&C(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
r=   r  c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )(Data2VecAudioForAudioFrameClassificationi7  c                   > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        UR                   U l        U R%                  5         g )NrZ  zgAudio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r(   r)   r  rZ  r   r  rA  r  r  r   r  r   r  r  r   ra   r  r  init_weightsr  s      r;   r)   1Data2VecAudioForAudioFrameClassification.__init__9  s     6=))f.@.@y  18--1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r=   c                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r;   r  AData2VecAudioForAudioFrameClassification.freeze_feature_extractorI  r  r=   c                 L    U R                   R                  R                  5         gr  r  r  s    r;   r  ?Data2VecAudioForAudioFrameClassification.freeze_feature_encoderU  r  r=   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r;   r  :Data2VecAudioForAudioFrameClassification.freeze_base_model\  r  r=   r   r   r  r   r  r  r   c           	         Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      n
SnUbZ  [        5       nU" U
R                  SU R                  5      [
        R                   " UR                  SU R                  5      SS95      nU(       d  U
4U[        S -   nU$ [#        UU
UR$                  UR&                  S	9$ )
r  NTr  r   r   r@   r   )axisr  )r8   r  r  rA  r  r   r  r   r   r   r  r   r  r  r   r  argmaxr   rC   r  )r7   r   r   r  r   r  r  r   rC   r  r  r  r  r  s                 r;   rD   0Data2VecAudioForAudioFrameClassification.forwardd  sh   . &1%<k$++B]B]'+{{'I'ItOc%%)/!5# & 
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r=   )r  rA  r  r  r  )rH   rI   rJ   rK   r)   r  r  r  r   r   r   r   r   r   r   r   rD   rL   rM   rN   s   @r;   r  r  7  s     
&C(  26)-,0/3&*9
u||,9
 !.9
 &	9

 $D>9
 'tn9
 d^9
 
u++	,9
 9
r=   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )AMSoftmaxLossi  c                    > [         TU ]  5         X0l        X@l        X l        [
        R                  " [        R                  " X5      SS9U l	        [
        R                  " 5       U l        g )NT)r   )r(   r)   scalemarginr  r   r  r   randnrL  r   r  )r7   	input_dimr  r+  r,  r:   s        r;   r)   AMSoftmaxLoss.__init__  sI    
$ll5;;y#EUYZ'')	r=   c                    UR                  5       n[        R                  R                  U R                  SS9n[        R                  R                  USS9n[
        R                  " X5      nX@R                  -
  n[        R                  R                  X R                  5      nU R                  [
        R                  " UR                  5       XT5      -  nU R                  Xr5      nU$ )Nr   r   r   )flattenr   r   	normalizerL  r   mmr,  one_hotr  r+  wherer   r  )	r7   rC   r  rL  	cos_thetapsionehotr  r  s	            r;   rD   AMSoftmaxLoss.forward  s    !((!(<//1/EHH]3	++%&&v?ekk&++-HHyy(r=   )r  r,  r  r+  rL  )g      >@g?rG   rN   s   @r;   r)  r)    s    * r=   r)  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )	TDNNLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OUR                  U   U l        UR                  U   U l        UR
                  U   U l        UR                  U   U l        [        R                  " U R                  U R                  -  U R                  5      U l        [        R                  " 5       U l        g )Nr   r   )r(   r)   tdnn_dimr+   r,   tdnn_kernelr#   tdnn_dilationdilationr   r   kernelReLUr5   r6   s      r;   r)   TDNNLayer.__init__  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r=   rC   r   c                 >   [        5       (       a  SSKJn  [        5       (       a1  [        U R                  W5      (       a  [
        R                  " S5        UR                  SS5      nU R                  R                  R                  U R                  U R                  U R                  5      R                  SS5      n[        R                  R                  XU R                  R                   U R"                  S9nUR                  SS5      nU R%                  U5      nU$ )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   rS   )r@  )r   peft.tuners.lorarE  r   rA  r  r  rA   rL  r   r,   r#   r+   r   r   conv1dr%   r@  r5   )r7   rC   rE  rL  s       r;   rD   TDNNLayer.forward  s    2$++y11O &//15##(():):D<L<LdN^N^_iijkmno,,]DKKDTDT_c_l_l,m%//156r=   )r5   r@  r+   rA  r#   r,   rF   )
rH   rI   rJ   rK   r)   r   r   rD   rL   rM   rN   s   @r;   r;  r;    s(    $U\\ ell  r=   r;  zq
    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                     ^  \ rS rSrU 4S jrS rS rS rS\\	R                  \4   4S jr\     SS\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       rSrU =r$ )Data2VecAudioForXVectori  c                 2  > [         TU ]  U5        [        U5      U l        UR                  S-   nUR
                  (       a2  [        R                  " [        R                  " U5      U-  5      U l
        [        R                  " UR                  UR                  S   5      U l        [        [!        UR                  5      5       Vs/ s H  n[#        X5      PM     nn[        R$                  " U5      U l        [        R                  " UR                  S   S-  UR(                  5      U l        [        R                  " UR(                  UR(                  5      U l        [/        UR(                  UR0                  5      U l        U R5                  5         g s  snf )Nr   r   r@   rS   )r(   r)   r  rA  r  r  r   r  r   r  r  r   ra   r=  r   ro   r  r;  rn   tdnnxvector_output_dimr  r  r)  r  	objectiver  )r7   r8   r  r   tdnn_layersr:   s        r;   r)    Data2VecAudioForXVector.__init__  s     08--1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQ5Py+5PQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   Fc                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r;   r  0Data2VecAudioForXVector.freeze_feature_extractor  r  r=   c                 L    U R                   R                  R                  5         gr  r  r  s    r;   r  .Data2VecAudioForXVector.freeze_feature_encoder  r  r=   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r;   r  )Data2VecAudioForXVector.freeze_base_model  r  r=   rY  c                 X    S nU R                   R                   H  nU" XS5      nM     U$ )z/
Computes the output length of the TDNN layers
c                     X-
  U-  S-   $ )Nr   r  r`  s      r;   rb  JData2VecAudioForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !.69A==r=   r   )r8   r>  )r7   rY  rb  r#   s       r;   _get_tdnn_output_lengths0Data2VecAudioForXVector._get_tdnn_output_lengths  s1    
	>
  ;;22K,]KM 3 r=   r   r   r   r  r  r  r   c                    Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nU R                   H  n
U
" U5      nM     Uc  UR                  SS9nUR!                  SS9nOU R#                  UR                  SS95      nU R%                  U5      n/ n/ n['        U5       HN  u  nnUR)                  XSU24   R                  SS95        UR)                  XSU24   R!                  SS95        MP     [
        R                  " U5      n[
        R                  " U5      n[
        R*                  " X/SS9nU R-                  U5      nU R/                  U5      nSnUb  U R1                  UU5      nU(       d  UU4U[        S -   nUb  U4U-   $ U$ [3        UUUUR4                  UR6                  S9$ )	r  NTr  r   r   r@   r   )r  r  
embeddingsrC   r  )r8   r  r  rA  r  r   r  r   r   r   r  r   r  r   rL  rE  rF  re  rZ  	enumerater  catr  r  rN  r   rC   r  )r7   r   r   r   r  r  r  r   rC   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr  r  r  s                         r;   rD   Data2VecAudioForXVector.forward  s   . &1%<k$++B]B]'+{{'I'ItOc%%)/!5# & 
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5))J&}5M $ !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':;	6$$]gvg:%>%C%C%C%JK##MWfW*$=$A$Aa$A$HI < "KK6M ;;|4L!II}&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r=   )r  rA  r  r  rN  r   rL  r  )rH   rI   rJ   rK   r)   r  r  r  r   r   r}  r   rZ  r   r   r   r   r   r   rD   rL   rM   rN   s   @r;   rJ  rJ    s    &
&C(eE<L<Lc<Q6R   26,0/3&*)-O
u||,O
 !.O
 $D>	O

 'tnO
 d^O
 &O
 
um#	$O
 O
r=   rJ  )r  r  r  rJ  r  r@  )Nr   NrX   )MrG  r  typingr   r   r   numpyr;  r   r   torch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_data2vec_audior   integrations.flex_attentionr   r    ModulerP   r\   rj   rx   r   r   r   r   r   r   r   r   r(  r2  r@  r   r   r}  ndarrayr  r  r  r  r  r  r  r)  r;  rJ  __all__r  r=   r;   <module>r{     s  ,   , ,    % ! @ 7 g B 9  G & T T =  !!J7 6BII ryy 6299 ")) :1RYY 1*  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<U/RYY U/pryy 0! : !HZ299 Zz		 $299 > K? K Kd 26tc?tt t U--.	t
 t ZZtn  7  
5 
 
D !"  
t
6 t

t
n p
-I p
p
f f
/K f
 f
RBII .		 @ 
N
: N

N
br=   