
    bCi$                        S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSKJ	r	  SSK
Jr  S	S
KJrJrJrJrJrJrJrJrJrJrJr  SSKJr   " S S\5      r " S S\5      r " S S\R8                  5      r " S S\R8                  5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\\5      r#\	r$ " S S \#\5      r% " S! S"\#\5      r& " S# S$\5      r' " S% S&\5      r( " S' S(\5      r)/ S)Qr*g)*zPyTorch Data2VecText model.    N)nn   )ACT2FN)GradientCheckpointingLayer)Wav2Vec2BaseModelOutput)PreTrainedModel   )Wav2Vec2AdapterWav2Vec2EncoderWav2Vec2FeatureEncoderWav2Vec2FeatureProjection#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PreTrainedModelWav2Vec2SamePadLayer   )Data2VecAudioConfigc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Data2VecAudioConvLayer*   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activation)selfconfiglayer_id	__class__s      m/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/data2vec/modular_data2vec_audio.pyr!   Data2VecAudioConvLayer.__init__+   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@    c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )N)r)   	transposer+   r-   r.   hidden_statess     r2   forwardData2VecAudioConvLayer.forward:   sV    		-0%//B76%//B76r4   )r-   r)   r#   r+   r$   )r   __name__
__module____qualname____firstlineno__r!   r;   __static_attributes____classcell__r1   s   @r2   r   r   *   s    A r4   r   c                       \ rS rSrSrg)Data2VecAudioPadLayerE    Nr>   r?   r@   rA   rB   rH   r4   r2   rF   rF   E       r4   rF   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ ) Data2VecAudioPositionalConvLayerI   c                 r  > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        UR
                  5      U l	        [        UR                     U l        [        R                  " UR                  SS9U l        g )Nr	   )r   paddinggroupsFr   )r    r!   r   r%   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr)   rF   rO   r   r,   r-   r*   r+   )r.   r/   r1   s     r2   r!   )Data2VecAudioPositionalConvLayer.__init__J   s    II33//1477
	 -V-H-HI !?!?@,,v'9'9eTr4   c                     U R                  U5      nU R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R	                  U5      nU$ Nr   r	   )r)   rO   r8   r+   r-   r9   s     r2   r;   (Data2VecAudioPositionalConvLayer.forwardY   sd    		-0]3%//156%//156r4   )r-   r)   r+   rO   r=   rD   s   @r2   rL   rL   I   s    U r4   rL   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )$Data2VecAudioPositionalConvEmbeddingd   c                    > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf )N)r    r!   r   
ModuleListrangenum_conv_pos_embeddingsrL   layers)r.   r/   _r1   s      r2   r!   -Data2VecAudioPositionalConvEmbedding.__init__e   sF    mm?DVEcEc?de?d!-f5?de
es   Ac                     UR                  SS5      nU R                   H  nU" U5      nM     UR                  SS5      nU$ rV   )r8   r_   )r.   r:   layers      r2   r;   ,Data2VecAudioPositionalConvEmbedding.forwardk   sD    %//15[[E!-0M !%//15r4   )r_   r=   rD   s   @r2   rY   rY   d   s    
 r4   rY   c                       \ rS rSrS rSrg)Data2VecAudioFeatureEncoders   c           
          [         R                  R                  U 5        [         R                  " [	        UR
                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        SU l	        g s  snf )N)r0   FT)
r   Moduler!   r\   r]   num_feat_extract_layersr   conv_layersgradient_checkpointing_requires_grad)r.   r/   is      r2   r!   $Data2VecAudioFeatureEncoder.__init__t   sb    
		4 ==AFvGeGeAfgAfA#F7Afg
 ',#" hs   A5)rm   rk   rl   N)r>   r?   r@   rA   r!   rB   rH   r4   r2   rf   rf   s   s    #r4   rf   c                       \ rS rSrSrg)Data2VecAudioFeatureProjection}   rH   NrI   rH   r4   r2   rq   rq   }   rJ   r4   rq   c                       \ rS rSrSrg)Data2VecAudioEncoder   rH   NrI   rH   r4   r2   rt   rt      rJ   r4   rt   c                       \ rS rSrSrg)Data2VecAudioAdapter   rH   NrI   rH   r4   r2   rw   rw      rJ   r4   rw   c                   P    \ rS rSr% \\S'   SrSrSrSr	Sr
SrS rS rS rS	 rS
rg)Data2VecAudioPreTrainedModel   r/   data2vec_audioinput_valuesTc                    [        U[        5      (       a  [        R                  " SUR                  R
                  -  5      n[        R                  R                  UR                  R                  U* US9  [        R                  R                  UR                  R                  U* US9  g[        U[        5      (       a5  [        R                  R                  UR                  R                  S5        g[        U[        R                  5      (       ak  UR                  R                  R!                  SU R"                  R$                  S9  UR                  b%  UR                  R                  R'                  5         gg[        U[        R(                  [        R*                  45      (       ae  UR                  b$  UR                  R                  R'                  5         UR                  b&  UR                  R                  R-                  S5        gg[        U[        R.                  5      (       a  [        R                  R1                  UR                  5        UR                  bh  [        R                  " UR2                  UR4                  UR6                  S   -  -  5      n[        R                  R                  UR                  U* US9  ggg)zInitialize the weightsr   )abr           )meanstdNg      ?)
isinstancerq   mathsqrt
projectionin_featuresr   inituniform_weightr   rL   	constant_r)   Lineardatanormal_r/   initializer_rangezero_r*   	GroupNormfill_r%   kaiming_normal_rP   in_channelsr   )r.   moduleks      r2   _init_weights*Data2VecAudioPreTrainedModel._init_weights   s   f<==		!f//;;;<AGGV..55!qAGGV..33rQ? @AAGGfkk..2		**MM&&CT[[5R5R&S{{&  &&( 'r|| <=={{&  &&(}}(""((- )		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r4   c                     [        S5      eNzNot needed for Data2VecAudioAttributeErrorr.   s    r2   _get_adapters*Data2VecAudioPreTrainedModel._get_adapters       ;<<r4   c                     [        S5      er   r   r   s    r2   init_adapter_layers0Data2VecAudioPreTrainedModel.init_adapter_layers   r   r4   c                     [        S5      er   r   r   s    r2   load_adapter)Data2VecAudioPreTrainedModel.load_adapter   r   r4   rH   N)r>   r?   r@   rA   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r   r   rB   rH   r4   r2   rz   rz      s>    ($O&*#N92===r4   rz   c                   B   ^  \ rS rSrS\4S jrS rS rU 4S jrSr	U =r
$ )Data2VecAudioModel   r/   c                    [         R                  X5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        UR$                  (       a  ['        U5      OS U l        U R+                  5         g )Nr   )rz   r!   r/   rf   feature_extractorrq   feature_projectionmask_time_probmask_feature_probr   	ParametertorchTensorrQ   r   masked_spec_embedrt   encoderadd_adapterrw   adapter	post_init)r.   r/   s     r2   r!   Data2VecAudioModel.__init__   s    $--d;!<V!D"@"H   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"+F37=7I7I+F3t 	r4   c                     [        S5      er   r   r   s    r2   freeze_feature_extractor+Data2VecAudioModel.freeze_feature_extractor   r   r4   c                 8    U R                   R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r   _freeze_parametersr   s    r2   freeze_feature_encoder)Data2VecAudioModel.freeze_feature_encoder   s    
 	113r4   c                 $   > [         TU ]  " S0 UD6$ NrH   r    r;   r.   super_kwargsr1   s     r2   r;   Data2VecAudioModel.forward       w...r4   )r   r/   r   r   r   r   )r>   r?   r@   rA   r   r!   r   r   r;   rB   rC   rD   s   @r2   r   r      s$    2 "=4/ /r4   r   c                   :   ^  \ rS rSrS rS rS rU 4S jrSrU =r	$ )Data2VecAudioForCTC   c                    [         R                  X5        [        U5      U l        [        R
                  " UR                  5      U l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                  " X!R                  5      U l        U R#                  5         g )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r   )rz   r!   r   r|   r   Dropoutfinal_dropoutdropout
vocab_size
ValueErrorr1   hasattrr   output_hidden_sizerQ   r   lm_headr   )r.   r/   r   s      r2   r!   Data2VecAudioForCTC.__init__   s    $--d;08zz&"6"67$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r4   c                     [        S5      er   r   r   s    r2   freeze_base_model%Data2VecAudioForCTC.freeze_base_model   r   r4   c                     [        S5      er   r   r   s    r2   tie_weightsData2VecAudioForCTC.tie_weights   r   r4   c                 $   > [         TU ]  " S0 UD6$ r   r   r   s     r2   r;   Data2VecAudioForCTC.forward   r   r4   )r|   r   r   )
r>   r?   r@   rA   r!   r   r   r;   rB   rC   rD   s   @r2   r   r      s    *==/ /r4   r   c                       \ rS rSrSrg)&Data2VecAudioForSequenceClassification   rH   NrI   rH   r4   r2   r   r      rJ   r4   r   c                       \ rS rSrSrg)(Data2VecAudioForAudioFrameClassification   rH   NrI   rH   r4   r2   r   r      rJ   r4   r   c                       \ rS rSrSrg)Data2VecAudioForXVector   rH   NrI   rH   r4   r2   r   r      rJ   r4   r   )r   r   r   r   r   rz   )+__doc__r   r   r   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr   wav2vec2.modeling_wav2vec2r
   r   r   r   r   r   r   r   r   r   r   configuration_data2vec_audior   r   rF   ri   rL   rY   rf   rq   rt   rw   rz   Data2VecAudioBaseModelOutputr   r   r   r   r   __all__rH   r4   r2   <module>r      s
   "    ! 9 7 -    >7 6	0 	ryy 6299 #"8 #	%> 		? 		? 	)=?4K )=X  7 /5} /@/6 /@	-N 		/R 		0 	r4   