
    cCiA                    J   S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSK	J
r
JrJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.  \)R^                  " \05      r1Sr2S\Rf                  S\4S\44S jr5 S~S\Rf                  S\4S\\Rf                     4S jjr6  SS\7\4\44   S\8S\4S\\Rr                     S\4S \Rt                  4S! jjr; " S" S#\5      r< " S$ S%\5      r= " S& S'\5      r> " S( S)\R~                  5      r@ " S* S+\R~                  5      rA " S, S-\R~                  5      rB " S. S/\R                  R~                  5      rC " S0 S1\R~                  5      rD " S2 S3\R~                  5      rE " S4 S5\R~                  5      rF " S6 S7\R~                  5      rG " S8 S9\R~                  5      rH " S: S;\R~                  5      rI " S< S=\R~                  5      rJ " S> S?\R~                  \%5      rK " S@ SA\R~                  \%5      rL " SB SC\R~                  \%5      rM " SD SE\R~                  5      rN " SF SG\R~                  5      rO " SH SI\5      rP " SJ SK\5      rQ\( " SL SM\&5      5       rR " SN SO\R5      rS " SP SQ\R5      rT " SR SS\R5      rU " ST SU\R5      rV " SV SW\R5      rW " SX SY\R5      rX " SZ S[\R5      rY " S\ S]\R5      rZ " S^ S_\R~                  5      r[ " S` Sa\R~                  5      r\\(" SbSc9 " Sd Se\R5      5       r]\(" SfSc9 " Sg Sh\R\5      5       r^        SSi\RS\R                  Sj\\R                     S\\Rr                     Sk\8Sl\8Sm\8Sn\\R~                     So\`Sp\`S \\R                  \7\R                  \R                  4   4   4Sq jjra\(" SrSc9 " Ss St\R5      5       rb\(" SuSc9 " Sv Sw\R5      5       rc " Sx Sy\R~                  5      rd\(" SzSc9 " S{ S|\&5      5       re/ S}Qrfg)zPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging)deprecate_kwarg   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r!   r"   r#   shifted_input_idss       h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr-   6   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     US:  a!  U SS2US-
  SU24   n Ub  USS2US-
  SU24   nU R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   UR                  US:H  S5        X24$ )zo
Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
r   Nr%         Y        )r&   r'   r(   r*   )r/   r0   r1   shifted_input_valuess       r,   shift_spectrograms_rightr6   F   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r.   r'   	mask_probmask_length	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr8   r7   r9   sequence_lengths     r,   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr.   Nr%   dtyper   F)replace)r)   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper?   put_along_axis)r'   r7   r8   r1   r9   
batch_sizerD   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr@   rA   spec_aug_mask_idxdummy_mask_idxoffsetsrB   rC   s    `` `            @@r,   _compute_mask_indicesrg   \   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5NoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r,   rq   %SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r.   c                 J    U R                  U5      nU R                  U5      nU$ N)ry   r{   r}   hidden_statess     r,   forward$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r.   )r{   ry   rs   rt   r   __name__
__module____qualname____firstlineno__rq   r   __static_attributes____classcell__r   s   @r,   ri   ri      s    A r.   ri   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5LayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   rl   T)elementwise_affine)rp   rq   rr   rs   rt   r   ru   rv   rw   rx   ry   	LayerNorm
layer_normr
   rz   r{   r|   s      r,   rq   #SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r.   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nr%   )ry   	transposer   r{   r   s     r,   r   "SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r.   r{   ry   rs   r   rt   r   r   r   s   @r,   r   r      s    A r.   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5GroupNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   rl   T)
num_groupsnum_channelsaffine)rp   rq   rr   rs   rt   r   ru   rv   rw   rx   ry   r
   rz   r{   	GroupNormr   r|   s      r,   rq   #SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr.   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )ry   r   r{   r   s     r,   r   "SpeechT5GroupNormConvLayer.forward  s2    		-066r.   r   r   r   r   s   @r,   r   r     s    r  r.   r   c            	         ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\R                  S\4S jj5       r SS
\R                  S\S\\   4S jjrSrU =r$ )%SpeechT5SinusoidalPositionalEmbeddingi  zDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g N   )rp   rq   offsetr   r   make_weights)r}   r   r   r   r   s       r,   rq   .SpeechT5SinusoidalPositionalEmbedding.__init__"  s8    *&-++5}Rr.   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsrG   deviceF
persistent)get_embeddinghasattrtor   rG   r   register_buffer)r}   r   r   r   emb_weightss        r,   r   2SpeechT5SinusoidalPositionalEmbedding.make_weights)  s\    ((T4##%..t||/A/A$,,J]J].^KYFr.   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   rF   r   dimr%   N)mathlogtorchexprT   int64float	unsqueezecatsincosviewrQ   r   get_default_dtype)r   r   r   half_dimembs        r,   r   3SpeechT5SinusoidalPositionalEmbedding.get_embedding1  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r.   r!   past_key_values_lengthc                    UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nX`R
                  R                  S5      :  a3  U R                  X`R                  -   U R                  U R                  5        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       $ )Nr   r   r%   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rM   )r}   r!   r   bszseq_lenposition_idsmax_poss          r,   r   -SpeechT5SinusoidalPositionalEmbedding.forwardC  s     ~~'>>yJZJZ\rsvv

 ""Q&0\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVXY``bbr.   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:
Returns: torch.Tensor
r   r   )ner>   r   cumsumtype_aslong)r}   r!   r   r   maskincremental_indicess         r,   r   HSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsR  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r.   )r   r   r   r   r   )r   r   r   r   __doc__r>   r   rq   r   staticmethodr   r   no_gradTensorr   r   r   r   r   s   @r,   r   r     s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" ]]_c cs c c bc88478QYZ]Q^8 8r.   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5PositionalConvEmbeddingid  c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr   )rm   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rp   rq   r   ru   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsry   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r
   rz   r{   )r}   r~   r   r   r   r   r   s         r,   rq   (SpeechT5PositionalConvEmbedding.__init__e  s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI+F,J,JK !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr   r   )r   ry   r   r{   r   s     r,   r   'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r.   )r{   ry   r   r   r   s   @r,   r   r   d  s    AB r.   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ ) SpeechT5ScaledPositionalEncodingi  uS   
Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
c           	        > [         R                  " X25      n[         R                  " SU5      R                  S5      n[         R                  " [         R                  " SUS[         R
                  S9R                  5       [        R                  " S5      U-  * -  5      n[         R                  " UR                  5       U-  5      US S 2SS S24'   [         R                  " UR                  5       U-  5      US S 2SS S24'   UR                  S5      n[        TU ]1  5         U R                  SUSS9  [        R                  " US	9U l        X l        [        R$                  " [         R&                  " S
5      5      U l        g )Nr   r   r   rF   g     @peFr   p      ?)r   rQ   rT   r   r   r   r   r   r   r   r   rp   rq   r   r   Dropoutdropoutr   	Parametertensoralpha)r}   r  r   max_lenr  positiondiv_termr   s          r,   rq   )SpeechT5ScaledPositionalEncoding.__init__  s   [[&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r.   c                     XR                   U R                  S S 2S UR                  S5      24   -  -   nU R                  U5      nU$ )Nr   )r  r  r   r  )r}   r   s     r,   r   (SpeechT5ScaledPositionalEncoding.forward  s@    JJMchhqkM)9!:::ll3
r.   )r  r   r  )i  )	r   r   r   r   r   rq   r   r   r   r   s   @r,   r  r    s    5 r.   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )"SpeechT5RelativePositionalEncodingi  c                    > [         TU ]  5         Xl        X l        [        R
                  R                  SU-  U5      U l        g r   )rp   rq   r   
max_lengthr   r   	Embeddingpe_k)r}   r   r  r   s      r,   rq   +SpeechT5RelativePositionalEncoding.__init__  s4    $HH&&q:~s;	r.   c                 t   UR                   S   n[        R                  " SU5      R                  UR                  [        R
                  S9nUS S 2S 4   US S S 24   -
  nU R                  * X3U R                  * :  '   U R                  S-
  X3U R                  :  '   X0R                  -   nU R                  U5      $ )Nr   r   r   rG   )r'   r   rT   r   r   r   r  r  )r}   r   r   pos_seqs       r,   r   *SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+OO+yy!!r.   )r   r  r  )i  r   r   s   @r,   r  r    s    <	" 	"r.   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr   r   r   )rp   rq   num_pad_remove)r}   r   r   s     r,   rq   SpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car.   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r"  r   s     r,   r   SpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr.   r&  r   r   s   @r,   r   r     s    K r.   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SpeechT5FeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rp   rq   feat_extract_normr   rP   num_feat_extract_layersri   r   r)   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r}   r~   ir0  r   s       r,   rq   SpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNNw,V!eDNwN K %%0HMfNlNlHmHm1*6>Hm  K 01I1I0JJst  ==5&+#"Ns   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g )NF)
parametersrequires_gradr2  )r}   params     r,   _freeze_parameters)SpeechT5FeatureEncoder._freeze_parameters  s#    __&E"'E '#r.   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ NT)r2  trainingr7  r0  )r}   r/   r   
conv_layers       r,   r   SpeechT5FeatureEncoder.forward  sK    $QW- 4==*.M'**J&}5M + r.   )r2  r0  r1  )
r   r   r   r   r   rq   r9  r   r   r   r   s   @r,   r)  r)    s    8#&$

 
r.   r)  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeatureProjectioni  c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr%   eps)rp   rq   r   r   rr   layer_norm_epsr   Linearr   
projectionr
  feat_proj_dropoutr  r}   r~   r   s     r,   rq   "SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r.   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r   )r   rG  r  )r}   r   norm_hidden_statess      r,   r   !SpeechT5FeatureProjection.forward  s7    !__];(:;]300r.   )r  r   rG  r   r   s   @r,   rA  rA    s    <1 1r.   rA  c                   L  ^  \ rS rSrU 4S jrS r  SS\R                  S\\R                     S\\R                     4S jjrS\S\R                  4S	 jrS
\\R                  \4   4S jr  SS\R                  S\\R                     S\\R                     4S jjrSrU =r$ )SpeechT5SpeechEncoderPreneti  c                   > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        [%        UR&                  UR(                  -   S-   UR                  UR(                  5      U l        g )Nr4   r   )rp   rq   r~   r)  feature_encoderrA  feature_projectionmask_time_probmask_feature_probr   r  r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr"   pos_sinusoidal_embedrI  s     r,   rq   $SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r.   c                 8    U R                   R                  5         g r   )rQ  r9  r}   s    r,   freeze_feature_encoder2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r.   r/   r1   mask_time_indicesc                    U R                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R	                  U5      u  pTU R                  XSUS9nU R                  U5      nXV-   nUb   UR                  S5      R                  5       nO;[        R                  " UR                  S S [        R                  UR                  S9nU R                  U5      nXX-   nXR4$ )Nr   r   )r_  r1   r   )rQ  r   "_get_feature_vector_attention_maskr'   rR  _mask_hidden_statesrW  r   r   r   rQ   r   rY  )	r}   r/   r1   r_  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r,   r   #SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S'00~ 1 
 %)$7$7$F!%A%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%H,,r.   feature_vector_lengthc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr%   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r'   rQ   rG   r   rT   fliprR   )r}   rh  r1   non_padded_lengthsoutput_lengthsr^   s         r,   ra  >SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr.   r`   c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   div)r@   rm   rn   s      r,   _conv_out_lengthVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s      99\7wWZ[[[r.   )zipr~   rv   rw   )r}   r`   ru  rm   rn   s        r,   rk  <SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r.   r   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )r7   r8   r1   r9   r  )r7   r8   r9   r%   )getattrr~   r   rV  r   rG   rS  r=  rg   mask_time_lengthmask_time_min_masksr   r  r   rR   rT  mask_feature_lengthmask_feature_min_masksexpand)r}   r   r_  r1   r^   rC   r   mask_feature_indicess           r,   rb  /SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r.   )r~   rQ  rR  rV  rW  rY  NN)r   r   r   r   rq   r]  r   r   r   
LongTensorFloatTensorr   r>   ra  r   rk  rb  r   r   r   s   @r,   rO  rO    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	, ,r.   rO  c                   t   ^  \ rS rSrU 4S jrS r SS\R                  S\\R                     4S jjr	Sr
U =r$ )	SpeechT5SpeechDecoderPreneti  c           	      f  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H@  n[        R                  " US:X  a  UR                  OUR                  UR                  5      PMB     sn5      U l
        [        R                  " UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        [        R                  " UR"                  UR                  -   UR                  5      U l        g s  snf r%  )rp   rq   r~   r   r/  rP   speech_decoder_prenet_layersrF  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr  positional_dropoutrX  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr}   r~   r3  r   s      r,   rq   $SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 DA	 		+,6F''v7Y7Y66 D
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD.c                     [         R                  " US   US9nUR                  S5      R                  UR	                  S5      SS5      n[         R
                  " US:H  US5      S-  SU-
  -  $ )Nr   r  r   )r   	bernoullir   repeatr   where)r}   inputs_embedsr  r   	all_maskss        r,   _consistent_dropout/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr.   r/   speaker_embeddingsc                 6   UnU R                    HM  n[        R                  R                  U" U5      5      nU R	                  X0R
                  R                  5      nMO     U R                  U5      nU R                  U5      nUb  [        R                  R                  U5      nUR                  S5      R                  SUR                  S5      S5      n[        R                  " X2/SS9n[        R                  R                  U R                  U5      5      nU$ )Nr   r%   r   )r  r   
functionalrelur  r~   speech_decoder_prenet_dropoutr  r  	normalizer   r  r   r   r   r  )r}   r/   r  r  r,  s        r,   r   #SpeechT5SpeechDecoderPrenet.forward  s     %[[EMM..u]/CDM 44]KKDmDmnM ! ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}&IrRMMM..t/H/H/WXMr.   )r~   r  r  r  r  r   )r   r   r   r   rq   r  r   r   r   r   r   r   r   s   @r,   r  r    s=    u,K 6:ll %U\\2 r.   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5BatchNormConvLayeri  c           	        > [         TU ]  5         US:X  a  UR                  nOUR                  nX!R                  S-
  :X  a  UR                  nOUR                  n[
        R                  " UUUR                  SUR                  S-
  S-  SS9U l        [
        R                  " U5      U l
        X!R                  S-
  :  a  [
        R                  " 5       U l        OS U l        [
        R                  " UR                  5      U l        g )Nr   r   r   F)rm   rn   r   ro   )rp   rq   r  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   ru   speech_decoder_postnet_kernelry   BatchNorm1d
batch_normTanhr{   r
  speech_decoder_postnet_dropoutr  )r}   r~   r   rs   rt   r   s        r,   rq   #SpeechT5BatchNormConvLayer.__init__  s    q= --K ==K;;a??!..L!>>LII<<99A=!C
	 ..6::Q>> ggiDO"DOzz&"G"GHr.   c                     U R                  U5      nU R                  U5      nU R                  b  U R                  U5      nU R                  U5      nU$ r   )ry   r  r{   r  r   s     r,   r   "SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r.   )r{   r  ry   r  r   r   r   s   @r,   r  r    s    I< r.   r  c                   l   ^  \ rS rSrU 4S jrS\R                  4S jrS\R                  4S jrSr	U =r
$ )SpeechT5SpeechDecoderPostneti  c           	        > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  -  5      U l        [        R                  " UR
                  UR                  5      U l	        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )rp   rq   r~   r   rF  r   r  r0   feat_outprob_outr/  rP   r  r  r  r  s      r,   rq   %SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<gh<gq'2<gh
hs   *Cr   c                    U R                  U5      R                  UR                  S5      SU R                  R                  5      nU R                  U5      nU R                  U5      R                  UR                  S5      S5      nX#U4$ )Nr   r%   )r  r   r   r~   r  postnetr  )r}   r   outputs_before_postnetoutputs_after_postnetlogitss        r,   r   $SpeechT5SpeechDecoderPostnet.forward  s{    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%fDDr.   c                     UR                  SS5      nU R                   H  nU" U5      nM     XR                  SS5      -   $ r  )r   r  )r}   r   layer_outputr,  s       r,   r  $SpeechT5SpeechDecoderPostnet.postnet  sB    $..q!4[[E .L !55a;;;r.   )r~   r  r  r  )r   r   r   r   rq   r   r   r   r  r   r   r   s   @r,   r  r    s/    	
EU\\ E<U\\ < <r.   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )SpeechT5TextEncoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        g r   )rp   rq   r~   r   r  
vocab_sizer   r"   embed_tokensr  r  max_text_positionsr  rI  s     r,   rq   "SpeechT5TextEncoderPrenet.__init__  sc    LL):):F<N<NPVPcPcd @%%%%!
r.   r!   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  )r}   r!   r  s      r,   r   !SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r.   )r~   r  r  )
r   r   r   r   rq   r   r   r   r   r   r   s   @r,   r  r    s    
  r.   r  c                   z   ^  \ rS rSrU 4S jr  SS\R                  S\\R                     S\\	   4S jjr
SrU =r$ )	SpeechT5TextDecoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  5      U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                  UR                  UR                  5      U l        [!        UR"                  UR                  -   S-   UR                  UR                  5      U l        g )Nr	  r   )rp   rq   r~   r   r
  r  r  scale_embeddingr   sqrtr   embed_scaler  r  r"   r  r   r  embed_positionsrI  s     r,   rq   "SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r.   r!   r1   past_key_valuesc                 v   Ub&  UR                  5       nUR                  SUS   5      nO[        S5      eSnUb:  [        U[        5      (       d  US   S   R
                  S   OUR                  5       nU R                  X5      nU R                  U5      U R                  -  nXv-  nU R                  U5      nXr4$ )Nr%   z'You have to specify `decoder_input_ids`r   r   )r   r   r)   
isinstancer   r'   get_seq_lengthr  r  r  r  )r}   r!   r1   r  input_shaper   	positionsr  s           r,   r   !SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG!"& "/599  "1%++B/$335 # ((K	)))4t7G7GG"]3,,r.   )r~   r  r  r  r  r  )r   r   r   r   rq   r   r   r   r  r   r   r   r   r   s   @r,   r  r    sI    
" 6:+/	-<<- !!1!12- "%	- -r.   r  c                   V   ^  \ rS rSrU 4S jrS\R                  4S jrS rS r	Sr
U =r$ )SpeechT5TextDecoderPostneti:  c                    > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        g )NFro   )rp   rq   r~   r   rF  r   r  lm_headrI  s     r,   rq   #SpeechT5TextDecoderPostnet.__init__;  s3    yy!3!3V5F5FUSr.   r   c                 $    U R                  U5      $ r   r  r   s     r,   r   "SpeechT5TextDecoderPostnet.forward@  s    ||M**r.   c                     U R                   $ r   r  r\  s    r,   get_output_embeddings0SpeechT5TextDecoderPostnet.get_output_embeddingsC  s     ||r.   c                     Xl         g r   r  r}   new_embeddingss     r,   set_output_embeddings0SpeechT5TextDecoderPostnet.set_output_embeddingsH  s    %r.   )r~   r  )r   r   r   r   rq   r   r   r   r  r  r   r   r   s   @r,   r  r  :  s(    T
+U\\ +
& &r.   r  c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	\
" S
SSS9       SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\   4   4S jj5       rSrU =r$ )SpeechT5AttentioniL  z
Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
https://aclanthology.org/N18-2074.pdf)
	embed_dim	num_headsr  
is_decoderro   	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rp   rq   r  r  r  head_dimr)   scalingr  r  r   rF  k_projv_projq_projout_proj)r}   r  r  r  r  ro   r  r   s          r,   rq   SpeechT5Attention.__init__R  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr.   past_key_valuer  4.58new_nameversionr   key_value_statesr1   layer_head_maskposition_biasoutput_attentionscache_positionr:   c	                    USLn	UR                  5       u  pnU R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUbc  U	(       d  UOSnWR'                  UUU R                  SU05      u  nnU	(       a.  [        U[        5      (       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " UUR%                  SS5      5      nUR                  5       XR                   -  UU4:w  a.  [/        SXR                   -  UU4 S	UR                  5        35      eUb  UR1                  5       R                  XR                   -  SU R"                  5      R%                  S
S5      n[*        R2                  " UUR%                  SS5      5      nUR%                  S
S5      R                  XR                   -  UR                  S
5      UR                  S5      5      nUU-  nUbz  UR                  5       U
SUU4:w  a#  [/        SU
SUU4 S	UR                  5        35      eUR                  XR                   UU5      U-   nUR                  XR                   -  UU5      n[4        R6                  R9                  USS9nUb  UR                  5       U R                   4:w  a*  [/        SU R                   4 S	UR                  5        35      eUR                  SSSS5      UR                  XR                   UU5      -  nUR                  XR                   -  UU5      nU(       a=  UR                  XR                   UU5      nUR                  XR                   -  UU5      nOSn[4        R6                  R;                  UU R:                  U R<                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 S	UR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU R>                  5      nU RA                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr%   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r  r=  z `attn_output` should be of size )!r   r  r  r  r   
is_updatedgetr  cross_attention_cacheself_attention_cacher  keysvaluesr  r  r   r  r  r   updater\   r   bmmr)   
contiguousmatmulr   r  softmaxr  r=  r  r  )r}   r   r  r  r1   r  r  r  r  is_cross_attentionr   tgt_lenr_   query_statesr  curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                              r,   r   SpeechT5Attention.forwardo  s     .T9',,.a {{=1DLL@
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(*  $$//166s^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<nn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r.   )r  r  r  r  r  r  r  r  r  r  r  )r4   FTN)NNNNNFN)r   r   r   r   r   r>   r   r   rR   rq   r   r   r   r   tupler   r   r   r   s   @r,   r  r  L  s^    $'%*#$(CC C %	C
 TNC tnC D>C C: %0A6R 48+/152604"'152||2 #5<<02 "%	2
 !.2 "%,,/2  -2  2 !.2 
u||Xell3Xe_D	E2 S2r.   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  U5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " X!R                  5      U l        [        R                  " UR                  5      U l        g r   )rp   rq   r   r
  activation_dropoutintermediate_dropoutrF  r   intermediate_denser  
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r}   r~   intermediate_sizer   s      r,   rq   SpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''--'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r.   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r"  r%  r!  r&  r(  r   s     r,   r   SpeechT5FeedForward.forward   sX    //>00?11-@))-8++M:r.   )r%  r"  r!  r&  r(  r   r   s   @r,   r  r    s    @ r.   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\R                     S\\R                     S\	4
S	 jjr
S
rU =r$ )SpeechT5EncoderLayeri
  r~   c                   > [         TU ]  5         [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        XR                  5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r  r  r  r  rC  )rp   rq   r  r   encoder_attention_headsattention_dropout	attentionr   r
  r'  r  r   rE  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normrI  s     r,   rq   SpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r.   r   r1   r  r  r  c                     UnU R                  UUUUUS9u  pU R                  U5      nXa-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`):
        attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
        large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(config.encoder_attention_heads,)`.
    position_bias (`torch.FloatTensor`):
        relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r1   r  r  r  )r2  r  r   r4  r5  )	r}   r   r1   r  r  r  residualr  outputss	            r,   r   SpeechT5EncoderLayer.forward  s    . !&*nn')+'/ '5 '
# ]3 06%(9(9-(HH--m< "&Gr.   )r2  r  r4  r5  r   )NNNF)r   r   r   r   r   rq   r   r   r   rR   r   r   r   r   s   @r,   r.  r.  
  sx    \~ \  262604"',||, !., "%,,/	,
  -,  , ,r.   r.  c                   T  ^  \ rS rSrSS\4U 4S jjjr\" SSSS9         SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\
   S\	\   S\	\   S\	\R                     4S jj5       rSrU =r$ )SpeechT5DecoderLayeriG  r~   c                 t  > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  UR                  S9U l        [!        XR"                  5      U l        [        R                  " UR                  UR                  S9U l        g )NT)r  r  r  r  r  rC  )r  r  r  )rp   rq   r  r   decoder_attention_headsr1  	self_attnr   r
  r'  r  r   rE  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr4  r5  )r}   r~   r  r   s      r,   rq   SpeechT5DecoderLayer.__init__H  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r.   r  r  r  r  r   r1   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  	use_cacher  c           
      r   UnU R                  UUUUUU
S9u  pU R                  U5      nX-   nU R                  U5      nSnUb?  UnU R                  UUUUUUU
S9u  pU R                  U5      nX-   nU R	                  U5      nXR                  U5      -   nU R                  U5      nU4nU(       a  XU4-  nU$ )ay  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r  r1   r  r  r  N)r   r  r1   r  r  r  r  )r?  r  r@  rA  rB  r4  r5  )r}   r   r1   rE  rF  r  rG  r  r  rH  r  r8  self_attn_weightscross_attn_weightsr9  s                  r,   r   SpeechT5DecoderLayer.forward`  s   @ ! ,0>>'+)+/) ,: ,
( ]3 011-@ " ,$H040A0A+!65 : /"3- 1B 1-M !LL7M$4M 88GM &(9(9-(HH--m< "+=>>Gr.   )r  rA  rB  r4  r5  r?  r@  r   )	NNNNNNFTN)r   r   r   r   r   rq   r   r   r   r   r   rR   r   r   r   r   s   @r,   r<  r<  G  s   \~ \ \0 %0A6R 268<9=26=A+/,1$(15I||I !.I  (5	I
 !) 6I "%,,/I %-U\\$:I "%I $D>I D>I !.I SIr.   r<  c                   N    \ rS rSr% \\S'   SrSrSrS\	R                  4S jrSrg	)
SpeechT5PreTrainedModeli  r~   speecht5r/   Tmodulec           
      |   U R                   R                  n[        U[        5      (       a  [        R
                  R                  UR                  R                  SS[        R                  " SUR                  R                  S   UR                  R                  -  -  5      -  S9  [        R
                  R                  UR                  R                  S5        GO%[        U[        5      (       a'  UR                   R"                  R%                  S5        GO[        U[&        5      (       a  [        R                  " SUR(                  R*                  -  5      n[        R
                  R-                  UR(                  R                  U* US9  [        R
                  R-                  UR(                  R                  U* US9  GO=[        U[        R.                  5      (       aW  UR                  R"                  R                  SUS9  UR                  b$  UR                  R"                  R1                  5         GO[        U[        R2                  [        R4                  [        R6                  45      (       aK  UR                  R"                  R1                  5         UR                  R"                  R%                  S5        GO>[        U[        R8                  5      (       a  [        R
                  R;                  UR                  5        UR                  bg  [        R                  " UR<                  UR                  UR                  S   -  -  5      n[        R
                  R-                  UR                  U* US9  O[        U[        R>                  5      (       ab  UR                  R"                  R                  SUS9  UR@                  b1  UR                  R"                  UR@                     R1                  5         [C        US	5      (       a*  [        R
                  R-                  URD                  5        gg)
zInitialize the weightsr   r   r   meanstdr	  )abr4   NrV  )#r~   initializer_ranger  r   r   initnormal_ry   r   r   r  rm   in_channels	constant_ro   r  r  datafill_rA  rG  in_featuresrU  rF  zero_r   r   r  ru   kaiming_normal_r   r  r   r   rV  )r}   rP  rT  ks       r,   _init_weights%SpeechT5PreTrainedModel._init_weights  s   kk++f=>>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 @AALL##C( 9::		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CS&9{{&  &&(r||R^^ LMMKK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8--MM&&CS&9!!-""6#5#56<<>6.//GGV556 0r.    N)r   r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Modulerb  r   rd  r.   r,   rN  rN    s)    "$O&*#"7BII "7r.   rN  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5Encoderi  zm
Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
r~   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  S9U l        [        R                  " UR                  5      U l	        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [#        UR                  UR$                  -  UR&                  5      U l        SU l        U R-                  5         g s  snf )NrC  F)rp   rq   r   r   r   rE  r   r
  r'  r  encoder_layerdrop	layerdropr/  rP   encoder_layersr.  r  r  r0  encoder_max_relative_positionr  r1  	post_init)r}   r~   r_   r   s      r,   rq   SpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$hKga%9&%AKg$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr   r1   	head_maskr  output_hidden_statesreturn_dictr:   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        X!R
                  5      nU R                  U5      nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      nU(       a  SOSn	U(       a  SOSn
Ub`  UR                  5       S   [        U R                  5      :w  a6  [        S[        U R                  5       SUR                  5       S    S35      e[        U R                  5       H  u  pU(       a  X4-   n	SnU R                   (       a$  ["        R$                  " / 5      nXR&                  :  nU(       a  U(       a  U" UUUUb  X;   OSUS9nUS   nU(       a  S	nU(       d  M~  U
WS
   4-   n
M     U(       a  X4-   n	U(       d  [)        S XU
4 5       5      $ [+        UU	U
S9$ )aA  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the encoder prenet.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nrd  r   z&The head_mask should be specified for  layers, but it is for .F)r1   r  r  r  r  r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rd  .0vs     r,   	<genexpr>*SpeechT5Encoder.forward.<locals>.<genexpr>O  s     m$[q$[s   	last_hidden_stater   
attentions)r~   r  rt  use_return_dictr   rG   r   r  r  r   r   r   rU   r  r)   	enumerater=  r   rK   rn  r  r   )r}   r   r1   rs  r  rt  ru  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r,   r   SpeechT5Encoder.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8C#$58H$H! #N}}&+jjn#!4~~!E![ -!#1"/7@7LY^RV&7! !.a 0 ,  &9]1=M<O&O#3 #96   14D Dm]GZ$[mmm++*
 	
r.   )r  r  r1  r   rn  r  NNNNNr   r   r   r   r   r   rq   r   r  r   r   rR   r   r  r   r   r   r   r   s   @r,   rk  rk    s    ~ ( 26,0,0/3&*f
((f
 !.f
 ELL)	f

 $D>f
 'tnf
 d^f
 
uo%	&f
 f
r.   rk  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithSpeechPrenetiX  z
Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
hidden features.
r~   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rp   rq   rO  prenetrk  wrapped_encoderrq  rI  s     r,   rq   (SpeechT5EncoderWithSpeechPrenet.__init__^  5     1&9.v6 	r.   r/   r1   rs  r  rt  ru  r:   c           	      T    U R                  X5      u  prU R                  UUUUUUS9nU$ N)r   r1   rs  r  rt  ru  r  r  	r}   r/   r1   rs  r  rt  ru  r   r9  s	            r,   r   'SpeechT5EncoderWithSpeechPrenet.forwardf  sC     )-L(Q%&&')/!5# ' 
 r.   r  r  r  r   s   @r,   r  r  X  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r.   r  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r     SS\	R                  S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithTextPreneti}  zt
Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
r~   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rp   rq   r  r  rk  r  rq  rI  s     r,   rq   &SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r.   c                 6    U R                   R                  5       $ r   r  get_input_embeddingsr\  s    r,   r  2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r.   c                 :    U R                   R                  U5        g r   r  set_input_embeddingsr}   values     r,   r  2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r.   r/   r1   rs  r  rt  ru  r:   c           	      P    U R                  U5      nU R                  UUUUUUS9nU$ r  r  r  s	            r,   r   %SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r.   r  r  )r   r   r   r   r   r   rq   r  r  r   r  r   r   rR   r   r  r   r   r   r   r   s   @r,   r  r  }  s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r.   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithoutPreneti  
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
r~   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rp   rq   rk  r  rq  rI  s     r,   rq   %SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r.   r/   r1   rs  r  rt  ru  r:   c           	      *    U R                  UUUUUUS9$ r  r  )r}   r/   r1   rs  r  rt  ru  s          r,   r   $SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r.   r  r  r  r   s   @r,   r  r    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
 
r.   r  c                   l  ^  \ rS rSrSrS\4U 4S jjr            SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5Decoderi  zl
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
r~   c           
      
  > [         TU ]  U5        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l	        SU l
        U R                  5         g s  snf )N)r  F)rp   rq   decoder_layerdroprn  r   r/  rP   decoder_layersr<  r  r1  rq  r  s      r,   rq   SpeechT5Decoder.__init__  sl     11mmX]^d^s^sXt$uXtST%9&%NXt$uv&+# 	 %vs   	B r   r1   rE  rF  rs  cross_attn_head_maskr  rH  r  rt  ru  r  r:   c                 Z   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUR                  5       SS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a1  Uc.  [        [        U R                   S9[        U R                   S95      nU(       a@  [        U[        5      (       a+  [        R                  S5        [        R                  " U5      nUb  UR                  5       OSn[!        X-X5      nUb  Ub  [#        XAR$                  US   S9n['        5       =(       d    [)        U 5      nU
(       a  S	OSnU	(       a  S	OSnU	(       a  Ub  S	OSn[+        XV/S
S/5       Hn  u  nnUc  M  UR                  5       S   [-        U R.                  5      :w  d  M7  [1        SU S[-        U R.                  5       SUR                  5       S    S35      e   [3        U R.                  5       H  u  nnU
(       a  UU4-   nSnU R                  (       a%  [4        R6                  " / 5      nUU R8                  :  nU(       a	  U(       d  M[  U" UUUUUb  UU   OSUb  UU   OSUU	UUS9
nUS   nU	(       d  M  UUS   4-   nUc  M  UUS   4-   nM     U
(       a  UU4-   nU(       d  [        S XUUU4 5       5      $ [;        UUUUUS9$ )a  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the decoder prenet.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
        cross-attention on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr%   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r~   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r  rd  rs  r  zThe `z` should be specified for rw  rx  )rF  r  rG  r  r  rH  r  r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rd  rz  s     r,   r}  *SpeechT5Decoder.forward.<locals>.<genexpr>y  s      wA ws   	)r  r  r   r  cross_attentions)r~   r  rt  rH  r  r   r1  r=  loggerwarning_oncer   r   r  r  from_legacy_cacher  r   r   rG   r   r   rw  rU   r  r)   r  r   rK   rn  r   )r}   r   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r  r   r  r  r  all_cross_attentions	attn_mask	mask_namer  decoder_layerr  r  r  s                             r,   r   SpeechT5Decoder.forward  sR   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/&&4==##p "	01,dkk2RT`hlhshsTtuOOU;;\
 2CCOTOETE`!?!?!Afg:

 !,1G1S%?&(;(;[QS_&" 12R6LT6R #7BD$5b4&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#$58H$H! #N}}&+jjn#!4t~~!Ek)%'=3<3H3dI]Ii,@,Eos /"3#-M *!,M  &9]1=M<O&O#(4+?=QRCSBU+U(; #9>   1]4D D ':KM`bvw   9+++*1
 	
r.   )r1  rn  r  NNNNNNNNNNNNr   r   r   r   r   r   rq   r   r   r  r  r   r   rR   r   r  r   r   r   r   r   s   @r,   r  r    s9   	~ 	 6:59=A=A,07;+/$(,0/3&*15n
 1 12n
 !!1!12n
  ((9(9:	n

 !))9)9 :n
 ELL)n
 'u||4n
 "%n
 D>n
 $D>n
 'tnn
 d^n
 !.n
 
u??	@n
 n
r.   r  c                     ^  \ rS rSrSrS\4U 4S jjr             SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithSpeechPreneti  z|
Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
features.
r~   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rp   rq   r  r  r  wrapped_decoderrq  rI  s     r,   rq   (SpeechT5DecoderWithSpeechPrenet.__init__  r  r.   r/   r1   rE  rF  r  rs  r  r  rH  r  rt  ru  r  r:   c                 \    U R                  X5      nU R                  UUUUUUUU	U
UUUS9nU$ N)r   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r  r  )r}   r/   r1   rE  rF  r  rs  r  r  rH  r  rt  ru  r  decoder_hidden_statesr9  s                   r,   r   'SpeechT5DecoderWithSpeechPrenet.forward  sS      !%L M&&/)"7#9!5+/!5#) ' 
 r.   r  )NNNNNNNNNNNNNr  r   s   @r,   r  r    s@   
~  5959=A=A59,07;+/$(,0/3&*15!u001! !!1!12!  ((9(9:	!
 !))9)9 :! %U\\2! ELL)! 'u||4! "%! D>! $D>! 'tn! d^! !.! 
u??	@! !r.   r  c                   x  ^  \ rS rSrSrS\4U 4S jjrS rS r            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithTextPreneti  zs
Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
r~   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rp   rq   r  r  r  r  rq  rI  s     r,   rq   &SpeechT5DecoderWithTextPrenet.__init__  r  r.   c                 6    U R                   R                  5       $ r   r  r\  s    r,   r  2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r  r.   c                 :    U R                   R                  U5        g r   r  r  s     r,   r  2SpeechT5DecoderWithTextPrenet.set_input_embeddings  r  r.   r/   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r:   c                 b    U R                  XU5      u  pU R                  UUUUUUUUU	U
UUS9nU$ r  r  )r}   r/   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r  r9  s                  r,   r   %SpeechT5DecoderWithTextPrenet.forward  sY     15LZi0j-&&/)"7#9!5+/!5#) ' 
 r.   r  r  )r   r   r   r   r   r   rq   r  r  r   r   r  r  r   r   rR   r   r  r   r   r   r   r   s   @r,   r  r    s4   ~ 20
 5959=A=A,07;+/$(,0/3&*15 u001  !!1!12   ((9(9:	 
 !))9)9 :  ELL)  'u||4  "%  D>  $D>  'tn  d^  !.  
u??	@   r.   r  c                   l  ^  \ rS rSrSrS\4U 4S jjr            SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithoutPreneti  r  r~   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rp   rq   r  r  rq  rI  s     r,   rq   %SpeechT5DecoderWithoutPrenet.__init__  r  r.   r/   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r:   c                 :    U R                  UUUUUUUUU	U
UUS9nU$ r  r  )r}   r/   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  r9  s                 r,   r   $SpeechT5DecoderWithoutPrenet.forward  sD     &&&)"7#9!5+/!5#) ' 
 r.   r  r  r  r   s   @r,   r  r    s*   
~  5959=A=A,07;+/$(,0/3&*15u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "% D> $D> 'tn d^ !. 
u??	@ r.   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jrS
 r\S 5       rSrU =r$ )$SpeechT5GuidedMultiheadAttentionLossi  z
Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
r~   c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )rp   rq   guided_attention_loss_sigmasigmaguided_attention_loss_scalescalerI  s     r,   rq   -SpeechT5GuidedMultiheadAttentionLoss.__init__#  s(    77
77
r.   r  input_masksoutput_masksr:   c                 D   U R                  X#UR                  5      nUR                  S5      UR                  S5      -  nUR                  UR                  5      R                  S5      nXA-  n[        R
                  " UR                  U5      5      nU R                  U-  $ )a  
Compute the attention loss.

Args:
    attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
        Batch of multi-head attention weights
    input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
        Input attention mask as booleans.
    output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
        Target attention mask as booleans.

Returns:
    `torch.Tensor` with the loss value
r%   r   r   )_make_guided_attention_masksr   r   r   r   rS  masked_selectr  )r}   r  r  r  guided_attn_masksmaskslosseslosss           r,   r   ,SpeechT5GuidedMultiheadAttentionLoss.forward(  s    " !==kYcYjYjk&&r*[-B-B2-FF**+55a8"/zz&..u56zzD  r.   c                 j   UR                  S5      nUR                  S5      n[        R                  " [        U5      UR                  S   UR                  S   4US9n[        [        XE5      5       H.  u  nu  pU R                  XU R                  U5      XgS U	2S U24'   M0     UR                  S5      $ )Nr%   r   rj  )
rN   r   rQ   rU   r'   r  rw  _make_guided_attention_maskr  r   )
r}   r  r  r   r`   rn  r  r  ilenolens
             r,   r  ASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksA  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}+M!NC$373S3STX`d`j`jlr3s5D5%4%/0 "O !**1--r.   c                    [         R                  " [         R                  " XS9[         R                  " XS9SS9u  pEUR                  5       U-  nUR                  5       U -  nS[         R                  " XE-
  S-  * SUS-  -  -  5      -
  $ )Nrj  xy)indexingr	  r   )r   meshgridrT   r   r   )r@   output_lengthr  r   grid_ygrid_xs         r,   r  @SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maskL  sz    LL5LL6

 -/,.UYY&/a!78ANKLLLr.   )r  r  )r   r   r   r   r   r   rq   r   r  
BoolTensorr   r   r  r   r  r   r   r   s   @r,   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr.   r  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\R                  S	\R                  S
\
\R                     S\R                  4S jjrSrU =r$ )SpeechT5SpectrogramLossiX  z3
Loss computation used by SpeechT5ForTextToSpeech.
r~   c                 .  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        5       U l        [        [        R                  " S5      S9U l
        U R                  (       a  [        U5      U l        g g )Ng      @)
pos_weight)rp   rq   use_guided_attention_lossguided_attention_loss_num_headsr0   r   l1_criterionr   r   r  bce_criterionr  attn_criterionrI  s     r,   rq    SpeechT5SpectrogramLoss.__init__]  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r.   r1   r  r  r  labelsr  r:   c           	      V   US:g  nUR                  U5      nUR                  U5      nUR                  U5      nU R                  X55      U R                  X%5      -   nUS S 2S S 2S4   n	[        R                  " U	) S-  [        R                  " U	R                  S5      S5      R                  U	R                  5      /SS9n
U
S S 2SS 24   R                  U	5      n
UR                  U	5      nU R                  XJ5      nX-   nU R                  (       a  [        R                  " U Vs/ s H  oS S 2S U R                  24   PM     snSS9nUS:H  nUS S 2S S 2S4   nU R                  S:  a#  US S 2U R                  S-
  S U R                  24   nU R                  XU5      nUU-  nU$ s  snf )Nr3   r   r	  r   r   )r  r  r   r   rW   r   r   r   r  r  r  r0   r  )r}   r1   r  r  r  r  r  re  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r,   r   SpeechT5SpectrogramLoss.forwardi  s    ' %%l3!7!E!El!S 5 C CL Q ##$9BTEVEVWmEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%f: ! ))99TdeTdq#IT%I%I#I IJTdeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D|LIID fs   %F&)r  r  r  r  r0   r  r   )r   r   r   r   r   r   rq   r   r  r  r   r   r   r   r   r   s   @r,   r  r  X  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
) )r.   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            $       T  ^  \ rS rSr  SS\S\\R                     S\\R                     4U 4S jjjrS r	S r
S rS	 r\               SS
\\R                     S\\R                      S\\R                     S\\R                      S\\R"                     S\\R"                     S\\R                     S\\\\R"                           S\\   S\\   S\\R"                     S\\   S\\   S\\   S\\R                     S\\\R"                     \4   4 S jj5       rSrU =r$ )SpeechT5Modeli  r~   encoderdecoderc                    > [         TU ]  U5        Xl        Uc  [        U5      OUU l        Uc  [        U5      OUU l        U R                  5         g)z
encoder (`PreTrainedModel`, *optional*):
    The encoder model to use.
decoder (`PreTrainedModel`, *optional*):
    The decoder model to use.
N)rp   rq   r~   r  r  r  r  rq  )r}   r~   r  r  r   s       r,   rq   SpeechT5Model.__init__  sK     	 ?F3F;T[?F3F;T[ 	r.   c                     [        U R                  [        5      (       a  U R                  R                  5       $ [        U R                  [
        5      (       a  U R                  R                  5       $ [        er   )r  r  r  r  r  r  NotImplementedErrorr\  s    r,   r  "SpeechT5Model.get_input_embeddings  sR    dll$ABB<<4466dll$ABB<<4466!!r.   c                     [        U R                  [        5      (       a  U R                  R                  U5        [        U R                  [
        5      (       a  U R                  R                  U5        g g r   )r  r  r  r  r  r  r  s     r,   r  "SpeechT5Model.set_input_embeddings  sP    dll$ABBLL--e4dll$ABBLL--e4 Cr.   c                     U R                   $ r   )r  r\  s    r,   get_encoderSpeechT5Model.get_encoder  s    ||r.   c                     [        U R                  [        5      (       a%  U R                  R                  R	                  5         ggz
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r  r  r  r  r]  r\  s    r,   r]  $SpeechT5Model.freeze_feature_encoder  s2    
 dll$CDDLL668 Er.   r/   r1   decoder_input_valuesdecoder_attention_maskrs  decoder_head_maskr  encoder_outputsr  rH  r  r  rt  ru  r  r:   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUbV  [        U R
                  [        5      (       a7  U R
                  R                  R                  US   R                  S   U5      nOUn[        U R                  [        5      (       a  SU0nO0 nU R                  " S
UUUS   UUUU	U
UUUUS.UD6nU(       d  UU-   $ [        UR                   UR"                  UR$                  UR&                  UR(                  UR                   UR$                  UR&                  S	9$ )a1  
input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
    Depending on which encoder is being used, the `input_values` are either: float values of the input raw
    speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
    filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
    the vocabulary, or hidden states.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
N)r/   r1   rs  r  rt  ru  r   r   r   r  r  )r/   r1   rE  rF  rs  r  r  rH  r  rt  ru  r  )r  r  r  decoder_attentionsr  encoder_last_hidden_staterE  encoder_attentionsrd  )r~   r  rt  rH  r  r  r  r   rU   r  r  ra  r'   r  r  r   r  r  r   r  r  )r}   r/   r1   r-  r.  rs  r/  r  r0  r  rH  r  r  rt  ru  r  rF  decoder_argsdecoder_outputss                      r,   r   SpeechT5Model.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c*c%)\\%8%8%[%["((+^&" &4"dll$CDD02DELL,, 
-1"1!"4#9'!5+/!5#)
 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r.   )r~   r  r  r  NNNNNNNNNNNNNNN)r   r   r   r   r   r   r   ri  rq   r  r  r(  r]  r   r   r   r  r  r  r   rR   r   r   r   r   r   r   s   @r,   r  r    s    (,'+	 "))$ "))$	 ("59  04597;=A159=7;EI+/$(:>,0/3&*15!k
u||,k
 !!1!12k
 'u||4	k

 !))9)9 :k
 E--.k
 $E$5$56k
 'u||4k
 "%e.?.?(@"ABk
 "%k
 D>k
 %U%6%67k
 $D>k
 'tnk
 d^k
  !.!k
" 
uU&&');;	<#k
 k
r.   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            $         ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	 r\               SS
\\R                     S\\R                      S\\R                      S\\R                      S\\R                     S\\R                     S\\R"                     S\\\\R                           S\\   S\\   S\\   S\\   S\\   S\\R                      S\\R"                     S\\\4   4 S jj5       rSrU =r$ )SpeechT5ForSpeechToTexti6  z#text_decoder_postnet.lm_head.weightr~   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rp   rq   r  r)   r   r  r  r  rO  r  text_decoder_postnetrq  )r}   r~   speech_encodertext_decoderr   s       r,   rq    SpeechT5ForSpeechToText.__init__>  s}     $00@ A/ /  9@4V<%flK$>v$F! 	r.   c                 6    U R                   R                  5       $ r   rO  r(  r\  s    r,   r(  #SpeechT5ForSpeechToText.get_encoderR      }}((**r.   c                 6    U R                   R                  5       $ r   rO  get_decoderr\  s    r,   rG  #SpeechT5ForSpeechToText.get_decoderU  rD  r.   c                 T    U R                  5       R                  R                  5         gr+  r(  r  r]  r\  s    r,   r]  .SpeechT5ForSpeechToText.freeze_feature_encoderX      
 	!!88:r.   c                 6    U R                   R                  5       $ r   )r=  r  r\  s    r,   r  -SpeechT5ForSpeechToText.get_output_embeddings_  s    ((>>@@r.   c                 :    U R                   R                  U5        g r   )r=  r  r  s     r,   r  -SpeechT5ForSpeechToText.set_output_embeddingsb  s    !!77Gr.   r/   r1   decoder_input_idsr.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r:   c                    Ub  UOU R                   R                  nUb7  Uc4  [        XR                   R                  U R                   R                  5      nU R                  UUUUUUUUU	U
UUSUS9nU R                  US   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                   UR"                  S9	$ )a`  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
    or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
    only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
>>> from datasets import load_dataset

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
>>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
>>> predicted_ids = model.generate(**inputs, max_length=100)

>>> # transcribe speech
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
>>> transcription[0]
'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
```

```python
>>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

>>> # compute loss
>>> loss = model(**inputs).loss
>>> round(loss.item(), 2)
19.68
```
NT)r/   r1   r-  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r   r%   r   )	r  r  r  r  r2  r  r3  rE  r4  )r~   r  r-   r"   r#   rO  r=  r   r   r  r   r  r  r2  r  r3  rE  r4  )r}   r/   r1   rQ  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r9  r  r  loss_fctoutputs                        r,   r   SpeechT5ForSpeechToText.forwarde  s[   v &1%<k$++B]B] ($6KK44dkk6X6X%! --%)!2#9/!5++/!5)   
" **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   )rO  r=  r8  )r   r   r   r   _tied_weights_keysr   rq   r(  rG  r]  r  r  r   r   r   r  r  r   r  r   rR   r   r   r   r   r   r   s   @r,   r:  r:  6  s    @@~ (++;AH  59598<=A159=7;EI+/$(,0/3&*-115!H
u001H
 !!1!12H
 $E$4$45	H

 !))9)9 :H
 E--.H
 $E$5$56H
 'u||4H
 "%e.?.?(@"ABH
 "%H
 D>H
 $D>H
 'tnH
 d^H
 ))*H
  !.!H
" 
uo%	&#H
 H
r.   r:  modelr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
                    Uc  [        S5      eUc*  SXR                  R                  :H  R                  5       -
  n
OUn
UR	                  S5      nU R
                  R                  UU
SS9nUR                  n[        U R
                  R                  [        5      (       a@  U R
                  R                  R                  R                  US   R                  S   U
5      n
[        UR	                  S5      U-  U R                  R                  -  5      n[        UR	                  S5      U-  U R                  R                  -  5      nUR                  USU R                  R                  5      n/ n/ nS nSn0 n US-  nU R
                  R                   R                  UU5      nU R
                  R                   R#                  US S 2SS 24   S UU
USUSS9nU(       a.  UR%                  [&        R(                  " UR*                  SS95        UR                  R-                  S5      nUR.                  nU R0                  R3                  U5      nUR5                  XR                  R                  U R                  R                  5      nUR%                  U5        US S 2SS S 24   R5                  USU R                  R                  5      n[&        R(                  " UU4SS9n[&        R6                  " U R0                  R9                  U5      5      nUU:  a  GM  UU:  a@  [&        R:                  " USS9U:  n[&        R<                  " U5      S   R?                  5       nO[A        [C        U5      5      nU Vs/ s H  nUU;  d  M  UPM     nn[C        U5      S:  ad  [&        RD                  " U5      nURG                  SS5      RI                  SS	5      nU R0                  RK                  U5      nU H  n UU    UU '   M     [C        U5      U:  a  OGM  [A        [C        U5      5       Vs/ s H  nUU   PM
     nnU	(       d  US:X  a  US   O1[&        RL                  RN                  RP                  RS                  USS
9nUb	  U" U5      n!OUn!U(       a_  [&        R(                  " US	S9nUS:  a@  UR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 nU!U4n!U!$ / n"[A        U5       H&  nU"R%                  UU   R	                  S5      5        M(     Uc7  [&        RL                  RN                  RP                  RS                  USS
9nUU"4n!Oy/ n#[&        RL                  RN                  RP                  RS                  USS
9nU" U5      n#U" Vs/ s H,  n[        U#R	                  S5      [U        U"5      -  5      U-  PM.     n$nU#U$4n!U(       a\  [&        R(                  " US	S9nUR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 n/ U!QUP7n!U!$ s  snf s  snf s  snf )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r/   r1   ru  r%   )r   r1   rE  rF  r  rH  r  ru  r   r   )batch_first)+r)   r~   r"   r>   r   rO  r  r  r  r  r  ra  r'   r0   r&   r  r  r  rY   r   r   r  squeezer  speech_decoder_postnetr  r   sigmoidr  rN   r  rO   rP   rU   stackr   flattenr  r   r   rnnpad_sequencer?   )%rW  r/   r  r1   rX  rY  rZ  r[  r\  r]  rF  r   encoder_outr3  maxlenminlenoutput_sequencespectrogramr  r  r  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr3  spectrograms
meet_indexr9  spectrogram_lengths	waveformswaveform_lengthss%                                        r,   _generate_speechrz    s    !
 	
 !"lll6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ll&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S|!q@R7RA|LS< 1$${{;7+55a;CCAqI$;;CCLQ".J5A*5M&z2 #/%&#-i j 49=O9P3QR3Qa&q)3QLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !sA&&|A';';A'>? ? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rs_rZ[INN1$5<O8P$P QTU U_rs "23G"$yy)9qA/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   7
YY)Y%3YzB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            (         ^  \ rS rSrSrS\4U 4S jjr\S\4S j5       r	S r
S r\                 S#S\\R                     S	\\R                     S
\\R                      S\\R                     S\\R                      S\\R                      S\\R"                     S\\\\R                            S\\   S\\   S\\   S\\   S\\   S\\R                      S\\R                      S\\R"                     S\\R"                     S\\\4   4$S jj5       r\R.                  " 5               S$S\R                  S	\\R                     S\\R                      S\S\S\S\\R4                     S\S\S\\R                   \\R                   \R                   4   4   4S  jj5       r\R.                  " 5               S$S\R                  S\\R                      S	\\R                     S\S\S\S\\R4                     S\S\S\\R                   \\R                   \R                   4   4   4S! jj5       rS"rU =r$ )%SpeechT5ForTextToSpeechi	  r!   r~   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )Nr<  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rp   rq   r  r)   r   r  r  r  rO  r  rb  rq  )r}   r~   text_encoderspeech_decoderr   s       r,   rq    SpeechT5ForTextToSpeech.__init__	  s}     $00@ A/ /  5V<8@%fNK&B6&J# 	r.   r:   c                     gr<  rd  )clss    r,   can_generate$SpeechT5ForTextToSpeech.can_generate	  s    
 r.   c                 6    U R                   R                  5       $ r   rB  r\  s    r,   r(  #SpeechT5ForTextToSpeech.get_encoder	  rD  r.   c                 6    U R                   R                  5       $ r   rF  r\  s    r,   rG  #SpeechT5ForTextToSpeech.get_decoder	  rD  r.   r1   r-  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r  r  c                 r   Ub  UOU R                   R                  nUbB  Uc"  [        XR                   R                  U5      u  p4U R                   R                  (       a  SnU R                  UUUUUUUUU	U
UUUSUS9nU R                  US   5      u  nnnSnUb,  [        U R                   5      nU" UUUUUUR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )aH  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
    [`~PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
    for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
>>> import torch

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
>>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([15872])
```
NTr/   r1   r-  r.  rs  r/  r  r0  r  rH  r  r  rt  ru  r  r   r   	r  rl  r  r  r2  r  r3  rE  r4  )r~   r  r6   r0   r  rO  rb  r  r  r   r  r  r2  r3  rE  r4  )r}   r!   r1   r-  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r  r  r9  r  r  r  r  	criterionrT  s                            r,   r   SpeechT5ForTextToSpeech.forward	  s{   Z &1%<k$++B]B]#+?WKK88:P@<$ {{44$(!--")!5#9/!5++1/!5)   
$ AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   rX  rY  rZ  r[  r\  r]  c
                     UbY  UR                  S5      nUR                  S5      U:w  a3  UR                  S5      S:X  a  UR                  US5      nO[        S5      e[        U UUUUUUUUU	5
      $ )a  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Attention mask from the tokenizer, required for batched inference to signal to the model where to
        ignore padded tokens from the input_ids.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r)   rz  )r}   r!   r1   r  rX  rY  rZ  r[  r\  r]  kwargsr^   s               r,   generate SpeechT5ForTextToSpeech.generate0
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r.   c
                     UbY  UR                  S5      n
UR                  S5      U
:w  a3  UR                  S5      S:X  a  UR                  U
S5      nO[        S5      e[        U UUUUUUUUU	5
      $ )aW  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )r}   r!   r  r1   rX  rY  rZ  r[  r\  r]  r^   s              r,   generate_speech'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r.   rb  rO  NNNNNNNNNNNNNNNNNNNg      ?r4   g      4@NFF)r   r   r   r   rg  r   rq   classmethodrR   r  r(  rG  r   r   r   r  r  r   r  r   r   r   r   r   r   r   ri  r  r  r   r   r   s   @r,   r|  r|  	  s    "O~ ( T  ++  1559<@=A159=7;EI+/$(,0/3&*:>.2.215%D
E,,-D
 !!1!12D
 'u'8'89	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%D
 D>D
 $D>D
 'tnD
 d^D
 %U%6%67D
  **+!D
" ell+#D
$ !.%D
& 
u..	/'D
 D
L ]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v ]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r.   r|  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            (         ^  \ rS rSrS\4U 4S jjrS rS rS r\	                 S"S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\\R                           S\
\   S\
\   S\
\   S\
\   S\
\   S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\\\4   4$S jj5       r\R*                  " 5               S#S\R                  S\
\R                     S\
\R                     S\S\S\S\
\R0                     S\S\S\R                  4S  jj5       rS!rU =r$ )$SpeechT5ForSpeechToSpeechi
  r~   c                    > [         TU ]  U5        [        U5      n[        U5      n[	        XU5      U l        [        U5      U l        U R                  5         g r   )	rp   rq   r  r  r  rO  r  rb  rq  )r}   r~   r>  r  r   s       r,   rq   "SpeechT5ForSpeechToSpeech.__init__
  sK     8@8@%fnM&B6&J# 	r.   c                 6    U R                   R                  5       $ r   rB  r\  s    r,   r(  %SpeechT5ForSpeechToSpeech.get_encoder
  rD  r.   c                 6    U R                   R                  5       $ r   rF  r\  s    r,   rG  %SpeechT5ForSpeechToSpeech.get_decoder  rD  r.   c                 T    U R                  5       R                  R                  5         gr+  rJ  r\  s    r,   r]  0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  rL  r.   r/   r1   r-  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r  r  r:   c                    Ub  UOU R                   R                  nUb%  Uc"  [        XR                   R                  U5      u  p4U R	                  UUUUUUUUU	U
UUUSUS9nU R                  US   5      u  nnnSnU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a[  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
    a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
    [`SpeechT5Processor.__call__`] for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
>>> from datasets import load_dataset
>>> import torch

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
>>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([77824])
```
NTr  r   r   r  )r~   r  r6   r0   rO  rb  r   r  r  r2  r  r3  rE  r4  )r}   r/   r1   r-  r.  rs  r/  r  r0  r  rH  r  rt  ru  r  r  r  r  r9  r_   rl  r  r  rT  s                           r,   r   !SpeechT5ForSpeechToSpeech.forward  s(   h &1%<k$++B]B]#+?WKK88:P@<$ --%)!5#9/!5++1/!5)   
$ "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r.   rX  rY  rZ  r[  r\  r]  c
                 n    Uc  [         R                  " SUR                  S9n[        U UUUUUUUUU	5
      $ )ao  
Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
speech waveform using a vocoder.

Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Float values of input raw speech waveform.

        Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
        a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
        or the soundfile library (`pip install soundfile`).
        To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
        conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
)r   i   rj  )r   rQ   r   rz  )
r}   r/   r  r1   rX  rY  rZ  r[  r\  r]  s
             r,   r  )SpeechT5ForSpeechToSpeech.generate_speech  sM    T %!&Xl>Q>Q!R#!
 	
r.   r  r  r  )r   r   r   r   r   rq   r(  rG  r]  r   r   r   r  r  r   r  r   rR   r   r   r   r   r   r   ri  r  r   r   r   s   @r,   r  r  
  s   
~ 
++;  5959<@=A159=7;EI+/$(,0/3&*:>.2.215%
u001
 !!1!12
 'u'8'89	

 !))9)9 :
 E--.
 $E$5$56
 'u||4
 "%e.?.?(@"AB
 "%
 D>
 $D>
 'tn
 d^
 %U%6%67
  **+!
" ell+#
$ !.%
& 
u..	/'
 
B ]]_ ;?59 !'+(-&+W
''W
 %U%6%67W
 !!1!12	W

 W
 W
 W
 "))$W
 "&W
  $W
 
		W
 W
r.   r  c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ s H0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ s H,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )rn   dilationr   )rp   rq   leaky_relu_sloper   r/  rP   rU   ru   get_paddingconvs1convs2)r}   channelsrm   r  r  r3  r_   r   s          r,   rq   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   7C%%3C*c                     X-  U-
  S-  $ r   rd  )r}   rm   r  s      r,   r   HifiGanResidualBlock.get_padding	  s    &1a77r.   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r  r  r}   r   r,  s      r,   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm  si    hh**288,,m<<((33??K[[E ![[E !r.   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r   )r  r   r   remove_weight_normr  r}   r,  s     r,   r  'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r.   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r   )rw  r  r  r   r  
leaky_relur  )r}   r   conv1conv2r8  s        r,   r   HifiGanResidualBlock.forward  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r.   )r  r  r  )r	   )r   r	      g?)r   )r   r   r   r   rq   r  r  r  r   r   r   r   s   @r,   r  r    s!    
>8/ r.   r  z
    HiFi-GAN vocoder.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S rS r\" S	S
9S\R                  S\R                  4S j5       rSrU =r$ )SpeechT5HifiGani'  r~   rl  c                   > [         TU ]  U5        [        UR                  5      U l        [        UR
                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [        UR
                  UR                   5      5       Ha  u  nu  p4U R                  R#                  [        R$                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [)        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[        UR                  UR*                  5       H4  u  pFU R&                  R#                  [-        XTXaR.                  5      5        M6     Mr     [        R                  " WSSSSS9U l        U R3                  S[4        R6                  " UR                  5      5        U R3                  S[4        R8                  " UR                  5      5        U R;                  5         g )N   r   r	   )rm   rn   r   r   rS  r  )rp   rq   rU   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   ru   model_in_dimupsample_initial_channelconv_prer/  	upsamplerr  rw  upsample_kernel_sizesrY   ConvTranspose1d	resblocksrP   resblock_dilation_sizesr  r  	conv_postr   r   rQ   rW   rq  )r}   r~   r3  upsample_raterm   r  r  r   s          r,   rq   SpeechT5HifiGan.__init__0  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r.   rP  c                 8   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         ggg)zInitialize the weights.r4   rR  N)r  r   ru   r  r   r\  rY  r~   rW  ro   r_  )r}   rP  s     r,   rb  SpeechT5HifiGan._init_weightsV  so    fryy"*<*<=>>MM&&CT[[5R5R&S{{&  &&( ' ?r.   c                    [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU" U R
                  5        U R                   H  nU" U5        M     U R                   H  nUR                  5         M     U" U R                  5        g r  )
r   r   r   r   r   r  r  r  r  r  r  s      r,   r  !SpeechT5HifiGan.apply_weight_norm]  s    hh**288,,m<<((33??KDMM"^^E $^^E##% $DNN#r.   c                 R   [         R                  R                  U R                  5        U R                   H"  n[         R                  R                  U5        M$     U R
                   H  nUR                  5         M     [         R                  R                  U R                  5        g r   )r   r   r  r  r  r  r  r  s     r,   r  "SpeechT5HifiGan.remove_weight_normi  sh    
##DMM2^^EHH''. $^^E$$& $
##DNN3r.   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r:   c                    U R                   R                  (       a  XR                  -
  U R                  -  nUR	                  5       S:H  nU(       d  UR                  S5      nUR                  SS5      nU R                  U5      n[        U R                  5       H  n[        R                  R                  X0R                   R                  5      nU R                  U   " U5      nU R                  X@R                   -     " U5      n[        SU R                   5       H)  nXPR                  X@R                   -  U-      " U5      -  nM+     XPR                   -  nM     [        R                  R                  U5      nU R#                  U5      n[$        R&                  " U5      nU(       d2  UR)                  S5      R                  SS5      R+                  S5      nU$ UR)                  S5      nU$ )a  
spectrogram (`torch.FloatTensor`):
    Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
    config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

Returns:
    `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
    shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
r	   r   r   r   r%   )r~   normalize_beforerS  r  r   r   r   r  rP   r  r   r  r  r  r  r  r  r  r   tanhra  r   )r}   rl  
is_batchedr   r3  	res_statejwaveforms           r,   r   SpeechT5HifiGan.forwardq  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr.   )r  r  r  r  r  r  )r   r   r   r   r    re  rg  rq   r   ri  rb  r  r  r   r   r  r   r   r   r   s   @r,   r  r  '  sp     "!#O$4 $L)BII )
$4 (5#4#4 (9J9J ((r.   r  )r:  r  r|  r  rN  r  )r   Nr%  r  )gr   r   typingr   r   numpyrI   r   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   utils.deprecationr   configuration_speecht5r   r    
get_loggerr   r  _HIDDEN_STATES_START_POSITIONr   r>   r-   r6   r  r   r  ndarrayrg   ri   r   r   ri  r   r   r  r  r   r)  rA  rO  r  r  r  r  r  r  r  r  r.  r<  rN  rk  r  r  r  r  r  r  r  r  r  r  r:  r  rR   rz  r|  r  r  r  __all__rd  r.   r,   <module>r     s     "    @ @ ! C C ) @ 7 e 9  D , 0 I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp#= ,!; 8!; 2A8BII A8J*bii *Zryy 0" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ")-		+? )-X&,@ &$c2		 c2L")) 0:5 :zc5 cL (7o (7 (7V|
- |
~"&= "J'$; 'T
#: 
@~
- ~
B/&= /d3$; 3l*#: *Z8M299 8Mv:bii :z 
Y
+ Y

Y
x 
s
5 s

s
r 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
e
5 e

e
P 
t
 7 t

t
n;299 ;| 
to t
tnr.   