
    cCiJ~                     "   S SK r S SKJr  S SKJrJrJr  S SKrS SKJr  SSK	J
r
  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJr  SSKJrJ r    " S S\RB                  5      r" " S S\RB                  5      r# " S S\RB                  5      r$S\RJ                  S\&S\RJ                  4S jr' S5S\RB                  S\RJ                  S\RJ                  S\RJ                  S\\RJ                     S\(S \(S!\\   4S" jjr) " S# S$\RB                  5      r* " S% S&\RB                  5      r+ " S' S(\5      r,\ " S) S*\5      5       r-\" S+S,9 " S- S.\-5      5       r.\ " S/ S0\5      5       r/\" S1S,9 " S2 S3\-5      5       r0/ S4Qr1g)6    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )ParakeetCTCConfigParakeetEncoderConfigc                      ^  \ rS rSr% Sr\R                  \S'   S	S\4U 4S jjjr	\R                  " 5       S\R                  4S j5       rSrU =r$ )
$ParakeetEncoderRelPositionalEncoding(   z*Relative positional encoding for Parakeet.inv_freqconfigc           	      &  > [         TU ]  5         UR                  U l        SnSU[        R                  " SUR
                  S[        R                  S9R                  U[        R                  S9UR
                  -  -  -  nU R                  SUSS	9  g )
Ng     @      ?r      dtype)devicer"   r   F)
persistent)
super__init__max_position_embeddingstorcharangehidden_sizeint64tofloatregister_buffer)selfr   r#   baser   	__class__s        h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/parakeet/modeling_parakeet.pyr&   -ParakeetEncoderRelPositionalEncoding.__init__-   s    '-'E'E$Q 2 2AU[[ILLTZbgbmbmLn$$%
 	ZeD    hidden_statesc                    UR                   S   nX R                  :  a  [        SU SU R                   S35      e[        R                  " US-
  U* SUR
                  S9nU R                  S S S 2S 4   R                  5       R                  UR                   S   SS5      R                  UR
                  5      nUS S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OS	n[        R                  " US
S9   UR                  5       UR                  5       -  R                  SS5      nUR                  5       nUR!                  5       n	[        R"                  " X/SS9n
U
R$                  " / U
R                   S S QSP76 n
S S S 5        W
R                  UR&                  S9$ ! , (       d  f       N'= f)Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r#   r   mpscpuF)device_typeenabledr    dimr!   )shaper'   
ValueErrorr(   r)   r#   r   r-   expandr,   
isinstancetypestrautocast	transposesincosstackreshaper"   )r/   r5   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedr<   freqsrI   rJ   	pos_embeds              r2   forward,ParakeetEncoderRelPositionalEncoding.forward;   s   "((+
444#J< 02262N2N1OqR 
 ||JNZKML`L`aMM$4-(..0778K8KA8NPRTUVYYZgZnZno 	 !-T4] ; A A C -..33S99m>R>R>W>W[`>`   %% 	
 ^^UC&,,.1F1L1L1NNYYZ[]^_E))+C))+CSJB7I!))D9??3B+?DDI D ||-"5"5|66 DCs   B	G++
G9)r'   N)__name__
__module____qualname____firstlineno____doc__r(   Tensor__annotations__r   r&   no_gradrS   __static_attributes____classcell__r1   s   @r2   r   r   (   sJ    4llE4 E E ]]_7U\\ 7 7r4   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )ParakeetEncoderFeedForwardZ   r   c                 X  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        UR                  U l        g )Nbias)r%   r&   r   Linearr*   intermediate_sizeattention_biaslinear1r	   
hidden_act
activationlinear2activation_dropoutr/   r   r1   s     r2   r&   #ParakeetEncoderFeedForward.__init__[   s|    yy!3!3V5M5MTZTiTij !2!23yy!9!96;M;MTZTiTij"(";";r4   c                     U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU$ )Nptraining)rl   rj   r   
functionaldropoutrn   rt   rm   )r/   r5   s     r2   rS   "ParakeetEncoderFeedForward.forwardb   sS    ](CD--m?V?Vaeanan-o]3r4   )rl   rn   rj   rm   	rV   rW   rX   rY   r   r&   rS   r^   r_   r`   s   @r2   rb   rb   Z   s    <4 < r4   rb   c                   >   ^  \ rS rSrSS\4U 4S jjjrSS jrSrU =r$ ) ParakeetEncoderConvolutionModulei   r   c           
        > [         TU ]  5         UR                  nUc&  UR                  n[        [        USS5         U l        O#US   n[        UR                  SS5         U l        US-
  S-  U l        [        R                  " USU-  SSSS	S
9U l        [        R                  " X3USU R                  US	S9U l        [        R                  " U5      U l        [        R                  " X3SSSS	S
9U l        g)z
Args:
    config (ParakeetEncoderConfig): Configuration for the model.
    module_config (dict): Configuration for the module (e.g., encoder or decoder).
Nrk   silukernel_sizerl   r   r    r   T)r~   stridepaddingrf   )r   r   groupsrf   )r%   r&   r*   conv_kernel_sizer	   getattrrl   getr   r   Conv1dpointwise_conv1depthwise_convBatchNorm1dnormpointwise_conv2)r/   r   module_configchannelsr~   r1   s        r2   r&   )ParakeetEncoderConvolutionModule.__init__j   s     	%%  11K$WV\6%JKDO'6K$]%6%6|V%LMDO#aA-!yy1x<QWXbcjno iiAt||T\cg
 NN8,	!yyST^_fjkr4   c                    UR                  SS5      nU R                  U5      n[        R                  R	                  USS9nUb(  [
        R                  " U) SS9nUR                  US5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      $ )a  
Compute convolution module.

Args:
    hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
    attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask.

Returns:
    `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.

r   r    r>   r8           )rH   r   r   ru   glur(   allmasked_fillr   r   rl   r   )r/   r5   attention_maskall_masked_rowss       r2   rS   (ParakeetEncoderConvolutionModule.forward   s     &//15 ,,];))-Q)? %#iiR@O)55osKM ++M:		-06,,];&&q!,,r4   )rl   r   r   r   r   r   rU   rx   r`   s   @r2   rz   rz   i   s"    l4 l l0- -r4   rz   r5   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rA   rC   rL   )r5   r   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr4   modulequerykeyvaluer   scalingrv   kwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr    r   r@   r8   r?   r"   rr   r   )r   num_key_value_groupsr(   matmulrH   rA   r   ru   softmaxfloat32r,   r"   rv   rt   
contiguous)r   r   r   r   r   r   rv   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r2   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r4   c                      ^  \ rS rSrSrS\S\4U 4S jjr\" SSSS	9 SS
\	R                  S\\	R                     S\\	R                     S\\   S\\	R                  \	R                  4   4
S jj5       rS rSrU =r$ )ParakeetEncoderAttention   ztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R*                  " [,        R.                  " UR                  U R                  5      5      U l        [        R*                  " [,        R.                  " UR                  U R                  5      5      U l        g )Nr   g      Fre   )r%   r&   r   r   r   r*   num_attention_headsr   r   r   r   attention_dropout	is_causalr   rg   ri   q_projk_projv_projo_projrelative_k_proj	Parameterr(   zerosbias_ubias_vr/   r   r   r1   s      r2   r&   !ParakeetEncoderAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
  "yy););V=W=WZ^ZgZg=gnstll5;;v/I/I4==#YZll5;;v/I/I4==#YZr4   past_key_valuepast_key_valuesz4.58)new_nameversionr5   position_embeddingsr   r   r   c           
         UR                   S S nUu  pgXgSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nXR                  R                  SU R                  R                  SU R                  5      -   nXR                  R                  SU R                  R                  SU R                  5      -   nU R                  U5      nUR                  USU R                  R                  U R                  5      nXR                  SSSS5      -  nU R!                  U5      nUSS U24   nUU R"                  -  nUb)  UR%                  UR'                  5       [)        S5      5      nU" U 4UU
UUU R*                  (       d  S	OU R,                  U R"                  S
.UD6u  nnUR.                  " / UQSP76 R1                  5       nU R3                  U5      nUU4$ )Nr8   r   r    eagerr   r   .z-infr   )r   r   r   r   rv   r   )rA   r   r   viewrH   r   r   r   r   _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr-   rt   r   rL   r   r   )r/   r5   r   r   r   input_shape
batch_sizerM   hidden_shapequery_statesr   r   attention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdr   r   s                      r2   rS    ParakeetEncoderAttention.forward   sg    $))#2.!,
"DMMB{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?;;++w6"9$++:Z:Z"[#/++2B2Bt{{..4==3
 $
  $0++2B2Bt{{..4==3
 $
  #223FG166z2t{{GfGfhlhuhuv -/J/J1aQRTU/VV	OOI.	c;J;./	,	% "..~/I/I/KUSY][I %8	%
*$#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r4   c                     UR                   u  p#pE[        R                  R                  USS9nUR	                  X#SU5      nUSS2SS2SS24   R	                  X#XE5      nU$ )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padr8   Nr   )rA   r   ru   r   r   )r/   attention_scoresr   	num_headsquery_lengthposition_lengths         r2   r   #ParakeetEncoderAttention._rel_shift$  si    ?O?U?U<
|==,,-=6,J+00LY+Aq!"H5:::R^pr4   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   rU   )rV   rW   rX   rY   rZ   r   intr&   r   r(   r[   r   r   r   tuplerS   r   r^   r_   r`   s   @r2   r   r      s    ~[4 [ [: %0A6R
 26	7)||7) &ell37) !.	7)
 +,7) 
u||U\\)	*7) S7)r   r4   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr
SS\R                  S\R                  4S	 jjrS
rU =r$ ) ParakeetEncoderSubsamplingConv2Di-  r   c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  S-
  S-  U l        [        [        R                  " UR                  5      5      U l        [        R                  " 5       U l        U R                   R#                  [        R$                  " SU R                  U R                  U R
                  U R                  S95        U R                   R#                  [        R&                  " 5       5        [)        U R                  S-
  5       H  nU R                   R#                  [        R$                  " U R                  U R                  U R                  U R
                  U R                  U R                  S95        U R                   R#                  [        R$                  " U R                  U R                  SS95        U R                   R#                  [        R&                  " 5       5        M     UR*                  U R
                  U R                  -  -  n[        R,                  " UR                  U-  UR.                  SS9U l        g )Nr   r    )r~   r   r   )r~   r   r   r   r~   Tre   )r%   r&   subsampling_conv_kernel_sizer~   subsampling_conv_strider   subsampling_conv_channelsr   r   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsrg   r*   linear)r/   r   i
out_lengthr1   s       r2   r&   )ParakeetEncoderSubsamplingConv2D.__init__.  s   !>>4488((1,2dii(A(ABC mmoIIaD4D4DT[[bfbnbno	
 	2779%t*+AKK		MMMM $ 0 0;; LL==	 KKryySTUVKKrwwy) ," ((T[[$//-IJ
ii @ @: MvOaOahlmr4   input_lengths
conv_layerc                     [        US5      (       aR  UR                  S:w  aB  UR                  nUR                  S   nUR                  S   nXS   -   US   -   U-
  U-  S-   nU$ U$ )Nr   )r   r   r   r   )hasattrr   r   r~   )r/   r   r   r   r~   r   output_lengthss          r2   _get_output_length3ParakeetEncoderSubsamplingConv2D._get_output_lengthQ  sy    :x((Z->->&-H ((G$003K&&q)F+aj871:ESX^^abbN!!r4   input_featuresr   c                     UR                  S5      nUb  UR                  S5      OS nU R                   H  nU" U5      n[        U[        R
                  5      (       d  M,  Uc  M1  U R                  XE5      nUR                  S   n[        R                  " XbR                  S9US S 2S 4   :  nX7S S 2S S S 2S 4   -  nM     UR                  SS5      R                  UR                  S   UR                  S   S5      nU R                  U5      nU$ )Nr   r8   r    r9   r   )	unsqueezesumr   rD   r   r   r   rA   r(   r)   r#   rH   rL   r   )r/   r  r   r5   current_lengthslayercurrent_seq_lengthchannel_masks           r2   rS   (ParakeetEncoderSubsamplingConv2D.forward\  s   &0034B4N.,,R0TX[[E!-0M %++0J"&"9"9/"Q%2%8%8%;"LL!3<Q<QRUdefhlelUmm  aq$.>!?? ! &//15==m>Q>QRS>TVcViVijkVlnpqM2r4   )r   r~   r   r   r   r   r   rU   )rV   rW   rX   rY   r   r&   r(   r[   r   r   r   rS   r^   r_   r`   s   @r2   r   r   -  sN    !n4 !nF	 	")) 	ell ELL  r4   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr  SS\R                  S\\R                     S\\R                     S\
\   S	\R                  4
S
 jjrSrU =r$ )ParakeetEncoderBlockir  r   r   c                 "  > [         TU ]  5         SU l        [        U5      U l        [        X5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g NF)r%   r&   gradient_checkpointingrb   feed_forward1r   	self_attnrz   convfeed_forward2r   	LayerNormr*   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r2   r&   ParakeetEncoderBlock.__init__s  s    &+#7?1&D4V<	7?"$,,v/A/A"B\\&*<*<=f&8&89"$,,v/A/A"BV%7%78r4   r5   r   r   r   r   c                 l   UnU R                  U R                  U5      5      nUSU-  -   nU R                  U5      nU R                  " SUUUS.UD6u  pxX-   nU R	                  U R                  U5      US9n	X-   nU R                  U R                  U5      5      n
USU
-  -   nU R                  U5      nU$ )Ng      ?)r5   r   r   )r    )	r  r  r  r  r  r  r  r  r  )r/   r5   r   r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputs              r2   rS   ParakeetEncoderBlock.forward  s     !**4+B+B=+QR 3#66#'#5#5m#D  
2) 3
 	
 &3ii} =ni]%3''(?(?(NO
%j(88m4r4   )
r  r  r  r  r  r  r  r  r  r  rU   NN)rV   rW   rX   rY   r   r   r   r&   r(   r[   r   r   rS   r^   r_   r`   s   @r2   r  r  r  s    94 9# 9 9$ 266:	|| !. &ell3	
 +, 
 r4   r  c                      ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSr\\S.rU 4S	 jrS
\R*                  4S jrSS\R*                  S\\   4S jjrSrU =r$ )ParakeetPreTrainedModeli  r   modelr  Tr  F)r5   
attentionsc                   > [         TU ]  U5        [        U R                  S5      (       a  U R                  R                  nO%[        U R                  R                  5       SS5      n[        U[        5      (       aI  UR                  R                  R                  SUS9  UR                  R                  R                  SUS9  g g )Ninitializer_rangeg{Gz?r   )meanstd)r%   _init_weightsr   r   r(  r   get_text_configrD   r   r   datanormal_r   )r/   r   r*  r1   s      r2   r+  %ParakeetPreTrainedModel._init_weights  s    f%4;; 344++//C $++5579LdSCf677MM&&CS&9MM&&CS&9 8r4   r   c                 "   [        U R                  [        5      (       a  U R                  R                  OU R                  nUR                  nUR
                  n[        [        R                  " UR                  5      5      nUS-
  S-  S-  nXc-
  nUn[        U5       HQ  n	[        R                  " UR                  [        R                  S9U-   U5      S-   n[        R                  " U5      nMS     UR                  [        R                  S9$ )Nr   r    r!   r   )rD   r   r   encoder_configr   r   r   r   r   r   r   r(   divr,   r-   floor)
r/   r   r1  r~   r   r   all_paddingsadd_padlengthsr  s
             r2   _get_subsampling_output_length6ParakeetPreTrainedModel._get_subsampling_output_length  s    7A$++O`7a7a33gkgrgr$AA77>#D#DEF
#aA-1,z"Aii


 = GPSVVGkk'*G # zz		z**r4   r   target_lengthc                     U R                  UR                  S5      5      nUb  UOUR                  5       n[        R                  " XAR
                  S9USS2S4   :  nU$ )z
Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
r8   Nr9   )r7  r  maxr(   r)   r#   )r/   r   r9  r   
max_lengths        r2   _get_output_attention_mask2ParakeetPreTrainedModel._get_output_attention_mask  sa    
 <<^=O=OPR=ST&3&?]^EWEWEY
j9N9NOR`abdhahRiir4   r  rU   )rV   rW   rX   rY   r   r\   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr+  r(   r[   r7  r   r   r=  r^   r_   r`   s   @r2   r$  r$    s    &O&*#/0$(!N !!"&-.
:+ELL +"	 	V^_bVc 	 	r4   r$  z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    )custom_introc                      ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	" 5       \
 SS\R                  S\\R                     S\\   S\4S	 jj5       5       5       rS
rU =r$ )ParakeetEncoderi  r   encoderc           	        > [         TU ]  U5        Xl        SU l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        U5      U l        [        U5      U l        [         R"                  " [%        UR&                  5       Vs/ s H  n[)        X5      PM     sn5      U l        U R-                  5         g s  snf )NFr   )r%   r&   r   r  rv   dropout_positions	layerdropscale_inputr   sqrtr*   input_scaler   subsamplingr   encode_positionsr   r   r   num_hidden_layersr  r   	post_initr   s      r2   r&   ParakeetEncoder.__init__  s     &+#~~!'!9!9))<B<N<N499V%7%78TW;FC DV LmmFKFLdLdFefFe!&4Fef
 	 gs   Dr  r   r   r   c                    U R                  X5      nX@R                  -  nU R                  U5      n[        R                  R                  X@R
                  U R                  S9n[        R                  R                  XPR                  U R                  S9nUbp  U R                  X$R                  S   S9nUR                  S5      R                  SUR                  S   S5      nX"R                  SS5      -  nUR                  S5      nU R                   HR  nSnU R                  (       a'  [        R                  " / 5      nXR                   :  a  SnU(       a  MF  U" U4UUS.UD6nMT     [#        US	9$ )
al  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetEncoder
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> encoder = ParakeetEncoder.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"])
>>> encoder_outputs = encoder(**inputs)

>>> print(encoder_outputs.last_hidden_state.shape)
```
rr   r   r9  r8   r    FT)r   r   )last_hidden_state)rT  rS  rU  r   ru   rv   rt   rO  r=  rA   r  rC   rH   r   r(   randrP  r   )	r/   r  r   r   r5   r   encoder_layerto_dropdropout_probabilitys	            r2   rS   ParakeetEncoder.forward  sc   < ((H%(8(88"33MB--m||VZVcVc-d mm33#9#9DMM 4 
 %!<<^[n[nop[q<rN+55a8??MDWDWXYDZ\^_N+.F.Fq!.LLN+55a8N![[MG}}&+jjn#&7"G7 -!!#1(;! 	! )  ??r4   )	r   rv   rO  rU  r  rS  rP  r   rT  rU   )rV   rW   rX   rY   r   r\   r?  r&   r   r   r   r(   r[   r   r   r   r   rS   r^   r_   r`   s   @r2   rL  rL    s     "!!4 &  26:@:@ !.:@ +,	:@
 
:@   :@r4   rL  c                       \ rS rSr% Sr\R                  \S'   Sr\	\
\R                        \S'   Sr\	\
\
\R                           \S'   Sr\	\
\
\R                           \S'   Srg)	ParakeetGenerateOutputi=  a,  
Outputs of Parakeet models.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
	sequencesNlogitsr&  r5   r  )rV   rW   rX   rY   rZ   r(   
LongTensorr\   rd  r   r   FloatTensorr&  r5   r^   r  r4   r2   rb  rb  =  sm    & 15FHU5,,-.5<@JuU%6%6789@?CM8E%(9(9":;<Cr4   rb  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                   d  ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\\	R                     S\\	R                     S\\   S\4
S	 jj5       5       r\	R                   " 5         SS\	R                  S\\	R                     S
\S\\   S\\\	R(                  4   4
S jj5       rSrU =r$ )ParakeetForCTCiX  r   c                    > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  R                  UR                  SS9U l	        U R                  5         g )Nr   r   )r%   r&   rL  r1  rM  r   r   r*   
vocab_sizectc_headrW  ro   s     r2   r&   ParakeetForCTC.__init__`  sS     &v'<'<=		&"7"7"C"CVEVEVdefr4   r  r   labelsr   r   c                    U R                   " SUUS.UD6nUR                  nU R                  UR                  SS5      5      R                  SS5      nSnUGbN  Ub  UO"[        R
                  " U[        R                  S9nU R                  UR                  S5      5      n	X0R                  R                  :g  n
U
R                  S5      nUR                  U
5      n[        R                  R                  US[        R                  S9R                  SS5      n[        R                   R"                  R%                  S	S
9   [        R                  R'                  UUU	UU R                  R                  U R                  R(                  U R                  R*                  S9nSSS5        [-        UUUR.                  UR0                  S9$ ! , (       d  f       N.= f)aV  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> outputs = model(**inputs)

>>> print(outputs.loss)
```r  r   r   r    Nr!   r8   r   r   F)r=   )blank	reductionzero_infinity)lossrd  r5   r&  r  )rM  r[  rk  rH   r(   	ones_likelongr7  r  r   pad_token_idmasked_selectr   ru   log_softmaxr   backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   r5   r&  )r/   r  r   rm  r   encoder_outputsr5   rd  rs  r   labels_masktarget_lengthsflattened_targets	log_probss                 r2   rS   ParakeetForCTC.forwardh  s   : ,, 
))
 
 (99}66q!<=GG1M #1"<%//R`hmhrhrBs  !??@R@RSU@VWM !KK$<$<<K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; )77&11	
 	
 ;:s   ?A G
Greturn_dict_in_generatec                 >   SUS'   U R                   " S	UUS.UD6nUR                  R                  SS9nUb5  U R                  X&R                  S   S9nU R
                  R                  Xb) '   U(       a*  [        UUR                  UR                  UR                  S9$ U$ )
a  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> predicted_ids = model.generate(**inputs)
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

>>> print(transcription)
```
Treturn_dictro  r8   r>   r   rZ  )rc  rd  r&  r5   r  )
rS   rd  argmaxr=  rA   r   rv  rb  r&  r5   )r/   r  r   r  r   outputsrc  s          r2   generateParakeetForCTC.generate  s    : !%}"&,, #
))#
 #
 NN))b)1	 %!<<^[j[jkl[m<nN)-)A)AIo&")#~~"--%33	  r4   )rk  rM  r"  r  )rV   rW   rX   rY   r   r\   r&   r   r   r(   r[   r   r   r   r   rS   r]   boolr   rb  re  r  r^   r_   r`   s   @r2   rh  rh  X  s    0   26)-	E
E
 !.E
 &	E

 +,E
 
E
  E
N ]]_ 26(-	33 !.3 "&	3
 +,3 
%u'7'77	83 3r4   rh  )rh  rL  r$  )r   )2r   dataclassesr   typingr   r   r   r(   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   configuration_parakeetr   r   Moduler   rb   rz   r[   r   r   r-   r   r   r   r  r$  rL  rb  rh  __all__r  r4   r2   <module>r     s  ,  ! , ,   ! 9 ? F & V V 0 / L/7299 /7d 8-ryy 8-v	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4` ryy ` FBryy BJ,5 ,^ <o < <~ 
T@- T@
T@n D[ D D4 
H, H
HV Kr4   