
    cCih                     0   S r SSKrSSKJr  SSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$   " S S\	RJ                  5      r& " S S\	RJ                  5      r' " S S\5      r( " S S\ 5      r) " S S\	RJ                  5      r* " S S\5      r+\ " S S \5      5       r,\" S!S"9 " S# S$\,5      5       r-\ " S% S&\5      5       r.\" S'S"9 " S( S)\,5      5       r// S*Qr0g)+zPyTorch Parakeet model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )%FastSpeech2ConformerConvolutionModule)LlamaAttentioneager_attention_forward   )ParakeetCTCConfigParakeetEncoderConfigc                      ^  \ rS rSr% Sr\R                  \S'   S	S\4U 4S jjjr	\R                  " 5       S\R                  4S j5       rSrU =r$ )
$ParakeetEncoderRelPositionalEncoding$   z*Relative positional encoding for Parakeet.inv_freqconfigc           	      &  > [         TU ]  5         UR                  U l        SnSU[        R                  " SUR
                  S[        R                  S9R                  U[        R                  S9UR
                  -  -  -  nU R                  SUSS	9  g )
Ng     @      ?r   r   dtype)devicer$   r   F)
persistent)
super__init__max_position_embeddingstorcharangehidden_sizeint64tofloatregister_buffer)selfr    r%   baser   	__class__s        g/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/parakeet/modular_parakeet.pyr(   -ParakeetEncoderRelPositionalEncoding.__init__)   s    '-'E'E$Q 2 2AU[[ILLTZbgbmbmLn$$%
 	ZeD    hidden_statesc                    UR                   S   nX R                  :  a  [        SU SU R                   S35      e[        R                  " US-
  U* SUR
                  S9nU R                  S S S 2S 4   R                  5       R                  UR                   S   SS5      R                  UR
                  5      nUS S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OS	n[        R                  " US
S9   UR                  5       UR                  5       -  R                  SS5      nUR                  5       nUR!                  5       n	[        R"                  " X/SS9n
U
R$                  " / U
R                   S S QSP76 n
S S S 5        W
R                  UR&                  S9$ ! , (       d  f       N'= f)Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r%   r   mpscpuF)device_typeenabledr   dimr#   )shaper)   
ValueErrorr*   r+   r%   r   r/   expandr.   
isinstancetypestrautocast	transposesincosstackreshaper$   )r1   r7   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedr>   freqsrK   rL   	pos_embeds              r4   forward,ParakeetEncoderRelPositionalEncoding.forward7   s   "((+
444#J< 02262N2N1OqR 
 ||JNZKML`L`aMM$4-(..0778K8KA8NPRTUVYYZgZnZno 	 !-T4] ; A A C -..33S99m>R>R>W>W[`>`   %% 	
 ^^UC&,,.1F1L1L1NNYYZ[]^_E))+C))+CSJB7I!))D9??3B+?DDI D ||-"5"5|66 DCs   B	G++
G9)r)   N)__name__
__module____qualname____firstlineno____doc__r*   Tensor__annotations__r   r(   no_gradrU   __static_attributes____classcell__r3   s   @r4   r   r   $   sJ    4llE4 E E ]]_7U\\ 7 7r6   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )ParakeetEncoderFeedForwardV   r    c                 X  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        UR                  U l        g )Nbias)r'   r(   r   Linearr,   intermediate_sizeattention_biaslinear1r	   
hidden_act
activationlinear2activation_dropoutr1   r    r3   s     r4   r(   #ParakeetEncoderFeedForward.__init__W   s|    yy!3!3V5M5MTZTiTij !2!23yy!9!96;M;MTZTiTij"(";";r6   c                     U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU$ )Nptraining)rn   rl   r   
functionaldropoutrp   rv   ro   )r1   r7   s     r4   rU   "ParakeetEncoderFeedForward.forward^   sS    ](CD--m?V?Vaeanan-o]3r6   )rn   rp   rl   ro   )	rX   rY   rZ   r[   r   r(   rU   r`   ra   rb   s   @r4   rd   rd   V   s    <4 < r6   rd   c                   4   ^  \ rS rSrSS\4U 4S jjjrSrU =r$ ) ParakeetEncoderConvolutionModulee   r    c                 $   > [         TU ]  X5        g rW   )r'   r(   )r1   r    module_configr3   s      r4   r(   )ParakeetEncoderConvolutionModule.__init__f   s    /r6    rW   )rX   rY   rZ   r[   r   r(   r`   ra   rb   s   @r4   r{   r{   e   s    04 0 0r6   r{   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\
\R                     S\
\R                     S	\\   S
\\R                  \R                  4   4
S jjrS rSrU =r$ )ParakeetEncoderAttentionj   ztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r    	layer_idxc                   > [         TU ]  XS9  SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " [        R                  " UR                  U R                  5      5      U l        [        R                  " [        R                  " UR                  U R                  5      5      U l        g )N)r   Frg   )r'   r(   	is_causalr   ri   r,   num_attention_headshead_dimrelative_k_proj	Parameterr*   zerosbias_ubias_vr1   r    r   r3   s      r4   r(   !ParakeetEncoderAttention.__init__m   s    5!yy););V=W=WZ^ZgZg=gnstll5;;v/I/I4==#YZll5;;v/I/I4==#YZr6   r7   position_embeddingsattention_maskkwargsreturnc           
         UR                   S S nUu  pgXgSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nXR                  R                  SU R                  R                  SU R                  5      -   nXR                  R                  SU R                  R                  SU R                  5      -   nU R                  U5      nUR                  USU R                  R                  U R                  5      nXR                  SSSS5      -  nU R!                  U5      nUSS U24   nUU R"                  -  nUb)  UR%                  UR'                  5       [)        S5      5      nU" U 4UU
UUU R*                  (       d  S	OU R,                  U R"                  S
.UD6u  nnUR.                  " / UQSP76 R1                  5       nU R3                  U5      nUU4$ )Nr:   r   r   eagerr   r   .z-inf        )querykeyvaluer   rx   scaling)rC   r   q_projviewrJ   k_projv_projr   r    _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr/   rv   attention_dropoutrN   
contiguouso_proj)r1   r7   r   r   r   input_shape
batch_sizerO   hidden_shapequery_states
key_statesvalue_statesattention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdattn_outputattn_weightss                      r4   rU    ParakeetEncoderAttention.forwardw   sg    $))#2.!,
"DMMB{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?;;++w6"9$++:Z:Z"[#/++2B2Bt{{..4==3
 $
  $0++2B2Bt{{..4==3
 $
  #223FG166z2t{{GfGfhlhuhuv -/J/J1aQRTU/VV	OOI.	c;J;./	,	% "..~/I/I/KUSY][I %8	%
*$#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r6   c                     UR                   u  p#pE[        R                  R                  USS9nUR	                  X#SU5      nUSS2SS2SS24   R	                  X#XE5      nU$ )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padr:   Nr   )rC   r   rw   r   r   )r1   attention_scoresr   	num_headsquery_lengthposition_lengths         r4   r   #ParakeetEncoderAttention._rel_shift   si    ?O?U?U<
|==,,-=6,J+00LY+Aq!"H5:::R^pr6   )r   r   r   r   rW   )rX   rY   rZ   r[   r\   r   intr(   r*   r]   r   r   r   tuplerU   r   r`   ra   rb   s   @r4   r   r   j   s    ~[4 [ [ 26	7)||7) &ell37) !.	7)
 +,7) 
u||U\\)	*7)r   r6   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr
SS\R                  S\R                  4S	 jjrS
rU =r$ ) ParakeetEncoderSubsamplingConv2D   r    c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  S-
  S-  U l        [        [        R                  " UR                  5      5      U l        [        R                  " 5       U l        U R                   R#                  [        R$                  " SU R                  U R                  U R
                  U R                  S95        U R                   R#                  [        R&                  " 5       5        [)        U R                  S-
  5       H  nU R                   R#                  [        R$                  " U R                  U R                  U R                  U R
                  U R                  U R                  S95        U R                   R#                  [        R$                  " U R                  U R                  SS95        U R                   R#                  [        R&                  " 5       5        M     UR*                  U R
                  U R                  -  -  n[        R,                  " UR                  U-  UR.                  SS9U l        g )Nr   r   )kernel_sizestridepadding)r   r   r   groupsr   Trg   )r'   r(   subsampling_conv_kernel_sizer   subsampling_conv_strider   subsampling_conv_channelschannelsr   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsri   r,   linear)r1   r    i
out_lengthr3   s       r4   r(   )ParakeetEncoderSubsamplingConv2D.__init__   s   !>>4488((1,2dii(A(ABC mmoIIaD4D4DT[[bfbnbno	
 	2779%t*+AKK		MMMM $ 0 0;; LL==	 KKryySTUVKKrwwy) ," ((T[[$//-IJ
ii @ @: MvOaOahlmr6   input_lengths
conv_layerc                     [        US5      (       aR  UR                  S:w  aB  UR                  nUR                  S   nUR                  S   nXS   -   US   -   U-
  U-  S-   nU$ U$ )Nr   )r   r   r   r   )hasattrr   r   r   )r1   r   r   r   r   r   output_lengthss          r4   _get_output_length3ParakeetEncoderSubsamplingConv2D._get_output_length   sy    :x((Z->->&-H ((G$003K&&q)F+aj871:ESX^^abbN!!r6   input_featuresr   c                     UR                  S5      nUb  UR                  S5      OS nU R                   H  nU" U5      n[        U[        R
                  5      (       d  M,  Uc  M1  U R                  XE5      nUR                  S   n[        R                  " XbR                  S9US S 2S 4   :  nX7S S 2S S S 2S 4   -  nM     UR                  SS5      R                  UR                  S   UR                  S   S5      nU R                  U5      nU$ )Nr   r:   r   r;   r   )	unsqueezesumr   rF   r   r   r   rC   r*   r+   r%   rJ   rN   r   )r1   r   r   r7   current_lengthslayercurrent_seq_lengthchannel_masks           r4   rU   (ParakeetEncoderSubsamplingConv2D.forward   s   &0034B4N.,,R0TX[[E!-0M %++0J"&"9"9/"Q%2%8%8%;"LL!3<Q<QRUdefhlelUmm  aq$.>!?? ! &//15==m>Q>QRS>TVcViVijkVlnpqM2r6   )r   r   r   r   r   r   r   rW   )rX   rY   rZ   r[   r   r(   r*   r]   r   r   r   rU   r`   ra   rb   s   @r4   r   r      sN    !n4 !nF	 	")) 	ell ELL  r6   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr  SS\R                  S\\R                     S\\R                     S\
\   S	\R                  4
S
 jjrSrU =r$ )ParakeetEncoderBlock   r    r   c                 "  > [         TU ]  5         SU l        [        U5      U l        [        X5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g NF)r'   r(   gradient_checkpointingrd   feed_forward1r   	self_attnr{   convfeed_forward2r   	LayerNormr,   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r4   r(   ParakeetEncoderBlock.__init__   s    &+#7?1&D4V<	7?"$,,v/A/A"B\\&*<*<=f&8&89"$,,v/A/A"BV%7%78r6   r7   r   r   r   r   c                 l   UnU R                  U R                  U5      5      nUSU-  -   nU R                  U5      nU R                  " SUUUS.UD6u  pxX-   nU R	                  U R                  U5      US9n	X-   nU R                  U R                  U5      5      n
USU
-  -   nU R                  U5      nU$ )Ng      ?)r7   r   r   )r   r   )	r   r   r   r   r   r   r   r   r   )r1   r7   r   r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputs              r4   rU   ParakeetEncoderBlock.forward  s     !**4+B+B=+QR 3#66#'#5#5m#D  
2) 3
 	
 &3ii} =ni]%3''(?(?(NO
%j(88m4r6   )
r   r   r   r   r   r   r   r   r   r   rW   NN)rX   rY   rZ   r[   r   r   r   r(   r*   r]   r   r   rU   r`   ra   rb   s   @r4   r   r      s    94 9# 9 9$ 266:	|| !. &ell3	
 +, 
 r6   r   c                      ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSr\\S.rU 4S	 jrS
\R*                  4S jrSS\R*                  S\\   4S jjrSrU =r$ )ParakeetPreTrainedModeli-  r    modelr   Tr   F)r7   
attentionsc                   > [         TU ]  U5        [        U R                  S5      (       a  U R                  R                  nO%[        U R                  R                  5       SS5      n[        U[        5      (       aI  UR                  R                  R                  SUS9  UR                  R                  R                  SUS9  g g )Ninitializer_rangeg{Gz?r   )meanstd)r'   _init_weightsr   r    r  getattrget_text_configrF   r   r   datanormal_r   )r1   moduler
  r3   s      r4   r  %ParakeetPreTrainedModel._init_weightsB  s    f%4;; 344++//C $++5579LdSCf677MM&&CS&9MM&&CS&9 8r6   r   c                 "   [        U R                  [        5      (       a  U R                  R                  OU R                  nUR                  nUR
                  n[        [        R                  " UR                  5      5      nUS-
  S-  S-  nXc-
  nUn[        U5       HQ  n	[        R                  " UR                  [        R                  S9U-   U5      S-   n[        R                  " U5      nMS     UR                  [        R                  S9$ )Nr   r   r#   r"   )rF   r    r   encoder_configr   r   r   r   r   r   r   r*   divr.   r/   floor)
r1   r   r  r   r   r   all_paddingsadd_padlengthsr   s
             r4   _get_subsampling_output_length6ParakeetPreTrainedModel._get_subsampling_output_lengthP  s    7A$++O`7a7a33gkgrgr$AA77>#D#DEF
#aA-1,z"Aii


 = GPSVVGkk'*G # zz		z**r6   r   target_lengthc                     U R                  UR                  S5      5      nUb  UOUR                  5       n[        R                  " XAR
                  S9USS2S4   :  nU$ )z
Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
r:   Nr;   )r  r   maxr*   r+   r%   )r1   r   r  r   
max_lengths        r4   _get_output_attention_mask2ParakeetPreTrainedModel._get_output_attention_maska  sa    
 <<^=O=OPR=ST&3&?]^EWEWEY
j9N9NOR`abdhahRiir6   r   rW   )rX   rY   rZ   r[   r   r^   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr  r*   r]   r  r   r   r  r`   ra   rb   s   @r4   r  r  -  s    &O&*#/0$(!N !!"&-.
:+ELL +"	 	V^_bVc 	 	r6   r  z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    )custom_introc                      ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	" 5       \
 SS\R                  S\\R                     S\\   S\4S	 jj5       5       5       rS
rU =r$ )ParakeetEncoderim  r    encoderc           	        > [         TU ]  U5        Xl        SU l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        U5      U l        [        U5      U l        [         R"                  " [%        UR&                  5       Vs/ s H  n[)        X5      PM     sn5      U l        U R-                  5         g s  snf )NFr"   )r'   r(   r    r   rx   dropout_positions	layerdropscale_inputr   sqrtr,   input_scaler   subsamplingr   encode_positionsr   r   r   num_hidden_layersr   r   	post_initr   s      r4   r(   ParakeetEncoder.__init__v  s     &+#~~!'!9!9))<B<N<N499V%7%78TW;FC DV LmmFKFLdLdFefFe!&4Fef
 	 gs   Dr   r   r   r   c                    U R                  X5      nX@R                  -  nU R                  U5      n[        R                  R                  X@R
                  U R                  S9n[        R                  R                  XPR                  U R                  S9nUbp  U R                  X$R                  S   S9nUR                  S5      R                  SUR                  S   S5      nX"R                  SS5      -  nUR                  S5      nU R                   HR  nSnU R                  (       a'  [        R                  " / 5      nXR                   :  a  SnU(       a  MF  U" U4UUS.UD6nMT     [#        US	9$ )
al  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetEncoder
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> encoder = ParakeetEncoder.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"])
>>> encoder_outputs = encoder(**inputs)

>>> print(encoder_outputs.last_hidden_state.shape)
```
rt   r   r  r:   r   FT)r   r   )last_hidden_state)r6  r5  r7  r   rw   rx   rv   r1  r  rC   r   rE   rJ   r   r*   randr2  r   )	r1   r   r   r   r7   r   encoder_layerto_dropdropout_probabilitys	            r4   rU   ParakeetEncoder.forward  sc   < ((H%(8(88"33MB--m||VZVcVc-d mm33#9#9DMM 4 
 %!<<^[n[nop[q<rN+55a8??MDWDWXYDZ\^_N+.F.Fq!.LLN+55a8N![[MG}}&+jjn#&7"G7 -!!#1(;! 	! )  ??r6   )	r    rx   r1  r7  r   r5  r2  r   r6  rW   )rX   rY   rZ   r[   r   r^   r!  r(   r   r   r   r*   r]   r   r   r   r   rU   r`   ra   rb   s   @r4   r.  r.  m  s     "!!4 &  26:@:@ !.:@ +,	:@
 
:@   :@r6   r.  c                       \ rS rSr% Sr\R                  \S'   Sr\	\
\R                        \S'   Sr\	\
\
\R                           \S'   Sr\	\
\
\R                           \S'   Srg)	ParakeetGenerateOutputi  a,  
Outputs of Parakeet models.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
	sequencesNlogitsr  r7   r   )rX   rY   rZ   r[   r\   r*   
LongTensorr^   rF  r   r   FloatTensorr  r7   r`   r   r6   r4   rD  rD    sm    & 15FHU5,,-.5<@JuU%6%6789@?CM8E%(9(9":;<Cr6   rD  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                   d  ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\\	R                     S\\	R                     S\\   S\4
S	 jj5       5       r\	R                   " 5         SS\	R                  S\\	R                     S
\S\\   S\\\	R(                  4   4
S jj5       rSrU =r$ )ParakeetForCTCi  r    c                    > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  R                  UR                  SS9U l	        U R                  5         g )Nr   r   )r'   r(   r.  r  r/  r   Conv1dr,   
vocab_sizectc_headr9  rq   s     r4   r(   ParakeetForCTC.__init__  sS     &v'<'<=		&"7"7"C"CVEVEVdefr6   r   r   labelsr   r   c                    U R                   " SUUS.UD6nUR                  nU R                  UR                  SS5      5      R                  SS5      nSnUGbN  Ub  UO"[        R
                  " U[        R                  S9nU R                  UR                  S5      5      n	X0R                  R                  :g  n
U
R                  S5      nUR                  U
5      n[        R                  R                  US[        R                  S9R                  SS5      n[        R                   R"                  R%                  S	S
9   [        R                  R'                  UUU	UU R                  R                  U R                  R(                  U R                  R*                  S9nSSS5        [-        UUUR.                  UR0                  S9$ ! , (       d  f       N.= f)aV  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> outputs = model(**inputs)

>>> print(outputs.loss)
```r   r   r   r   Nr#   r:   )rA   r$   r   F)r?   )blank	reductionzero_infinity)lossrF  r7   r  r   )r/  r=  rN  rJ   r*   	ones_likelongr  r   r    pad_token_idmasked_selectr   rw   log_softmaxfloat32backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   r7   r  )r1   r   r   rP  r   encoder_outputsr7   rF  rV  r   labels_masktarget_lengthsflattened_targets	log_probss                 r4   rU   ParakeetForCTC.forward  s   : ,, 
))
 
 (99}66q!<=GG1M #1"<%//R`hmhrhrBs  !??@R@RSU@VWM !KK$<$<<K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; )77&11	
 	
 ;:s   ?A G
Greturn_dict_in_generatec                 >   SUS'   U R                   " S	UUS.UD6nUR                  R                  SS9nUb5  U R                  X&R                  S   S9nU R
                  R                  Xb) '   U(       a*  [        UUR                  UR                  UR                  S9$ U$ )
a  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> predicted_ids = model.generate(**inputs)
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

>>> print(transcription)
```
Treturn_dictrR  r:   r@   r   r<  )rE  rF  r  r7   r   )
rU   rF  argmaxr  rC   r    rY  rD  r  r7   )r1   r   r   ri  r   outputsrE  s          r4   generateParakeetForCTC.generate=  s    : !%}"&,, #
))#
 #
 NN))b)1	 %!<<^[j[jkl[m<nN)-)A)AIo&")#~~"--%33	  r6   )rN  r/  r  r   )rX   rY   rZ   r[   r   r^   r(   r   r   r*   r]   r   r   r   r   rU   r_   boolr   rD  rG  rn  r`   ra   rb   s   @r4   rJ  rJ    s    0   26)-	E
E
 !.E
 &	E

 +,E
 
E
  E
N ]]_ 26(-	33 !.3 "&	3
 +,3 
%u'7'77	83 3r6   rJ  )rJ  r.  r  )1r\   r   dataclassesr   typingr   r   r   r*   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   4fastspeech2_conformer.modeling_fastspeech2_conformerr   llama.modeling_llamar   r   configuration_parakeetr   r   Moduler   rd   r{   r   r   r   r  r.  rD  rJ  __all__r   r6   r4   <module>r     s4     ! , ,   ! 9 ? F & V V / h J L/7299 /7d 0'L 0
L ~ L ^Bryy BJ,5 ,^ <o < <~ 
T@- T@
T@n D[ D D4 
H, H
HV Kr6   