
    cCi                        S SK JrJrJr  S SKrS SKrS SKJr  S SK	J
r
Jr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0  SSK1J2r2   " S S\Rf                  5      r4 " S S\Rf                  5      r5S\Rl                  S\7S\Rl                  4S jr8 SES\Rf                  S\Rl                  S\Rl                  S\Rl                  S \\Rl                     S!\9S"\9S#\*\,   4S$ jjr:S% r;SFS& jr< " S' S(\Rf                  5      r= " S) S*\Rf                  5      r> " S+ S,\5      r? " S- S.\5      r@\- " S/ S0\(5      5       rA " S1 S2\A5      rB\- " S3 S4\A5      5       rC  SGS5\D\7\74   S6\9S7\7S \\R                     S8\7S\R                  4S9 jjrG\- " S: S;\A5      5       rHS<\Rl                  S=\7S>\74S? jrI\-" S@SA9 " SB SC\A\5      5       rJ/ SDQrKg)H    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg   )MoonshineConfigc                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineEncoderMLP3   c                 
  > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  5      U l	        [
        R                  " UR                  UR                  5      U l
        g Nsuper__init__configr	   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr+   
hidden_act	__class__s      j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/moonshine/modeling_moonshine.pyr*   MoonshineEncoderMLP.__init__4   s\    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r'   )r1   r,   r2   )r4   r:   s     r7   forwardMoonshineEncoderMLP.forward;   s4    /**=9/r9   r,   r+   r1   r2   
__name__
__module____qualname____firstlineno__r*   torchTensorr=   __static_attributes____classcell__r6   s   @r7   r$   r$   3   s)    KU\\ ell  r9   r$   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineDecoderMLPB   c                   > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  S-  5      U l	        [
        R                  " UR                  UR                  5      U l
        g )N   r(   r3   s      r7   r*   MoonshineDecoderMLP.__init__C   sa    #J/99V//1I1IA1MN99V55v7I7IJr9   r:   r;   c                     U R                  U5      nUR                  SSS9u  pU R                  U5      U-  nU R                  U5      nU$ )NrN   dim)r1   chunkr,   r2   )r4   r:   gates      r7   r=   MoonshineDecoderMLP.forwardJ   sQ    /+11!1<**40=@/r9   r?   r@   rI   s   @r7   rK   rK   B   s)    KU\\ ell  r9   rK   r:   n_repr;   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)shapeexpandreshape)r:   rW   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr`   R   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr9   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrN   r   rQ   )rS   dtype)ptrainingr!   )r`   num_key_value_groupsrE   matmul	transposerY   r-   
functionalsoftmaxfloat32tork   rg   rm   
contiguous)ra   rb   rc   rd   re   rf   rg   rh   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r7   eager_attention_forwardr{   ^   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r9   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   NrN   r!   rQ   rR   rj   )rE   stackflatten)xx1x2s      r7   rotate_halfr   x   sJ    	
319B	
319B;;Ryb)11"55r9   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pUSSU24   USUS24   pXr-  [        U5      U-  -   nX-  [        U	5      U-  -   n[        R
                  " X/SS9n[        R
                  " X/SS9nX4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.NrQ   rN   rR   )	unsqueezerY   repeat_interleaver   rE   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r7   apply_rotary_pos_embr      s6   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr9   c                     ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr\	" S	S
SS9     SS\
R                  S\\\
R                  \
R                  4      S\\
R                     S
\\   S\\
R                     S\\
R                     S\\   S\\
R                  \\
R                     \\\
R                        4   4S jj5       rSrU =r$ )MoonshineAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr+   	layer_idx	is_causalnum_attention_headsr]   c                   > [         TU ]  5         UR                  XES.5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        X0l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR                  SS9U l        U R                  R*                  bA  U R                  R*                  nX`R                  U-   S-
  U-  -  nXpR                  -
  U l        g SU l        g )N)r   r]   r_   g      ࿩biasFr!   r   )r)   r*   updater+   r   getattrr/   r   r_   r]   rn   rf   attention_dropoutr   r-   r.   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r4   r+   r   r   r   r]   target_multipletarget_head_dimr6   s	           r7   r*   MoonshineAttention.__init__   s    	.Ano"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO---/2QTU2UZi1ijO$3mm$CD!$%D!r9   past_key_valuepast_key_values4.58new_nameversionr:   position_embeddingsre   cache_positionkey_value_statesrh   r;   c                    UR                   S S u  pU R                  U5      R                  XU R                  R                  U R
                  5      R                  SS5      n
US LnUb^  UR                  R                  U R                  5      nU(       a&  SUR                  U R                  '   UR                  nOUR                  nUb  UOUnU(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU R!                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU(       a$  Ub!  UR#                  XU R                  SU05      u  pU(       d<  Uu  nn[%        XUU5      u  pUb%  UUUS.nUR#                  XU R                  U5      u  p[&        nU R                  R(                  S:w  a  [*        U R                  R(                     nU R,                  =(       a    US L =(       a    U	S:  nU R.                  S:  a  [0        R2                  R4                  R7                  U
SU R.                  45      n
[0        R2                  R4                  R7                  USU R.                  45      n[0        R2                  R4                  R7                  USU R.                  45      nU" U U
UUU4U R8                  (       d  S	OU R:                  U R<                  US
.UD6u  nnU R.                  S:  a  USS U R.                  * 24   nUR?                  XS5      RA                  5       nU RC                  U5      nUU4$ )NrQ   r!   rN   Tr   )r   r   r   eagerr           )rg   rf   r   .)"rY   r   viewr+   r]   r_   rp   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   r{   _attn_implementationr   r   r   rE   r-   rq   padrm   r   rf   r[   ru   r   )r4   r:   r   re   r   r   r   rh   bszq_lenquery_statesis_cross_attentionr   current_statesrv   rw   r   r   cache_kwargsattention_interfacer   rz   rx   s                          r7   r=   MoonshineAttention.forward   sq    #(("-
 KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+Adnn?OQ_>`,(
 "*HC';LVY[^'_$L*'*3.Y+:+A+Adnnl,(
 )@;;++w6"9$++:Z:Z"[NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#b9DDFkk+.L((r9   )r   r+   r_   r   r   r   r   rn   r   r   rf   r   )NNNNN)rA   rB   rC   rD   __doc__r"   intboolr*   r    rE   rF   r   tupler
   
LongTensorr   r   r=   rG   rH   rI   s   @r7   r   r      s0   G#&#& #& 	#&
 !#& !#&J %0A6R LP15+/5937U)||U) &eELL%,,,F&GHU) !.	U)
 "%U) !!1!12U) #5<<0U) -.U) 
u||Xell3XeELL>Q5RR	SU) SU)r9   r   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )MoonshineRotaryEmbeddingi*  inv_freqr+   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r)   r*   hasattr
isinstancer   dictr   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr+   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r4   r+   devicer   r6   s       r7   r*   !MoonshineRotaryEmbedding.__init__-  s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r9   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rQ   r!   mpscpuF)device_typeenabledrN   rR   rk   )r   floatrZ   rY   rt   r   r   r   strrE   autocastrp   r   r   r   r   rk   )
r4   r   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r7   r=    MoonshineRotaryEmbedding.forward>  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r+   r   r   r   r   r   r'   )rA   rB   rC   rD   rE   rF   __annotations__r"   r*   no_gradr   r=   rG   rH   rI   s   @r7   r   r   *  s@    ll/ / /" ]]_<  <r9   r   c                   H  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9      SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\R                     S\
\\R                  \R                  4      S\\   S\R                  4S jj5       rSrU =r$ )MoonshineEncoderLayeriN  r+   r   c                 T  > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l	        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NFr+   r   r   r   r]   r   )r)   r*   r/   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr$   encoder_hidden_actmlpr-   	LayerNorminput_layernormpost_attention_layernormr4   r+   r   r6   s      r7   r*   MoonshineEncoderLayer.__init__O  s    !--+ & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%r9   r   r   r   r   r:   re   r   	use_cacher   r   rh   r;   c                     Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pX-   nUn	U R                  U5      nU R                  U5      nX-   nU$ )Nr:   re   r   r   r  r   r    )r   r   r   r   )r4   r:   re   r   r   r  r   r   rh   residual_s              r7   r=   MoonshineEncoderLayer.forward_  s     !,,];>> 	
')%+) 3	
 	
 !0 !55mD/ 0r9   )r/   r   r   r   r   )NNNFNN)rA   rB   rC   rD   r"   r   r*   r    rE   rF   r   r   r
   r   r   r   r   r=   rG   rH   rI   s   @r7   r   r   N  s    U U3 U  %0A6R 2637+/$)59KO|| !. u//0	
 "% D> !!1!12 &eELL%,,,F&GH +, 
 Sr9   r   c            !       2  ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9          SS	\	R                  S
\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\	R                     S\\\	R                  \	R                  4      S\\\	R                  \	R                  4      S\\   S\\	R"                  \\\	R"                  \	R"                  4      4   4S jj5       rSrU =r$ )MoonshineDecoderLayeri  r+   r   c                   > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NTr   Fr   )r)   r*   r/   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrK   decoder_hidden_actr   r-   r   r   r   final_layernormr   s      r7   r*   MoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr9   r   r   r   r   r:   re   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr  r   r   encoder_position_embeddingsrh   r;   c                    UnU R                  U5      nU R                  " SUUUUUU	U
S.UD6u  pX-   nUb,  UnU R                  U5      nU R                  UUUUUS9u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nU$ )Nr  )r:   r   re   r   r  r  )r   r   r   r  r  r   )r4   r:   re   r  r  r   r  r   r  r   r   r  rh   r  r  s                  r7   r=   MoonshineDecoderLayer.forward  s      !,,];>> 	
')%+) 3	
 	
 !0 ,$H 99-HM#00+!65 /#  1  M %4M ,,];/ 0r9   )r  r  r/   r   r   r   r   r'   )
NNNNNNFNNN)rA   rB   rC   rD   r"   r   r   r*   r    rE   rF   r   r
   r   r   r   r   FloatTensorr=   rG   rH   rI   s   @r7   r	  r	    s   L L8C= L L0 %0A6R 268<9=37;?+/$)59KOSW.||. !..  (5	.
 !) 6. u//0. 'u'7'78. "%. D>. !!1!12. &eELL%,,,F&GH. &.eELL%,,4N.O%P. +,. 
u  (51B1BEDUDU1U+V"WW	X. S.r9   r	  c                   b    \ rS rSr% \\S'   SrSrSrSS/r	Sr
SrSrS\R                  4S	 jrS
rg)MoonshinePreTrainedModeli  r+   modelinput_valuesTr   r	  input_lengthsc                 ~    [        US-
  S-  S-   5      n[        US-
  S-  S-   5      n[        US-
  S-  S-   5      nU$ )z8
Computes the output length of the convolutional layers
   @   r!      r   rN   )r   )r4   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r7    _get_feat_extract_output_lengths9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r9   r  N)rA   rB   rC   rD   r"   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrE   r   r$  rG   r  r9   r7   r  r    sH    $O&*#02IJN!#e>N>N #r9   r  c            
          ^  \ rS rSrSrSr\\S.rS\	4U 4S jjr
S\R                  4S jrS	\R                  4S
 jr\" 5        SS\R"                  S\\R&                     S\\   S\4S jj5       rSrU =r$ )MoonshineEncoderi  z
Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

Args:
    config: MoonshineConfig
r  )
attentionsr:   r+   c           	      L  > [         TU ]  U5        Xl        UR                  n[        R
                  " SUSSSS9U l        [        R
                  " USU-  SSS	9U l        [        R
                  " SU-  USSS	9U l        [        R                  " SUS
S9U l
        [        US9U l        [        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [        R$                  " USS9U l        SU l        U R+                  5         g s  snf )Nr!   r  r  F)kernel_sizestrider   rN   r   r   )r1  r2  gh㈵>)
num_groupsnum_channelsepsr+   r   )r)   r*   r+   r/   r-   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r4   r+   	embed_dimidxr6   s       r7   r*   MoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bc;bC"6/;bc
 ,,yu=&+#	 ds   D!r;   c                     U R                   $ r'   r8  r4   s    r7   get_input_embeddings%MoonshineEncoder.get_input_embeddings  s    zzr9   rd   c                     Xl         g r'   rH  r4   rd   s     r7   set_input_embeddings%MoonshineEncoder.set_input_embeddings  s    
r9   re   rh   c                    UR                  S5      n[        R                  R                  U R	                  U5      5      nU R                  U5      n[        R                  R                  U R                  U5      5      n[        R                  R                  U R                  U5      5      nUR                  SSS5      nUb  U R                  UR                  S   5      nSnUSSSU24   SSU24   nU R                  R                  S:X  a  US	:H  R                  5       (       a  UOSnOEU R                  R                  S
:X  a  [        X$R                   5      nO[#        X$R                   5      n[$        R&                  " SUR                  S   UR(                  S9R                  S5      nU R+                  XG5      nU R,                   H  n	U	" U4UUUS.UD6nM     U R/                  U5      n[1        US9$ )a  
Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
        Float values of the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
        `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
        the soundfile library (`pip install soundfile`). To prepare the array into
        `input_values`, the [`AutoFeatureExtractor`] should be used for padding
        and conversion into a tensor of type `torch.FloatTensor`.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
r!   r   rN   NrQ     .flash_attention_2r   sdpar   )re   r   r   )last_hidden_state)r   r-   rq   tanhr8  r<  gelur9  r:  permuter$  rY   r+   r   anyr   rk   r   rE   aranger   r=  r   rA  r   )
r4   r  re   rh   r:   mask_lendownsample_strider   r   encoder_layers
             r7   r=   MoonshineEncoder.forward  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3P3PVZ11V;!D^UhUh!i!;NL_L_!`||A}':':1'=mFZFZ[eefgh"oomJ![[M)-)$7	
 M ) 6&+
 	
r9   )	r+   r8  r9  r:  rB  r<  rA  r   r=  r'   )rA   rB   rC   rD   r   r'  r   r   _can_record_outputsr"   r*   r-   ModulerJ  rN  r   rE   r  r   rF   r   r   r   r=   rG   rH   rI   s   @r7   r.  r.    s     %O(.
 $bii "))   268
''8
 !.8
 +,	8

 
!8
 8
r9   r.  c                     ^  \ rS rSrSr\" \SSS9\\" \SSS9S.rS\	4U 4S	 jjr
\" 5                SS\\R                     S
\\R                     S\\R                     S\\   S\\R"                     S\\   S\\R                     S\\R"                     S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )MoonshineDecoderiG  	input_idsr!   r   )index
layer_namer  )r/  r:   cross_attentionsr+   c           	      
  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [
        R                  " UR                  SS9U l        [!        US9U l        SU l        U R'                  5         g s  snf )NFr   r6  )r)   r*   pad_token_idpadding_idx
vocab_sizer-   	Embeddingr/   embed_tokensr>  r?  decoder_num_hidden_layersr	  r   r   normr   r=  rB  rC  )r4   r+   rE  r6   s      r7   r*   MoonshineDecoder.__init__P  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bc;bC"6/;bc
 LL!3!3%@	2&A&+# 	 ds   D re   r   r   inputs_embedsr  r   r  r  rh   r;   c
                    USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a1  Uc.  [        [        U R                  S9[        U R                  S95      nUcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9nUnU R                  X5      nU	b  UR                  S   nS	nU	S
SSU24   S
SU24   n	U R                  R                  S:X  a  U	S:H  R                  5       (       a  U	OSn	OaU R                  R                  S:X  a$  [        XR                   UR                  S   5      n	O#[#        XR                   UR                  S   5      n	U R$                   H  nU" UUU4U	UUUUUS.U
D6nM     U R'                  U5      n[)        UU(       a  US9$ SS9$ )a\  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embedsr6  r   r!   rT  )r+   input_embedsre   r   r   r   rj   rQ  .rR  r   rS  )r  r   r   r  r   r   )rU  r   )
ValueErrorrl  r   r   r+   get_seq_lengthrE   rZ  rY   r   r   r   r=  r   rY  r   rk   r   r   rn  r   )r4   rc  re   r   r   rp  r  r   r  r  rh   past_seen_tokensry   r:   r   r[  r\  decoder_layers                     r7   r=   MoonshineDecoder.forward`  s>   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oomJ!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfChCh)?nr&11V;)L*,?,?ATATUWAX*& *D*,?,?ATATUWAX*& "[[M)% (>) /#-$7 M ) 		-08+/8O
 	
>B
 	
r9   )rl  rB  r   rn  ri  r=  rj  )	NNNNNNNNN)rA   rB   rC   rD   r'  r   r   r	  r_  r"   r*   r   r   rE   r   rF   r
   r  r   r   r   r   r   r   r=   rG   rH   rI   s   @r7   rb  rb  G  sI   !O$%7q[Y.*+=QSab    151537+/59$(59=A9=W
E,,-W
 !.W
 u//0	W

 "%W
   1 12W
 D>W
 !!1!12W
  ((9(9:W
 !) 6W
 +,W
 
u--	.W
 W
r9   rb  rY   	mask_probmask_length	min_masksc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r!   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr!   r   )r   max)input_lengthnum_masked_spanepsilonry  rx  rz  sequence_lengths     r7   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr9   NrQ   r   r   F)replace)rs  nprandomranditemdetachsumtolistr?  zerosr   choicerZ  lenconcatenateonesint32appendarraybroadcast_tor[   r~  put_along_axis)rY   rx  ry  re   rz  
batch_sizer  r  r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r7   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	 SS\
R                  S	\\
R                     4S
 jjr\\          SS\\
R                     S	\\
R                     S\\
R                     S\\
R                     S\\\\
R                           S\\\\\
R                     4      S\\\
R                        S\\\
R                        S\\   S\\
R                     S\\   S\4S jj5       5       rSrU =r$ )MoonshineModeli2  r+   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r'   )r)   r*   r.  encoderrb  decoderrC  r4   r+   r6   s     r7   r*   MoonshineModel.__init__4  s2     '/'/r9   c                 .    U R                   R                  $ r'   r  rl  rI  s    r7   rJ  #MoonshineModel.get_input_embeddings<  s    ||(((r9   c                 $    XR                   l        g r'   r  rM  s     r7   rN  #MoonshineModel.set_input_embeddings?  s    $)!r9   c                     U R                   $ r'   )r  rI  s    r7   get_encoderMoonshineModel.get_encoderB  s    ||r9   c                 8    U R                   R                  5         g)z
Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
not be updated during training.
N)r  _freeze_parametersrI  s    r7   freeze_encoderMoonshineModel.freeze_encoderE  s    
 	'')r9   input_featuresre   c                 2   [        U R                  SS5      (       d  U$ UR                  5       u  p4nU R                  R                  S:  a  U R                  (       a  [        X54U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " XaR                  [        R                  S9nUSS2S4   R                  SUS5      nSX'   U R                  R                  S:  a  U R                  (       az  [        X44U R                  R                  U R                  R                  U R                  R                  S9n[        R                  " XqR                  [        R                  S9nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTr   )rx  ry  re   rz  )r   rk   NrQ   )rx  ry  rz  )r   r+   sizemask_time_probrm   r  mask_time_lengthmask_time_min_masksrE   tensorr   r   rZ   mask_feature_probmask_feature_lengthmask_feature_min_masks)r4   r  re   r  r/   r  mask_time_indicesmask_feature_indicess           r7   _mask_input_features#MoonshineModel._mask_input_featuresL  sN    t{{$8$??!! 4B3F3F3H0
;;%%)dmm 5-++44 KK88-++99! !&->G\G\didndn o 1!T' : A A"kSU V01N-;;((1,#8)++77 KK;;++<<	$  $)<<0DMbMbjojtjt#u 34N0r9   r  decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr  r   rh   r;   c                 >   Uc  U R                   " U4SU0UD6nU R                  " SUUUUR                  UUUU	U
S.	UD6n[        UR                  UR                  UR
                  UR                  UR                  UR                  UR
                  UR                  S9$ )a:  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, MoonshineModel
>>> from datasets import load_dataset

>>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 288]
```
re   )	rc  re   r  r  r   rp  r   r  r   )rU  r   decoder_hidden_statesdecoder_attentionsrf  encoder_last_hidden_stater  encoder_attentionsr  )r  r  rU  r   r   r:   r/  rf  )r4   r  re   r  r  r  r   r  r  r  r   rh   decoder_outputss                r7   r=   MoonshineModel.forwardw  s    \ "/3||L/rYg/rkq/rOEI\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r9   )r  r  r'   )
NNNNNNNNNN)rA   rB   rC   rD   r"   r*   rJ  rN  r  r  rE   r  r   r   r  r   r   r   r   r   r   r   r   r   r=   rG   rH   rI   s   @r7   r  r  2  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(59E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 "%e.?.?(@"ABE
 "%(;U5CTCT=U(U"VWE
  (e.?.?(@AE
 'uU-=-='>?E
 D>E
 !!1!12E
 +,E
 
E
  E
r9   r  rc  rh  decoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
NrQ   r!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrY   cloners  masked_fill_)rc  rh  r  shifted_input_idss       r7   shift_tokens_rightr    sz     "++IOO<(CRC0668ae4adLMM""#4#<lKr9   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                     ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	\R                  4S
 jr\\           SS\\R$                     S\\R&                     S\\R&                     S\\R&                     S\\\\R$                           S\\\\\R$                     4      S\\\R$                        S\\\R&                        S\\   S\\R&                     S\\R&                     S\\   S	\4S jj5       5       rSrU =r$ )!MoonshineForConditionalGenerationi  zproj_out.weightr+   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r)   r*   r  r  r-   r.   r/   rj  proj_outrC  r  s     r7   r*   *MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r9   c                 6    U R                   R                  5       $ r'   )r  r  rI  s    r7   r  -MoonshineForConditionalGeneration.get_encoder      zz%%''r9   c                 6    U R                   R                  5       $ r'   )r  get_decoderrI  s    r7   r  -MoonshineForConditionalGeneration.get_decoder  r  r9   c                     U R                   $ r'   r  rI  s    r7   get_output_embeddings7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r9   c                     Xl         g r'   r  )r4   new_embeddingss     r7   set_output_embeddings7MoonshineForConditionalGeneration.set_output_embeddings  s    &r9   r;   c                 6    U R                   R                  5       $ r'   )r  rJ  rI  s    r7   rJ  6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r9   r  re   r  r  r  r   r  r  r  r   labelsrh   c                    Ub:  Uc7  Uc4  [        XR                  R                  U R                  R                  5      nU R                  " U4UUUUUUUU	U
S.	UD6nU R                  UR                  5      nSnUb$  U R                  XU R                  R                  S9n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )ah  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values

>>> generated_ids = model.generate(input_values, max_new_tokens=100)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
```N)	re   r  r  r  r   r  r  r  r   )logitsr  rj  )	lossr  r   r  r  rf  r  r  r  )r  r+   rh  r  r  r  rU  loss_functionrj  r   r   r  r  rf  r  r  r  )r4   r  re   r  r  r  r   r  r  r  r   r  rh   outputsr  r  s                   r7   r=   )MoonshineForConditionalGeneration.forward  s
   f  (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+"7!5)'
 '
 w889%%Vt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r9   )r  r  )NNNNNNNNNNN)rA   rB   rC   rD   _tied_weights_keysr"   r*   r  r  r  r  r-   r`  rJ  r   r   r   rE   r  r   r   r   r   r   r   r   r   r=   rG   rH   rI   s   @r7   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(59-1T
u001T
 !!1!12T
 $E$4$45	T

 !))9)9 :T
 "%e.?.?(@"ABT
 "%(;U5CTCT=U(U"VWT
  (e.?.?(@AT
 'uU-=-='>?T
 D>T
 !!1!12T
 ))*T
 +,T
 
T
  T
r9   r  )r  r  r  )r   )Nr!   )Nr   )Ltypingr   r   r   numpyr  rE   torch.nnr-   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr    configuration_moonshiner"   r`  r$   rK   rF   r   r`   r   r{   r   r   r   r   r   r	  r  r.  rb  r   r   ndarrayr  r  r  r  __all__r  r9   r7   <module>r     s}  * - ,    I ! C C ) / g B 9  L F & I I 0 4")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%46'T~) ~)B!<ryy !<H16 1hH6 HV # # #._
/ _
D p
/ p
 p
n 26tc?tt t U--.	t
 t ZZtn K
- K
 K
\%,, c [^   
p
(@/ p

p
f ^r9   