
    cCiK                     6   S SK JrJrJr  S SKrS SKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0  SSK1J2r2  \(" 5       (       a  SSK3J4r4  \*Rj                  " \65      r7\& " S S\!5      5       r8 " S S\Rr                  5      r: " S S\Rr                  5      r;\" S5       " S S\Rr                  5      5       r< " S S \Rr                  5      r=S! r>SDS" jr?S#\R                  S$\AS%\R                  4S& jrB SES'\Rr                  S(\R                  S)\R                  S*\R                  S+\\R                     S,\CS-\CS.\#\%   4S/ jjrD " S0 S1\Rr                  5      rE " S2 S3\Rr                  5      rF " S4 S5\5      rG " S6 S7\85      rH " S8 S9\5      rI " S: S;\85      rJ\&" S<S=9 " S> S?\85      5       rK\&" S@S=9 " SA SB\8\25      5       rL/ SCQrMg)F    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSS/rSrg	)
DiaPreTrainedModel?   configmodelT	input_idsDiaEncoderLayerDiaDecoderLayer N)__name__
__module____qualname____firstlineno__r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules__static_attributes__r0       ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/dia/modeling_dia.pyr)   r)   ?   s<    &*#N!!O*,=>r?   r)   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	DiaMultiChannelEmbeddingL   a  In order to efficiently compute the audio embedding from the 9 different channels,
we vectorize the embedding process by using a single embedding layer and an offset.
Example:
- num_embeds = 4
- vocab_size = 8
- num_channels = 3
We would have offsets = [0, 8, 16]
If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
then tokens = audio_codes + offsets
            = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
This allows us to use a single embedding layer for all channels.
r+   c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        UR                  U l        UR
                  U l        [        R                  " UR
                  [        R                  S9UR                  -  nU R                  SUSS9  g )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr+   rG   	__class__s      r@   rK   !DiaMultiChannelEmbedding.__init__Z   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr?   audio_codesreturnc                    XR                   R                  UR                  5      -   R                  S5      nU R	                  U5      R                  UR                  S   UR                  S   SU R                  5      nUR                  SS9$ )Nr"   r      dim)	rG   todevicesqueezerP   viewshaperO   sum)rU   rX   tokensembedss       r@   forward DiaMultiChannelEmbedding.forwardb   ss    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r?   )rP   rO   rN   )r1   r2   r3   r4   __doc__r$   rK   rQ   Tensorrg   r>   __classcell__rV   s   @r@   rB   rB   L   s7    C/ C!5<< !ELL ! !r?   rB   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DiaMLPh   c                    > [         TU ]  5         Xl        [        R                  " UR
                  SUR                  -  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        UR                     U l        g )Nr\   Fbias)rJ   rK   r+   r   LinearrO   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnrU   r+   rV   s     r@   rK   DiaMLP.__init__i   sn    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r?   hidden_statesrY   c                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )Nr\   r[   r]   )ru   chunkrx   rv   )rU   r{   	up_statesgates       r@   rg   DiaMLP.forwardq   sH    %%m4	#//!/4 2 24 88	~~i((r?   )rx   r+   rv   ru   )
r1   r2   r3   r4   rK   rQ   FloatTensorrg   r>   rk   rl   s   @r@   rn   rn   h   s,    7)U%6%6 )5;L;L ) )r?   rn   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )
DiaRMSNormz   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z)
DiaRMSNorm is equivalent to T5LayerNorm
N)rJ   rK   r   	ParameterrQ   onesweightvariance_epsilon)rU   rO   epsrV   s      r@   rK   DiaRMSNorm.__init__|   s/     	ll5::k#:; #r?   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr\   r[   T)keepdim)	rF   r_   rQ   float32powmeanrsqrtr   r   )rU   r{   input_dtypevariances       r@   rg   DiaRMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r?   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rc   r   rU   s    r@   
extra_reprDiaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr?   )r   r   )gư>)	r1   r2   r3   r4   rK   rg   r   r>   rk   rl   s   @r@   r   r   z   s    $;J Jr?   r   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )DiaRotaryEmbedding   inv_freqr+   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultr   FrH   )rJ   rK   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr+   r   rope_init_fnattention_scalingrT   r   original_inv_freq)rU   r+   r`   r   rV   s       r@   rK   DiaRotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r?   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r[   r"   mpscpuF)device_typeenabledr\   r]   rE   )r   floatexpandrc   r_   r`   r   r   strrQ   autocast	transposecatcosr   sinrF   )
rU   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r@   rg   DiaRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r+   r   r   r   r   r   N)r1   r2   r3   r4   rQ   rj   r5   r#   rK   no_gradr   rg   r>   rk   rl   s   @r@   r   r      s@    ll/y / /" ]]_<  <r?   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr[   r\   r]   )rc   rQ   r   )r   x1x2s      r@   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r?   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r@   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr?   r{   n_reprY   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rc   r   reshape)r{   r   batchnum_key_value_headsslenhead_dims         r@   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr?   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr\   r   r[   )r^   rF   )ptrainingr"   )r   num_key_value_groupsrQ   matmulr   rc   r   
functionalsoftmaxr   r_   rF   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r@   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r?   c                   F  ^  \ rS rSrSrSS\\\4   S\S\	4U 4S jjjr
\" SSS	S
9  SS\R                  S\\R                  \R                  4   S\\R                     S\\   S\\R"                     S\\   S\\R                  \R                  4   4S jj5       rSrU =r$ )DiaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperr+   	layer_idx	is_causalc                   > [         TU ]  5         Xl        X l        UR                  U l        U R                  R
                  U l        U R                  R                  =(       d    U R                  U l        U R                  U R                  -  U l        [        USUR                  U R                  -  5      U l
        SU l        SU l        X0l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr   r"           Frq   )rJ   rK   r+   r   rO   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   rs   q_projk_projv_projo_proj)rU   r+   r   r   rV   s       r@   rK   DiaSelfAttention.__init__   s@   "!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r?   past_key_valuepast_key_valuesz4.58)new_nameversionr{   position_embeddingsr   cache_positionr   rY   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                  U R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nU R'                  U5      nUU4$ )Nr[   r"   r\   )r   r   r  eagerr   )r   r   )rc   r   r   rb   r   r   r   r   updater   r   r+   _attn_implementationr   r   r   r   r   r   r   )rU   r{   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r@   rg   DiaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r?   )r   r+   r   rO   r   r   r   r   r   r   r   r   r   r   )FNN)r1   r2   r3   r4   ri   r   r%   r$   intboolrK   r!   rQ   rj   r   r   r	   
LongTensorr   r   rg   r>   rk   rl   s   @r@   r   r      s    G^u%57G%GH ^UX ^ei ^ ^$ %0A6R ,059))||)) #5<<#=>)) !.	))
 "%)) !!1!12)) +,)) 
u||U\\)	*)) S))r?   r   c                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\\   S\\R                  \
\R                     4   4S jjrSrU =r$ )DiaCrossAttentioni=  r   r+   r   c                 R  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  R                  U l        U R                  R                  U l	        U R                  U R                  -  U l
        UR                  U l        SU l        SU l        SU l        [         R"                  " U R                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R                  U R                  -  U R                  SS9U l        g )Nr"   r   Frq   )rJ   rK   r+   r   rO   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rs   r   r   r   r   rU   r+   r   rV   s      r@   rK   DiaCrossAttention.__init__@  s;   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r?   r{   cross_attention_statesr   r  r   rY   c                 n   UR                   S S n/ UQSPU R                  P7n/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Ub%  UR
                  R                  U R                  5      OSn
Ubb  U
(       a[  UR                  R                  U R                     R                  nUR                  R                  U R                     R                  nOU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nUbB  UR                  R                  UUU R                  5      u  pSUR
                  U R                  '   [        nU R                   R"                  S:w  a  [$        U R                   R"                     nU" U U	UUU4SU R&                  0UD6u  pUR)                  / UQSP75      R+                  5       nU R-                  U5      nX4$ )Nr[   r"   r\   FTr  r   )rc   r   r   rb   r   
is_updatedr   r   cross_attention_cachelayerskeysvaluesr   r   r	  r   r+   r
  r   r   r   r   r   )rU   r{   r  r   r  r   r  r  cross_shaper  r   r   r   r  r   r   s                   r@   rg   DiaCrossAttention.forwardS  s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
 >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
! "))*<K*<*<=HHJkk+.((r?   )r   r+   r  r   rO   r   r   r   r   r   r   r   r   r   r   r  )r1   r2   r3   r4   ri   r$   r  rK   rQ   rj   r   r   r   r   r   rg   r>   rk   rl   s   @r@   r  r  =  s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41) 1)r?   r  c                      ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\\   S	\
\R                  \	\R                     4   4
S
 jjrSrU =r$ )r.   i  r+   r   c                    > [         TU ]  5         [        UR                  UR                  S9U l        [        XSS9U l        [        UR                  UR                  S9U l        [        U5      U l
        g )Nr   Fr   )rJ   rK   r   rO   norm_epspre_sa_normr   self_attentionpost_sa_normrn   mlpr  s      r@   rK   DiaEncoderLayer.__init__  sZ    %f&8&8fooN.vER&v'9'9vO&>r?   r{   r  r   r   rY   c                     UnU R                  U5      nU R                  " U4UUS.UD6u  pxXW-   nUnU R                  U5      nU R                  U5      n	XY-   nX4$ )Nr  r   )r,  r-  r.  r/  )
rU   r{   r  r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r@   rg   DiaEncoderLayer.forward  s     !((7.2.A.A/
 3)/
 	/
+ !3 ))-8((=) *//r?   )r/  r.  r,  r-  r  )r1   r2   r3   r4   r%   r  rK   rQ   rj   r   r   r   r   rg   r>   rk   rl   s   @r@   r.   r.     s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40 0r?   r.   c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S\
\R                     S\
\   S\
\   S	\\   S
\\\4   4S jj5       5       rS\\R                  S4   S\R                  4S jrSrU =r$ )
DiaEncoderi  r+   c           	        > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [!        U5      U l        g s  snf Nr)  )rJ   rK   r+   r   rL   rM   rO   	embedding
ModuleListrangenum_hidden_layersr.   r"  r   r+  normr   rotary_embeddingsr  s      r@   rK   DiaEncoder.__init__  s     f&7&79K9KLmmAFvG_G_A`aA`I_V/A`a
 v11vG	!3F!; bs   .CNr-   r   output_attentionsoutput_hidden_statesr   rY   c                    U R                  U5      n[        R                  " UR                  S   UR                  S9S S S 24   nU R                  Xg5      nU R                  UU5      nU(       a  SOS n	U(       a  SOS n
U R                   H1  nU(       a  X4-   n	U" U4UUS.UD6nUS   nU(       d  M)  XS   4-   n
M3     U R                  U5      nU(       a  X4-  n	[        XiU
S9$ )Nr[   r`   r0   r2  r   r"   last_hidden_stater{   
attentions)
r=  rQ   rR   rc   r`   rB  _update_full_maskr"  rA  r   )rU   r-   r   rD  rE  r   r{   r   r  encoder_statesall_attentionsencoder_layerlayer_outputss                r@   rg   DiaEncoder.forward  s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]Q//

  40d![[M#!/2B!B)$7- 	M *!,M  !/3C2E!E ) 		-0..N+Vd
 	
r?   inputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ )Nflash_attention_2r   sdpaflex_attentionFr*  	r+   r
  r   rF   r   rQ   rj   r'   r   )rU   r   rQ  s      r@   rK  DiaEncoder._update_full_mask  s    
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r?   )r+   r=  r"  rA  rB  )NFF)r1   r2   r3   r4   r%   rK   r   r   rQ   rj   r   r  r   r   r   r   r   rg   rK  r>   rk   rl   s   @r@   r:  r:    s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 || r?   r:  c                   |  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\	\R                     S	\	\R                     S
\	\   S\	\R                     S\
\R                  \	\R                     \	\R                     4   4S jjrSrU =r$ )r/   i  r+   r   c                 t  > [         TU ]  5         UR                  U l        [	        XSS9U l        [        X5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        g )NTr*  r)  )rJ   rK   rO   	embed_dimr   r-  r  cross_attentionr   r+  r,  pre_ca_normpre_mlp_normrn   r/  r  s      r@   rK   DiaDecoderLayer.__init__  s    ++.vDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r?   r{   r  r   encoder_hidden_statesencoder_attention_maskr  r  rY   c                 d   Un	[        U	[        5      (       a  U	R                  n	Un
U R                  U5      nU R                  " UUUU	4SU0UD6u  pX-   nUn
U R                  U5      nU R                  " UU4UUS.UD6u  pX-   nUn
U R                  U5      nU R                  U5      nU
U-   nXU4$ )Nr  )r   r  )	r   r   self_attention_cacher,  r-  r\  r[  r]  r/  )rU   r{   r  r   r_  r`  r  r  r   self_attn_cacher3  r4  r5  r6  cross_statescross_attn_weightsr7  s                    r@   rg   DiaDecoderLayer.forward	  s     *o':;;-BBO ((7.2.A.A 	/
 *	/
 	/
+ !3 ((7+/+?+?!,
 2+	,

 ,
( !/ ))-8((=) 7*1CCCr?   )r[  rZ  r/  r\  r]  r,  r-  )NNNNNN)r1   r2   r3   r4   r$   r  rK   rQ   rj   r   r   r   r  rg   r>   rk   rl   s   @r@   r/   r/     s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-D -Dr?   r/   c                     ^  \ rS rSrSrS\4U 4S jjr\\        SS\	R                  S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       5       rS	\\	R                  S4   S
\\	R                  S4   S\	R(                  S\	R                  4S jrSrU =r$ )
DiaDecoderi9  z-Transformer Decoder Stack using DenseGeneral.r+   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        g s  snf r<  )rJ   rK   rN   rM   rB   
embeddingsr   rB  r   r>  r?  r@  r/   r"  r   rO   r+  rA  r  s      r@   rK   DiaDecoder.__init__<  s     "// ++26:!3F!;mmAFvG_G_A`aA`I_V/A`a
 v11vG	 bs   :B?Nr-   r   r   r_  r`  r  rD  rE  r  rY   c
           	      "   UR                  5       SS u  pUb  UR                  5       OSnU	c"  [        R                  " XU-   UR                  S9n	Uc	  U	SSS24   nU R                  U5      nU R                  X5      nUc3  [        5       (       d$  X-   n[        R                  " UUUR                  S9n[        U R                  UUU	UUS9nU R                  UUUR                  SS U5      nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSnU R                   HE  nU(       a  UU4-  nU" UUUU4UUU	S.U
D6nUS   nU(       d  M.  UUS	   4-   nUc  M<  UUS   4-   nMG     U R                  U5      nU(       a  UU4-  n[        UUUUUS
9$ )z
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
    The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

    [What are input IDs?](../glossary#input-ids)
Nr[   r   rG  )r+   input_embedsr   r  r  r   r\   r0   )r`  r  r  r"   )rI  r  r{   rJ  cross_attentions)sizeget_seq_lengthrQ   rR   r`   rj  rB  r   r   r   r+   _update_cross_attn_maskrc   r"  rA  r   )rU   r-   r   r   r_  r`  r  rD  rE  r  r   
batch_size
seq_lengthpast_key_values_lengthr{   r  mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerrO  s                         r@   rg   DiaDecoder.forwardG  s   , "+!1#2!6
ETE`!?!?!Afg!"\\&(KT]TdTdN )$'2L 	2"44]Q!*B*D*D4AO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[E#!m%55!!#%		
 (> /-	 	M *!,M  !/=3C2E!E(4+?=QRCSBU+U() !, 		-0-!118+++%1
 	
r?   r  rQ  c                    Ub  Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        UUR                  US   S9nU$ U R                   R                  S:X  a/  [	        U[
        R                  5      (       a  [        UUS   SS9nU$ [        X$R                  US   S9nU$ )	NrS  r   rT  r[   )tgt_lenrU  F)query_lengthr   rV  )rU   r_  r`  r  rQ  s        r@   rq  "DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellCC-H.%0_"'.* &%	 *D*,?,?UW*& &%r?   )rj  r"  rA  rN   rB  rM   )NNNNNFFN)r1   r2   r3   r4   ri   r$   rK   r   r   rQ   rj   r   r  r   r   r  r   r   r   rg   Sizerq  r>   rk   rl   s   @r@   rh  rh  9  s`   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!& !&r?   rh  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   l  ^  \ rS rSrS\4U 4S jjrS r\\           SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\\\4      S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jj5       5       rSrU =r$ )DiaModeli  r+   c                    > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g r   )
rJ   rK   r+   r:  encoder_configencoderrh  decoder_configdecoder	post_initry   s     r@   rK   DiaModel.__init__  sC     !&"7"78!&"7"78r?   c                     U R                   $ r   )r  r   s    r@   get_encoderDiaModel.get_encoder  s    ||r?   r-   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr  	use_cacherD  rE  r  rY   c                    Uc  Uc  [        S5      eU	b  U	OU R                  R                  n	U
b  U
OU R                  R                  n
Ub  UOU R                  R                  nU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a1  Uc.  [        [        U R                  S9[        U R                  S95      nUc  U R                  " SUUU	U
S.UD6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S	:  a  US	   OSS
9nUS   R                  S   SU R                  R                   R"                  pnUc7  [$        R&                  " USU4U R                  R(                  U R*                  S9nUR,                  S	:X  a"  UR/                  XU5      R1                  SS	5      nU R2                  " SUUUUS   UUU	U
UUS.
UD6n[5        UR6                  UR8                  UR:                  UR<                  UR>                  US   UR:                  UR<                  S9$ )a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r+   )r-   r   rD  rE  r   r"   r\   rH  r[   )ro  
fill_valuer`   )
r-   r   r   r_  r`  r  rD  rE  r  r  )rI  r  decoder_hidden_statesdecoder_attentionsrn  encoder_last_hidden_stater_  encoder_attentionsr0   ) 
ValueErrorr+   rD  rE  r  is_gradient_checkpointingr   loggerwarning_oncer   r
   r  r   r   lenrc   r  rN   rQ   fullbos_token_idr`   ndimr   r   r  r   rI  r  r{   rJ  rn  )rU   r-   r   r  r  r  r  r  r  rD  rE  r  r   bszseq_lenchannelsdecoder_outputss                    r@   rg   DiaModel.forward  sj   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,dkk2RT`hlhshsTtuO""ll #-"3%9	
 O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjh$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9# Q [ [\]_` a,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r?   )r+   r  r  )NNNNNNNNNNN)r1   r2   r3   r4   r#   rK   r  r   r   r   rQ   r  r   r   r   r   r  r   rg   r>   rk   rl   s   @r@   r  r    sE   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r?   r  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\\\4      S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\\4   4S jj5       5       rSrU =r$ )DiaForConditionalGenerationiJ  r,   r+   c                 v  > [         TU ]  U5        Xl        [        U5      U l        UR
                  R                  U l        UR
                  R                  U l        [        R                  " UR
                  R                  U R                  U R                  -  SS9U l        SU l        U R                  5         g )NFrq   ForMaskedLM)rJ   rK   r+   r  r,   r  rN   rM   r   rs   rO   logits_dense	loss_typer  ry   s     r@   rK   $DiaForConditionalGeneration.__init__R  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r?   c                 6    U R                   R                  5       $ r   )r,   r  r   s    r@   r  'DiaForConditionalGeneration.get_encodera      zz%%''r?   c                 6    U R                   R                  5       $ r   )r,   get_decoderr   s    r@   r  'DiaForConditionalGeneration.get_decoderd  r  r?   r-   r   r  r  r  r  r  r  rD  rE  labelsr  rY   c                 X   U R                   " S	UUUUUUUUU	U
US.UD6nUS   nUR                  S   nU R                  U5      R                  USU R                  U R
                  45      R                  SS5      R                  5       R                  UU R                  -  SU R
                  5      nSnUb  U R                  " S	UXR
                  S.UD6n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )
a   
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in
    `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
    are ignored (masked).
)r-   r   r  r  r  r  r  r  rD  rE  r  r   r[   r"   r\   N)logitsr  rM   )	lossr  r  r  r  rn  r  r_  r  r0   )r,   rc   r  rb   rN   rM   r   r   loss_functionr   r  r  r  rn  r  r_  r  )rU   r-   r   r  r  r  r  r  r  rD  rE  r  r  r   outputsrI  rr  audio_logitsr  s                      r@   rg   #DiaForConditionalGeneration.forwardg  sA   X ** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %%o\&UdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r?   )r+   r  r  r,   rN   rM   )NNNNNNNNNNNN)r1   r2   r3   r4   r6   r#   rK   r  r  r   r   r   rQ   r  r   r   r   r   r  r   rg   r>   rk   rl   s   @r@   r  r  J  sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r?   r  )r  r)   r  )Nr"   )r   )Ntypingr   r   r   rQ   r   activationsr   cache_utilsr	   r
   r   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    utils.deprecationr!   configuration_diar#   r$   r%   generation_diar&   integrations.flex_attentionr'   
get_loggerr1   r  r)   ModulerB   rn   r   r   r   r   rj   r  r   r   r   r   r  r.   r:  r/   rh  r  r  __all__r0   r?   r@   <module>r     sR  , - ,   ! C C 7 / g B 9  L F &  1 L L .  !!J 
		H	% 	? 	? 	?!ryy !8)RYY )$ Y'J J (J(!< !<H(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4?)ryy ?)DG)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
x
! x

x
v 
l
"46H l

l
^ Lr?   