
    +hk                        S SK r S SKrS SKJrJrJrJrJrJr  S SK	r	S SK
Jr  S SKJs  Jr  SSKJrJr  SSKJrJr  SSKJrJrJrJr  SSKJr  SS	KJr  SS
KJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-  \R\                  " \/5      r0    S(S\	Rb                  S\2S\3S\4S\4S\2S\	Rb                  4S jjr5  S)S\	Rb                  S\\	Rb                  \\	Rb                     4   S\3S\2S\\	Rb                  \	Rb                  4   4
S jjr6 " S S\Rn                  5      r8 " S  S!\Rn                  5      r9 " S" S#5      r:\ " S$ S%\Rn                  5      5       r; " S& S'\*\\\\#5      r<g)*    N)AnyDictListOptionalTupleUnion   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers)maybe_allow_in_graph   )FeedForward)dispatch_attention_fn)	Attention)
CacheMixin)TimestepEmbedding	Timesteps)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousRMSNorm	timestepsembedding_dimflip_sin_to_cosdownscale_freq_shiftscale
max_periodreturnc                    [        U R                  5      S:X  d   S5       eUS-  n[        R                  " U5      * [        R
                  " SU[        R                  U R                  S9-  nXvU-
  -  n[        R                  " U5      R                  U R                  5      nU SS2S4   R                  5       USSS24   -  nXH-  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9nU(       a)  [        R                  " USS2US24   USS2SU24   /SS9nUS-  S:X  a*  [        R                  R                   R#                  US	5      nU$ )
a  
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.

Args
    timesteps (torch.Tensor):
        a 1-D Tensor of N indices, one per batch element. These may be fractional.
    embedding_dim (int):
        the dimension of the output.
    flip_sin_to_cos (bool):
        Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
    downscale_freq_shift (float):
        Controls the delta between frequencies between dimensions
    scale (float):
        Scaling factor applied to the embeddings.
    max_period (int):
        Controls the maximum frequency of the embeddings
Returns
    torch.Tensor: an [N x dim] Tensor of positional embeddings.
   zTimesteps should be a 1d-arrayr   r   )startenddtypedeviceNdim)r   r&   r   r   )lenshapemathlogtorcharangefloat32r*   exptor)   floatcatsincosnn
functionalpad)	r   r   r    r!   r"   r#   half_dimexponentembs	            m/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_qwenimage.pyget_timestep_embeddingrB   (   sB   6 y1$F&FF$!H$$u||XU]]9;K;K( H &::;H
))H

 
 
1C
AtG

"
"
$s47|
3C +C ))UYYs^UYYs^4"
=C iiQ	\*C9H9,=>BG qAhh!!%%c<8J    x	freqs_cisuse_realuse_real_unbind_dimc                    U(       GaI  Uu  pEUS   nUS   nUR                  U R                  5      UR                  U R                  5      pTUS:X  a\  U R                  " / U R                  SS QSPSP76 R	                  S5      u  pg[
        R                  " U* U/SS9R                  S5      nObUS:X  aM  U R                  " / U R                  SS QSPSP76 R	                  S5      u  pg[
        R                  " U* U/SS9nO[        SU S	35      eU R                  5       U-  UR                  5       U-  -   R                  U R                  5      n	U	$ [
        R                  " U R                  5       R                  " / U R                  SS QSPSP76 5      nUR                  S
5      n[
        R                  " X-  5      R                  S5      n
U
R                  U 5      $ )a  
Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
tensors contain rotary embeddings and are returned as real tensors.

Args:
    x (`torch.Tensor`):
        Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
    freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

Returns:
    Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
NNr+   Nr   r,   r	   z`use_real_unbind_dim=z` but should be -1 or -2.r&   )r6   r*   reshaper/   unbindr2   stackflattenr8   
ValueErrorr7   r)   view_as_complex	unsqueezeview_as_realtype_as)rD   rE   rF   rG   r:   r9   x_realx_imag	x_rotatedoutx_outs              rA   apply_rotary_emb_qwenrY   ^   s   ( *o*o66!((#SVVAHH%5S"$YY<<b<!<CCBGNFfWf$52>FFqII B&YY<<a<<CCBGNF		F7F"3<I45H4IIbcddwwy3!2S!88<<QWWE
))!'')*;*;*QQWWSb\*Q2*Qq*QR	''*	""9#89AA!D}}QrC   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )QwenTimestepProjEmbeddings   c                 b   > [         TU ]  5         [        SSSSS9U l        [	        SUS9U l        g )N   Tr     )num_channelsr    r!   r"   )in_channelstime_embed_dim)super__init__r   	time_projr   timestep_embedder)selfr   	__class__s     rA   rd   #QwenTimestepProjEmbeddings.__init__   s2    "T`aimn!2sS`!arC   c                 |    U R                  U5      nU R                  UR                  UR                  S95      nUnU$ )N)r)   )re   rf   r6   r)   )rg   timestephidden_statestimesteps_projtimesteps_embconditionings         rA   forward"QwenTimestepProjEmbeddings.forward   s?    1..~/@/@}GZGZ/@/[\$rC   )re   rf   )__name__
__module____qualname____firstlineno__rd   rp   __static_attributes____classcell__rh   s   @rA   r[   r[      s    b rC   r[   c                   ~   ^  \ rS rSrSS\S\\   4U 4S jjjrSS jrS r\	R                  " SS9SS	 j5       rS
rU =r$ )QwenEmbedRope   thetaaxes_dimc           	        > [         TU ]  5         Xl        X l        [        R
                  " S5      n[        R
                  " S5      R                  S5      S-  S-
  n[        R                  " U R                  X@R                  S   U R                  5      U R                  X@R                  S   U R                  5      U R                  X@R                  S   U R                  5      /SS9U l	        [        R                  " U R                  XPR                  S   U R                  5      U R                  XPR                  S   U R                  5      U R                  XPR                  S   U R                  5      /SS9U l
        0 U l        X0l        g )Ni   r   r+   r&   r   r,   )rc   rd   r|   r}   r2   r3   flipr8   rope_params	pos_freqs	neg_freqs
rope_cache
scale_rope)rg   r|   r}   r   	pos_index	neg_indexrh   s         rA   rd   QwenEmbedRope.__init__   s=   
 LL&	LL&++A.3a7	  MM!,<djjI  MM!,<djjI  MM!,<djjI
 
   MM!,<djjI  MM!,<djjI  MM!,<djjI
 
  %rC   c                 T   US-  S:X  d   e[         R                  " US[         R                  " U[         R                  " SUS5      R	                  [         R
                  5      R                  U5      5      -  5      n[         R                  " [         R                  " U5      U5      nU$ )zV
Args:
    index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
r   r         ?)	r2   outerpowr3   r6   r4   divpolar	ones_like)rg   indexr-   r|   freqss        rA   r   QwenEmbedRope.rope_params   s}    
 Qw!||E35%,,q#q:Q:T:TUZUbUb:c:g:ghk:l)m#mnEOOE2E:rC   c                 f   U R                   R                  U:w  a@  U R                   R                  U5      U l         U R                  R                  U5      U l        [	        U[
        5      (       a  US   n[	        U[
        5      (       d  U/n/ nSn[        U5       H  u  pgUu  pn
U SU	 SU
 3n[        R                  R                  5       (       d>  XR                  ;  a  U R                  XX5      U R                  U'   U R                  U   nOU R                  XX5      nUR                  U5      nUR                  U5        U R                  (       a  [        U	S-  U
S-  U5      nM  [        XU5      nM     [        U5      nU R                   XUU-   2S4   n[        R                  " USS9nXN4$ )z
Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
txt_length: [bs] a list of 1 integers representing the length of the text
r   _r   .r,   )r   r*   r6   r   
isinstancelist	enumerater2   compileris_compilingr   _compute_video_freqsappendr   maxr8   )rg   	video_fhwtxt_seq_lensr*   	vid_freqsmax_vid_indexidxfhwframeheightwidthrope_key
video_freqmax_len	txt_freqss                  rA   rp   QwenEmbedRope.forward   s   
 >>  F*!^^..v6DN!^^..v6DNi&&!!I)T**"I	!),HC#& E5axq0H>>..00??2040I0I%Y^0dDOOH-!__X6
!66ueQ
#v.JZ( #FaK!] K #F= A! -$ l#NN=73J#JC#OP	IIiQ/	##rC   N)maxsizec                    X-  U-  nU R                   R                  U R                   Vs/ s H  ofS-  PM	     snSS9nU R                  R                  U R                   Vs/ s H  ofS-  PM	     snSS9nUS   XDU-    R	                  USSS5      R                  XUS5      n	U R                  (       a  [        R                  " US   X"S-  -
  * S  US   S US-   /SS9n
U
R	                  SUSS5      R                  XUS5      n
[        R                  " US   X3S-  -
  * S  US   S US-   /SS9nUR	                  SSUS5      R                  XUS5      nOVUS   S U R	                  SUSS5      R                  XUS5      n
US   S U R	                  SSUS5      R                  XUS5      n[        R                  " XU/SS9R                  US5      nUR                  5       R                  5       $ s  snf s  snf )Nr   r&   r,   r   r+   )r   splitr}   r   viewexpandr   r2   r8   rK   clone
contiguous)rg   r   r   r   r   seq_lensrD   	freqs_pos	freqs_negfreqs_framefreqs_heightfreqs_widthr   s                rA   r   "QwenEmbedRope._compute_video_freqs   s   >E)NN(($--)H-Qq&-)Ha(P	NN(($--)H-Qq&-)Ha(P	l3u5::5!QKRRSXbgikl?? 99ilVk5I3J3L&MyYZ|\i^dhi^iOj%kqrsL',,Q2>EEeUZ\^_L))Yq\EQJ4F2G2I%JIVWLYe[`de[eLf$gmnoK%**1a;BB5RWY[\K$Q<055aBGNNu^ceghL#A,v.33Aq%DKKE[`bdeK		;kBKSST\^`a{{}'')) *I)Hs   G2"G7)r}   r   r   r   r   r|   )F)'  )r   )rr   rs   rt   ru   intr   rd   r   rp   	functools	lru_cacher   rv   rw   rx   s   @rA   rz   rz      sI    %c %T#Y % %6&$P &* '*rC   rz   c                       \ rS rSrSrSrS r    SS\S\R                  S\R                  S\R                  S	\
\R                     S
\
\R                     S\R                  4S jjrSrg) QwenDoubleStreamAttnProcessor2_0r^   z
Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
implements joint attention computation where text and image streams are processed together.
Nc                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionz`QwenDoubleStreamAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)rg   s    rA   rd   )QwenDoubleStreamAttnProcessor2_0.__init__  s%    q899r  :rC   attnrl   encoder_hidden_statesencoder_hidden_states_maskattention_maskimage_rotary_embr$   c           
         Uc  [        S5      eUR                  S   nUR                  U5      nUR                  U5      n	UR	                  U5      n
UR                  U5      nUR                  U5      nUR                  U5      nUR                  SUR                  S45      nU	R                  SUR                  S45      n	U
R                  SUR                  S45      n
UR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  b  UR                  U5      nUR                  b  UR                  U	5      n	UR                  b  UR                  U5      nUR                  b  UR                  U5      nUb,  Uu  p[        XSS9n[        XSS9n	[        XSS9n[        XSS9n[        R                   " X/SS9n[        R                   " X/SS9n[        R                   " X/SS9n[#        UUUUSSU R$                  S9nUR'                  S	S
5      nUR)                  UR*                  5      nUS S 2S U2S S 24   nUS S 2US 2S S 24   nUR,                  S   " U5      n[/        UR,                  5      S:  a  UR,                  S   " U5      nUR1                  U5      nUU4$ )NzMQwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)r&   r+   F)rF   r,   g        )	attn_mask	dropout_p	is_causalbackendr   r	   r   )rO   r/   to_qto_kto_v
add_q_proj
add_k_proj
add_v_proj	unflattenheadsnorm_qnorm_knorm_added_qnorm_added_krY   r2   r8   r   _attention_backendrN   r6   r)   to_outr.   
to_add_out)rg   r   rl   r   r   r   r   seq_txt	img_queryimg_key	img_value	txt_querytxt_key	txt_value	img_freqsr   joint_query	joint_keyjoint_valuejoint_hidden_statestxt_attn_outputimg_attn_outputs                         rA   __call__)QwenDoubleStreamAttnProcessor2_0.__call__  s    !(lmm'--a0 IIm,	))M*IIm,	 OO$9:	//"78OO$9:	 ''TZZ,<=	##BR(89''TZZ,<=	''TZZ,<=	##BR(89''TZZ,<=	 ;;"I.I;;"kk'*G()))4I(''0G '#3 I-iUSI+GOG-iUSI+GOG ii 6A>IIw0a8	ii 6A> 4$++
 299!Q?144[5F5FG .a'1n=-a1n= ++a.9t{{a"kk!n_=O///://rC    )NNNN)rr   rs   rt   ru   __doc__r   rd   r   r2   FloatTensorr   Tensorr   rv   r   rC   rA   r   r      s    
  488<6:37S0S0 ((S0  %00	S0
 %*$5$5S0 !!2!23S0 #5<<0S0 
		S0 S0rC   r   c                   D  ^  \ rS rSr SS\S\S\S\S\4
U 4S jjjrS r  SS	\	R                  S
\	R                  S\	R                  S\	R                  S\\\	R                  \	R                  4      S\\\\4      S\\	R                  \	R                  4   4S jjrSrU =r$ )QwenImageTransformerBlockid  r-   num_attention_headsattention_head_dimqk_normepsc                   > [         TU ]  5         Xl        X l        X0l        [
        R                  " [
        R                  " 5       [
        R                  " USU-  SS95      U l	        [
        R                  " USUS9U l        [        US UUUUSS[        5       UUS9U l        [
        R                  " USUS9U l        [!        XSS9U l        [
        R                  " [
        R                  " 5       [
        R                  " USU-  SS95      U l        [
        R                  " USUS9U l        [
        R                  " USUS9U l        [!        XSS9U l        g )	N   TbiasFelementwise_affiner   )	query_dimcross_attention_dimadded_kv_proj_dimdim_headr   out_dimcontext_pre_onlyr   	processorr   r   zgelu-approximate)r-   dim_outactivation_fn)rc   rd   r-   r   r   r;   
SequentialSiLULinearimg_mod	LayerNorm	img_norm1r   r   r   	img_norm2r   img_mlptxt_mod	txt_norm1	txt_norm2txt_mlp)rg   r-   r   r   r   r   rh   s         rA   rd   "QwenImageTransformerBlock.__init__f  s#    	#6 "4 }}GGIIIc1s7.
 ceM $!'%"68
	 ceM"sGYZ }}GGIIIc1s7.
 ceMceM"sGYZrC   c                     UR                  SSS9u  p4nUSUR                  S5      -   -  UR                  S5      -   UR                  S5      4$ )z Apply modulation to input tensorr	   r+   r,   r&   )chunkrQ   )rg   rD   
mod_paramsshiftr"   gates         rA   	_modulate#QwenImageTransformerBlock._modulate  sR    '--aR-8dA**+eooa.@@$..QRBSSSrC   rl   r   r   tembr   joint_attention_kwargsr$   c                 P   U R                  U5      nU R                  U5      nUR                  SSS9u  pUR                  SSS9u  pU R                  U5      nU R	                  X5      u  pU R                  U5      nU R	                  UU5      u  nnU=(       d    0 nU R                  " SUUUUS.UD6nUu  nnXU-  -   nUUU-  -   nU R                  U5      nU R	                  UU
5      u  nnU R                  U5      nUUU-  -   nU R                  U5      nU R	                  UU5      u  nnU R                  U5      nUUU-  -   nUR                  [        R                  :X  a  UR                  SS5      nUR                  [        R                  :X  a  UR                  SS5      nX!4$ )Nr   r+   r,   )rl   r   r   r   i  i  r   )r	  r  r  r  r  r  r   r  r  r  r  r)   r2   float16clip)rg   rl   r   r   r  r   r  img_mod_paramstxt_mod_paramsimg_mod1img_mod2txt_mod1txt_mod2
img_normedimg_modulated	img_gate1
txt_normedtxt_modulated	txt_gate1attn_outputr   r   img_normed2img_modulated2	img_gate2img_mlp_outputtxt_normed2txt_modulated2	txt_gate2txt_mlp_outputs                                 rA   rp   !QwenImageTransformerBlock.forward  s    d+d+ ,11!1<+11!1< ^^M2
#'>>*#G  ^^$9:
#'>>*h#G y "8!=2ii 
'"/'A-	

 %
 ,7( &O(CC 5	O8S S nn]3$(NN;$I!	n5%	N(BB nn%:;$(NN;$I!	n5 5	N8R R !&&%--7$9$>$>vu$M!%--/)..vu=M$33rC   )r   r   r-   r  r	  r  r  r   r  r  r  r  )rms_normư>rI   )rr   rs   rt   ru   r   strr7   rd   r  r2   r   r   r   r   r   rp   rv   rw   rx   s   @rA   r   r   d  s     tx'['[-0'[FI'[TW'[kp'[ '[RT IM;?A4||A4  %||A4 %*LL	A4
 llA4 #5u||)C#DEA4 !)c3h 8A4 
u||U\\)	*A4 A4rC   r   c                     ^  \ rS rSrSrSrS/rSS/rS/r\	         SS\
S\
S	\\
   S
\
S\
S\
S\
S\S\\
\
\
4   4U 4S jjj5       r        SS\R                   S\R                   S\R                   S\R"                  S\\\\
\
\
4         S\\\
      S\R                   S\\\\4      S\S\\R                   \4   4S jjrSrU =r$ )QwenImageTransformer2DModeli  a  
The Transformer model introduced in Qwen.

Args:
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `64`):
        The number of channels in the input.
    out_channels (`int`, *optional*, defaults to `None`):
        The number of channels in the output. If not specified, it defaults to `in_channels`.
    num_layers (`int`, defaults to `60`):
        The number of layers of dual stream DiT blocks to use.
    attention_head_dim (`int`, defaults to `128`):
        The number of dimensions to use for each attention head.
    num_attention_heads (`int`, defaults to `24`):
        The number of attention heads to use.
    joint_attention_dim (`int`, defaults to `3584`):
        The number of dimensions to use for the joint attention (embedding/channel dimension of
        `encoder_hidden_states`).
    guidance_embeds (`bool`, defaults to `False`):
        Whether to use guidance embeddings for guidance-distilled variant of the model.
    axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
        The dimensions to use for the rotary positional embeddings.
Tr   	pos_embednorm
patch_sizera   out_channels
num_layersr   r   joint_attention_dimguidance_embedsaxes_dims_ropec
                   > [         TU ]  5         U=(       d    UU l        Xe-  U l        [	        S[        U	5      SS9U l        [        U R                  S9U l        [        USS9U l
        [        R                  " X R                  5      U l        [        R                  " XpR                  5      U l        [        R                  " [!        U5       V
s/ s H  n
[#        U R                  UUS9PM     sn
5      U l        ['        U R                  U R                  SSS	9U l        [        R                  " U R                  X-  U R                  -  SS
9U l        SU l        g s  sn
f )Nr   T)r|   r}   r   )r   r6  )r   )r-   r   r   Fr   r   )rc   rd   r=  	inner_dimrz   r   r:  r[   time_text_embedr   txt_normr;   r  img_intxt_in
ModuleListranger   transformer_blocksr   norm_outproj_outgradient_checkpointing)rg   r<  ra   r=  r>  r   r   r?  r@  rA  r   rh   s              rA   rd   $QwenImageTransformer2DModel.__init__  s!    	(7K,A&UT.=Q^bc9W 3>ii^^<ii 3^^D"$-- z* +A *(;'9
 +	#
 /t~~t~~bgmqr		$..*2IDL]L]2]dhi&+#s   Erl   r   r   rk   
img_shapesr   guidanceattention_kwargsreturn_dictr$   c
           
         Ub#  UR                  5       nUR                  SS5      n
OSn
[        (       a  [        X
5        O+Ub(  UR	                  SS5      b  [
        R                  S5        U R                  U5      nUR                  UR                  5      nU R                  U5      nU R                  U5      nUb  UR                  UR                  5      S-  nUc  U R                  XA5      OU R                  XGU5      nU R                  XVUR                  S9n[        U R                   5       HW  u  p["        R$                  " 5       (       a+  U R&                  (       a  U R)                  UUUUUU5      u  p!MJ  U" UUUUUUS9u  p!MY     U R+                  X5      nU R-                  U5      n[        (       a  [/        X
5        U	(       d  U4$ [1        US9$ )	a  
The [`QwenTransformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
        Input `hidden_states`.
    encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
        Mask of the input conditions.
    timestep ( `torch.LongTensor`):
        Used to indicate denoising step.
    attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
Nr"   r   z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.r_   )r*   )rl   r   r   r  r   r  )sample)copypopr   r   getloggerwarningrF  r6   r)   rE  rG  rD  r:  r*   r   rJ  r2   is_grad_enabledrM  _gradient_checkpointing_funcrK  rL  r   r   )rg   rl   r   r   rk   rO  r   rP  rQ  rR  
lora_scaler  r   index_blockblockoutputs                   rA   rp   #QwenImageTransformer2DModel.forward!  s   H '/446)--gs;JJd/+0@0D0DWd0S0_r M2;;}223 $.C D $,A B{{=#6#67$>H    9%%h-H 	  >>*=K_K_>`"+D,C,C"DK$$&&4+F+F7;7X7X!).$84%} 8="/*?/I%5+;84%} #E, m:}-19'v66rC   )rM  rF  rC  rK  r=  r:  rL  rD  rJ  rG  rE  )	r   @      <         i   F)rb  8   rf  )NNNNNNNT)rr   rs   rt   ru   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patterns_repeated_blocksr   r   r   boolr   rd   r2   r   
LongTensorr   r   r7  r   r   r   rp   rv   rw   rx   s   @rA   r9  r9    s   2 (,$45(3V'<$34 &("%#%#' %/;',', ', sm	',
 ',  ', !', !', ', c3m,', ',X /337%);?,0!%59 d7||d7  %||d7 %*LL	d7
 ""d7 T%S#"678d7 tCy)d7 ,,d7 #4S>2d7 d7 
u||55	6d7 d7rC   r9  )Fr&   r&   r   )Tr+   )=r   r0   typingr   r   r   r   r   r   r2   torch.nnr;   torch.nn.functionalr<   r   configuration_utilsr
   r   loadersr   r   utilsr   r   r   r   utils.torch_utilsr   	attentionr   attention_dispatchr   attention_processorr   cache_utilsr   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrr   rX  r   r   rk  r7   rB   rY   Moduler[   rz   r   r   r9  r   rC   rA   <module>r~     s     : :     B ? V V 5 # 6 + $ 5 7 ' ; 
		H	% ""#3||33 3  	3
 3 3 \\3r !	- ||- U\\5#667-  -  	- 
 5<<%&- `  _*BII _*Da0 a0H p4		 p4 p4fm7*k;KMceo m7rC   