
    cCi                     z   S SK r S SKJr  S SKJrJr  S SKrS SKJr  S SK	Js  J
r  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \RV                  " \,5      r- " S S\ 5      r. " S S\!5      r/ " S S\R`                  5      r1 " S S\5      r2 " S S\R`                  5      r3 " S S\R`                  5      r4 " S S\R`                  5      r5 " S  S!\R`                  5      r6 " S" S#\R`                  5      r7 " S$ S%\R`                  5      r8 " S& S'\R`                  5      r9 " S( S)\&5      r: " S* S+\Rv                  5      r< " S, S-\R`                  5      r= " S. S/\R`                  5      r> " S0 S1\R`                  5      r? " S2 S3\R`                  5      r@ " S4 S5\R`                  5      rA\" S6S79 " S8 S9\5      5       rB " S: S;5      rC " S< S=\\B5      rD " S> S?\#\D5      rE " S@ SA\"\D\5      rF " SB SC\D5      rG " SD SE\D\5      rH/ SFQrIg)G    N)cached_property)OptionalUnion   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                       \ rS rSrSrg)Emu3Attention,    N__name__
__module____qualname____firstlineno____static_attributes__r        _/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/emu3/modular_emu3.pyr   r   ,       r'   r   c                   H  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9      SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\R                     S\
\\R                  \R                  4      S\\   S\R                  4S jj5       rSrU =r$ )Emu3DecoderLayer1   config	layer_idxc                 n   > [         TU ]  X5        [        R                  " UR                  5      U l        g N)super__init__nnDropoutattention_dropoutdropoutselfr-   r.   	__class__s      r(   r2   Emu3DecoderLayer.__init__2   s&    +zz&":":;r'   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesattention_maskposition_ids	use_cachecache_positionposition_embeddingskwargsreturnc                     Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pXR                  U5      -   nUn	U R                  U5      nU R	                  U5      nXR                  U5      -   nU$ )N)r?   r@   rA   r<   rB   rC   rD   r    )input_layernorm	self_attnr6   post_attention_layernormmlp)r8   r?   r@   rA   r<   rB   rC   rD   rE   residual_s              r(   forwardEmu3DecoderLayer.forward6   s     !,,];>> 	
')%+) 3	
 	
 !<<#>> 55mD/ <<#>>r'   )r6   )NNNFNN)r"   r#   r$   r%   r   intr2   r   torchTensorr   
LongTensorr   booltupler   r   rN   r&   __classcell__r9   s   @r(   r+   r+   1   s    <z <c < %0A6R 2637+/$)59KO|| !. u//0	
 "% D> !!1!12 &eELL%,,,F&GH +, 
 Sr'   r+   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )Emu3VQVAEVectorQuantizerX   a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r-   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        U R                  R                  R                  R                  SUR                  -  SUR                  -  5        g )Ng            ?)
r1   r2   r3   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r8   r-   r9   s     r(   r2   !Emu3VQVAEVectorQuantizer.__init__c   sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr'   hidden_statec                    UR                   u  p#pEnUR                  SSSSS5      R                  5       nUR                  SU5      n[        R
                  " US-  SSS9n[        R
                  " U R                  R                  S-  SS	9n	S[        R                  " XpR                  R                  R                  SS5      5      -  n
X-   U
-
  n
[        R                  " U
SS	9nUR                  X#XV5      nU$ )
Nr   r   r      r   T)dimkeepdimrj   )shapepermute
contiguousviewrQ   sumr`   ra   matmul	transposeargmin)r8   rf   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r(   rN    Emu3VQVAEVectorQuantizer.forwardh   s    8D8J8J5
h#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;^^=R=R=\=\]^`a=bcc	$4y@	$||I1=388v]##r'   )r`   )r"   r#   r$   r%   __doc__r   r2   rQ   rR   rN   r&   rV   rW   s   @r(   rY   rY   X   s+    e e
$ELL $ $r'   rY   c                       \ rS rSrSrg)Emu3VQVAEEncoderConvDownsamplez   r    Nr!   r    r'   r(   r   r   z   r)   r'   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvUpsample~   c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   kernel_sizestridepadding)r1   r2   r3   Conv2dconv)r8   in_channelsr9   s     r(   r2   %Emu3VQVAEEncoderConvUpsample.__init__   s%    IIkAaYZ[	r'   c                 T    [         R                  " USSS9nU R                  U5      nU$ )N       @nearestscale_factormode)Finterpolater   r8   r?   s     r(   rN   $Emu3VQVAEEncoderConvUpsample.forward   s(    m#IV		-0r'   r   r"   r#   r$   r%   r2   rN   r&   rV   rW   s   @r(   r   r   ~   s    \ r'   r   c            	       j   ^  \ rS rSrS\S\S\\   S\\   4U 4S jjrS\R                  4S jr	S	r
U =r$ )
Emu3VQVAEConv3d   
in_channelout_channelr   r   c                 R  > [         T	U ]  5         [        USS  USS  5       VVs/ s H	  u  pVXV-
  PM     nnnSU l        US S S2    H&  nU =R                  US-  US-  -   US-  4-  sl        M(     U =R                  S-  sl        [        R
                  " UUUUS9U l        g s  snnf )Nr   r    ri   r   )r   r   )r   )r1   r2   zipr   r3   Conv3dr   )
r8   r   r   r   r   
one_kernel
one_stridepadding_sizespad_sizer9   s
            r(   r2   Emu3VQVAEConv3d.__init__   s     	ORS^_`_aSbdjklkmdnOopOo5KZ0Oop%dd+HLLX]X\98q=IIL ,II	
	 qs   B#r?   c                 h    [         R                  " XR                  5      nU R                  U5      nU$ r0   )r   padr   r   r   s     r(   rN   Emu3VQVAEConv3d.forward   s(    m\\:		-0r'   )r   r   )r"   r#   r$   r%   rP   rU   r2   rQ   rR   rN   r&   rV   rW   s   @r(   r   r      sK    

 
 3Z	

 c

,U\\  r'   r   c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Emu3VQVAESpatialNorm   r   out_channelsc                    > [         TU ]  5         [        R                  " USSSS9U l        [        R
                  " UUSSSS9U l        [        R
                  " UUSSSS9U l        g )N    ư>Tnum_channels
num_groupsepsaffiner   r   r   )r1   r2   r3   	GroupNorm
norm_layerr   conv_yconv_br8   r   r   r9   s      r(   r2   Emu3VQVAESpatialNorm.__init__   sn    
 	,,%	
 ii
 ii
r'   r?   quant_statesc                     [         R                  " X!R                  SS  SS9nU R                  U5      nXR	                  U5      -  U R                  U5      -   nU$ )Nr   )sizer   )r   r   rm   r   r   r   )r8   r?   r   s      r(   rN   Emu3VQVAESpatialNorm.forward   sT    }}\8K8KBC8PW`a6%L(AADKKP\D]]r'   )r   r   r   r"   r#   r$   r%   rP   r2   rQ   rR   rN   r&   rV   rW   s   @r(   r   r      s:    

 
8U\\   r'   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalUpsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )Nr   r   r   r   r   r   r   r   r1   r2   r   r   r8   r   r   r9   s      r(   r2   "Emu3VQVAETemporalUpsample.__init__   (    
 	#!	
	r'   r?   c                 D   UR                   u  p#pEnUR                  SSSSS5      R                  5       R                  USU5      n[        R
                  " USSS	9nUR                  X#XVS5      R                  SSSSS5      R                  5       nU R                  U5      nU$ )
Nr   r   r   rh   r   ri   r   r   r   )rm   rn   ro   rp   r   r   r   )r8   r?   ru   rw   rv   rx   ry   s          r(   rN   !Emu3VQVAETemporalUpsample.forward   s    8E8K8K5
h%--aAq!<GGINNz[]_ghm#IV%**:PRS[[\]_`bcefhijuuw		-0r'   r   r   rW   s   @r(   r   r      s/    

 
U\\  r'   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalDownsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )N)rh   r   r   )r   r   r   r   r   r   s      r(   r2   $Emu3VQVAETemporalDownsample.__init__   r   r'   r?   c                 (    U R                  U5      nU$ r0   r   r   s     r(   rN   #Emu3VQVAETemporalDownsample.forward   s    		-0r'   r   r   rW   s   @r(   r   r      s/    

 
U\\  r'   r   c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )Emu3VQVAETemporalResnetBlock   c                 f  > [         TU ]  5         Xl        Uc  UOUU l        [        R
                  " U5      U l        [        UUSSS9U l        [        R
                  " U5      U l	        [        UUSSS9U l
        U R                  U R                  :w  a  [        R                  " UUSSSS9U l        g g )Nr   r   r   r   r   r   )r1   r2   r   r   r3   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r(   r2   %Emu3VQVAETemporalResnetBlock.__init__   s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r'   c                 P   UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nX!-   $ r0   )	r   rQ   sigmoidr   r   r   r   r   r   )r8   r?   rL   s      r(   rN   $Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H''r'   )r   r   r   r   r   r   r   r0   r   rW   s   @r(   r   r      s     @( (r'   r   c                      ^  \ rS rSr  S	S\S\\   S\\   4U 4S jjjrS
S\R                  S\\R                     4S jjr	Sr
U =r$ )Emu3VQVAEResnetBlocki%  r   r   quant_channelsc                   > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Uc9  [
        R                  " USSSS9U l        [
        R                  " USSSS9U l        O [        X15      U l        [        X25      U l        [
        R                  " UUSSSS9U l        [
        R                  " UUSSSS9U l        U R                  U R                  :w  a  [
        R                  " UUSSSS9U l        g g )	Nr   r   Tr   r   r   r   r   )r1   r2   r   r   r   r3   r   r   r   r   r   r   r   r   )r8   r   r   r   r9   s       r(   r2   Emu3VQVAEResnetBlock.__init__&  s     	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nJDJ-nKDJYY

 YY

 t000 "		!D 1r'   r?   c                 |   U R                   c  SOU4nUnU R                  " U/UQ76 nU[        R                  " U5      -  nU R	                  U5      nU R
                  " U/UQ76 nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nXA-   $ Nr    )
r   r   rQ   r   r   r   r   r   r   r   )r8   r?   r   	norm_argsrL   s        r(   rN   Emu3VQVAEResnetBlock.forwardR  s    --5BN;L	 

==9=}55

=1

==9=}55

=1t000((2H''r'   )r   r   r   r   r   r   r   r   )NNr0   )r"   r#   r$   r%   rP   r   r2   rQ   rR   rN   r&   rV   rW   s   @r(   r   r   %  s_     '+(,	** sm* !	* *X(U\\ (8ELLCY ( (r'   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Emu3VQVAEAttentionBlockid  r-   c                 2   > [         TU ]  U5        SU l        g )Nr   )r1   r2   num_key_value_groupsrd   s     r(   r2    Emu3VQVAEAttentionBlock.__init__e  s      %&!r'   )r   )r"   r#   r$   r%   r   r2   r&   rV   rW   s   @r(   r   r   d  s    & & &r'   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )Emu3VQVAEGroupNormil  z
Same as the torch GroupNorm with the only difference that this ones accepts
an optional kwarg `quant_states` which is not used. This class makes it easier to
use SpatialNorm or GroupNorm without conditionals
c                 &   > [         TU ]  " S0 UD6  g r   )r1   r2   )r8   rE   r9   s     r(   r2   Emu3VQVAEGroupNorm.__init__s  s    "6"r'   c                     [         R                  " XR                  U R                  U R                  U R
                  5      $ r0   )r   
group_normr   ra   biasr   )r8   inputr   s      r(   rN   Emu3VQVAEGroupNorm.forwardv  s'    ||E??DKKDHHUUr'   r    r0   )	r"   r#   r$   r%   r   r2   rN   r&   rV   rW   s   @r(   r   r   l  s    #V Vr'   r   c                   p   ^  \ rS rSrSU 4S jjrSS\R                  S\\R                     4S jjrSr	U =r
$ )Emu3VQVAEMiddleBlockiz  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        Uc  [        USSSS9U l        O[        X25      U l        [        UUUS9U l	        g )Nr   r   r   r   r   Tr   )
r1   r2   r   block_1r   attn_1r   	attn_normr   block_2)r8   r-   r   r   r9   s       r(   r2   Emu3VQVAEMiddleBlock.__init__{  sm    +#$)

 .f5!/[UW]ajnoDN1.NDN+#$)
r'   r?   r   c                 N   U R                  X5      nUnU R                  X5      nUR                  u  pEpgUR                  XEXg-  5      R	                  SS5      nU R                  U5      S   nUR                  XFXu5      R                  SSSS5      nX1-   nU R                  X5      nU$ )Nr   r   r   r   )	r  r  rm   rp   rs   r  reshapern   r  )r8   r?   r   rL   ru   rw   rx   ry   s           r(   rN   Emu3VQVAEMiddleBlock.forward  s    ]A }C.;.A.A+
f%**:PZZ[\^_`M215%--j%RZZ[\^_abdef 0]Ar'   )r  r  r  r  r0   )r"   r#   r$   r%   r2   rQ   FloatTensorr   rN   r&   rV   rW   s   @r(   r   r   z  s1    
(
U%6%6 
huO`O`Fa 
 
r'   r   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEDownBlocki  c                   > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nS[        U5      -   nX@l        [        R                  " 5       U l        [        U R                  5       GHL  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nX$U   -  n	X#U   -  n
[        U R
                  5       H~  nUR                  [        U	U
S95        U
n	UR                  c  M-  XQR                  ;   d  M>  UR                  [!        U5      5        UR                  [        R"                  " U	SSSS95        M     [        R$                  " 5       nXll        X|l        Xl        XPR                  S-
  :w  a  [-        U	5      Ul        U R                  R                  U5        GMO     g )N)r   r   r   r   r   Tr   r   )r1   r2   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrU   in_channel_multiplierr3   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsr   
downsample)r8   r-   r  r  r  i_levelr  r  r  block_in	block_outi_blockr  r9   s                r(   r2   Emu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112GMMOE==?DJ$W'EEH%7(CCI !4!45($,%. %**67F]F];]KK 7 ?@%%bllUW]ajn&op 6 99;DJI(O..22"@"JIIT"1 3r'   r?   c                 <   [        U R                  5       GH  u  p#[        U R                  5       H  nUR                  U   " U5      n[        UR                  5      S:  d  M3  UnUR                  U   " U5      nUR                  u  pgpUR                  XgX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XhX5      R                  SSSS5      nXQ-   nM     X R                  S-
  :w  d  M  UR                  U5      nGM     U$ )Nr   r   r   r   )	enumerater  r  r  r  r  r  r  rm   rp   rs   r  rn   r  r  )
r8   r?   r  blocksr!  rL   ru   rw   rx   ry   s
             r(   rN   Emu3VQVAEDownBlock.forward  s   (3OG !4!45 &W 5m Dv{{#a',H$*$5$5g$>}$MM:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M 6 ..22 & 1 1- @  4" r'   )r  r  r  r  
r"   r#   r$   r%   r2   rQ   r	  rN   r&   rV   rW   s   @r(   r  r    s     ##JU%6%6  r'   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Emu3VQVAEUpBlocki  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  UR                  S   -  n[        R                  " 5       U l
        [        [        U R                  5      5       GH8  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nUR                  UR                  U   -  n[        U R
                  S-   5       Hd  n	UR                  [        UUUS95        UnXAR                  ;   d  M0  UR                  [!        U5      5        UR                  [#        X#5      5        Mf     [        R$                  " 5       n
XZl        Xjl        Xzl        US:w  a  [-        U5      U
l        U R                  R1                  SU
5        GM;     g )Nri   r   r   r   )r1   r2   r  r  r  r  r_   r  r3   r  upreversedr  r  r   r  r   r   r  r  r  r  r   upsampleinsert)r8   r-   r   r  r  r  r  r  r   r!  r+  r9   s              r(   r2   Emu3VQVAEUpBlock.__init__  si   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;<GMMOE==?DJ,,v/H/H/QQI !4!4q!89($,%.'5 %555KK 7 ?@%%&:>&TU : BHG&M!|:8DGGNN1b!3 =r'   r?   r   c                 b   [        U R                  S S S2   5       GH  u  p4[        U R                  S-   5       H  nUR                  U   " X5      n[        UR                  5      S:  d  M3  UnUR                  U   " X5      nUR                  u  pxpUR                  XxX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XyX5      R                  SSSS5      nXa-   nM     U[        U R                  5      S-
  :w  d  M  UR                  U5      nGM     U$ )Nri   r   r   r   r   )r$  r+  r  r  r  r  r  r  rm   rp   rs   r  rn   r-  )r8   r?   r   r  r%  r!  rL   ru   rw   rx   ry   s              r(   rN   Emu3VQVAEUpBlock.forward  s   (27OG !4!4q!89 &W 5m Rv{{#a',H$*$5$5g$>}$[M:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M : #dgg,** & >  8  r'   )r  r  r+  r'  rW   s   @r(   r)  r)    s-    #"JU%6%6 eFWFW  r'   r)  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEEncoderi  c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nU(       a  SU-  OUnX&S   -  n[        R                  R                  X2SSSS9U l
        [        U5      U l        [        X5      U l        [        R                  R                  SUSSS	9U l        [        R                  R                  UUSSSS9U l        [%        [&        R(                  " UR*                  5      5      n	[        R,                  " 5       U l        [        R,                  " 5       U l        [3        U	5       H)  n
[5        Xw5      nU R.                  R7                  U5        M+     [3        UR8                  5       H(  n[;        UUS
9nU R0                  R7                  U5        M*     g )Nr   ri   r   r   r   r   r   T)r   r   r   r   r  )r1   r2   r  r   double_latentlatent_channelsr  rQ   r3   r   conv_inr  
down_blockr   middle_blockr   norm_outconv_outrP   mathlog2temporal_downsample_factorr  	time_convtime_res_stackr  r   r  r  r   )r8   r-   r  r   r5  r6  r  r   r  temporal_down_blocksir   rM   time_res_convr9   s                 r(   r2   Emu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* b#99xx{qYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+,A.|JDNN!!$' - v,,-A8()M &&}5 .r'   pixel_valuesc                 t   UR                   S   nUR                  " S/UR                   SS  Q76 nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nUR                  " SU/UR                   SS  Q76 nUR                  SSSSS5      nU R                   H$  nU" U5      nU[        R                  " U5      -  nM&     U R                   H  nU" U5      nM     UR                  SSSSS5      nU$ )Nr   ri   r   r   r   rh   )rm   r  r7  r8  r9  r:  rQ   r   r;  rn   r?  r@  )r8   rE  temporal_dimr?   r   layers         r(   rN   Emu3VQVAEEncoder.forward9  s:   #))!,#++BH1C1CAB1GH \26))-8 m4}55m4%--b,YATATUVUWAXY%--aAq!< NND /MU]]=99M # ((E!-0M ) &--aAq!<r'   )r7  r;  r8  r9  r:  r?  r@  )
r"   r#   r$   r%   r2   rQ   rS   rN   r&   rV   rW   s   @r(   r3  r3    s     %6NE$4$4  r'   r3  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Emu3VQVAEDecoderiW  r-   c                   > [         T	U ]  5         UR                  nUR                  UR                  S   -  n[
        R                  " 5       U l        [        UR                  5       H<  n[        UR                  UR                  S9nU R                  R                  U5        M>     [        [        R                  " UR                   5      5      n[
        R                  " 5       U l        [        U5       H>  n[%        UR                  UR                  5      nU R"                  R                  U5        M@     [
        R&                  " UR                  USSSS9U l        [+        XUS9U l        [/        U5      U l        UR                  UR                  S   -  n[3        X#5      U l        [
        R&                  " UUR6                  SSSS9U l        g )Nri   r  r   r   r   )r   r   )r1   r2   r_   r  r  r3   r  r@  r  r  r   r6  r  rP   r<  r=  r>  r?  r   r   r7  r   r9  r)  up_blockr   r:  r   r;  )
r8   r-   r   r  rM   rC  temp_upsample_block_numrB  r   r9   s
            r(   r2   Emu3VQVAEDecoder.__init__X  s|   ))''&*C*CB*GG mmov,,-A8"22AWAWM &&}5	 . #&dii0Q0Q&R"S./A,V-C-CVE[E[\DNN!!$' 0 yy""
 1R`a(0''&*C*CA*FF,^F		
r'   r?   r   c                    [         R                  " X4SS9nUR                  SSSSS5      nU R                   H  nU" U5      nM     U R                   H$  nU" U5      nU[         R
                  " U5      -  nM&     UR                  SSSSS5      n[         R                  " USSS9u  pUR                  " S/UR                  SS  Q76 nUR                  " S/UR                  SS  Q76 nU R                  U5      nU R                  X5      nU R                  X5      nU R                  X5      nU[         R
                  " U5      -  nU R                  U5      nU$ )Nr   rl   r   r   r   rh   ri   )rQ   catrn   r@  r?  r   chunkr  rm   r7  r9  rM  r:  r;  )r8   r?   r   hidden_quant_statesrH  s        r(   rN   Emu3VQVAEDecoder.forward  sV   #ii(E1M199!Q1aH ((E"'(;"< ) ^^E"'(;"<5==1D#EE $ 299!Q1aH&+kk2Eqa&P#%--bK=3F3Fqr3JK#++BH1C1CAB1GH]3 ))-FmBmB}55m4r'   )r7  r;  r9  r:  r?  r@  rM  )r"   r#   r$   r%   r   r2   rQ   rR   rN   r&   rV   rW   s   @r(   rK  rK  W  s0    %
 %
NU\\   r'   rK  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
Sr/ SQrS rS\4U 4S jjrS\R                   S	\R                   4S
 jrS\R                   4S jrSrU =r$ )	Emu3VQVAEi  r-   
emuvideovqrE  T)r   r   r   rY   c                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  SSS9  UR                  bq  [        R                  R                  UR                  5      u  p#S[        R                  " U5      -  n[        R                  R                  UR                  U* U5        g g [        U[        R                  5      (       a  [        R                  R                  UR                  [        R                  " S5      S9  UR                  by  [        R                  R                  UR                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  R                  UR                  U* U5        g g [        U[        R                  [        R                  [        R                   45      (       aU  [        R                  R#                  UR                  S5        [        R                  R#                  UR                  S	5        g [        U[        R$                  5      (       ad  UR                  R&                  R)                  5         UR*                  b2  UR                  R&                  UR*                     R-                  5         g g g )
Nfan_outrelu)r   nonlinearityr      )ar   r\   g        )
isinstancer3   r   r   initkaiming_normal_ra   r   _calculate_fan_in_and_fan_outr<  sqrtrc   Linearkaiming_uniform_BatchNorm2dr   r   	constant_r]   rb   normal_padding_idxzero_)r8   modulefan_inrM   bounds        r(   _init_weightsEmu3VQVAE._init_weights  s   fryy"))455GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		**GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOOGGfmmS1GGfkk3/--MM&&(!!-""6#5#56<<> . .r'   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        S[        UR                  5      S-
  -  U l        [        UR                  UR                  SSS9U l        [        UR                  UR                  SSS9U l        S[        UR                  5      S-
  -  U l        U R%                  5         U R'                  5         g )Nr   r   )r   r   r   r   r   )r1   r2   r-   r3  encoderrK  decoderrY   quantizer  r  vision_spatial_factorr   r6  r_   
quant_convpost_quant_convspatial_scale_factoreval	post_initrd   s     r(   r2   Emu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r'   image_sizesc                    UR                   S:H  nU(       aJ  U R                  R                  nUR                  u  pVpxUR	                  S5      R                  SUSSS5      nOUR                  u  pTpgnU R                  U5      n	U	R                  SSSSS5      n	U R                  U	5      n	U	R                  SSSSS5      n	U R                  U	5      n
U(       a  U
R                  S5      OU
n[        X5       VVs/ s HB  u  pUS [        US   U R                  -  5      2S [        US   U R                  -  5      24   PMD     nnnU$ s  snnf )Nrh   r   r   r   r   )ndimr-   r>  rm   	unsqueezerepeatrq  rn   ru  rs  squeezer   rP   rt  )r8   rE  r{  is_imagerv   ru   rw   rx   ry   r?   codesimage_tokenssingle_imager   s                 r(   encodeEmu3VQVAE.encode  sP   $$){{==H2>2D2D/J&'11!4;;AxAqQL<H<N<N9J(E\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
&D" D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr&D 	 

 
s   6A	Er?   c                    UR                   S:H  nU(       a  UR                  S5      nUR                  u  p4pVU R                  R	                  UR                  5       5      nUR                  S   nUR                  X4XVU5      R                  SSSSS5      R                  5       nU R                  U5      n	UR                  SSSSS5      nU	R                  SSSSS5      n	U R                  X5      n
U
R                  UX@R                  R                  -  U R                  R                  XPR                  -  X`R                  -  5      n
U(       a	  U
S S 2S4   $ U
$ )Nr   r   ri   r   rh   r   )r}  r~  rm   rs  r`   flattenrp   rn   ro   rv  rr  r  r-   r>  r   rw  )r8   r?   r  ru   rv   rx   ry   quantrw   
post_quantvideos              r(   decodeEmu3VQVAE.decode  s;    %%*)33A6M.;.A.A+
f''(=(=(?@;;r?

:IQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/{{===KK$$...---
 'uQT{1E1r'   )r-   rr  rq  rv  ru  rs  rw  rt  )r"   r#   r$   r%   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesrn  r2   rQ   rR   r  r  r&   rV   rW   s   @r(   rW  rW    sv     $$ON"&?* *5<< ell 82ELL 2 2r'   rW  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\\R                     S\R                  4S jrS
\R                  S\R                  4S jrSrg)Emu3ImageVocabularyMappingi  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 h    Xl         UR                  S5      U l        UR                  S5      U l        g )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r8   r  s     r(   r2   #Emu3ImageVocabularyMapping.__init__  s)    "%MM/:'mmI6r'   c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf Nz<|visual tokensortedr  items
startswithr8   namevals      r(   r  'Emu3ImageVocabularyMapping.image_tokens  s<    DNN,@,@,Bh,BytdooVfFgs,Bhiih   A
A
c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf r  r  r  s      r(   image_tokens_str+Emu3ImageVocabularyMapping.image_tokens_str"  s<    T^^-A-A-Ci-C	tWgGht-Cijjir  c                 z    U R                    Vs0 s H  n[        USS 5      U R                  U   _M!     sn$ s  snf )Nir   )r  rP   r  )r8   tokens     r(   img2bpe"Emu3ImageVocabularyMapping.img2bpe&  s;    FJF[F[\F[UE"RL!4>>%#88F[\\\s   &8c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r0   )r  r  )r8   kvs      r(   bpe2img"Emu3ImageVocabularyMapping.bpe2img*  s-    !%!3!3!56!5!5666s   0c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ Nr   dtype)rQ   zerosmaxr  keysrP   r  r8   mappingr  r  s       r(   bpe2img_mapping_tensor1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor.  R    ++c$,,"3"3"56:%))LLL&&(DAAJ )r'   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ r  )rQ   r  r  r  r  rP   r  r  s       r(   img2bpe_mapping_tensor1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor5  r  r'   	img_batchrF   c                 "   UR                   n[        R                  " UR                  S   S4[        R                  S9U R
                  -  nU R                  UR                  S5         n[        R                  " XC/SS9nUR                  U5      $ )Nr   r   r  cpuri   rl   )	devicerQ   onesrm   rP   r  r  torQ  )r8   r  r  eol_row
img_tokenss        r(   convert_img2bpe*Emu3ImageVocabularyMapping.convert_img2bpe<  su    !!**iooa0!4EIIFIZIZZ00e1DE
YY
4"=
}}V$$r'   c                     UR                   nUSS S24   nU R                  UR                  S5         nUR                  U5      $ )N.ri   r  )r  r  r  )r8   r  r  r  s       r(   convert_bpe2img*Emu3ImageVocabularyMapping.convert_bpe2imgC  sG    !!c3B3h'	00e1DE
}}V$$r'   )r  r  r  N)r"   r#   r$   r%   r   r2   r   r  r  r  r  r  r  listrQ   rR   r  r  r&   r    r'   r(   r  r    s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r'   r  c                   "    \ rS rSrS/rSrSrSrg)Emu3PreTrainedModeliJ  r+   Tr    N)r"   r#   r$   r%   r  r  r  r&   r    r'   r(   r  r  J  s     "&r'   r  c                   :   ^  \ rS rSr\\S.rS\4U 4S jjrSr	U =r
$ )Emu3TextModeliR  )r?   
attentionsr-   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r0   )r1   r2   r3   r  r  num_hidden_layersr+   layersr7   s      r(   r2   Emu3TextModel.__init__X  sH     mmBGH`H`BabBaYf0Bab
bs   A)r  )r"   r#   r$   r%   r+   r   _can_record_outputsr   r2   r&   rV   rW   s   @r(   r  r  R  s"    )#

z 
 
r'   r  c                   @   ^  \ rS rSr% \\S'   U 4S jrU 4S jrSrU =r	$ )Emu3ForCausalLMi_  r-   c                 D   > [         TU ]  U5        [        U5      U l        g r0   )r1   r2   r  modelrd   s     r(   r2   Emu3ForCausalLM.__init__b  s     "6*
r'   c                  6   > [        5       R                  5         g)aC  
Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```N)r1   rN   )super_kwargsr9   s    r(   rN   Emu3ForCausalLM.forwardf  s    $ 	r'   )r  )
r"   r#   r$   r%   r   r  r2   rN   r&   rV   rW   s   @r(   r  r  _  s    + r'   r  c                     ^  \ rS rSrSS0rU 4S jrS rS rS rS r	S	\
R                  S
\
R                  4S jrS	\
R                  S
\
R                  4S jr\
R                  S\
R                  S\S\4S j5       rS\
R                  S\
R                  S\
R                  4S jr\\         SS\\
R                     S	\\
R                     S
\\
R,                     S\\
R,                     S\\
R                     S\\   S\\
R                     S\\   S\\
R                     S\\   S\\\4   4S jj5       5       rSrU =r $ )	Emu3Modeli{  ztext_model.model
text_modelc                    > [         TU ]  U5        [        R                  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g r0   )r1   r2   r  _from_configtext_configr  rW  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingry  rd   s     r(   r2   Emu3Model.__init__~  sY     '44V5G5GH !1!12"<V=R=R"S 	r'   c                 6    U R                   R                  5       $ r0   )r  get_input_embeddingsr8   s    r(   r  Emu3Model.get_input_embeddings  s    3355r'   c                 :    U R                   R                  U5        g r0   )r  set_input_embeddingsr8   values     r(   r  Emu3Model.set_input_embeddings  s    ,,U3r'   c                     Xl         g r0   r  r8   rr  s     r(   set_decoderEmu3Model.set_decoder  s    !r'   c                     U R                   $ r0   r  r  s    r(   get_decoderEmu3Model.get_decoder  s    r'   rE  r{  c                     U R                   R                  X5      nU Vs/ s H+  o@R                  R                  U5      R	                  5       PM-     nn[
        R                  " U5      nU$ s  snf )a  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
        The sizes of the images in the batch, being (height, width) for each image.
)r  r  r  r  r  rQ   rQ  )r8   rE  r{  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r(   get_image_tokensEmu3Model.get_image_tokens  sb     !LL//JctuctY_22BB6JRRTctuYY/
 vs   2A,c                    U R                  X5      nU VVs/ s H9  u  pEX@R                  R                  -  XPR                  R                  -  S-   -  PM;     nnnU R                  5       " U5      n[        R
                  " Xv5      nU$ s  snnf )a  
Tokenizes images into discrete tokens with VQGAN module and embeds
them with text embeddings layer

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
        The tensors corresponding to the input images.
r   )r  r  rt  r  rQ   split)r8   rE  r{  r  rx   ry   split_sizesimage_featuress           r(   get_image_featuresEmu3Model.get_image_features  s     ,,\G "-
!, ||999e||GiGi>ilm>mn!, 	 
 224\B^A
s   A Br  rx   ry   c                     USS2SS24   R                  SX#S-   5      nU R                  R                  U5      nU R                  R	                  U5      nU$ )a  
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.

Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
    height (`int`):
        Height of the generated image before upsampling.
    width (`int`):
        Width of the generated image before upsampling.
Nri   r   )rp   r  r  r  r  )r8   r  rx   ry   	sequencesimages         r(   decode_image_tokensEmu3Model.decode_image_tokens  sV     !CRC(--b&!)D	..>>yI##L1r'   	input_idsinputs_embedsr
  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r  r  ri   r   r   z6Image features and image tokens do not match: tokens: z, features )r  rQ   tensorr  r  longr  allrq   r~  	expand_asr  rm   numel
ValueError)r8   r  r  r
  special_image_maskn_image_tokensn_image_featuress          r(   get_placeholder_maskEmu3Model.get_placeholder_mask  s    !.2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*.E.E.T.T!T+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r'   r@   rA   r<   rB   rC   rE   rF   c
           
      0   USL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbG  U R                  X#5      n[        R                  " USS9nU R                  XUS9nUR                  X5      nU R                  " SUUUUUU	S.U
D6nU$ )aH  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   rl   )r  r
  )r@   rA   r<   r  rB   rC   r    )r  r  r  rQ   rQ  r  masked_scatterr  )r8   r  rE  r{  r@   rA   r<   r  rB   rC   rE   image_embedsr  outputss                 r(   rN   Emu3Model.forward  s    * -t";<s    557	BM#22<ML 99\q9L!%!:!:| "; " *889KZM // 
)%+')
 
 r'   )r  r  r  )	NNNNNNNNN)!r"   r#   r$   r%   _checkpoint_conversion_mappingr2   r  r  r  r  rQ   r	  rS   r  r  no_gradrP   r  r  r   r   r   rR   r   rT   r   r   r   rU   r	   rN   r&   rV   rW   s   @r(   r  r  {  s   &8,%G"64"U->-> UM]M] "u/@/@ uO_O_ $ ]]0@0@ # VY  $"))":?:K:K"]b]n]n"0  1548.21537+/59$(59.E,,-. u001. ell+	.
 !.. u//0. "%.   1 12. D>. !!1!12. +,. 
u,,	-.  .r'   r  c                   B  ^  \ rS rSrSrS/rSSSS.rU 4S jrS	 rS
 r	S\
R                  4S jrS rS r\S 5       r\S 5       r\S 5       rS r\\           S"S\\R0                     S\\R2                     S\\R4                     S\\R4                     S\\R0                     S\\   S\\R2                     S\\   S\\R0                     S\\R0                     S\\\R4                  4   S\\    S\\!\"4   4S jj5       5       r#       S#U 4S  jjr$S!r%U =r&$ )$Emu3ForConditionalGenerationi   zlm_head.weightzmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r   )r1   r2   r  r  r3   rd  r  hidden_size
vocab_sizer+  ry  rd   s     r(   r2   %Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr'   c                 6    U R                   R                  5       $ r0   )r  r  r  s    r(   r  1Emu3ForConditionalGeneration.get_input_embeddings$  s    zz..00r'   c                 :    U R                   R                  U5        g r0   )r  r  r  s     r(   r  1Emu3ForConditionalGeneration.set_input_embeddings'  s    

''.r'   rF   c                     U R                   $ r0   )r+  r  s    r(   get_output_embeddings2Emu3ForConditionalGeneration.get_output_embeddings*  s    ||r'   c                 :    U R                   R                  U5        g r0   )r  r  r  s     r(   r  (Emu3ForConditionalGeneration.set_decoder-  s    

w'r'   c                 6    U R                   R                  5       $ r0   )r  r  r  s    r(   r  (Emu3ForConditionalGeneration.get_decoder0  s    zz%%''r'   c                 .    U R                   R                  $ r0   )r  r  r  s    r(   r  'Emu3ForConditionalGeneration.text_model4  s    zz$$$r'   c                 .    U R                   R                  $ r0   )r  r  r  s    r(   r  $Emu3ForConditionalGeneration.vqmodel8  s    zz!!!r'   c                 .    U R                   R                  $ r0   )r  r  r  s    r(   r  /Emu3ForConditionalGeneration.vocabulary_mapping<  s    zz,,,r'   c                 :    U R                   R                  " S0 UD6$ r   )r  r  )r8   rE   s     r(   r  0Emu3ForConditionalGeneration.decode_image_tokens@  s    zz--777r'   r  rE  r{  r@   rA   r<   r  rB   rC   labelslogits_to_keeprE   c                    U R                   " SUUUUUUU	S.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R	                  USS2USS24   5      nSnU
b3  U R
                  " SUXR                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )aF  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
...     {
...     "role": "system",
...     "content": [
...         {"type": "text", "text": "You are a helpful assistant."},
...         ],
...     },
...     {
...     "role": "user",
...     "content": [
...         {"type": "image"},
...         {"type": "text", "text": "Please describe the image."},
...         ],
...     },
... ]

>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
>>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

>>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```)r  r@   rA   r<   r  rB   rC   r   N)logitsrC  r.  )lossrF  r<   r?   r  r    )r  r_  rP   slicer+  loss_functionr-   r  r.  r	   r<   r?   r  )r8   r  rE  r{  r@   rA   r<   r  rB   rC   rC  rD  rE   r$  r?   slice_indicesrF  rG  s                     r(   rN   $Emu3ForConditionalGeneration.forwardC  s    | ** 	
)%+')	
 	
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD &#33!//))
 	
r'   c	                 V   > [         TU ]  " U4UUUUUUUS.U	D6n
US   S:w  a  S U
S'   U
$ )N)r<   r@   r  rC   rA   rE  rB   r   rE  )r1   prepare_inputs_for_generation)r8   r  r<   r@   r  rC   rA   rB   rE  rE   model_inputsr9   s              r(   rM  :Emu3ForConditionalGeneration.prepare_inputs_for_generation  sZ     w<

+)')%%

 

 !!+/L(r'   )r+  r  )NNNNNNNNNNr   )NNNNNTN)'r"   r#   r$   r%   r  _tied_weights_keysr&  r2   r  r  r3   r  r5  r  r  propertyr  r  r  r  r   r   r   rQ   rS   r	  rR   r   rT   r   rP   r   r   rU   r	   rN   rM  r&   rV   rW   s   @r(   r)  r)    s   *+/#(&"1/ryy (( % % " " - -8  1548.21537+/59$(59-134X
E,,-X
 u001X
 ell+	X

 !.X
 u//0X
 "%X
   1 12X
 D>X
 !!1!12X
 ))*X
 c5<</0X
 +,X
 
u,,	-X
  X
z  r'   r)  )r)  r  r  r  rW  r  )Jr<  	functoolsr   typingr   r   rQ   torch.nnr3   torch.nn.functional
functionalr   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.deprecationr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr"   loggerr   r+   r  rY   r   r   r   r   r   r   r   r   r   r   r   r   r  r)  r3  rK  rW  r  r  r  r  r  r)  __all__r    r'   r(   <module>re     s  "  % "       ) 6 - & > > 0 w v 4 K K 
		H	%	N 	
$( $N$ryy $D	%H 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~&o &V V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l'2I '

J 3 

&(;_ 8V# Vrh#6 hVr'   