
    cCi                     P   S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  SSKJr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/  \&R`                  " \15      r2\$ " S S\5      5       r3\\$" SS9 " S S\5      5       5       r4\\$" SS9 " S S\5      5       5       r5\\$" SS9 " S S\5      5       5       r6 " S  S!\	Rn                  5      r8S"\Rr                  S#\:S$\Rr                  4S% jr; S^S&\	Rn                  S'\Rr                  S(\Rr                  S)\Rr                  S*\\Rr                     S+\<S,\<S-\!\#   4S. jjr= " S/ S0\	Rn                  5      r> " S1 S2\	Rn                  5      r? " S3 S4\5      r@ " S5 S6\	Rn                  5      rA " S7 S8\	Rn                  5      rB " S9 S:\	Rn                  5      rC " S; S<\5      rD\$ " S= S>\35      5       rE " S? S@\	Rn                  5      rF " SA SB\	Rn                  5      rG " SC SD\	Rn                  5      rH " SE SF\	Rn                  5      rI " SG SH\	Rn                  5      rJ " SI SJ\	Rn                  5      rK " SK SL\	Rn                  5      rL " SM SN\	Rn                  5      rM " SO SP\	Rn                  5      rN\$" SQS9 " SR SS\35      5       rO " ST SU\	Rn                  5      rP " SV SW\	Rn                  5      rQ\$" SXS9 " SY SZ\35      5       rR " S[ S\\3\5      rS/ S]QrTg)_    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   H    \ rS rSr% \\S'   SrSrSS/rSS/r	Sr
SrSrS	rS
rg)JanusPreTrainedModel/   configmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskF N)__name__
__module____qualname____firstlineno__r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment__static_attributes__r,       b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/janus/modeling_janus.pyr$   r$   /   sB    &*#,.GH#4m"DN!(-%r;   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   j    \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Srg)JanusVQVAEOutput=   z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossr,   )r-   r.   r/   r0   __doc__rA   r   torchFloatTensorr1   rB   r:   r,   r;   r<   r?   r?   =   s4     9=(5#4#45<26NHU../6r;   r?   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
JanusBaseModelOutputWithPastO   aa  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.

    If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
    hidden_size)` is output.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_stater*   hidden_states
attentionsimage_hidden_statesr,   )r-   r.   r/   r0   rC   rI   r   rD   rE   r1   r*   r
   rJ   tuplerK   rL   r:   r,   r;   r<   rG   rG   O   s|    & 6:x 1 129'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br;   rG   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)JanusCausalLMOutputWithPastp   a1  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr*   rJ   rK   rL   r,   )r-   r.   r/   r0   rC   rQ   r   rD   rE   r1   rR   r*   r
   rJ   rM   rK   rL   r:   r,   r;   r<   rO   rO   p   s    " )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br;   rO   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )JanusVisionEmbeddings   r&   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r&   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferrD   arangeexpandselfr&   	__class__s     r<   ra   JanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr;   
embeddingsheightwidthreturnc                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   r^   g      ?r   r   bicubicF)sizemodealign_corners)shaperl   weightrD   jit
is_tracingr]   	unsqueezere   r   reshapepermuter   
functionalinterpolateview)rq   rt   ru   rv   ri   rj   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r<   interpolate_pos_encoding.JanusVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr;   pixel_valuesr   c                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )N)dtyper   r   )
r}   rh   r~   r   toflatten	transposer   rl   r]   )
rq   r   r   _ru   rv   target_dtypepatch_embedsrt   
pos_embedss
             r<   forwardJanusVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
r;   )r&   rc   rd   ri   rj   rh   re   rl   )F)r-   r.   r/   r0   r!   ra   rD   Tensorintr   boolr   r:   __classcell__rr   s   @r<   rT   rT      sj    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i  r;   rT   rJ   n_reprw   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r}   ro   r   )rJ   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr;   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r   r^   )r   r   )ptrainingr   )r   num_key_value_groupsrD   matmulr   r}   r   r   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr+   attn_outputs                r<   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r;   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\   4S jjrS	rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr&   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: ).      Fr   biasr   )r`   ra   r&   rb   rc   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rq   r&   proj_dropoutqk_normrr   s       r<   ra   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r;   rJ   r   r   c                 <   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  U R$                  S.UD6u  pUR	                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nr^   r   r   eager        )r   r   r   )rz   r   r   r   r   r   r   r   r   r   r   r   r&   _attn_implementationr   r   r   r   r   rc   r   r   )rq   rJ   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r<   r   JanusVisionAttention.forward!  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##r;   )r   r&   rc   r   r   r   r   r   r   r   r   r   r   r   r   N)r-   r.   r/   r0   rC   r!   ra   rD   r   r   r   r   r   r:   r   r   s   @r<   r   r     sT    2Q0 Q@ 26)$||)$ !.)$ +,	)$ )$r;   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPiM  r&   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r   )r`   ra   r&   r   rb   	mlp_ratiointermediate_sizer	   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2rp   s     r<   ra   JanusVisionMLP.__init__N  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r;   rJ   rw   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   rq   rJ   s     r<   r   JanusVisionMLP.forwardX  sP    /**=9m4/m4r;   )r   r&   r   r   r   r   r   )r-   r.   r/   r0   r!   ra   rD   r   r   r:   r   r   s   @r<   r   r   M  s0    ?0 ?U\\ ell  r;   r   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\	\
   S\R                  4S j5       rS	rU =r$ )
r)   ia  r&   c                 H  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        Xl        g N)eps)r`   ra   rb   rc   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr&   rp   s     r<   ra    JanusVisionEncoderLayer.__init__b  sr    ++<<F<Q<QR-f5<<F<Q<QR!&)r;   rJ   r   r   rw   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)rJ   r   r,   r   r   r   r   rq   rJ   r   r   residualr   s         r<   r   JanusVisionEncoderLayer.forwardk  sz     !((7>> 
')
 

 !0 ((7/ 0r;   )r&   rc   r   r   r   r   )r-   r.   r/   r0   r!   ra   r   rD   r   r   r   rE   r   r:   r   r   s   @r<   r)   r)   a  s^    0  ||  +,	
 
		 r;   r)   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ S
S\\	R                     S\\   S\4S jj5       rS	rU =r$ )JanusVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`JanusVisionEncoderLayer`].

Args:
    config: JanusVisionConfig
r&   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r`   ra   r&   r   
ModuleListrangenum_hidden_layersr)   layersgradient_checkpointingrq   r&   r   rr   s      r<   ra   JanusVisionEncoder.__init__  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A&r   r   rw   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)rI   )r  r   )rq   inputs_embedsr   r   rJ   encoder_layers         r<   r   JanusVisionEncoder.forward  s>     &![[M) M ) ??r;   )r&   r  r  r   )r-   r.   r/   r0   rC   r!   ra   r   r   rD   r   r   r   r   r   r:   r   r   s   @r<   r  r    s`    ,0 ,  26@ !.@ +,	@
 
@ @r;   r  c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	 SS\R                  S	\
\R                     S
\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )JanusAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nr   r   r   r   Fr   r   )requires_grad)r`   ra   r&   rb   rc   r   r   r   r   r   r   r   r   r   qkvqkv_bias	ParameterrD   zeroscat
zeros_liker   
projection)rq   r&   q_biasv_biasr  rr   s        r<   ra   JanusAttention.__init__  ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCr;   tensorr   bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )r   r   r   r   r   )rq   r"  r   r#  s       r<   _shapeJanusAttention._shape  s5    {{3GQQRSUVWbbddr;   rJ   	head_maskrw   c                 4   UR                  5       u  pEnU R                  U5      nUR                  XESU R                  X`R                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
4SU R                  (       d  SOU R                  U R                  S	.UD6u  pUR                  XES
5      R                  5       nU R                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   r   r   r      r   Nr   )r   r   r   r^   )rz   r  r   r   r   r   r&   r   r   r   r   r   r   r  )rq   rJ   r'  r   r#  tgt_lenrc   	mixed_qkvr   r   r   r   r   r   s                 r<   r   JanusAttention.forward  s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((r;   )	r   r&   rc   r   r   r   r  r  r   r   )r-   r.   r/   r0   rC   ra   rD   r   r   r%  r   rM   r   r:   r   r   s   @r<   r  r    s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$) $)r;   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r`   ra   r&   r	   r   r   r   r   rb   r   r   r   rp   s     r<   ra   JanusMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr;   rJ   rw   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   s     r<   r   JanusMLP.forward  s4    /**=9/r;   )r   r&   r   r   )
r-   r.   r/   r0   ra   rD   r   r   r:   r   r   s   @r<   r.  r.    s)    KU\\ ell  r;   r.  c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\	\
   S\R                  4S j5       rS	rU =r$ )
JanusEncoderLayeri  r&   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )r`   ra   rb   rc   r  r   r   r   r   r   r.  r   r   rp   s     r<   ra   JanusEncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr;   rJ   r   r   rw   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)rJ   r'  r,   r  r  s         r<   r   JanusEncoderLayer.forward
  sz     !((7>> 
'$
 

 &0 ((7/%0r;   )rc   r   r   r   r   )r-   r.   r/   r0   r    ra   r   rD   r   r   r   rE   r   r:   r   r   s   @r<   r4  r4    s_    S{ S ||  +,	
 
		 r;   r4  c                      ^  \ rS rSr% Sr\\S'   \\S.r	S\4U 4S jjr
\" SS9\  SS\\R                     S\S	\\   S
\\\4   4S jj5       5       rS rSrU =r$ )JanusVisionModeli#  r   r&   )rJ   rK   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )r`   ra   r&   rb   rT   rt   r  encoderr   r   r   post_layernorm	post_init)rq   r&   rc   rr   s      r<   ra   JanusVisionModel.__init__,  sY     &&	/7)&1 ll9:O:OPr;   F)tie_last_hidden_statesr   r   rw   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   r  r   )rI   pooler_outputr,   )r   rt   r<  rI   r=  r   )rq   r   r   r   rJ   encoder_outputsrI   pooled_outputs           r<   r   JanusVisionModel.forward7  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
r;   c                     U R                   $ r   )rt   rq   s    r<   get_input_embeddings%JanusVisionModel.get_input_embeddingsT  s    r;   )r&   rt   r<  r=  r  )r-   r.   r/   r0   main_input_namer!   r1   r4  r  _can_record_outputsra   r   r   r   rD   rE   r   r   r   r   rM   r   r   rH  r:   r   r   s   @r<   r:  r:  #  s    $O*$
	0 	 u5 59).
u001
 #'
 +,	

 
u00	1
  6
6 r;   r:  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPiX  r&   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nr   )r`   ra   r   r   rb   projection_dimr   r	  r
  depthhidden_layersr	   r   r   r  s      r<   ra   JanusVisionAlignerMLP.__init__Y  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r   rR  r   rq   rJ   layers      r<   r   JanusVisionAlignerMLP.forwardb  B    /''E ..}=M!-0M ( r;   r   r   rR  )	r-   r.   r/   r0   r!   ra   r   r:   r   r   s   @r<   rM  rM  X  s    70 7 r;   rM  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S\R                  S\R                  4S	 jrS
rU =r$ )JanusVQVAEVectorQuantizerij  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r&   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        UR                  /S-  U l        g )Nbetag      ?r   )r`   ra   num_embeddingsrc   embedding_dimgetattrr_  r   rk   	embeddingri   quant_state_dimsrp   s     r<   ra   "JanusVQVAEVectorQuantizer.__init__u  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8r;   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r   r   r   r^   T)r   keepdimr   z	bd,dn->bn)r   r   r   ra  rD   sumrc  r~   einsumr   argminr}   meandetachr_  )rq   rf  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrQ   s          r<   r   !JanusVQVAEVectorQuantizer.forward~  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===r;   image_tokensrw   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r^   r   )r   r   r   r   )	r}   rc  r~   F	normalizer   rd  r   r   )rq   rt  r   emb_dimrr  s        r<   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r;   )r_  rc  ra  r`  rd  )r-   r.   r/   r0   rC   r"   ra   rD   r   r   
LongTensorrE   ry  r:   r   r   s   @r<   r]  r]  j  sI    9/ 9>ELL >6"u/?/? "EDUDU " "r;   r]  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )JanusVQVAEResnetBlocki  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    ư>T
num_groupsrg   r   affiner   r   rZ   r[   r\   r   )r`   ra   rX   rY   use_conv_shortcutrD   r   	GroupNormnorm1rf   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rq   r&   rX   rY   r  rr   s        r<   ra   JanusVQVAEResnetBlock.__init__  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r;   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r   )r  rD   sigmoidr  r  r   r  rX   rY   r  r  r  )rq   rJ   r  s      r<   r   JanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''r;   )
r  r  r  r   rX   r  r  r  rY   r  r  r-   r.   r/   r0   ra   r   r:   r   r   s   @r<   r}  r}    s    
 s.( (r;   r}  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEAttnBlocki  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )Nr  r  Tr  r   r   r  )r`   ra   rX   rD   r   r  normrf   qkvproj_outrq   rX   rr   s     r<   ra   JanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcder;   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r   r   r   ri  )r  r  r  r  r}   r   r   rD   bmmr   rv  r   r  )rq   rJ   r  r   r   r   r   channelsru   rv   r   r   s               r<   r   JanusVQVAEAttnBlock.forward  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%r;   )rX   r  r  r  r  r  r  r   s   @r<   r  r    s    f& &r;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvDownsamplei  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   r   r  )r`   ra   r   rf   convr  s     r<   ra   !JanusVQVAEConvDownsample.__init__  s%    IIkAaYZ[	r;   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r   r   r   constantr   )padr{   r   )rv  r  r  r   s     r<   r    JanusVQVAEConvDownsample.forward  s+    mJVWX		-0r;   r  r  r   s   @r<   r  r    s    \ r;   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr   r   r  )r`   ra   rD   r   rf   r  r  s     r<   ra   JanusVQVAEConvUpsample.__init__  s,    HHOOK!TU_`Oa	r;   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factorr{   )rv  r   r  r   s     r<   r   JanusVQVAEConvUpsample.forward	  s(    m#IV		-0r;   r  r  r   s   @r<   r  r    s    b r;   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r&   r  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr&   rX   rY   )r`   ra   r}  block_1r  attn_1block_2)rq   r&   r  rr   s      r<   ra   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r;   rJ   rw   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r   s     r<   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3r;   )r  r  r  )r-   r.   r/   r0   r"   r   ra   rD   r   r   r:   r   r   s   @r<   r  r    s7    
/ 
3 
U\\ ell  r;   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi%  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr   r   r  )r   r  r  r  Tr  r   ) r`   ra   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrX   double_latentlatent_channelsrD   r   rf   conv_inrM   in_channel_multiplierr	  downr
  appendr}  r  Moduleblockattnr  
downsampler  midr  norm_outconv_out)rq   r&   r  rX   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  rr   s                  r<   ra   JanusVQVAEEncoder.__init__&  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r;   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr^   r   r   )r  r
  r  r  r  r  r  r  r  r  r  r  rD   r  r  )rq   r   rJ   r  r  rf  rI   s          r<   r   JanusVQVAEEncoder.forwardY  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r;   )r  r  r  r  r  r  r  r  )
r-   r.   r/   r0   ra   rD   r{  r   r:   r   r   s   @r<   r  r  %  s     1
f!E$4$4 ! !r;   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderir  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nr   r   r  r  r   r  r  Tr  )r`   ra   r  r  r  r  r  r  rY   rD   r   rf   r  r  r  r	  upreversedr
  r  r}  r  r  r  r  r  upsampler  r  r  )rq   r&   r  r  rY   r  r  r  r  r  r  r  rr   s               r<   ra   JanusVQVAEDecoder.__init__s  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcr;   rf  rw   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr   r   )r  r  r
  r  r  r  r  r  r  r  r  rD   r  r  )rq   rf  r  r  s       r<   r   JanusVQVAEDecoder.forward  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2r;   )r  r  r  r  r  r  r  )
r-   r.   r/   r0   ra   rD   rE   r   r:   r   r   s   @r<   r  r  r  s.    ,d\E$5$5 %:K:K  r;   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                     ^  \ rS rSr% \\S'   / SQrSrS\4U 4S jjrS\	R                  4S jrS\	R                  S\	R                  4S	 jr\\S\	R                  S\\	R                  \	R                  4   4S
 j5       5       rSrU =r$ )
JanusVQVAEi  r&   )r  r}  r]  r   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         [        U5      U l        SU l        U R#                  5         g )Nr   F)r`   ra   r  r<  r]  quantizerD   r   rf   r  rc   
quant_convpost_quant_convevalr  decoderr  r>  rp   s     r<   ra   JanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	r;   c                 v    U R                  U5      nU R                  U5      nU R                  U5      u  p4nX4U4$ r   )r<  r  r  )rq   r   rJ   quantemb_lossindicess         r<   encodeJanusVQVAE.encode  s<    \26#'==#? ''r;   rt  rw   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r}   r  rd  r   ry  r  r  )rq   rt  codebook_entryrJ   r   s        r<   decodeJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r;   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      $ )Nr   r^   )r}   r  r  r   r?   )rq   r   r   r  rB   r  rA   s          r<   r   JanusVQVAE.forward  sM     "''*
)-\)B&w#{{7<<
B+GH 4EEr;   )r  r<  r  r  r  r  )r-   r.   r/   r0   r"   r1   r4   rJ  ra   rD   r{  r  rE   r  r   r   rM   r   r:   r   r   s   @r<   r  r    s     
 %O/ (5#3#3 (5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr;   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPi  r&   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf rO  )r`   ra   r   r   rc   rP  r   r	  r
  r  rR  r	   r   r   r  s      r<   ra   JanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rrT  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   rV  rW  s      r<   r   JanusVQVAEAlignerMLP.forward  rZ  r;   r[  )	r-   r.   r/   r0   r"   ra   r   r:   r   r   s   @r<   r  r    s    7/ 7 r;   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadi  zOHead used for sampling tokens in image generation, replacing the usual lm head.r&   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r   )r`   ra   r   r   image_token_embed_dimrP  r  r	   r   r   r`  vision_headrp   s     r<   ra   JanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr;   rJ   rw   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r   r   r   s     r<   r   JanusVQVAEHead.forward  s6    m4**=9((7r;   )r   r  r   )r-   r.   r/   r0   rC   r"   ra   rD   r   r"  r   r:   r   r   s   @r<   r  r    s5    YS/ SU\\ ell  r;   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS\	R                  S\	R                  S	\	R                  4S
 jr\\         SS\\	R                     S\\	R                     S\\	R                      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\\	R                   4   4S jj5       5       rSrU =r$ )
JanusModeli  r&   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r&   F)r`   ra   r&   r:  _from_configvision_configvision_modelrM  alignerr  	vq_configvqmodelr   rk   r`  rc   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr  r>  rp   s     r<   ra   JanusModel.__init__#  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r;   c                 6    U R                   R                  5       $ r   )r  rH  rG  s    r<   rH  JanusModel.get_input_embeddings8  s    ""7799r;   c                 :    U R                   R                  U5        g r   )r  set_input_embeddingsrq   r   s     r<   r  JanusModel.set_input_embeddings;  s    007r;   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r   )r	  r
  rI   )rq   r   image_embedss      r<   get_image_featuresJanusModel.get_image_features>  s,    ((6||L$B$BCr;   	input_idsr  image_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a0  UR                  S   UR                  S   -  n[        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer^   r   r   z6Image features and image tokens do not match: tokens: z, features )rH  rD   r"  r&   image_token_idlongr"  allrj  r   	expand_asr   numelr}   r   )rq   r  r  r  special_image_maskn_image_tokensn_image_featuress          r<   get_placeholder_maskJanusModel.get_placeholder_maskC  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r;   r   r   r]   r*   cache_position	use_cachelogits_to_keepc
                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbw  U R                  U5      nUR                  SUR                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                  Ub  WS9$ S S9$ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner^   )r  r  )r  r   r]   r*   r.  r-  r/  )rI   r*   rJ   rK   rL   r,   )r   rH  r  r   r}   r   r"  r   r+  masked_scatterr  rG   rI   r*   rJ   rK   )rq   r  r   r   r]   r*   r-  r  r.  r/  r   r  r  image_attention_mask	lm_outputs                  r<   r   JanusModel.forward[  s@    -t";<s    557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r;   )	r
  r&   r  r  r  r  r  r	  r  )	NNNNNNNNr   )r-   r.   r/   r0   r    ra   rH  r  r  rD   r{  rE   r+  r   r   r   r   r
   r   r   r   r   r:   r   r   s   @r<   r  r    s4   { *:8
"))":?:K:K"]b]n]n"0  15481537+/5959$(34.
E,,-.
 u001.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r;   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jr\\          SS\\
R                      S\\
R"                     S\\
R                     S\\
R                      S\\   S\\
R                      S\\
R"                     S\\
R                      S\\   S\\\
R                  4   S\\   4S jj5       5       r      SU 4S jjrS\
R                  4S jr\
R6                     S S	\\
R                     S\\
R                      S\\   4U 4S jjj5       rSrU =r$ )!JanusForConditionalGenerationi  z(model.language_model.embed_tokens.weightzlm_head.weightTr&   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )r`   ra   r&   r  r'   r   r   r  rb   
vocab_sizelm_headr>  rp   s     r<   ra   &JanusForConditionalGeneration.__init__  sZ     '
yy!3!3!?!?ASASA^A^ejk 	r;   c                 J    U R                   R                  R                  5       $ r   )r'   r  rH  rG  s    r<   rH  2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??r;   c                 N    U R                   R                  R                  U5        g r   )r'   r  r  r  s     r<   r  2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=r;   inputsrw   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r   )r'   r  r  )rq   r?  rf  s      r<   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Br;   r  r   r   r]   r*   r-  r  labelsr.  r/  r   c                    U R                   " SUUUUUUU	US.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r]   r*   r  r.  r-  N)rR   rC  r8  )rQ   rR   r*   rJ   rK   rL   r,   )r'   rI   
isinstancer   slicer9  loss_functionr&   r  r8  rO   r*   rJ   rK   rL   )rq   r  r   r   r]   r*   r-  r  rC  r.  r/  r   outputsrJ   slice_indicesrR   rQ   s                    r<   r   %JanusForConditionalGeneration.forward  s    , ** 

%)%+')

 

  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r;   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r*   r  r   r-  r/  r   r   )r`   prepare_inputs_for_generation)rq   r  r   r*   r   r  r-  r/  r   model_inputsrr   s             r<   rL  ;JanusForConditionalGeneration.prepare_inputs_for_generation  sR     w<
+')))
 
 !!+7(r;   rt  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r   r   r   )r'   r  r  r   )rq   rt  decoded_images      r<   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9r;   logits_processorc           	      L  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cA  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      US9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r?  r   rU  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr.  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   rX  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r"  r   )rU  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnrS  r"  )r  r   expand_sizer   boi_token_idr*   static)cache_implementationr   max_cache_lenmodel_kwargsr!  r,   )r  r  r-  )output_attentionsoutput_hidden_statesr^   ri  )num_samples)	sequencesscoresrR   rK   rJ   r*   )IpoprU  copydeepcopyr`   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   rX  loggerwarning_prepare_model_inputsbos_token_idr   r"  r  r}   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr'   r	  r&   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idrH  _get_initial_cache_positionget
_get_cacher`  max
max_lengthrD   r  rc  rd  output_scoresoutput_logitsreturn_dict_in_generater
  rL  r   r  #_update_model_kwargs_for_generationrI   cloner  	do_sampler   multinomialsqueezeargmaxr  r   rA  floatrK   rJ   r   r*   )&rq   r?  r   rS  r   rU  rV  rb  r  model_input_namer   r"  kwargs_has_attention_maskrz  r   r   input_tokensmaskr  generated_tokensrc  rd  r  r  r  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsirM  rH  rf  rg  next_token_scoresprobs
next_tokenrr   s&                                        r<   rk  &JanusForConditionalGeneration.generate  sM    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r;   )r&   r9  r'   )
NNNNNNNNNr   )NNNNNN)NNN) r-   r.   r/   r0   _tied_weights_keysr8   r    ra   rH  r  rD   r   rA  r   r   r   r{  rE   r
   r   r   r   r   r   r   rL  rQ  no_gradr   rk  r:   r   r   s   @r<   r6  r6    s   DFVW!{ @>ell u|| 
  15481537+/5959-1$(341
E,,-1
 u0011
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]] *.59:>	|$&|$ !!1!12|$ ##67	|$ |$r;   r6  )r$   r6  r  r  r:  )r   )Uri  dataclassesr   typingr   r   r   rD   torch.nn.functionalr   r   rv  activationsr	   cache_utilsr
   
generationr   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_janusr    r!   r"   
get_loggerr-   rr  r$   r?   rG   rO   r  rT   r   r   r   r  r   r   r   r)   r  r  r.  r4  r:  rM  r]  r}  r  r  r  r  r  r  r  r  r  r  r6  __all__r,   r;   r<   <module>r     sp  ,  ! , ,     !   u u 9 9 X X F & ] ] /  Q Q 
		H	% 
.? 
. 
. 
	7{ 	7 	7 
C; C C6 
C+ C C4HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4I$299 I$XRYY ( 8  F@ @DI)RYY I)Xryy 2 D 1+ 1 1hBII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH :F% :F:Fz299 $RYY   
i
% i

i
Xt$$8/ t$n	 tr;   