
    cCi.                       S SK r S SKJr  S SKJr  S SKJrJrJr  S SK	r
S SKrS SKJs  Jr  S SKrS SKJr  S SKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSK J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3  SSK4J5r5J6r6  SSK7J8r8  SSK9J:r:J;r;J<r<J=r=J>r>J?r?J@r@  SSKAJBrBJCrCJDrD  SSKEJFrF  SSKGJHrH  SSKIJJrJJKrKJLrLJMrMJNrN  SSKOJPrPJQrQ  SSKRJSrS  SSKTJUrU  SSKVJWrWJXrXJYrY  \?" 5       (       a  S SKZrZ\@R                  " \\5      r] " S S\U5      r^ " S S \H5      r_ " S! S"\5      r`\< " S# S$\65      5       ra\\<" S%S&9 " S' S(\35      5       5       rb " S) S*\P5      rc " S+ S,\Q5      rd " S- S.\Y5      re " S/ S0\R                  5      rg " S1 S2\R                  5      rh " S3 S4\X5      ri " S5 S6\W5      rj " S7 S8\F5      rk " S9 S:\R                  5      rl " S; S<\N5      rm " S= S>\M5      rn " S? S@\K5      ro " SA SB\L5      rp " SC SD\R                  5      rq " SE SF\R                  5      rr " SG SH\R                  5      rs " SI SJ\R                  5      rt " SK SL\J5      ru " SM SN\R                  5      rv " SO SP\R                  5      rw\<" SQS&9 " SR SS\a5      5       rx " ST SU\a\5      ry " SV SW\5      rz/ SXQr{g)Y    N)Iterable)	dataclass)CallableOptionalUnion)nn)BlipImageProcessor   )ACT2FN)Cache)PretrainedConfig)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)convert_to_rgbresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
TensorTypeTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargsis_vision_availablelogging   )CONFIG_MAPPING
AutoConfig	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddingsc                   \   ^  \ rS rSrSrSrSr                  SU 4S jjrSrU =r	$ )JanusVisionConfigT   a^
  
This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
`JanusVisionModel` according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    image_size (`int`, *optional*, defaults to 384):
        The size (resolution) of each image.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for attention weights.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"`, and `"gelu_new"` are supported.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        Ratio of MLP hidden dimensionality to embedding dimensionality.
    attention_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys, and values in the attention layers.
    hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
        The dropout probability for fully connected layers in the encoder.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    projection_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for the projection layer.
    use_qk_norm (`bool`, *optional*, defaults to `False`):
        Whether to normalize the query and key matrices.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated normal initializer for initializing all weight matrices.
    depth (`int`, *optional*, defaults to 2):
        Number of hidden layers in the aligner module.
    num_image_tokens (`int`, *optional*, defaults to 576):
        Number of image tokens.
janus_vision_modelvision_configc                    > [         TU ]  " SUUUUUUUUU	S.	UD6  U ?Xl        Xl        Xl        Xl        Xl        Xl        UU l	        UU l
        UU l        g )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrE   rF   rG   rH   rI   rJ   rK   rL   rM   rR   rS   rT   rU   rV   rW   rX   rY   rZ   kwargs	__class__s                       a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/janus/modular_janus.pyrP   JanusVisionConfig.__init__   s~    , 	 	
#/ 3%!!/)!	
 	
 "",#6 ,"4&!2
 0    )	rS   rY   rT   rX   rR   rZ   rU   rV   rW   )i         r
   rb   i          ư>gelug      @Trc      rc   F{Gz?r-   i@  )
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyrP   __static_attributes____classcell__r]   s   @r^   r@   r@   T   sW    ,\ &J%O ',1 ,1r`   r@   c                      ^  \ rS rSrSrSSSSSSSS	/ S
QSSSSSSS4S\S\S\S\S\S\S\S\S\\   S\S\4U 4S jjjr	Sr
U =r$ )JanusVQVAEConfig   a	  
This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
`JanusVQVAEModel` according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a
configuration with the defaults will yield a similar configuration to the VQModel of the
[deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

Args:
    embed_dim (`int`, *optional*, defaults to 8):
        Dimensionality of each embedding vector.
    num_embeddings (`int`, *optional*, defaults to 16384):
        Number of codebook embeddings.
    double_latent (`bool`, *optional*, defaults to `False`):
        Whether to use double z channels.
    latent_channels (`int`, *optional*, defaults to 256):
        Number of channels for the latent space.
    num_patches (`int`, *optional*, defaults to 32):
        Num of patches the input images can be divided into.
    in_channels (`int`, *optional*, defaults to 3):
        Number of input channels.
    out_channels (`int`, *optional*, defaults to 3):
        Number of out channels.
    base_channels (`int`, *optional*, defaults to 128):
        Base channel count.
    channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
        Channel multipliers for each resolution.
    num_res_blocks (`int`, *optional*, defaults to 2):
        Number of residual blocks.
    dropout (`float`, *optional*, defaults to 0.0):
        Dropout rate.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    num_hidden_layers (`int`, *optional*, defaults to 2):
        Number of hidden layers in VAVAE MLP Connecter module.
    hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    image_token_embed_dim (`int`, *optional*, defaults to 2048):
        Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
   i @  F       r
      )   ry   r-   r-      r-   rc   rg   rf   re   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                    > [         TU ]  " SUUUUUUU	U
UUS.
UD6  XPl        Xpl        Xl        Xl        Xl        UU l        U ?U ?	U ?
g )N)
r{   r|   r}   r~   r   r   r   r   r   rX   rN   )rO   rP   r   r   rU   rF   rM   image_token_embed_dim
resolutionattn_resolutions	attn_type)r[   r{   r|   r}   r~   r   r   r   r   r   r   r   rX   rU   rF   rM   r   r\   r]   s                     r^   rP   JanusVQVAEConfig.__init__   sv    ( 	 	
)'+#'1)/	
 	
 '(,!2$%:"O!Nr`   )rM   r   rF   r   r   rU   )rh   ri   rj   rk   rl   intboollistfloatrP   ro   rp   rq   s   @r^   rs   rs      s    *\ ##" (7"#** * 	*
 * * * * * !I* * * *r`   rs   c                   H   ^  \ rS rSrSrSr\\\S.r	    SU 4S jjr
SrU =r$ )JanusConfigi  a  
This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
[deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
        The config object or dictionary of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
        The config object or dictionary of the vision backbone.
    vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
        The config object or dictionary of the VQVAE backbone.
    image_token_id (`int`, *optional*, defaults to 100581):
        Token index of a placeholder image token.

Example:

```python
>>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

>>> # Initializing a Janus vision config
>>> vision_config = JanusVisionConfig()

>>> # Initializing a Llama config
>>> text_config = LlamaConfig()

>>> # Initializing a VQ config
>>> vq_config = JanusVQVAEConfig()

>>> # Initializing a Janus Pro 1B style configuration
>>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

>>> # Initializing a model from the Janus Pro 1B style configuration
>>> model = JanusForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```janus)text_configrC   	vq_configc                 &  > [        U[        5      (       a-  UR                  SS5      US'   [        US      " S	0 UD6U l        O_Uc)  [
        R                  S5        [        S   " 5       U l        O3[        U[        5      (       a  Xl        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X l        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X0l        O[        S[        U5       35      eU R                  R                  U l        U R                  R                  U R                  R                   -  U R                  l        X@l        [&        TU ]P  " S	0 UD6  g )
Nrm   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rN   )
isinstancedictgetr.   r   loggerinfor   
ValueErrortyper@   rC   rs   r   rX   rJ   rI   r   image_token_idrO   rP   )r[   r   rC   r   r   r\   r]   s         r^   rP   JanusConfig.__init__D  s    k4(((3g(NK%-k,.GHW;WD KKQR-g68D%566*  $[ 124 
  KKef!2!4Dt,,!2!C]!CD'899!.  $] 346 
 KK`a-/DN	4((-:	:DN	#344&N  $Y02 
 "&!3!3!E!E%)%7%7%B%BdFXFXFcFc%c","6"r`   )r   rX   r   rC   r   )NNNi )rh   ri   rj   rk   rl   rm   r/   r@   rs   sub_configsrP   ro   rp   rq   s   @r^   r   r     s8    +Z J!*%K 6# 6#r`   r   c                   H    \ rS rSr% \\S'   SrSrSS/rSS/r	Sr
SrSrS	rS
rg)JanusPreTrainedModeli}  configmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFrN   N)rh   ri   rj   rk   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignmentro   rN   r`   r^   r   r   }  sB    &*#,.GH#4m"DN!(-%r`   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   j    \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Srg)JanusVQVAEOutputi  z
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    Reconstructed pixel values after encoding and decoding the input.
embedding_loss (`torch.FloatTensor`):
    Embedding loss.
Ndecoded_pixel_valuesembedding_lossrN   )rh   ri   rj   rk   rl   r   r   torchFloatTensorr   r   ro   rN   r`   r^   r   r     s4     9=(5#4#45<26NHU../6r`   r   c                       \ rS rSrSrg)JanusBaseModelOutputWithPasti  rN   Nrh   ri   rj   rk   ro   rN   r`   r^   r   r         r`   r   c                       \ rS rSrSrg)JanusCausalLMOutputWithPasti  rN   Nr   rN   r`   r^   r   r     r   r`   r   c                   V    \ rS rSrSS\R
                  S\S\R
                  4S jjrSrg)	JanusVisionEmbeddingsi  pixel_valuesinterpolate_pos_encodingreturnc                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )Ndtyper-   ry   )
shapepatch_embeddingweightr   toflatten	transposer   position_embeddingposition_ids)
r[   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             r^   forwardJanusVisionEmbeddings.forward  s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
r`   rN   N)F)	rh   ri   rj   rk   r   Tensorr   r   ro   rN   r`   r^   r   r     s,    ELL D ]b]i]i  r`   r   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\   4S jjrS	rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fry   biasr   )rO   rP   r   rE   r{   rG   	num_headshead_dimr   scalerK   rV   rW   	is_causalnum_key_value_groupsr   LinearrS   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r[   r   proj_dropoutqk_normr]   s       r^   rP   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r`   hidden_statesattention_maskr\   c                 <   UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  U R$                  S.UD6u  pUR	                  XEU R&                  5      nU R)                  U5      nU R+                  U5      nX4$ )Nry   r-   eagerrc   )r   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr:   r   _attn_implementationr#   trainingrK   r   r   r{   r   rV   )r[   r   r   r\   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 r^   r   JanusVisionAttention.forward  s    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0##r`   )rK   r   r{   r   r   r   r   r   r   rV   r   r   r   r   r   N)rh   ri   rj   rk   rl   r@   rP   r   r   r   r%   r'   r   ro   rp   rq   s   @r^   r   r     sT    2Q0 Q@ 26)$||)$ !.)$ +,	)$ )$r`   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPi  r   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g r  )rO   rP   r   r   rE   rR   rQ   r   rM   activation_fnr   r   fc1fc2r   rT   dropout1dropout2r[   r   r]   s     r^   rP   JanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r`   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r  )r
  r	  r  r  r  r[   r   s     r^   r   JanusVisionMLP.forward  sP    /**=9m4/m4r`   )r	  r   r  r  r
  r  rQ   )rh   ri   rj   rk   r@   rP   r   r   r   ro   rp   rq   s   @r^   r  r    s0    ?0 ?U\\ ell  r`   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )r   i  r   c                 J  > [         TU ]  U5        Xl        UR                  U l        [        U5      U l        [        R                  " U R                  UR                  S9U l
        [        R                  " U R                  UR                  S9U l        [        U5      U l        g )N)eps)rO   rP   r   rE   r{   r   	self_attnr   r   rL   layer_norm1layer_norm2r  mlpr  s     r^   rP    JanusVisionEncoderLayer.__init__  st     ++-f5<<F<Q<QR<<F<Q<QR!&)r`   )r   r{   r  r  r  r  rh   ri   rj   rk   r@   rP   ro   rp   rq   s   @r^   r   r     s    *0 * *r`   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionEncoderi!  r   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r  )rO   rP   r   
ModuleListrangerF   r   layersr[   r   r   r]   s      r^   rP   JanusVisionEncoder.__init__"  sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   A)r!  r  rq   s   @r^   r  r  !  s    p0 p pr`   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionModeli'  r   c                 D   > [         TU ]  U5        [        U5      U l        g r  )rO   rP   r  encoderr  s     r^   rP   JanusVisionModel.__init__(  s     )&1r`   )r'  r  rq   s   @r^   r%  r%  '  s    20 2 2r`   r%  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPi-  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nry   )rO   rP   r   r   rE   rU   r
  r  r   rY   hidden_layersr   rM   r	  r"  s      r^   rP   JanusVisionAlignerMLP.__init__.  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r
  r-  r	  r[   r   layers      r^   r   JanusVisionAlignerMLP.forward7  B    /''E ..}=M!-0M ( r`   r	  r
  r-  )	rh   ri   rj   rk   r@   rP   r   ro   rp   rq   s   @r^   r*  r*  -  s    70 7 r`   r*  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )JanusVQVAEVectorQuantizeri?  r   c                 N   > [         TU ]  U5        UR                  /S-  U l        g )Nr-   )rO   rP   r   quant_state_dimsr  s     r^   rP   "JanusVQVAEVectorQuantizer.__init__@  s&     !'!3!3 4q 8r`   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r   r-   )pdimr
   ry   )	r   	embeddingr   F	normalizer   r:  permute
contiguous)r[   r<  r   emb_dimhidden_state_quants        r^   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entryD  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r`   )r:  )rh   ri   rj   rk   rs   rP   r   
LongTensorr   rG  ro   rp   rq   s   @r^   r8  r8  ?  s4    9/ 9"u/?/? "EDUDU " "r`   r8  c                       \ rS rSrSrg)JanusVQVAEResnetBlockiT  rN   Nr   rN   r`   r^   rK  rK  T  r   r`   rK  c                       \ rS rSrSrg)JanusVQVAEAttnBlockiX  rN   Nr   rN   r`   r^   rM  rM  X  r   r`   rM  c                       \ rS rSrSrg)JanusVQVAEConvDownsamplei\  rN   Nr   rN   r`   r^   rO  rO  \  r   r`   rO  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei`  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr
   ry   kernel_sizestridepadding)rO   rP   r   r   Conv2dconv)r[   r   r]   s     r^   rP   JanusVQVAEConvUpsample.__init__a  s,    HHOOK!TU_`Oa	r`   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factormode)rA  interpolaterX  r  s     r^   r   JanusVQVAEConvUpsample.forwarde  s(    m#IV		-0r`   )rX  )rh   ri   rj   rk   rP   r   ro   rp   rq   s   @r^   rQ  rQ  `  s    b r`   rQ  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlockik  r   channelsc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr   r   r   )rO   rP   rK  block_1rM  attn_1block_2)r[   r   rb  r]   s      r^   rP   JanusVQVAEMidBlock.__init__l  sF    , !

 *(3, !
r`   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )re  rf  rg  r  s     r^   r   JanusVQVAEMidBlock.forwardz  s2    ]3M2]3r`   )rf  re  rg  )rh   ri   rj   rk   rs   r   rP   r   r   r   ro   rp   rq   s   @r^   ra  ra  k  s7    
/ 
3 
U\\ ell  r`   ra  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr
   ry   rS  )ry   rd  rw   rd   T
num_groupsrH   r  affiner-   ) rO   rP   lenr   num_resolutionsr   r   r   r}   r~   r   r   rW  conv_intuplein_channel_multiplierr  downr   appendrK  rM  ModuleblockattnrO  
downsamplera  mid	GroupNormnorm_outconv_out)r[   r   r   r   r}   r~   r   ru  i_levelry  rz  block_in	block_outi_blockrv  r]   s                  r^   rP   JanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r`   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr   r   ry   )rs  r   rr  r   rv  ry  rq  rz  rw  r{  r|  r~  r   sigmoidr  )r[   r   r   r  r  hidden_statelast_hidden_states          r^   r   JanusVQVAEEncoder.forward  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r`   )rs  r  rv  ru  r|  r~  r   rr  )
rh   ri   rj   rk   rP   r   rI  r   ro   rp   rq   s   @r^   rl  rl    s     1
f!E$4$4 ! !r`   rl  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderi  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nry   r
   rS  rd  r   rw   rd   Trn  )rO   rP   rq  r   rr  r   r   r~   r   r   r   rW  rs  ra  r|  r  upreversedr   rw  rK  rM  rx  ry  rz  rQ  upsampler}  r~  r  )r[   r   r   r~   r   r  r  ry  rz  r  r  r  r]   s               r^   rP   JanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcr`   r  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nry   r   )rs  r|  r   rr  r   r  ry  rq  rz  r  r~  r   r  r  )r[   r  r  r  s       r^   r   JanusVQVAEDecoder.forward  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2r`   )rs  r  r|  r~  r   rr  r  )
rh   ri   rj   rk   rP   r   r   r   ro   rp   rq   s   @r^   r  r    s.    ,d\E$5$5 %:K:K  r`   r  c                      ^  \ rS rSr/ SQrSrS\4U 4S jjrS\R                  S\R                  4S jr\\S\R                  S\\R                  \R                  4   4S	 j5       5       rS
rU =r$ )
JanusVQVAEi  )rM  rK  r8  r   r   c                 r   > [         TU ]  U5        [        U5      U l        SU l        U R                  5         g )NF)rO   rP   r  decodergradient_checkpointing	post_initr  s     r^   rP   JanusVQVAE.__init__  s0     (0&+# 	r`   r<  r   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
ry   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer:  r   rG  post_quant_convr  )r[   r<  codebook_entryr   r   s        r^   decodeJanusVQVAE.decode"  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r`   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      $ )Nr   r   )r   encoder  r   r   )r[   r   r   quantr   indicesr   s          r^   r   JanusVQVAE.forward5  sM     "''*
)-\)B&w#{{7<<
B+GH 4EEr`   )r  r  )rh   ri   rj   rk   r   main_input_namers   rP   r   rI  r   r  r)   r(   rt  r   ro   rp   rq   s   @r^   r  r    s    
 %O/ 5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr`   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPiB  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r,  )rO   rP   r   r   r{   rU   r
  r  r   rF   r-  r   rM   r	  r"  s      r^   rP   JanusVQVAEAlignerMLP.__init__C  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rr/  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r1  r2  s      r^   r   JanusVQVAEAlignerMLP.forwardL  r5  r`   r6  )	rh   ri   rj   rk   rs   rP   r   ro   rp   rq   s   @r^   r  r  B  s    7/ 7 r`   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadiT  zOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r  )rO   rP   r   r   r   rU   proj_outr   rM   r	  r|   vision_headr  s     r^   rP   JanusVQVAEHead.__init__W  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr`   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r	  r  r  s     r^   r   JanusVQVAEHead.forward]  s6    m4**=9((7r`   )r	  r  r  )rh   ri   rj   rk   rl   rs   rP   r   r   tensorr   ro   rp   rq   s   @r^   r  r  T  s5    YS/ SU\\ ell  r`   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS\	R                  S\	R                  S	\	R                  4S
 jr\\         SS\\	R                     S\\	R                     S\\	R                      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\\	R                   4   4S jj5       5       rSrU =r$ )
JanusModelid  r   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r   F)rO   rP   r   r%  _from_configrC   vision_modelr*  alignerr  r   vqmodelr   	Embeddingr|   r{   generation_embeddingsr  generation_alignerr  generation_headr0   from_configr   language_modelr  r  r  s     r^   rP   JanusModel.__init__j  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r`   c                 6    U R                   R                  5       $ r  )r  get_input_embeddingsr[   s    r^   r  JanusModel.get_input_embeddings  s    ""7799r`   c                 :    U R                   R                  U5        g r  )r  set_input_embeddingsr[   values     r^   r  JanusModel.set_input_embeddings  s    007r`   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r  )r  r  r  )r[   r   image_embedss      r^   get_image_featuresJanusModel.get_image_features  s,    ((6||L$B$BCr`   	input_idsinputs_embedsimage_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a0  UR                  S   UR                  S   -  n[        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r   devicer   r   ry   z6Image features and image tokens do not match: tokens: z, features )r  r   r  r   r   longr  allsum	unsqueeze	expand_asr   numelr   r   )r[   r  r  r  special_image_maskn_image_tokensn_image_featuress          r^   get_placeholder_maskJanusModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r`   r   r   r   r   cache_position	use_cachelogits_to_keepc
                    US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbw  U R                  U5      nUR                  SUR                  S   5      nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                  Ub  WS9$ S S9$ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   )r  r  )r  r   r   r   r  r  r  )r  r   r   
attentionsimage_hidden_statesrN   )r   r  r  r   r   r   r  r   r  masked_scatterr  r   r  r   r   r  )r[   r  r   r   r   r   r  r  r  r  r\   r  r  image_attention_mask	lm_outputs                  r^   r   JanusModel.forward  s@    -t";<s    557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M^M'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r`   )	r  r   r  r  r  r  r  r  r  )	NNNNNNNNr   )rh   ri   rj   rk   r   rP   r  r  r  r   rI  r   r  r)   r(   r   r   r   r   r   r   r   ro   rp   rq   s   @r^   r  r  d  s4   { *:8
"))":?:K:K"]b]n]n"0  15481537+/5959$(34.
E,,-.
 u001.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r`   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jr\\          SS\\
R                      S\\
R"                     S\\
R                     S\\
R                      S\\   S\\
R                      S\\
R"                     S\\
R                      S\\   S\\\
R                  4   S\\   4S jj5       5       r      SU 4S jjrS\
R                  4S jr\
R6                     S S	\\
R                     S\\
R                      S\\   4U 4S jjj5       rSrU =r$ )!JanusForConditionalGenerationi  z(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )rO   rP   r   r  r   r   r   r   rE   
vocab_sizelm_headr  r  s     r^   rP   &JanusForConditionalGeneration.__init__  sZ     '
yy!3!3!?!?ASASA^A^ejk 	r`   c                 J    U R                   R                  R                  5       $ r  )r   r  r  r  s    r^   r  2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??r`   c                 N    U R                   R                  R                  U5        g r  )r   r  r  r  s     r^   r  2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=r`   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r  )r   r  r  )r[   r  r  s      r^   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Br`   r  r   r   r   r   r  r  labelsr  r  r\   c                    U R                   " SUUUUUUU	US.UD6nUR                  n[        U
[        5      (       a  [	        U
* S5      OU
nU R                  USS2USS24   5      nSnUb3  U R                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r  r   r   r   r   r  r  r  N)logitsr  r  )lossr  r   r   r  r  rN   )r   r  r   r   slicer  loss_functionr   r   r  r   r   r   r  r  )r[   r  r   r   r   r   r  r  r  r  r  r\   outputsr   slice_indicesr  r  s                    r^   r   %JanusForConditionalGeneration.forward  s    , ** 

%)%+')

 

  118B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD +#33!//)) ' ; ;
 	
r`   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r   r  r   r  r  r   r   )rO   prepare_inputs_for_generation)r[   r  r   r   r   r  r  r  r\   model_inputsr]   s             r^   r  ;JanusForConditionalGeneration.prepare_inputs_for_generation"  sR     w<
+')))
 
 !!+7(r`   r<  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r-   r
   ry   )r   r  r  rC  )r[   r<  decoded_images      r^   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens@  s:     

**11,?%--aAq9r`   logits_processorc           	      L  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cA  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      US9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r-   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  ry   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr
  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenmodel_kwargsr  rN   )r  r  r  )output_attentionsoutput_hidden_statesr   )r?  )num_samples)	sequencesscoresr  r  r   r   )Ipopr  copydeepcopyrO   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  rq  r   _prepare_special_tokensrw  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rZ   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   
_get_cacher  max
max_lengthr   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationr  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  r   r  r   r   r   )&r[   r  r   r
  r\   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskrZ   r   r   input_tokensmaskr  generated_tokensr  r  r9  r:  r;  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r  next_token_scoresprobs
next_tokenr]   s&                                        r^   r"  &JanusForConditionalGeneration.generateL  sM    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r`   )r   r  r   )
NNNNNNNNNr   )NNNNNN)NNN) rh   ri   rj   rk   _tied_weights_keysr   r   rP   r  r  r   r   r  r)   r(   r   rI  r   r   r   r   r   r%   r'   r   r  r  no_gradr   r"  ro   rp   rq   s   @r^   r  r    s   DFVW!{ @>ell u|| 
  15481537+/5959-1$(341
E,,-1
 u0011
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]] *.59:>	|$&|$ !!1!12|$ ##67	|$ |$r`   r  c            #       H  ^  \ rS rSrSrSSS\R                  SSSSSSS4S\S\\	\
\4      S	\S
\S\S\\\4   S\S\\\\\   4      S\\\\\   4      S\\   S\\   4U 4S jjjr   S S\R"                  S\\\\\\4   4   S\\\
\4      S\\\
\4      S\R"                  4
S jjr\R                  SS4S\R"                  S\\	\
\4   \4   S
\S\\\
\4      S\\\
\4      S\R"                  4S jjr\" 5       SSSSSSSSSSSS\R.                  S4S\S\\   S\\	\
\4      S
\\   S\\   S\\   S\\   S\\\\\   4      S\\\\\   4      S\\\
\4      S\\   S\\\\\\\4   4      S\\   S\S\\\
\4      S\R6                  R6                  4 S jj5       r       S!S\S\\   S\\   S\\   S\\\      S\\\      S\\
   S\\
   4S jjr S"S\R"                  S\\\\   4   S\\\\   4   S\\\
\4      S\R"                  4
S jjrSr U =r!$ )#JanusImageProcessori  a
  
Constructs a JANUS image processor.

Args:
    do_resize (`bool`, *optional*, defaults to `True`):
        Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
        `do_resize` parameter in the `preprocess` method.
    size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
        Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
        method.
    min_size (`int`, *optional*, defaults to 14):
        The minimum allowed size for the resized image. Ensures that neither the height nor width
        falls below this value after resizing.
    resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
        overridden by the `resample` parameter in the `preprocess` method.
    do_rescale (`bool`, *optional*, defaults to `True`):
        Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
        `do_rescale` parameter in the `preprocess` method.
    rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
        Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
        overridden by the `rescale_factor` parameter in the `preprocess` method.
    do_normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
        method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
    image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
        Mean to use if normalizing the image. This is a float or list of floats the length of the number of
        channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
        overridden by the `image_mean` parameter in the `preprocess` method.
    image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
        Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
        number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        Can be overridden by the `image_std` parameter in the `preprocess` method.
    do_convert_rgb (`bool`, *optional*, defaults to `True`):
        Whether to convert the image to RGB.
    do_pad (`bool`, *optional*, defaults to `True`):
        Whether to pad the image to square or not.
TN   gp?	do_resizer   min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbdo_padc                    > [         TU ]  " S0 UD6  Xl        X0l        Uc  SU l        g [        S U 5       5      U l        g )N)   rb  rb  c              3   >   #    U  H  n[        US -  5      v   M     g7f)   N)r   ).0xs     r^   	<genexpr>/JanusImageProcessor.__init__.<locals>.<genexpr>J  s     )K
1#a#g,,
s   rN   )rO   rP   r`  rX  background_colorrt  )r[   rW  r   rX  rY  rZ  r[  r\  r]  r^  r_  r`  r\   r]   s                r^   rP   JanusImageProcessor.__init__4  s@     	"6" $3D!$))K
)K$KD!r`   imageri  data_formatinput_data_formatr   c                 6   [        X5      u  pVU[        R                  :X  a  UR                  S   OUR                  S   nXV:X  a  Ub  [	        XU5      nU$ UnU$ [        XV5      n[        U[        5      (       a  U/nO[        U5      U:w  a  [        SU S35      eU[        R                  :X  av  [        R                  " XxU4UR                  S9n	[        U5       H  u  pXU
SS2SS24'   M     Xe:  a  X-
  S-  nXSS2XU-   2SS24'   U	$ X-
  S-  nXSS2SS2XU-   24'    U	$ [        R                  " XU4UR                  S9n	[        U5       H  u  pXSS2SS2U
4'   M     Xe:  a  X-
  S-  nXXU-   2SS2SS24'   U	$ X-
  S-  nXSS2XU-   2SS24'   U	$ )a  
Pads an image to a square based on the longest edge.

Args:
    image (`np.ndarray`):
        The image to pad.
    background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
        The color to use for the padding. Can be an integer for single channel or a
        tuple of integers representing for multi-channel images. If passed as integer
        in multi-channel mode, it will default to `0` in subsequent channels.
    data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the output image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        If unset, will use same as the input image.
    input_data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the input image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

Returns:
    `np.ndarray`: The padded image.
r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r-   )r   r   FIRSTr   r   r6  r   r   rq  r   npr8  r   	enumerate)r[   rk  ri  rl  rm  r   r   rH   max_dimresultrM  colorstarts                r^   pad_to_square!JanusImageProcessor.pad_to_squareL  s   < 'u@):>N>T>T)Tu{{1~Z_ZeZefhZi? * ,E@QR 
 L  
 Lf$ &,, 01!"l2:<.Hqr   0 6 66XX|g>ekkRF%&67"'q!Qw 8~ )a/7<q%&.0!34  !Q.6;q!UU]223  XXw>ekkRF%&67"'q!Qw 8~ )a/7<uv~-q!34
  !Q.6;q%%-/23r`   c                 d   Uc  [        U5      n[        X5      u  px[        Xx5      n	[        USS9nUS   US   :w  a  [	        SUS    SUS    35      eUS   nX)-  n
[        [        Xz-  5      U R                  5      [        [        X-  5      U R                  5      /n[        U4UUUUS.UD6nU$ )a  
Resize an image to dynamically calculated size.

Args:
    image (`np.ndarray`):
        Image to resize.
    size (`dict[str, int]` or `int`):
        The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
    resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
        `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
    data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the output image. If unset, the channel dimension format of the input
        image is used. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `None`: will be inferred from input
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

Returns:
    `np.ndarray`: The resized image.
Tdefault_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   rY  rl  rm  )r   r   r6  r   r   r   rX  r   )r[   rk  r   rY  rl  rm  r\   r   r   max_sizedeltaoutput_size_nonpaddeds               r^   r   JanusImageProcessor.resize  s    F $ >u E&u@v%TT:>T']*GXGWWbcghocpbqr  H~ FN#T]]3EM"DMM2!

 
&#/
 
 r`   imagesreturn_tensorsc                    Ub  UOU R                   nUb  UOU R                  nUb  UOU R                  nUb  UOU R                  nUb  UOU R                  nUb  UOU R
                  nU	b  U	OU R                  n	Ub  UOU R                  nUb  UOU R                  nUb  UOU R                  nUb  UOU R                  n[        USS9nU R                  U5      n[        U5      n[        U5      (       d  [        S5      e[!        UUUUU	UUUS9  U(       a  U Vs/ s H  n[#        U5      PM     nnU Vs/ s H  n[%        U5      PM     nnU(       a(  ['        US   5      (       a  [(        R+                  S5        Uc  [-        US   5      nU(       a!  U Vs/ s H  nU R/                  UX4US9PM     nnU(       a!  U Vs/ s H  nU R1                  UUUS9PM     nnU(       a   U Vs/ s H  nU R3                  UXoS	9PM     nnU(       a!  U Vs/ s H  nU R5                  UXUS
9PM     nnU Vs/ s H  n[7        UXS9PM     nn[9        SU0U
S9nU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf )a  
Preprocess an image or batch of images.

Args:
    images (`ImageInput`):
        Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
        passing in images with pixel values between 0 and 1, set `do_rescale=False`.
    do_resize (`bool`, *optional*, defaults to `self.do_resize`):
        Whether to resize the image.
    size (`dict[str, int]`, *optional*, defaults to `self.size`):
        Controls the size of the image after `resize`. The shortest edge of the image is resized to
        `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
        is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
        edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
    resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
    do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
        Whether to rescale the image values between [0 - 1].
    rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
        Rescale factor to rescale the image by if `do_rescale` is set to `True`.
    do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
        Whether to normalize the image.
    image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
        Image mean to normalize the image by if `do_normalize` is set to `True`.
    image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
        Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
    do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
        Whether to convert the image to RGB.
    background_color (`tuple[int, int, int]`):
        The background color to use for the padding.
    do_pad (`bool`, *optional*, defaults to `self.do_pad`):
        Whether to pad the image to square or not.
    return_tensors (`str` or `TensorType`, *optional*):
        The type of tensors to return. Can be one of:
            - Unset: Return a list of `np.ndarray`.
            - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
            - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
            - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
    data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
        The channel dimension format for the output image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - Unset: Use the channel dimension format of the input image.
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Fry  zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rZ  r[  r\  r]  r^  rW  r   rY  r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)rk  r   rY  rm  )rk  ri  rm  )rk  r   rm  rk  meanstdrm  input_channel_dimr   datatensor_type)rW  rY  rZ  r[  r\  r]  r^  r_  r`  ri  r   r   fetch_imagesr   r    r   r!   r   r   r   r   warning_oncer   r   rv  rescalerB  r   r   )r[   r  rW  r   rY  rZ  r[  r\  r]  r^  r  r_  ri  r`  rl  rm  rk  encoded_outputss                     r^   
preprocessJanusImageProcessor.preprocess  s   L "+!6IDNN	'38#-#9Zt
+9+E4K^K^'3'?|TEVEV#-#9Zt
!*!6IDNN	+9+E4K^K^!-4;;/?/K+QUQfQf'tTYYTU;""6*)&1F##: 
 	&!)%!		
 9?@nU+F@ 6<<VE.'V</&)44s
 $ >vay I $#E %dYjk#  
  $ $E ""%5&7 # 
 $    $#E 5d#  
  $#E U^op#   ou
ntej'{`nt 	 
 '^V,DR`ae A =

s*   I3II=I %I%I*-I/c	                 <   Ub  UOU R                   nUc  SU R                  -  OUnUb  UOU R                  nUb  UOU R                  nUb  UOU R                  n[        U5      n[        US   [        R                  R                  5      (       a  [        U5      S:  a  U$ US   $ Uc  [        US   5      n/ n	U H  n
[        U
5      n
U(       a  U R                  XXgS9n
U(       a?  U R                  XUS9n
U
R                  SS5      R                  [         R"                  5      n
U(       aE  U(       a>  US:X  a8  [%        U
[&        R(                  US	9n
[        R                  R+                  U
5      n
U	R-                  U
5        M     S
U	0nUS:w  a  UOSn[/        XS9$ )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Ng      ?r   ry   )rk  r]  r^  rm  )r   rm  rd  zPIL.Image.Imager  r   r  )rZ  r[  r\  r]  r^  r   r   PILImagerq  r   r   unnormalizer  clipastyperp  uint8r   r   LAST	fromarrayrw  r   )r[   r  rZ  r[  r\  r]  r^  rm  r  r   rk  r  s               r^   postprocessJanusImageProcessor.postprocesss  s    $.#9Zt
6D6Lt222R`'3'?|TEVEV#-#9Zt
!*!6IDNN	)&1fQi11 [1_6;&);$ >vay IE"5)E(() )  UTef

1c*11"((;
~AR/R3E;K;P;Pduv		++E2&! $ -+9=N+NTXBBr`   c                    Sn[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[	        S [        X#5       5       5      n[	        S U 5       5      nU R                  XXtS9nU$ )a  
Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
image = (image * image_std) + image_mean
Args:
    image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
        Batch of pixel values to postprocess.
    image_mean (`float` or `Iterable[float]`):
        The mean to use for unnormalization.
    image_std (`float` or `Iterable[float]`):
        The standard deviation to use for unnormalization.
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
r
   zmean must have z$ elements if it is an iterable, got zstd must have c              3   2   #    U  H  u  pU* U-  v   M     g 7fr  rN   )re  r  r  s      r^   rg  2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s     W<Vytus{<Vs   c              3   ,   #    U  H
  nS U-  v   M     g7f)ry   NrN   )re  r  s     r^   rg  r    s     ;#a#gs   r  )r   r   rq  r   rt  ziprB  )r[   rk  r]  r^  rm  rH   rev_image_meanrev_image_stds           r^   r  JanusImageProcessor.unnormalize  s    0 j(++:,. ?<.@dehisetdu!vww / %4Ji**9~- >,?cdghqdrcs!tuu . #l2IWC
<VWW;;;-  
 r`   )ri  r`  rX  )r   NN)NNNNNNNr  )"rh   ri   rj   rk   rl   r   BICUBICr   r   r   strr   r   r   r   rP   rp  ndarrayrt  r   rv  r   r*   ro  r   r&   r  r  r  r  r   r  ro   rp   rq   s   @r^   rU  rU    s   %R )-'9'A'A,3!:>9=)-!%LL tCH~&L 	L
 %L L c5j)L L U5$u+#567L E%e"456L !L L L6 >?>BDHHzzH  U3S=%9 9:H eC)9$9:;	H
 $E#/?*?$@AH 
H\ (:'A'A>BDH?zz? DcNC'(? %	?
 eC)9$9:;? $E#/?*?$@A? 
?B %& %))-15%)*.'+:>9=;?)-GK!%(8(>(>DH!YY D>Y tCH~&	Y
 -.Y TNY !Y tnY U5$u+#567Y E%e"456Y !sJ!78Y !Y #5eCcM.B)B#CDY Y &Y  $E#/?*?$@A!Y" 
#Y 'Y| &**.'+,0+/+/(,1C1C TN1C !	1C
 tn1C T%[)1C DK(1C $C=1C !1Cp EI+zz+ %%01+ /0	+
 $E#/?*?$@A+ 
+ +r`   rU  )	rU  r   r  r  r  r%  rs   r@   r   )|r   collections.abcr   dataclassesr   typingr   r   r   numpyrp  r   torch.nn.functionalr   
functionalrA  torch.utils.checkpoint.transformers.models.blip.image_processing_blipr	   activationsr   cache_utilsr   configuration_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   r   image_utilsr   r   r   r   r   r   r   r   r    r!   modeling_outputsr"   modeling_utilsr#   r$   processing_utilsr%   utilsr&   r'   r(   r)   r*   r+   r,   autor.   r/   r0   blip_2.modeling_blip_2r1   !chameleon.configuration_chameleonr2   chameleon.modeling_chameleonr3   r4   r5   r6   r7   idefics.modeling_ideficsr8   r9   llama.modeling_llamar:   siglip.configuration_siglipr;   siglip.modeling_siglipr<   r=   r>   r  
get_loggerrh   r   r@   rs   r   r   r   r   r   r   rx  r   r  r   r  r%  r*  r8  rK  rM  rO  rQ  ra  rl  r  r  r  r  r  r  rU  __all__rN   r`   r^   <module>r     s     $ ! , ,       M !   3 u u 9 A S S   , F &   9 8 5 D  e : < ^ ^ 			H	%
^1* ^1BW+ Wtk#" k#\ 
.? 
. 
. 
	7{ 	7 	7	#A 		"? 	2 "I$299 I$XRYY (*0 *p p2' 2BII $" = "*	< 		8 		B 	RYY  ,J!		 J!ZA		 AH-F -F`299 $RYY   
i
% i

i
Xt$$8/ t$n	E, EP
r`   