
    cCi                        S r SSKJr  SSKJrJrJrJr  SSKrSSK	J
s  Jr  SSKJ
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/J0r0  \$Rb                  " \25      r3\\"" SS9 " S S\5      5       5       r4\\"" SS9 " S S\5      5       5       r5    S@S jr6/ 4S jr7 " S S \
Rp                  5      r9 " S! S"\
Rt                  5      r; " S# S$\
Rx                  5      r= " S% S&\R                  Rx                  5      r>S' r?SAS( jr@ " S) S*\
Rx                  5      rA SBS+\
Rx                  S,\R                  S-\R                  S.\R                  S/\\R                     S0\CS1\C4S2 jjrD " S3 S4\
Rx                  5      rE " S5 S6\5      rF " S7 S8\5      rG\" " S9 S:\5      5       rH\" " S; S<\H5      5       rI " S= S>\H\5      rJ/ S?QrKg)CzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformerz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
IdeficsBaseModelOutputWithPast0   aa  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.

    If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
    hidden_size)` is output.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
    `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
    input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r%   r   torchFloatTensor__annotations__r&   r   r'   tupler(   r)   __static_attributes__r*       f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/idefics/modeling_idefics.pyr#   r#   0   s|    & 6:x 1 129'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br5   r#   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)IdeficsCausalLMOutputWithPastQ   a1  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
    sequence_length, hidden_size)`.

    image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr&   r'   r(   r)   r*   )r+   r,   r-   r.   r/   r:   r   r0   r1   r2   r;   r&   r   r'   r3   r(   r)   r4   r*   r5   r6   r8   r8   Q   s    " )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju00129>B%(9(9":;Br5   r8   c                    [         R                  " U R                  S   5      R                  SS5      R	                  SU5      R                  S5      R                  U R                  5      nU R                  SU5      n UR                  S5      US'   UR                  S5      US'   UR                  S5      US'   UR                  S5      US'   SU;   a  US   nUR                  SU5      US'   Ub  UR                  SU5      US	'   US   b  US   R                  SU5      US'   US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r0   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderrC   encoder_outputsmodel_kwargsexpanded_return_idxrB   s           r6   expand_inputs_for_generationrR   q   s    	Y__Q'(--b!4;;A{KPPQSTWWXaXhXhi  &&q*=>I#/#3#3N#CL /;/?/?@Z/[L+,+7+;+;<R+SL'(+7+;+;<R+SL'(<'%&67)7)D)DQH[)\%&!)7)D)DQH[)\%&*+7/;<R/S/`/`"0
+, N#/'3N'C'P'PQRTg'h^$ "" 
0	1	=3?@Z3[3h3h"4
/0 "" 
,	-	9/;<R/S/`/`"0
+, ""r5   c                 R  ^ [         R                  [         R                  [         R                  S.nU Vs/ s H  o2U   PM	     nnU R	                  5        HH  mU(       a-  [        U4S jU 5       5      (       a  TR                  S5        M7  TR                  S5        MJ     U $ s  snf )N)	LayerNormLinear	Embeddingc              3   <   >#    U  H  n[        TU5      v   M     g 7fN)
isinstance).0tmodules     r6   	<genexpr>freeze_model.<locals>.<genexpr>   s     $]D\qZ%:%:D\s   TF)r   rT   rU   rV   modulesanyrequires_grad_)modelmodule_exceptionsmappingmmodule_exceptions_mappedr\   s        @r6   freeze_modelrg      s    \\))\\G
 5FF4Eq
4EF--/$]D\$]!]!]!!$'!!%(	 "
 L  Gs   B$c                   ^   ^  \ rS rSrSr    S
S\\   SS4U 4S jjjrS rS\	4S jr
S	rU =r$ )IdeficsDecoupledEmbedding   a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
then it will create `num_additional_embeddings` additional parameters that are always trained. If
`num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
Npartially_freezereturnc           	      F  > Ub  Xq:  a  [        SU SU 35      e[        T	U ]  " SUUUUUS.UD6  Xl        Xpl        X l        X@l        U(       a  U R                  R                  S5        U R
                  S:  a'  [        R                  " U R
                  UUUS9U l        gg)	a  
Args:
    num_embeddings (`int`):
        Size of the dictionary of embeddings
    num_additional_embeddings (`int`):
        Number of additional embeddings. Only useful when you `partially_freeze=True`.
    embedding_dim (`int`):
        The size of each embedding vector
    partially_freeze: (`bool`, *optional*, defaults to `False`):
        If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
    padding_idx (`int`, *optional*):
        The padding index (needs to be less than num_embeddings)

Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrI   dtypepadding_idxFr   )rn   ro   rI   rp   r*   )
ValueErrorsuper__init__rn   rq   num_additional_embeddingsrk   weightra   r   rV   additional_embedding)
selfrn   ru   ro   rk   rI   rp   rq   kwargs	__class__s
            r6   rt   "IdeficsDecoupledEmbedding.__init__   s    6 "{'CN{m[`ao`pqrr 	
)'#	
 	
 -&)B& 0KK&&u-))A-(*#==+	)D% .r5   c                 \   U R                   S:X  a   [        R                  " XR                  5      $ UR	                  5       n[
        R                  " XR                  :  5      nX   nU R                  X0R                  -
  5      nSX'   [        R                  " XR                  5      nXEU'   U$ )a{  
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.

in order to make a lookup of the input ids, we:
1. find out the indices of the entries belonging to the 2nd embedding
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
   embedding starts from 0 and not num_embeddings
3. perform the 2nd embedding lookup
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
5. perform the 1st embedding lookup
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.

r   )	ru   F	embeddingrv   cloner0   wherern   rw   )rx   rL   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectors         r6   forward!IdeficsDecoupledEmbedding.forward   s    * ))Q.;;y++66 OO%	#(;;y<O<O/O#P %.%H" $ 9 9:TWjWj:j k /0	+kk)[[9 1F,-r5   c                 n    SU R                    SU R                   SU R                   SU R                   3$ )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)rn   ru   ro   rk   rx   s    r6   
extra_repr$IdeficsDecoupledEmbedding.extra_repr  sq     !4!4 55QRVRpRpQq  rB  CG  CU  CU  BV  Vi  jn  j  j  i@  A  	Ar5   )rw   ru   rn   rq   rk   )FNNN)r+   r,   r-   r.   r/   r   boolrt   r   strr   r4   __classcell__rz   s   @r6   ri   ri      sS     ,13
 #4.3 
3 3j%NAC A Ar5   ri   c                      ^  \ rS rSrSr     SS\S\S\S\S\S	S4U 4S
 jjjrS\R                  S	\R                  4S jr
S	\4S jrSrU =r$ )IdeficsDecoupledLineari  a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
Nin_featuresout_featuresout_additional_featuresbiasrk   rl   c                 (  > [         TU ]  XXFU5        X0l        XPl        Xl        X l        U(       a=  U R                  R                  S5        U(       a  U R                  R                  S5        US:  a  [        R                  " UUUUUS9U l        gg)a'  
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
Fr   )r   r   r   rI   rp   N)rs   rt   r   rk   r   r   rv   ra   r   r   rU   additional_fc)	rx   r   r   r   r   rk   rI   rp   rz   s	           r6   rt   IdeficsDecoupledLinear.__init__  s     	D%H'>$ 0&(KK&&u-		((/"Q&!#'4"D 'r5   inputc                     [         R                  " XR                  U R                  5      nU R                  S:  a)  U R                  U5      n[        R                  " X#4S5      nU$ )Nr   r=   )r}   linearrv   r   r   r   r0   cat)rx   r   outputadditional_featuress       r6   r   IdeficsDecoupledLinear.forwardC  sQ    %dii8''!+"&"4"4U";YY<bAFr5   c           
          SU R                    SU R                   SU R                   SU R                  SL SU R                   3
$ )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nr   r   r   r   r   rk   r   s    r6   r   !IdeficsDecoupledLinear.extra_reprL  s    d../t?P?P>QQklp  mI  mI  lJ  JQ  RV  R[  R[  cg  Rg  Qh  h{  |@  |Q  |Q  {R  S  	Sr5   )r   r   r   r   rk   )r   TTNN)r+   r,   r-   r.   r/   intr   rt   r0   Tensorr   r   r   r4   r   r   s   @r6   r   r     s     ()!%"" " "%	"
 " " 
" "HU\\ ell SC S Sr5   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )IdeficsRMSNormiR  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
IdeficsRMSNorm is equivalent to T5LayerNorm
N)rs   rt   r   	Parameterr0   onesrv   variance_epsilon)rx   hidden_sizeepsrz   s      r6   rt   IdeficsRMSNorm.__init__S  s/     	ll5::k#:; #r5   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   r=   T)keepdim)rH   r0   float32powmeanrsqrtr   rv   rp   float16bfloat16)rx   r'   variances      r6   r   IdeficsRMSNorm.forward[  s     ##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r5   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r3   rv   rE   r   r   s    r6   r   IdeficsRMSNorm.extra_repre  s*    ))*+6$2G2G1HIIr5   )r   rv   )gư>)	r+   r,   r-   r.   rt   r   r   r4   r   r   s   @r6   r   r   R  s    $+J Jr5   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )IdeficsEmbeddingij  c           	        > [         TU ]  5         Xl        X l        X0l        SU R                  [
        R                  " SU R                  S[
        R                  S9R                  U[
        R                  S9U R                  -  -  -  nU R                  SUSS9  U R                  X R                  R                  [
        R                  " 5       S	9  g )
N      ?r   r   rp   rI   rp   inv_freqF
persistentseq_lenrI   rp   )rs   rt   dimmax_position_embeddingsbaser0   rD   int64rH   floatregister_buffer_set_cos_sin_cacher   rI   get_default_dtype)rx   r   r   r   rI   r   rz   s         r6   rt   IdeficsEmbedding.__init__k  s    '>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD 	+MM4H4HPUPgPgPi 	  	
r5   c                    Xl         [        R                  " U R                   U[        R                  S9R	                  U R
                  5      n[        R                  " SX@R
                  5      n[        R                  " XU4SS9nU R                  SUR                  5       R                  U5      SS9  U R                  SUR                  5       R                  U5      SS9  g )	Nr   zi,j->ijr=   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr0   rD   r   type_asr   einsumr   r   cosrH   sin)rx   r   rI   rp   r[   freqsembs          r6   r   #IdeficsEmbedding._set_cos_sin_cache|  s    ")LL00u{{S[[\`\i\ijY==9iiB/\3779<<+>5Q\3779<<+>5Qr5   c                     X R                   :  a$  U R                  X!R                  UR                  S9  U R                  S U R                  UR                  S9U R                  S U R                  UR                  S94$ )Nr   r   )r   r   rI   rp   r   rH   r   )rx   xr   s      r6   r   IdeficsEmbedding.forward  su    ,,,##GHHAGG#T OOHW%((qww(7OOHW%((qww(7
 	
r5   )r   r   r   r   )i   i'  NrX   )	r+   r,   r-   r.   rt   r   r   r4   r   r   s   @r6   r   r   j  s    
"R
 
r5   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr=   r   r   )rE   r0   r   )r   x1x2s      r6   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r5   c                     X$   R                  U5      nX4   R                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a&  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`):
        The position indices of the tokens corresponding to the query and key tensors. For example, this can be
        used to pass offsetted position ids when working with a KV-cache.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embeds           r6   apply_rotary_pos_embr     s]    * 

%
%m
4C


%
%m
4Cw;q>C/0Gw;q>C/0Gr5   c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r	$ )
IdeficsMLPi  r   intermediate_size
hidden_actc                    > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X!SS9U l        [        R                  " XSS9U l        [        U   U l        g )NFr   )	rs   rt   r   rU   	gate_proj	down_projup_projr
   act_fn)rx   r   r   r   rz   s       r6   rt   IdeficsMLP.__init__  sS     	;N#4NyyeLZ(r5   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      $ rX   )r   r   r   r   )rx   r   s     r6   r   IdeficsMLP.forward  s0    ~~dkk$..*;<t||ANOOr5   )r   r   r   r   )
r+   r,   r-   r.   r   r   rt   r   r4   r   r   s   @r6   r   r     s0    
)
) 
) 	
)P Pr5   r   r\   querykeyvaluerC   scalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr=   )r   rp   ptrainingr   r   )r0   matmul	transposer   
functionalsoftmaxr   rH   rp   r   r  
contiguous)
r\   r   r   r   rC   r   r   ry   attn_weightsattn_outputs
             r6   eager_attention_forwardr	    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r5   c                     ^  \ rS rSrSr     SS\S\S\S\S\\	   S\S	\\   4U 4S
 jjjr
S\R                  S\S\4S jr\" SSSS9     SS\R                  S\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\   S\\R                  \R                  4   4S jj5       rSrU =r$ )IdeficsAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc                   > [         T	U ]  5         XPl        Xl        X l        X-  U l        X0l        SU l        U R
                  S-  U l        Xpl	        Uc-  [        R                  SU R                  R                   S35        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eX@l        [!        ["        R$                  S5      (       d  [        S	5      eU R                  (       a  [!        UR&                  S
5      (       d  U R                  OUR&                  R(                  n["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " XU R
                  -  SS9U l        ["        R*                  " UX R
                  -  SS9U l        O["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " X R
                  -  USS9U l        [5        U R
                  5      U l        X`l        U R8                  (       aG  [;        U R
                  UR<                  S9U l        [;        U R
                  UR<                  S9U l         g g )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!rs   rt   r  r   r  head_dimr   	is_causalr   r  loggerwarning_oncerz   r+   rr   r  hasattrr   r  vision_configr  rU   q_projk_projv_projo_projr   
rotary_embr  r   rms_norm_epsq_layer_normk_layer_norm)
rx   r   r  r   r  r  r  r  kv_input_dimrz   s
            r6   rt   IdeficsAttention.__init__  st    	&"#0}}d*" !8!8 9 :, , MMI%$*:*::QRVRbRbQc$YKr3 
 #5r}}&DEEHII""(/0D0Dk(R(R  X^XlXlXvXv  ))  MM)DK
 ))Ldmm2KRWXDK))MM)DK ))  MM)DK
 ))  MM)DK
 ))  MM)DK
 ii%

 +4==9, .t}}&BUBU VD .t}}&BUBU VD r5   tensorr   bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )rF   r  r  r  r  )rx   r%  r   r&  s       r6   _shapeIdeficsAttention._shape3  s5    {{3GQQRSUVWbbddr5   past_key_valuer&   4.58new_nameversionr'   key_value_statesrC   r   cache_positionry   rl   c                    U R                   =(       d    US LnUR                  5       u  pnU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU(       d  U R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nOUR                  5       u  pnU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nUR                  S   nUb  UUS   -  nU(       d-  U R                  U[        UU
5      S9u  nn[        XUUU5      u  pUb#  SU0nUR                  XU R                  U5      u  pU R                  (       a"  U R!                  U5      nU R#                  U5      n[$        nU R&                  R(                  S:w  a  [*        U R&                  R(                     nU" U UUUU4U R,                  (       d  SOU R.                  U R0                  S	.UD6u  nnUR3                  XS
5      R5                  5       nU R7                  U5      nUU4$ )Nr   r   r   r   )r   r0  eager        )r   r   r=   )r  sizer  rF   r  r  r  r  r  rE   r  maxr   updater  r  r!  r"  r	  r  _attn_implementationr   r  r   r   reshaper  r  )rx   r'   r/  rC   r   r&   r0  ry   r  r&  q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer  r  s                          r6   r   IdeficsAttention.forward6  s    "44T8HPT8T%**,A{{=166s4>>SWS`S`akklmopq!]388T^^UYUbUbcmmnoqrsJ;;}5::3t~~W[WdWdeoopqstuL+002LAq%56;;CY]YfYfgqqrsuvwJ,-223PTP]P]^hhijlmn   %%b)
&.++J!|SU=STHC';LVY[^`l'm$L &,n=L'6'='=jX\XfXfht'u$J,,\:L**:6J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$,,LL	%
 	%
!\ "))#b9DDFkk+.L((r5   )r  r   r  r   r  r  r"  r  r  r  r  r!  r  r  r  r   r  )r3  FNFNNNNNN)r+   r,   r-   r.   r/   r   r   r   r   r   rt   r0   r   r(  r   
LongTensorr   r   r   r3   r   r4   r   r   s   @r6   r  r    sk   G #(-1$#'OWOW OW 	OW
 !OW )*OW OW C=OW OWbeU\\ eC ec e %0A6R 481537+/59?)||?) #5<<0?) !.	?)
 u//0?) "%?) !!1!12?) +,?) 
u||U\\)	*?) S?)r5   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9\	    SS	\
R                  S
\\
R                     S\\
R                     S\\   S\\
R                     S\\   S\
R                   4S jj5       5       rSrU =r$ )IdeficsDecoderLayeriz  r  r  c                   > [         TU ]  5         UR                  U l        [        U R                  UR                  UR
                  UUS9U l        [        U R                  UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        g )N)r   r  r   r  r  r   r   r   r  )rs   rt   r   r  num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormrx   r  r  rz   s      r6   rt   IdeficsDecoderLayer.__init__{  s    !--)((00NN
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%~~r5   r*  r&   r+  r,  r'   rC   r   r0  ry   rl   c           	         UnU R                  U5      nU R                  " SUUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXq-   nUnU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXq-   nU$ )N)r'   rC   r   r&   r0  r   r*   )rL  rJ  r   r  r   r  rM  rK  )	rx   r'   rC   r   r&   r0  ry   residualr:  s	            r6   r   IdeficsDecoderLayer.forward  s     !,,];  >> 
')%+)
 
 --m||VZVcVc-d 0 !55mD/--m||VZVcVc-d 0r5   )r   r   rL  rK  rM  rJ  rX   )NNNN)r+   r,   r-   r.   r   r   r   rt   r   r   r0   r   rD  r   r   r   r1   r   r4   r   r   s   @r6   rF  rF  z  s    &} &# & && %0A6R 2637+/59 ||  !.  u//0	 
 "%  !!1!12  +,  
		   S r5   rF  c                   2  ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9\	     SS	\
R                  S
\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\   S\\   S\
R                  4S jj5       5       rSrU =r$ )IdeficsGatedCrossAttentionLayeri  r  r  c           
      	  > [         TU ]  5         UR                  U l        [        U R                  UR                  SUR
                  UUR                  US9U l        [        U R                  UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        ["        R$                  " 5       U l        ["        R$                  " 5       U l        UR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R2                  " SSU R                  5      5      U l        ["        R.                  " [0        R2                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R2                  " S5      5      U l        ["        R.                  " [0        R2                  " S5      5      U l        GO[9        S	UR,                   S
35      eUR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R:                  " SSU R                  5      5      U l        ["        R.                  " [0        R:                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R:                  " S5      5      U l        ["        R.                  " [0        R:                  " S5      5      U l        GO|[9        S	UR,                   S
35      eUR*                  S;   Ga9  UR,                  S:X  a  ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        OUR,                  S:X  as  ["        R.                  " [0        R<                  " SUR>                  SS95      U l        ["        R.                  " [0        R<                  " SUR>                  SS95      U l        O2[9        S	UR,                   S
35      e[A        SUR*                   S35      e[C        U S5      (       a  [C        U S5      (       d  [9        S5      eg )NT)r   r  r  r   r  r  r  rH  r  zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr3  )r   stdr4  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"rs   rt   r   r  rI  r   r  
cross_attnr   r   r   rK  r   r   rL  rM  r  r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r0   rV  r]  r^  rr   r   rY  alphas_initializer_rangeNotImplementedErrorr  rN  s      r6   rt   (IdeficsGatedCrossAttentionLayer.__init__  s   !--*((00#NN!00
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%nn ggi##w.  H,(*U[[AtGWGW5X(Y%#%<<Aq$BRBR0S#T ""g-(*U[[^(D%#%<<A#?  #CFDUDUCVVW!XYY%%/  H,(*UZZ1dFVFV5W(X%#%<<

1aAQAQ0R#S ""g-(*UZZ](C%#%<<

1#>  #CFDUDUCVVW!XYY%%)II  H,(*LLcv/N/NVWYZ\`\l\lUmn)% $&<<LLcv/N/NVWYZ\`\l\lUmn$  ""g-(*LLcv/N/NVWY)% $&<<#6KjKjrs0u#v  #CFDUDUCVVW!XYY &(DVE]E]D^^s&tuu011gdM6R6RJKK 7Sr5   r*  r&   r+  r,  r'   rC   r)   rA   cross_attention_gatery   rl   c                 ~   Uc  [        S5      eUc  [        S5      eUb  [        S5      eUnU R                  U5      nU R                  " S	UUUS.UD6u  p[        R
                  R                  XR                  U R                  S9nUR                  US:H  SS2SS2S4   S5      nXR                  U R                  5      U-  -   nUnU R                  U5      nU R                  U5      n[        R
                  R                  XR                  U R                  S9nXR                  U R                  5      U-  -   nU$ )
a  
image_hidden_states (`torch.FloatTensor`):
    Input to the layer of shape `(batch, seq_len, embed_dim)`
image_attention_mask (`torch.FloatTensor`, *optional*):
    image attention mask of size
    `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
cross_attention_gate (`torch.FloatTensor`, *optional*):
    gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r'   r/  rC   r   r   r3  r*   )rr   rf  rL  r_  r   r  r   r  r  masked_fillra  r]  rM  rK  rb  r^  )
rx   r'   rC   r)   rA   rh  r&   ry   rQ  r:  s
             r6   r   'IdeficsGatedCrossAttentionLayer.forward  sa   * &# 
  ' ^  &%&uvv ,,];  ?? 
'0/
 	
 --m{{UYUbUb-c%113G13LaQRTXj2Y[^_ #6#6t7L7L#MP]#]] !55mD/--m{{UYUbUb-c >>$2B2B#Cm#SSr5   )
ra  rb  r]  r^  r  r_  r   rL  rK  rM  rX   rC  )r+   r,   r-   r.   r   r   r   rt   r   r   r0   r   r   r   r   r1   r   r4   r   r   s   @r6   rT  rT    s    @L} @L# @L @LD %0A6R 266:7;7;+/8||8 !.8 &ell3	8
 'u||48 'u||48 "%8 +,8 
		8  S8r5   rT  c                   \    \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\" \SS	S
9S.rS rSrg)IdeficsPreTrainedModeli3  r  rb   TrF  rT  Fr   rJ  )index
layer_name)r'   r(   c                 F   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[         5      (       a%  UR"                  R                  R                  5         g [        U[$        5      (       GaT  U R                   R&                  S:X  aI  UR(                  R                  R                  5         UR*                  R                  R                  5         g U R                   R&                  S:X  aK  UR(                  R                  R                  S5        UR*                  R                  R                  S5        g U R                   R&                  S;   aq  UR(                  R                  R                  SU R                   R,                  S9  UR*                  R                  R                  SU R                   R,                  S9  g g [        U[.        5      (       a%  UR0                  R                  R                  5         g g )Nr3  )r   r\  r   rV  r   >   rY  rZ  r[  )r  initializer_rangerY   r   rU   Conv2drv   datanormal_r   zero_rV   rq   rT   fill_r   r   class_embeddingrT  rc  r]  r^  re  r   latents)rx   r\   r\  s      r6   _init_weights$IdeficsPreTrainedModel._init_weightsD  s    kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$//MM$$S) 788""''//1 ?@@{{,,7'',,224""''--/..&8'',,2237""''--c2..2RR'',,44#4;;CgCg4h""''//Sdkk>b>b/c S  9::NN'') ;r5   r*   N)r+   r,   r-   r.   r   r2   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_can_compile_fullgraph_supports_attention_backendrF  r   r  _can_record_outputsry  r4   r*   r5   r6   rm  rm  3  sV    &*#.0QRN ""& -$%5Q;W
*r5   rm  c            !         ^  \ rS rSrSrS\4U 4S jjrSS jr/ 4S jr/ 4S jr	\
" 5       \            SS\\R                     S	\\R                     S
\\R                     S\\   S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R                     S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )IdeficsModelif  z
Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

Args:
    config: IdeficsConfig
r  c           
        > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        UR
                  UR                  UR                  UR                  U R                  S9U l
        UR                  R                  U l        UR                  U l        UR                  U R                  l        [        UR                  5      U l        UR                   (       a]  UR"                  n[%        UUR                  R&                  UR(                  UR*                  UR,                  UR.                  5      U l        [2        R4                  " [7        UR8                  5       Vs/ s H  n[;        XS9PM     sn5      U l        UR>                  U l        UR8                  U R>                  -  n[2        R4                  " [7        U5       Vs/ s H  n[A        XS9PM     sn5      U l!        SU l"        [G        UR                  URH                  S9U l%        U RM                  5         U RO                  U5        g s  snf s  snf )N)rn   ru   ro   rk   rq   )r  Fr  )(rs   rt   r  pad_token_idrq   
vocab_sizeri   additional_vocab_sizer   freeze_text_layersembed_tokensr  
image_sizer7  r    vision_modeluse_resamplerperceiver_configr   r  resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layersrF  layerscross_layer_intervalrT  gated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)rx   r  r  inum_cross_layersrz   s        r6   rt   IdeficsModel.__init__o  s    !.. ++5!,,&,&B&B ,,#66((
 !..99#11282M2M/4V5I5IJ %66'@$$.. 00 22 33 44(D$ mm?DVE]E]?^_?^! 5?^_
 %+$?$?!!33t7P7PP')}}KPQaKbcKba,VAKbc(
$ ',#"6#5#56;N;NO	 	##F+ ` ds   3II	c                     Uc  U R                   nUR                  (       a  U R                  UR                  5        UR                  (       a  [	        U R
                  UR                  S9  g g N)rc   )r  r  freeze_text_module_exceptionsfreeze_vision_layersrg   r  freeze_vision_module_exceptions)rx   r  s     r6   r  #IdeficsModel.freeze_relevant_params  sQ    >[[F$$##F$H$HI&&**f>d>de 'r5   c                 T    U R                   U R                  4 H  n[        X!S9  M     g r  )r  r  rg   )rx   rc   r\   s      r6   r  IdeficsModel.freeze_text_layers  s!    {{DII.FE /r5   c                 ,    [        U R                  US9  g r  )rg   r  )rx   rc   s     r6   r  !IdeficsModel.freeze_vision_layers  s    T&&:KLr5   rL   rC   r   r&   inputs_embedsr>   r?   r@   rA   	use_cacheinterpolate_pos_encodingr0  ry   rl   c           	         Ub  UR                   OUR                   nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU
(       a  Uc  [        U R                  S9nUR
                  u  nnnUb  UR                  5       OSnUU-   nUc0  [        R                  " UUUR
                  S   -   UR                   S9nUbG  UcD  UR                  5       R                  S5      S-
  nUR                  US:H  S5        USS2U* S24   nOUc  UR                  S5      n[        S XgU4 5       5      S	:w  a  [        S
5      eUbw  UR                  U R                  US9nUR
                  SS	 u  nnUR!                  5       R"                  " UU-  /UR
                  S	S Q76 nU R%                  XkS9R&                  nOHUbE  UR)                  5       u  nnnnUR                  U R                  US9nUR#                  UU-  UU5      nU R                  R*                  (       aO  Uc4  U R-                  W5      nUR)                  S5      UR)                  S	5      nnOUR)                  5       u  nnnnUnO1Uc#  WR)                  S5      UR)                  S	5      nnO[        S5      eUR#                  UWU-  U5      nU	R)                  S5      nU	R                  S5      n	U	R/                  SSSU5      n	U	R#                  UUUU-  5      n	UbB  UR)                  5       u  nnnUU4nU	c  [        R0                  " UUS9n	U R3                  U	5      n	OSn	U	S:H  R5                  SS9R                  U R                  S9R7                  SS9R                  U5      nUc0  [        R0                  " UU4[        R8                  UR                   S9n[;        U R                  UUUUUS9nUn[=        U R>                  5       HR  u  nn UU R@                  -  S:X  a+  U RB                  UU R@                  -     n!U!" UUU4U	USS.UD6nU " U4UUUUS.UD6nMT     U RE                  U5      nUR#                  UUUU5      n[G        UUUS9$ )a*  
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.
Nz:You must specify exactly one of input_ids or inputs_embeds)r  r   r   )rI   r=   c              3   (   #    U  H  oS L v   M
     g 7frX   r*   )rZ   r   s     r6   r]   'IdeficsModel.forward.<locals>.<genexpr>  s     a"`QDy"`s   r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rp   rI   )r>   r  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer3  r   r   )r  input_embedsrC   r0  r&   r   )rA   rh  r&   )rC   r   r&   r0  )r%   r)   r&   )$rI   rr   r  r   r  rE   get_seq_lengthr0   rD   longcumsummasked_fill_r   sumrH   rp   r  rF   r  r%   r4  r  r  rG   r   invert_attention_maskr`   squeezer   r   	enumerater  r  r  r  r#   )"rx   rL   rC   r   r&   r  r>   r?   r@   rA   r  r  r0  ry   rI   
batch_size
seq_lengthr:  past_key_values_lengthseq_length_with_past
num_imagesr)   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaperh  causal_maskr'   idxdecoder_layercross_attn_blocks"                                     r6   r   IdeficsModel.forward  s,   4 &/%:!!@T@T-t";<YZZ  --i8M0*$++>O$1$7$7!
JETE`!?!?!Afg),BB!"\\&(>ATATUVAW(W`m`t`tN %,*>)..077;a?L%%n&91='J;<8L!)33A6La<K_"`aaeffq  %'??F?KL%1%7%7%;"J
'22499*z:QkT`TfTfghgiTjkL #'"3"3) #4 #   &1G_GdGdGfDJ
M3D":"="=DJJW]"="^"5":"::
;RTact"u;;$$#+'+'?'?@S'T$3G3L3LQ3OQeQjQjklQm00K_KdKdKfH
J7H"6!)/B/G/G/JL_LdLdefLg,M,abb166z:P]C]_pq ,0033==bA3::1aMR388\S]`mSmn*9L9Q9Q9S63Q"24I!J#+',zz2DV'T$#'#=#=>R#S #'  $83#>"C"C"C"K!O!OVZV`V`!O!a j jop j quu 

 !"ZZ12%**]MaMaN );;&))+%
 &"+DKK"8CT...!3#'#?#?tG`G`@`#a  0!'! *>)=$(! ! **) /- M #9. 		-0166z:}^op-+ 3+
 	
r5   )r  r  r  r  r  r  r  r  rq   r  r  r  r  rX   )NNNNNNNNNNFN)r+   r,   r-   r.   r/   r   rt   r  r  r  r   r   r   r0   rD  r   r   r1   r   r   r   r   r3   r#   r   r4   r   r   s   @r6   r  r  f  s   0,} 0,df 46 F 68 M  151537+/5948@D<@7;$(3859^
E,,-^
 !.^
 u//0	^

 "%^
   1 12^
 u001^
 #+5+<+<"=^
 'u'8'89^
 'u||4^
 D>^
 #+4.^
 !!1!12^
 +,^
 
u44	5^
  ^
r5   r  c            #       4  ^  \ rS rSrSS/rSU 4S jjrS r\\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\
R                     S\\   S\\\4   4S jj5       5       r         SU 4S jjr SS\S\\\4   S\S\\\4   4U 4S jjjrSrU =r$ ) IdeficsForVisionText2TextiU  zmodel.embed_tokens.weightzlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  UR                  SUR                  S9U l	        U R                  5         g )NFr   )rs   rt   r  rb   r   r   r  r  freeze_lm_headlm_headr  )rx   r  r  rz   s      r6   rt   "IdeficsForVisionText2Text.__init__X  s[     !&)
-****$*$@$@#22
 	r5   c                    U R                  5       nU R                  5       n[        U R                  SS5      (       ab  UR                  Ul        UR
                  S:  aA  UR                  UR
                  :X  d   eUR                  R                  UR                  l        [        US5      (       aY  [        US5      (       aG  UR                  Ul        [        US5      (       a$  [        US5      (       a  UR
                  Ul        ggggg)	z
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
tie_word_embeddingsTr   r   rn   r   ru   N)get_output_embeddingsget_input_embeddingsgetattrr  rv   ru   r   rw   r   r  rn   r   )rx   output_embeddingsinput_embeddingss      r6   tie_weights%IdeficsForVisionText2Text.tie_weightsg  s    
 !6684464;; 5t<<'7'>'>$99A=(@@DTDnDnnnn9I9^9^9e9e!//6$n55'BRTd:e:e-=-L-L*(*CDD "=J J =M<f<f!9JD ;f5r5   rL   rC   r   r&   r  r>   r?   r@   rA   labelsr  r  r0  ry   rl   c                 6   U R                   " SUUUUUUUUU	UUSUS.UD6nUS   nU R                  U5      nSnU
b)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )aK  
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoProcessor, IdeficsForVisionText2Text

>>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

>>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
>>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

>>> prompts = [
...     [
...         "User:",
...         dogs_image_url_1,
...         "Describe this image.\nAssistant: An image of two dogs.\n",
...         "User:",
...         dogs_image_url_2,
...         "Describe this image.\nAssistant:",
...     ]
... ]
>>> inputs = processor(prompts, return_tensors="pt")
>>> generate_ids = model.generate(**inputs, max_new_tokens=6)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)
```T)rL   rC   r   r&   r  r>   r?   r@   rA   r  r  return_dictr0  r   N)r;   r  r  )r:   r;   r&   r'   r(   r)   r*   )
rb   r  loss_functionr  r  r8   r&   r'   r(   r)   )rx   rL   rC   r   r&   r  r>   r?   r@   rA   r  r  r  r0  ry   outputsr'   r;   r:   s                      r6   r   !IdeficsForVisionText2Text.forward|  s    r ** 
)%+'%%=!5!5%=)
 
"  
m,%%pVF{{OeOepiopD,#33!//)) ' ; ;
 	
r5   c                   > 0 nUb%  U R                   R                  (       a  XS'   O	XS'   OX|S'   UR                  SS5      US'   [        TU ]  " U4UUUUUU
U	S.UDUD6nU	b$  Uc!  US   R
                  S   nU	S S 2U* S 24   US	'   U$ )
Nr@   r?   r>   r  F)r&   rC   r  r0  r   r  rA   rL   r   rA   )r  r  poprs   prepare_inputs_for_generationrE   )rx   rL   rC   r   r  r&   r0  r>   r)   rA   r  ry   images_kwargsmodel_inputsr  rz   s                  r6   r  7IdeficsForVisionText2Text.prepare_inputs_for_generation  s      *{{((8K45<O89,8.)4:JJ?Y[`4a01w<
+)')%!5
 
 
  +0E%k288;J3GJ;<3XL/0r5   r  rP   rN   c                   > [         TU ]  " UUU40 UD6nSU;   aU  US   nUS S 2SS S 24   R                  S5      nUR                  SS5      (       a  XbS'   O[        R
                  " XV/SS9US'   UR                  US'   U$ )NrA   r=   r   r  Tr   r)   )rs   #_update_model_kwargs_for_generationr   rK   r0   r   r)   )rx   r  rP   rN   ry   rA   	last_maskrz   s          r6   r  =IdeficsForVisionText2Text._update_model_kwargs_for_generation  s     wB
 	
 "\1#/0F#G ,QAX6@@CIT227@347<yyBVAbhi7j34 /6.I.I*+r5   )r  rb   rX   )NNNNNNNNNNNFN)	NNNNNNNNN)F)r+   r,   r-   r.   _tied_weights_keysrt   r  r   r   r   r0   rD  r   r   r1   r   r   r   r   r3   r8   r   r  r   dictr   r   r  r4   r   r   s   @r6   r  r  U  s   57GHg*  151537+/5948@D<@7;-1$(3859V
E,,-V
 !.V
 u//0	V

 "%V
   1 12V
 u001V
 #+5+<+<"=V
 'u'8'89V
 'u||4V
 ))*V
 D>V
 #+4.V
 !!1!12V
 +,V
  
u33	4!V
  V
v  !+b $)	 38n !	 
c3h r5   r  )r  r  rm  )r   FNN)r   )r3  )Lr/   dataclassesr   typingr   r   r   r   r0   torch.nn.functionalr   r  r}   activationsr
   cache_utilsr   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_ideficsr   	perceiverr   visionr   r    
get_loggerr+   r  r#   r8   rR   rg   rV   ri   rU   r   Moduler   r   r   r   r   r   r   r	  r  rF  rT  rm  r  r  __all__r*   r5   r6   <module>r     s6  (  ! 1 1     ! . ) / 9 + X X & R R 0 ? 0 0 E 
		H	% 
C[ C C6 
CK C C8 *#Z +- fA fAR8SRYY 8SxJRYY J0$
uxx $
N(:P P2 %II%<<% 
% <<	%
 U\\*% % %0W)ryy W)v64 6r}&@ }@ /*_ /* /*d k
) k
 k
\F 6 FR Rr5   