
    bCi                        S r SSKJr  SSKJrJrJr  SSKrSSKJ	s  J
r  SSKJ	r	  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*  \%RV                  " \,5      r- " S S\	R\                  5      r/ " S S\	R\                  5      r0 " S S\05      r1 " S S\05      r2S r3SMS jr4 " S S\	R\                  5      r5 " S  S!\	Rl                  5      r7S"\Rp                  S#\9S$\Rp                  4S% jr: SNS&\	R\                  S'\Rp                  S(\Rp                  S)\Rp                  S*\\Rp                     S+\;S,\;S-\ \"   4S. jjr< " S/ S0\	R\                  5      r= " S1 S2\5      r> " S3 S4\5      r? " S5 S6\	R\                  5      r@ " S7 S8\	R\                  5      rA " S9 S:\	R\                  5      rB " S; S<\	R\                  5      rC " S= S>\	R\                  5      rD " S? S@5      rE\# " SA SB\5      5       rF\#" SCSD9 " SE SF\F5      5       rG\# " SG SH\F5      5       rH\#" SISD9 " SJ SK\F\5      5       rI/ SLQrJg)OzPyTorch Chameleon model.    )cached_property)CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )ChameleonConfigChameleonVQVAEConfigc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )ChameleonRMSNorm/   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z/
ChameleonRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/chameleon/modeling_chameleon.pyr"   ChameleonRMSNorm.__init__0   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor$   float32powmeanrsqrtr'   r&   )r(   hidden_statesinput_dtypevariances       r,   forwardChameleonRMSNorm.forward8   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r.   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler&   shaper'   r(   s    r,   
extra_reprChameleonRMSNorm.extra_repr?   s*    ))*+6$2G2G1HIIr.   )r'   r&   )ư>)	__name__
__module____qualname____firstlineno__r"   r<   rB   __static_attributes____classcell__r+   s   @r,   r   r   /   s    $;J Jr.   r   c                   z   ^  \ rS rSr% \R
                  \S'   SU 4S jjr\R                  " 5       S 5       r	Sr
U =r$ )ChameleonRotaryEmbeddingE   inv_freqc           	      P  > [         TU ]  5         XPl        Xl        X l        X0l        SU R
                  [        R                  " SU R                  S[        R                  S9R                  U[        R                  S9U R                  -  -  -  nU R                  SUSS9  X l        g )	N      ?r   r0   r3   devicer3   rO   F
persistent)r!   r"   scaling_factordimmax_position_embeddingsbaser$   arangeint64r4   floatregister_buffermax_seq_len_cached)r(   rX   rY   rZ   rT   rW   rO   r+   s          r,   r"   !ChameleonRotaryEmbedding.__init__H   s    ,'>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD"9r.   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       nUR                  R
                  nUS:w  a  UOSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       nUR                  5       n	S S S 5        WR                  UR                  S
9W	R                  UR                  S
94$ ! , (       d  f       N@= f)Nr   r1   r   mpscpuF)device_typeenabledr0   rX   rR   )rO   r]   expandr@   rT   typer$   autocast	transposecatcossinr4   r3   )
r(   xposition_idsinv_freq_expandedposition_ids_expandedrd   freqsembrl   rm   s
             r,   r<    ChameleonRotaryEmbedding.forwardV   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @ hhmm%0E%9ku^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')C'')C	 D
 vvAGGv$cff177f&;;; DCs   A(D22
E )rZ   rX   rY   r_   rW   )i   i'  NrQ   )rE   rF   rG   rH   r$   Tensor__annotations__r"   no_gradr<   rI   rJ   rK   s   @r,   rM   rM   E   s,    ll: ]]_< <r.   rM   c                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ )%ChameleonLinearScalingRotaryEmbeddingg   z_ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendevc                 f   > UR                  5       U R                  -  n[        TU ]  X5      u  p4X44$ N)r]   rW   r!   r<   )r(   rn   ro   rl   rm   r+   s        r,   r<   -ChameleonLinearScalingRotaryEmbedding.forwardj   s3    #))+d.A.AA7?13xr.    rE   rF   rG   rH   __doc__r<   rI   rJ   rK   s   @r,   ry   ry   g   s    i r.   ry   c                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ ))ChameleonDynamicNTKScalingRotaryEmbeddingq   zqChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozillac           	        > [         R                  " U5      S-   nX0R                  :  a  U R                  U R                  U-  U R                  -  U R                  S-
  -
  U R
                  U R
                  S-
  -  -  -  nSU[         R                  " SU R
                  S[         R                  S9R                  UR                  [         R                  S9U R
                  -  -  -  nU R                  SUSS	9  [        TU ]5  X5      u  pgXg4$ )
Nr   r0   rQ   r   rR   rS   rO   FrU   )r$   maxrY   rZ   rW   rX   r[   r\   r4   rT   r]   r^   r!   r<   )	r(   rn   ro   seq_lenrZ   rO   rl   rm   r+   s	           r,   r<   1ChameleonDynamicNTKScalingRotaryEmbedding.forwardt   s    ))L)A-11199$$w.1M1MMRVReRehiRij((dhhl+ - -D LLDHHau{{CFFahh^c^i^iFjmqmumuuwH   X% H7?13xr.   r~   r   rK   s   @r,   r   r   q   s    { r.   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr1   r0   rf   )r@   r$   rk   )rn   x1x2s      r,   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r.   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrl   rm   ro   unsqueeze_dimq_embedk_embeds           r,   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr.   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ChameleonMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)r!   r"   configr)   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr(   r   r+   s     r,   r"   ChameleonMLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r.   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r|   )r   r   r   r   )r(   rn   r   s      r,   r<   ChameleonMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r.   )r   r   r   r   r)   r   r   rE   rF   rG   rH   r"   r<   rI   rJ   rK   s   @r,   r   r      s    0 r.   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ChameleonLayerNorm   ar  
LayerNorm but computes stats only over the last dim because Chameleon applies gamma and beta
from each shard separately to each head, instead of reducing. We can apply each head's own
gamma/beta by repeat-interleaving weights from each shard, but the stats have to be computed
in the last dimension. This module applies gamma/beta manually to fulfill this requirement.
c                 D   > [         TU ]  " U/UQ70 UD6  US   4U l        g )Nr1   )r!   r"   normalized_shape)r(   r)   argskwargsr+   s       r,   r"   ChameleonLayerNorm.__init__   s)    6t6v6!,R 2r.   c                 ~    [         R                  " XR                  S S SS9nXR                  -  U R                  -   nU$ )Ngh㈵>r*   )F
layer_normr   r&   r   r(   r9   s     r,   r<   ChameleonLayerNorm.forward   s9    ]4I4I4QU[_`%3dii?r.   )r   )	rE   rF   rG   rH   r   r"   r<   rI   rJ   rK   s   @r,   r   r      s    3 r.   r   r9   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r@   rg   reshape)r9   r   batchnum_key_value_headsslenhead_dims         r,   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr.   modulequerykeyvalueattention_maskscalingdropoutr   c                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr0   r   r1   )rX   r3   )ptrainingr   )r   num_key_value_groupsr$   matmulrj   r@   r   
functionalsoftmaxr5   r4   r3   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r,   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r.   c                   Z  ^  \ rS rSrSrSS\S\\   4U 4S jjjrS r	\
" SSS	S
9      SS\R                  S\\R                     S\\R                     S\\   S\S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       rSrU =r$ )ChameleonAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr   	layer_idxc                   > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR                   U l        SU l        UR$                  U l        U R                  S-  U l        U R                  U R                  -  U R                  :w  a&  [)        SU R                   SU R                   S35      e[*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  UR.                  S9U l        [9        U R                  U R                  45      U l        [9        U R                  U R                  45      U l        U R?                  5         g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.T      z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   ) r!   r"   r   r   loggerwarning_oncer+   rE   attention_dropoutr)   num_attention_heads	num_headsr   r   r   rY   
rope_theta	is_causalmodel_parallel_sizer   
ValueErrorr   r   attention_biasq_projk_projv_projo_projr   q_normk_norm
_init_roper(   r   r   r+   s      r,   r"   ChameleonAttention.__init__   s,   " !8!8 9 :, , "(!9!9!--33((DNN:#)#=#= $(NNd6N6N$N!'-'E'E$ ++#)#=#= }}d*MMDNN*t/?/??QRVRbRbQc$T^^$4B8 
 ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii 0 0$2B2BI^I^_($..$--)HI($*B*BDMM)RSr.   c                    U R                   R                  c/  [        U R                  U R                  U R
                  S9U l        g U R                   R                  S   nU R                   R                  S   nUS:X  a0  [        U R                  U R                  UU R
                  S9U l        g US:X  a0  [        U R                  U R                  UU R
                  S9U l        g [        SU 35      e)N)rY   rZ   rh   factorlinear)rY   rW   rZ   dynamiczUnknown RoPE scaling type )
r   rope_scalingrM   r   rY   r   
rotary_embry   r   r   )r(   scaling_typerW   s      r,   r   ChameleonAttention._init_rope  s    ;;##+6(,(D(D__DO  ;;33F;L![[55h?Nx'"GMM,0,H,H#1	# *"KMM,0,H,H#1	# !#=l^!LMMr.   past_key_valuepast_key_values4.58new_nameversionr9   r   ro   output_attentions	use_cachecache_positionr   c                    UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R                  U R                  5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nU R                  X5      u  nn[        XUU5      u  pUb$  UXS.nUR                  XU R                  U5      u  p[         nU R"                  R$                  S:w  a  [&        U R"                  R$                     nU" U UUUU4U R(                  (       d  SOU R*                  U R,                  S.UD6u  nnUR	                  XS5      R/                  5       nU R1                  U5      nUU4$ )Nr1   r   r0   )rm   rl   r   eager        )r   r   )sizer   r   r   r   r   r   r   r   r   rj   viewr   r   updater   r   r   _attn_implementationr   r   r   r   r   r   )r(   r9   r   ro   r   r   r   r   r   bszq_len_query_statesr   r   rl   rm   cache_kwargsattention_interfacer   r   s                        r,   r<   ChameleonAttention.forward:  s    &**,A{{=1[[/
{{=1#++BN{{<0''D,D,DdmmT
[[,
#++CV``abdef''D4L4Ldmm\ffghjkl
#((T5M5Mt}}]gghiklm??<>S#7RUWZ#[ &#&sUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "))#b9DDFkk+.L((r.   )r   r   r   r)   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   r|   NNNFFN)rE   rF   rG   rH   r   r   r   intr"   r   r   r$   ru   
LongTensorr
   boolr?   r<   rI   rJ   rK   s   @r,   r   r      s    G# #8C= # #NN6 %0A6R 2637+/"'595)||5) !.5) u//0	5)
 "%5)  5) 5) !!1!125) 
u||Xell3XeELL>Q5RR	S5) S5)r.   r   c                   N  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9      SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )ChameleonDecoderLayerit  r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g N)r   r   r   r!   r"   r)   r   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormr   s      r,   r"   ChameleonDecoderLayer.__init__u  k    !--+6O'/0B0BH[H[\(89K9KQWQdQd(e%r.   r   r   r   r   r9   r   ro   r   r   r   r   c                     Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pX-   nUn	U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )ab  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
r9   r   ro   r   r   r   r   r~   )r  r  r  r  r(   r9   r   ro   r   r   r   r   r   residualself_attn_weightsoutputss               r,   r<   ChameleonDecoderLayer.forward  s    > !,,]; ,0>> 	,
')%+/)	,
 	,
( !0 !55mD/ 0 "++Gr.   r)   r  r  r  r  r  rE   rF   rG   rH   r   r  r"   r   r$   ru   r   r  r
   r  r?   FloatTensorr<   rI   rJ   rK   s   @r,   r  r  t  s    f f3 f %0A6R 2637+/,1$)59:||: !.: u//0	:
 "%: $D>: D>: !!1!12: 
u  (51B1BEDUDU1U+V"WW	X: S:r.   r  c                   N  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9      SS	\R                  S
\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )ChameleonSwinDecoderLayeri  r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g r  r  r   s      r,   r"   "ChameleonSwinDecoderLayer.__init__  r  r.   r   r   r   r   r9   r   ro   r   r   r   r   c                     Un	U R                   " SUUUUUUUS.UD6u  pU R                  U5      nX-   nUn	U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
r  r~   )r  r  r  r  r  s               r,   r<   !ChameleonSwinDecoderLayer.forward  s    @ ! ,0>> 	,
')%+/)	,
 	,
( ,,]; 0 /55mD 0 "++Gr.   r$  r  r%  rK   s   @r,   r(  r(    s    f f3 f %0A6R 2637+/,1$)598||8 !.8 u//0	8
 "%8 $D>8 D>8 !!1!128 
u  (51B1BEDUDU1U+V"WW	X8 S8r.   r(  c                   N   ^  \ rS rSrSrU 4S jrS\R                  4S jrSr	U =r
$ )ChameleonVQVAEVectorQuantizeri  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
c                    > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        g )Nbetag      ?)
r!   r"   num_embeddings	embed_dimembedding_dimgetattrr0  r   	Embedding	embeddingr   s     r,   r"   &ChameleonVQVAEVectorQuantizer.__init__  sX    $33#--FFD1	d&9&94;M;MNr.   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r0   r   r   r1   T)rX   r2   rf   z	bd,dn->bn)permuter   r  r3  r$   sumr6  r&   einsumrj   argminr@   r7   detachr0  )r(   r8  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantlosss          r,   r<   %ChameleonVQVAEVectorQuantizer.forward  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===r.   )r0  r6  r3  r1  )rE   rF   rG   rH   r   r"   r$   ru   r<   rI   rJ   rK   s   @r,   r.  r.    s#    O>ELL > >r.   r.  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )#ChameleonVQVAEEncoderConvDownsamplei3  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r0   r   kernel_sizestridepadding)r!   r"   r   Conv2dconvr(   in_channelsr+   s     r,   r"   ,ChameleonVQVAEEncoderConvDownsample.__init__4  s%    IIkAaYZ[	r.   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r   r   r   constantr   )padmoder   )r   rS  rM  r   s     r,   r<   +ChameleonVQVAEEncoderConvDownsample.forward8  s+    mJVWX		-0r.   )rM  r   rK   s   @r,   rF  rF  3  s    \ r.   rF  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ ) ChameleonVQVAEEncoderResnetBlocki?  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    rD   T
num_groupsnum_channelsr*   affiner   r   rH  r   )r!   r"   rO  out_channelsuse_conv_shortcutr$   r   	GroupNormnorm1rL  conv1norm2Dropoutr   conv2conv_shortcutnin_shortcut)r(   r   rO  r^  rf  r+   s        r,   r"   )ChameleonVQVAEEncoderResnetBlock.__init__@  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r.   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r|   )ra  r$   sigmoidrb  rc  r   re  rO  r^  r_  rf  rg  )r(   r9   r   s      r,   r<   (ChameleonVQVAEEncoderResnetBlock.forwardW  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''r.   )
rb  re  rf  r   rO  rg  ra  rc  r^  r_  )NFr   rK   s   @r,   rW  rW  ?  s    
 s.( (r.   rW  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ChameleonVQVAEEncoderAttnBlockik  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )NrY  rD   TrZ  r   r   rH  )r!   r"   rO  r$   r   r`  normrL  r   r   vproj_outrN  s     r,   r"   'ChameleonVQVAEEncoderAttnBlock.__init__l  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcder.   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r0   r   r   rf   )ro  r   r   rp  r@   r   r:  r$   bmmr  r   r   rq  )r(   r9   r   r	  r   r   
batch_sizechannelsheightwidthr   r   s               r,   r<   &ChameleonVQVAEEncoderAttnBlock.forwardv  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%r.   )rO  r   ro  rq  r   rp  r   rK   s   @r,   rm  rm  k  s    f& &r.   rm  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )ChameleonVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  XBSSSS9U l        UnS[        U5      -   n	Xl        [        R"                  " 5       U l        ['        U R                  5       GH%  n
[        R"                  " 5       n[        R"                  " 5       nX)U
   -  nX'U
   -  n['        U R
                  5       Hk  nUR)                  [+        UUUS95        UnUR,                  c  M.  XR,                  ;   d  M?  UR.                  S:X  d  MQ  UR)                  [1        U5      5        Mm     [        R2                  " 5       nUUl        UUl        XR                  S-
  :w  a  [9        U5      Ul        US-  nU R$                  R)                  U5        GM(     [        R2                  " 5       U l        [+        UWUS9U R<                  l        UR.                  S:X  a  [1        U5      O[        R@                  " 5       U R<                  l!        [+        UUUS9U R<                  l"        [        R                  RG                  SUS	S
S9U l$        [        R                  R                  UU(       a  SU-  OUSSSS9U l%        g )Nr   r   rH  )r   )r   rO  r^  vanillar0   rY  rD   TrZ  )&r!   r"   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channels
resolutionrO  double_latentlatent_channelsr$   r   rL  conv_inr?   in_channel_multiplier
ModuleListdownrangeappendrW  attn_resolutions	attn_typerm  ModuleblockattnrF  
downsamplemidblock_1Identityattn_1block_2r`  norm_outconv_out)r(   r   r  r  rO  r  r  r  curr_resr  i_levelr  r  block_in	block_outi_blockr  r+   s                    r,   r"   ChameleonVQVAEEncoder.__init__  s   "6#<#<=$33,,&&
((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!454%$,%. %++7 $;$;;((I5KK >x HI 6  99;DDJDI..22"Eh"O#q=IIT"7 3: 99;; !

 GMFVFVZcFc8Bikititiv; !
 **bxUYbf*g#0Ao ( 
r.   pixel_valuesc                 @   U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  R                  U5      nU R                  R                  U5      nU R                  R                  U5      nU R                  U5      nU[        R                   " U5      -  nU R#                  U5      nU$ )Nr1   r   r   )r  r  r  r  r  r  r~  r  r  r  r  r  r  r  r  r$   rj  r  )r(   r  r9   r  r  r8  last_hidden_states          r,   r<   ChameleonVQVAEEncoder.forward  sr   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH,,->? HHOO,=> HH,,->? !MM*;<U]]+<== MM*;<  r.   )r  r  r  r  r  r  r  r  )
rE   rF   rG   rH   r"   r$   r  r<   rI   rJ   rK   s   @r,   r{  r{    s!    C
J!E$4$4 ! !r.   r{  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\R                  S\R                  4S jrSrg)ChameleonImageVocabularyMappingi  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 <    Xl         UR                  S5      U l        g )Nz<image>)	vocab_mapgetimage_token_id)r(   r  s     r,   r"   (ChameleonImageVocabularyMapping.__init__  s    "'mmI6r.   c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r|   )r  itemsr(   r   rp  s      r,   val2name(ChameleonImageVocabularyMapping.val2name  s-    !%!5!5!78!7!7888   0c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf )NIMGIMG)sortedr  r  
startswith)r(   namevals      r,   image_tokens,ChameleonImageVocabularyMapping.image_tokens  s<    DNN,@,@,B`,BytdooV^F_s,B`aa`s   A
A
c           
      (  ^ [        S5       Vs0 s H$  n[        [        S5      U-   5      [        U5      _M&     snmS[        S[        4U4S jjnU R                   Vs0 s H!  o3[        U" U R                  U   5      5      _M#     sn$ s  snf s  snf )N
   Aold_namer   c                 R   > SR                  U4S jU [        S5      S  5       5      $ )N c              3   F   >#    U  H  nTR                  X5      v   M     g 7fr|   )r  ).0cimg_tkn_chr_mappings     r,   	<genexpr>IChameleonImageVocabularyMapping.bpe2img.<locals>.remap.<locals>.<genexpr>  s"     _B^Q.22188B^s   !r  r1   )joinr~  )r  r  s    r,   remap6ChameleonImageVocabularyMapping.bpe2img.<locals>.remap  s$    77_(3x=[]B^___r.   )r  chrordstrr  r  r  )r(   ir  tokr  s       @r,   bpe2img'ChameleonImageVocabularyMapping.bpe2img   s    BG)L)Qs3s8a<0#a&8)L	`C 	`C 	` @D?P?PQ?PSt}}S1233?PQQ M
 Rs   +B
(Bc                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r|   )r  r  r  s      r,   img2bpe'ChameleonImageVocabularyMapping.img2bpe	  s-    !%!3!3!56!5!5666r  c                     [         R                  " [        U R                  R	                  5       5      5      [         R                  " [        U R                  R                  5       5      5      4$ r|   )r$   tensorr  r  keysvaluesrA   s    r,   bpe2img_search_tensors6ChameleonImageVocabularyMapping.bpe2img_search_tensors  sC    ||F4<<#4#4#678%,,vdllNaNaNcGd:eeer.   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ )Nr   rR   )r$   zerosr   r  r  r  r  )r(   mappingr   rp  s       r,   img2bpe_mapping_tensor6ChameleonImageVocabularyMapping.img2bpe_mapping_tensor  sR    ++c$,,"3"3"56:%))LLL&&(DAAJ )r.   	img_batchr   c                 x    UR                   nU R                  UR                  S5         nUR                  U5      $ )Nrc   )rT   r  r4   )r(   r  rT   
img_tokenss       r,   convert_img2bpe/ChameleonImageVocabularyMapping.convert_img2bpe  s5    !!00e1DE
}}V$$r.   )r  r  N)rE   rF   rG   rH   r   r"   r   r  r  r  r  r  r  r$   ru   r  rI   r~   r.   r,   r  r    s    7 9 9 b b R R 7 7 f f  % %%,, %r.   r  c                   P    \ rS rSr% \\S'   SrSrSS/rSS/r	Sr
SrSrS	rSrSrS
rg)ChameleonPreTrainedModeli  r   modelTr  r(  r   r   Fr~   N)rE   rF   rG   rH   r   rv   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment_supports_flex_attn_supports_attention_backendrI   r~   r.   r,   r  r    sN    &*#02MN#4m"DN!(-%"&r.   r  aW  
    The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                   f   ^  \ rS rSr% \\S'   / SQrS\4U 4S jjrS\R                  4S jr
SrU =r$ )ChameleonVQVAEi.  r   )r.  rm  rW  c                 l  > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         g Nr   )r!   r"   r{  encoderr.  quantizer$   r   rL  r  r2  
quant_convpost_quant_convevalr   s     r,   r"   ChameleonVQVAE.__init__>  s|     ,V45f=((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		r.   r  c                 v    U R                  U5      nU R                  U5      nU R                  U5      u  p4nX4U4$ r|   )r  r  r  )r(   r  r9   quantemb_lossindicess         r,   encodeChameleonVQVAE.encodeG  s<    \26#'==#? ''r.   )r  r  r  r  )rE   rF   rG   rH   r   rv   r  r"   r$   r  r  rI   rJ   rK   s   @r,   r  r  .  s7     ! 3 (5#3#3 ( (r.   r  c                     ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\R                  4S jr	S\R                  S\R                  S	\R                  4S
 jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )ChameleonModeliN  r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  5      U l        U R                  R                  (       d  [        O[        n[
        R                   " [#        UR$                  5       Vs/ s H
  o2" X5      PM     sn5      U l        [)        UR                  UR*                  S9U l        [.        R1                  UR2                  5      U l        SU l        U R9                  5         g s  snf )Nr   F)r!   r"   pad_token_idpadding_idx
vocab_sizer   r5  r)   embed_tokensr  vocabulary_mapvocabulary_mappingr   	swin_normr  r(  r  r  num_hidden_layerslayersr   r  ro  r  _from_config	vq_configvqmodelgradient_checkpointing	post_init)r(   r   decoder_layerr   r+   s       r,   r"   ChameleonModel.__init__P  s     !.. ++LL):):F<N<NPTP`P`a"A&BWBW"X59[[5J5J-Pimm?DVE]E]?^_?^)]6-?^_
 %V%7%7V=P=PQ	%2263C3CD&+# 	 `s   Er  c                     UR                   S   nU R                  R                  U5      u    p4U R                  R	                  U5      nUR                  US5      nU$ )a;  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
        The tensors corresponding to the input images.
r   r1   )r@   r  r  r  r  r  )r(   r  ru  r  
image_toksbpe_tokss         r,   get_image_tokensChameleonModel.get_image_tokensb  sX     "''*
<<..|<1**:::F==R0r.   c                 T    U R                  U5      nU R                  5       " U5      nU$ )a  
Tokenizes images into discrete tokens with VQGAN module and embeds
them with text embeddings layer

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
        The tensors corresponding to the input images.
)r  get_input_embeddings)r(   r  r  vision_embeddingss       r,   get_image_features!ChameleonModel.get_image_featuresr  s.     ,,\: 557E  r.   	input_idsinputs_embedsimage_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r3   rT   r1   r   r   z6Image features and image tokens do not match: tokens: z, features )r  r$   r  r  r  longrT   allr;  r   	expand_asr4   r@   numelr   )r(   r  r  r  special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_mask#ChameleonModel.get_placeholder_mask  s    !.2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*.E.E.T.T!T+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r.   r   ro   r   r   r   output_hidden_statesreturn_dictr   r   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
U R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUS L US L-  (       a  [        S5      eUc  U R                  U5      nUb2  U R                  U5      nU R                  XUS9nUR                  X5      nU(       a9  Uc6  [        R                  R!                  5       (       d  [#        U R                   S9nUcD  Ub  UR%                  5       OSn[        R&                  " XUR(                  S   -   UR*                  S9nUc  UR-                  S5      n[/        U R                   UUUUUS	9nUnU	(       a  S
OS nU(       a  S
OS nU R0                   H7  nU	(       a  UU4-  nU" U4UUUUUUS.UD6nUS   nU(       d  M.  UUS   4-  nM9     U R3                  U5      nU	(       a  UU4-  nU
(       d  [5        S UUUU4 5       5      $ [7        UUUUS9$ )NzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz:You must specify exactly one of input_ids or inputs_embeds)r  r  )r   r   r   )rT   )r   input_embedsr   r   r   ro   r~   )r   ro   r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr|   r~   )r  rp  s     r,   r  )ChameleonModel.forward.<locals>.<genexpr>  s      ^a^s   	)r  r   r9   
attentions)r   r   r)  r   use_return_dictr  r   r   r   r   r  r  r'  masked_scatterr$   jit
is_tracingr   get_seq_lengthr[   r@   rT   r   r   r
  ro  r?   r   )r(   r  r  r   ro   r   r  r   r   r)  r*  r   r   image_embedsr$  past_seen_tokensr   r9   all_hidden_statesall_self_attnsr  layer_outputss                         r,   r<   ChameleonModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]&&4==Yj I-t";<YZZ  --i8M#22<@L!%!:!:| "; " *889KZM 09M9M9O9O*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #7BD0d![[M#!m%55!)	*) /"3#-	 	M *!,M  =#3"55% )( 		-0  -!11 )?<M~^   '+++%	
 	
r.   )r  r  r
  ro  r  r  r  r  NNNNNNNNNNN)rE   rF   rG   rH   r   r"   r$   r&  r  r  r  r'  r   r   ru   r
   r  r   r   r   r?   r   r<   rI   rJ   rK   s   @r,   r   r   N  s    $U->->  !u/@/@ !"))":?:K:K"]b]n]n"0  15481537+/59$(,0/3&*59j
E,,-j
 u001j
 !.	j

 u//0j
 "%j
   1 12j
 D>j
 $D>j
 'tnj
 d^j
 !!1!12j
 -.j
 
u--	.j
 j
r.   r   zb
    Chameleon Model with a head on top used for outputting logits for next token prediction.
    c                     ^  \ rS rSrS/rU 4S jrS rS r\\	           SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\   S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\   S\\\4   4S jj5       5       r       SU 4S jjrSrU =r$ )!ChameleonForConditionalGenerationi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r!   r"   r   r  r  r   r   r)   lm_headr  r   s     r,   r"   *ChameleonForConditionalGeneration.__init__  sU     #F+
 ++yy!3!3V5F5FUS 	r.   c                 8    U R                   R                  U5      $ r|   )r  r  r(   r  s     r,   r  2ChameleonForConditionalGeneration.get_image_tokens  s    zz**<88r.   c                 8    U R                   R                  U5      $ r|   )r  r  rB  s     r,   r  4ChameleonForConditionalGeneration.get_image_features  s    zz,,\::r.   r  r  r   ro   r   r  labelsr   r   r)  r   r   r   c                 0   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	U
SUS.UD6nUS   nU R	                  U5      nU R                  R
                  R                  n[        R                  " UR                  5      R                  USS2SS2U4'   SnUb)  U R                  " SXU R                   R                  S.UD6n[        UUUR                  UR                  UR                   S9$ )a?  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16)
>>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")

>>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.<image><image>I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation."
>>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
>>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)

>>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```NT)r  r  r   ro   r   r  r   r   r)  r*  r   r   )logitsrF  r  )rC  rH  r   r9   r/  r~   )r   r   r)  r  r?  r  r  r$   finfor3   minloss_functionr  r   r   r9   r/  )r(   r  r  r   ro   r   r  rF  r   r   r)  r   r   r"  r9   rH  r  rC  s                     r,   r<   )ChameleonForConditionalGeneration.forward  s-   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ** 
%)%+'/!5)
 
  
m, zz44AA%*[[%>%B%Bq!\!"%%pVt{{OeOepiopD%#33!//))
 	
r.   c	                 V   > [         TU ]  " U4UUUUUUUS.U	D6n
US   S:w  a  S U
S'   U
$ )N)r  r   r   r  r   ro   r   r   r  )r!   prepare_inputs_for_generation)r(   r  r  r   r   r  r   ro   r   r   model_inputsr+   s              r,   rN  ?ChameleonForConditionalGeneration.prepare_inputs_for_generationo  s\     w<

%+)')%

 

 !! ,0L(r.   )r?  r  r  r;  )NNNNNNT)rE   rF   rG   rH   _tied_weights_keysr"   r  r  r   r   r   r$   r  r&  ru   r
   r  r   r   r   r?   r   r<   rN  rI   rJ   rK   s   @r,   r=  r=    sy    ++9;  15481537+/59-1$(,0/359O
E,,-O
 u001O
 !.	O

 u//0O
 "%O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 !!1!12O
 +,O
 
u,,	-O
  O
h  r.   r=  )r=  r   r  r  r  )r  )Kr   	functoolsr   typingr   r   r   r$   torch.nn.functionalr   r   r   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   configuration_chameleonr   r   
get_loggerrE   r   r  r   rM   ry   r   r   r   r   	LayerNormr   ru   r  r   r]   r   r   r  r(  r.  rF  rW  rm  r{  r  r  r  r   r=  __all__r~   r.   r,   <module>rd     s    % , ,     ! . ) / B 9 O F &  1 J 
		H	%Jryy J,<ryy <D,D 0H *(8299 " &	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % % '(%4{) {)~F6 FRD : DN,>BII ,>^	")) 	)(ryy )(X &RYY  &F^!BII ^!B,% ,%^ ' ' ' (- ((0 s
- s
 s
l 
D(@/ D
DN pr.   