
    cCip                        S r SSKrSSKrSSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  \RJ                  " \&5      r' " S S\	RP                  5      r) " S S\	RP                  5      r* S8S\	RP                  S\RV                  S\RV                  S\RV                  S\\RV                     S\,S\,4S jjr- " S S\	RP                  5      r. " S S\	RP                  5      r/ " S  S!\	RP                  5      r0 " S" S#\	RP                  5      r1 " S$ S%\	RP                  5      r2 " S& S'\5      r3 " S( S)\	RP                  5      r4\ " S* S+\5      5       r5\ " S, S-\55      5       r6 " S. S/\	RP                  5      r7\" S0S19 " S2 S3\55      5       r8\" S4S19 " S5 S6\55      5       r9/ S7Qr:g)9zPyTorch ViT model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )	ViTConfigc            	          ^  \ rS rSrSrSS\S\4U 4S jjjrS\R                  S\
S\
S	\R                  4S
 jr  SS\R                  S\\R                     S\S	\R                  4S jjrSrU =r$ )ViTEmbeddings+   zZ
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
configuse_mask_tokenc                 `  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l	        [        U5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR"                  U l        Xl        g )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r,   	__class__s       ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/vit/modeling_vit.pyr"   ViTEmbeddings.__init__0   s    ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei 26 :++77#%<<A{QPVPbPb0c#d zz&"<"<= ++    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper-   r$   jit
is_tracingr1   r   reshapepermuter   
functionalinterpolateviewcat)r2   r7   r8   r9   r,   num_positionsclass_pos_embedpatch_pos_embedrD   
new_height	new_widthsqrt_num_positionss               r4   interpolate_pos_encoding&ViTEmbeddings.interpolate_pos_encoding<   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr6   pixel_valuesbool_masked_posrT   c                    UR                   u  pEpgU R                  XS9nUbX  UR                   S   n	U R                  R                  XIS5      n
UR	                  S5      R                  U
5      nUSU-
  -  X-  -   nU R                  R                  USS5      n[        R                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )N)rT   r   r<         ?rC   )rE   r+   r)   expand	unsqueezetype_asr'   r$   rM   rT   r-   r0   )r2   rV   rW   rT   
batch_sizenum_channelsr8   r9   r7   
seq_lengthmask_tokensmask
cls_tokenss                r4   forwardViTEmbeddings.forwardd   s     3?2D2D/
&**<*k
&#))!,J//00LK",,R088ED#sTz2[5GGJ ^^**:r2>
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r6   )r'   r   r0   r)   r+   r1   r-   FNF)__name__
__module____qualname____firstlineno____doc__r   boolr"   r$   TensorintrT   r   
BoolTensorrc   __static_attributes____classcell__r3   s   @r4   r   r   +   s    
y 
$ 
 
&D5<< &D &DUX &D]b]i]i &DV 7;).	ll "%"2"23 #'	
 
 r6   r   c                   v   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	S\R                  4S jjr
S	rU =r$ )r*      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
r   c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r!   r"   
image_sizer1   r^   r&   
isinstancecollectionsabcIterabler,   r   Conv2d
projection)r2   r   rx   r1   r^   r&   r,   r3   s          r4   r"   ViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir6   rV   rT   r:   c                    UR                   u  p4pVX@R                  :w  a  [        SU R                   SU S35      eU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S	3	5      eU R	                  U5      R                  S
5      R                  SS
5      nU$ )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r>   )rE   r^   
ValueErrorrx   r~   flatten	transpose)r2   rV   rT   r]   r^   r8   r9   r7   s           r4   rc   ViTPatchEmbeddings.forward   s    2>2D2D/
&,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r6   )rx   r^   r,   r1   r~   re   )rg   rh   ri   rj   rk   r   r"   r$   rm   rl   rc   rp   rq   rr   s   @r4   r*   r*      s@    jy jELL D ]b]i]i  r6   r*   modulequerykeyvalueattention_maskscalingr0   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr<   )rD   dtype)ptrainingr   r>   )r$   matmulr   r   rJ   softmaxfloat32tor   r0   r   
contiguous)
r   r   r   r   r   r   r0   kwargsattn_weightsattn_outputs
             r4   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r6   c            	          ^  \ rS rSrS\4U 4S jjr S	S\R                  S\\R                     S\	\R                  \R                  4   4S jjr
SrU =r$ )
ViTSelfAttention   r   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   g      F)bias)r!   r"   r&   num_attention_headshasattrr   r   rn   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r2   r   r3   s     r4   r"   ViTSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r6   hidden_states	head_maskr:   c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU	R!                  5       S S U R"                  4-   nU	R%                  U5      n	X4$ )	Nr   r<   r   r>   eager        )r   r   r0   r   )rE   r   r   r   rL   r   r   r   r   r   _attn_implementationr   r   r   r   r   r@   r   rH   )r2   r   r   r]   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r4   rc   ViTSelfAttention.forward   sH    #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?;;++w6"9$++:Z:Z"[)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r6   )
r   r   r   r   r   r   r   r   r   r   N)rg   rh   ri   rj   r   r"   r$   rm   r   tuplerc   rp   rq   rr   s   @r4   r   r      sY    ]y ]* PT."\\.6>u||6L.	u||U\\)	*. .r6   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
ViTSelfOutput   z
The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	r!   r"   r   r   r&   denser.   r/   r0   r   s     r4   r"   ViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r6   r   input_tensorr:   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r0   r2   r   r   s      r4   rc   ViTSelfOutput.forward  s$    

=1]3r6   r   )rg   rh   ri   rj   rk   r   r"   r$   rm   rc   rp   rq   rr   s   @r4   r   r      sB    
>y >
U\\  RWR^R^  r6   r   c                      ^  \ rS rSrS\4U 4S jjrS\\   4S jrSS\	R                  S\\	R                     S\	R                  4S	 jjrS
rU =r$ )ViTAttentioni  r   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r!   r"   r   	attentionr   outputsetpruned_headsr   s     r4   r"   ViTAttention.__init__  s0    )&1#F+Er6   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rC   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r2   r   indexs      r4   prune_headsViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r6   r   r   r:   c                 N    U R                  X5      u  p4U R                  X15      nU$ r   )r   r   )r2   r   r   self_attn_output_r   s         r4   rc   ViTAttention.forward&  s(    "nn]F-=r6   )r   r   r   r   )rg   rh   ri   rj   r   r"   r   rn   r   r$   rm   r   rc   rp   rq   rr   s   @r4   r   r     sR    "y ";S ;$U\\ hu||>T `e`l`l  r6   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )ViTIntermediatei,  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r!   r"   r   r   r&   intermediate_sizer   ry   
hidden_actstrr   intermediate_act_fnr   s     r4   r"   ViTIntermediate.__init__-  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r6   r   r:   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r2   r   s     r4   rc   ViTIntermediate.forward5  s&    

=100?r6   r   rg   rh   ri   rj   r   r"   r$   rm   rc   rp   rq   rr   s   @r4   r   r   ,  s/    9y 9U\\ ell  r6   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )		ViTOutputi;  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r!   r"   r   r   r   r&   r   r.   r/   r0   r   s     r4   r"   ViTOutput.__init__<  sB    YYv779K9KL
zz&"<"<=r6   r   r   r:   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r4   rc   ViTOutput.forwardA  s,    

=1]3%4r6   r   r   rr   s   @r4   r   r   ;  s=    >y >
U\\  RWR^R^  r6   r   c                      ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	\R                     S\R                  4S jjr
S	rU =r$ )ViTLayeriH  z?This corresponds to the Block class in the timm implementation.r   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r!   r"   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr&   layer_norm_epslayernorm_beforelayernorm_afterr   s     r4   r"   ViTLayer.__init__K  s    '-'E'E$%f-+F3' "V-?-?VEZEZ [!||F,>,>FDYDYZr6   r   r   r:   c                     U R                  U5      nU R                  X25      nXA-   nU R                  U5      nU R                  U5      nU R	                  XQ5      nU$ r   )r   r   r   r   r   )r2   r   r   hidden_states_normattention_outputlayer_outputs         r4   rc   ViTLayer.forwardU  se    !22=A>>*<H )8 ++M:((6 {{<?r6   )r   r   r   r   r   r   r   r   )rg   rh   ri   rj   rk   r   r"   r$   rm   r   rc   rp   rq   rr   s   @r4   r   r   H  sG    I[y [U\\ hu||>T `e`l`l  r6   r   c                   x   ^  \ rS rSrS\4U 4S jjrS	S\R                  S\\R                     S\	4S jjr
SrU =r$ )

ViTEncoderif  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rf   )
r!   r"   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r2   r   r   r3   s      r4   r"   ViTEncoder.__init__g  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A&r   r   r:   c                 r    [        U R                  5       H  u  p4Ub  X#   OS nU" X5      nM     [        US9$ )N)last_hidden_state)	enumerater  r
   )r2   r   r   ilayer_modulelayer_head_masks         r4   rc   ViTEncoder.forwardm  s<    (4OA.7.CilO(HM  5 ??r6   )r   r  r  r   )rg   rh   ri   rj   r   r"   r$   rm   r   r
   rc   rp   rq   rr   s   @r4   r   r   f  sA    ,y ,@U\\ @hu||>T @`o @ @r6   r   c                       \ rS rSr% \\S'   SrSrSrSS/r	Sr
SrSrSr\\S.rS	\\R&                  \R(                  \R*                  4   4S
 jrSrg)ViTPreTrainedModeliu  r   vitrV   Tr   r   )r   
attentionsr   c                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       Ga_  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        [        R                  R                  UR(                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR(                  R                  5      UR(                  l        UR*                  b%  UR*                  R                  R                  5         ggg)zInitialize the weightsr   )meanstdNrY   )ry   r   r   r}   inittrunc_normal_weightdatar   r$   r   r   initializer_ranger   r   zero_r   fill_r   r-   r'   r)   )r2   r   s     r4   _init_weights ViTPreTrainedModel._init_weights  s   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S)...0gg.C.C**//225==AKK11 /D / b++112	 &&+ %'GG$9$9  %%((7KK11 %: % b!!''(	 !   ,!!&&,,. - /r6    N)rg   rh   ri   rj   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   r}   r   r  rp   r  r6   r4   r  r  u  sm    $O&*#(*5N"&!&
/E"))RYY*L$M /r6   r  c                     ^  \ rS rSrSS\S\S\4U 4S jjjrS\4S jrS	\	\
\\
   4   4S
 jr\" SS9\    SS\\R"                     S\\R$                     S\\R"                     S\\   S\\   S\4S jj5       5       rSrU =r$ )ViTModeli  Fr   add_pooling_layerr   c                   > [         TU ]  U5        Xl        [        XS9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
)r   r   N)r!   r"   r   r   r7   r   encoderr   r   r&   r   	layernorm	ViTPoolerpooler	post_init)r2   r   r,  r   r3   s       r4   r"   ViTModel.__init__  si     	 'N!&)f&8&8f>S>ST+<i'$ 	r6   r:   c                 .    U R                   R                  $ r   )r7   r+   )r2   s    r4   get_input_embeddingsViTModel.get_input_embeddings  s    ///r6   heads_to_prunec                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr.  r  r   r   )r2   r7  r  r   s       r4   _prune_headsViTModel._prune_heads  s<    
 +002LELLu%//;;EB 3r6   )tie_last_hidden_statesrV   rW   r   rT   r   c                    Uc  [        S5      eU R                  X0R                  R                  5      nU R                  R
                  R                  R                  R                  nUR                  U:w  a  UR                  U5      nU R	                  XUS9nU R                  XsS9nUR                  n	U R                  U	5      n	U R                  b  U R                  U	5      OSn
[        XS9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)rW   rT   )r   )r	  pooler_output)r   get_head_maskr   r  r7   r+   r~   r  r   r   r.  r	  r/  r1  r   )r2   rV   rW   r   rT   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputs              r4   rc   ViTModel.forward  s     ?@@ &&y++2O2OP	 99DDKKQQ/'??>:L??Tl + 
 ,0<<8H<+^);;..98<8OO4UY)Oiir6   )r   r7   r.  r/  r1  )TFNNNN)rg   rh   ri   rj   r   rl   r"   r*   r5  dictrn   listr:  r   r   r   r$   rm   ro   r   r   r   rc   rp   rq   rr   s   @r4   r+  r+    s    y T Z^  &0&8 0C4T#Y+? C u5 046:,037&ju||,&j "%"2"23&j ELL)	&j
 #+4.&j +,&j 
$&j  6&jr6   r+  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r0  i  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r!   r"   r   r   r&   pooler_output_sizer   r   
pooler_act
activationr   s     r4   r"   ViTPooler.__init__  s>    YYv1163L3LM
 !2!23r6   r   r:   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rM  )r2   r   first_token_tensorrD  s       r4   rc   ViTPooler.forward  s6     +1a40

#566r6   )rM  r   r   rr   s   @r4   r0  r0    s/    4y 4
U\\ ell  r6   r0  ac  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\    SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )ViTForMaskedImageModelingi  r   c                 H  > [         TU ]  U5        [        USSS9U l        [        R
                  " [        R                  " UR                  UR                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r,  r   r>   r   )in_channelsout_channelsrv   )r!   r"   r+  r  r   
Sequentialr}   r&   encoder_strider^   PixelShuffledecoderr2  r   s     r4   r"   "ViTForMaskedImageModeling.__init__
  s     FeDQ}}II"..#22A58K8KK
 OOF112
 	r6   rV   rW   r   rT   r   r:   c                 >   Ubh  U R                   R                  U R                   R                  :w  a:  [        SU R                   R                   SU R                   R                   S35      eU R                  " U4UUUS.UD6nUR
                  nUSS2SS24   nUR                  u  pn
[        R                  " U	S-  5      =pUR                  SS	S5      R                  XX5      nU R                  U5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  S
X5      nUR                  U R                   R                  S5      R                  U R                   R                  S	5      R                  S5      R                  5       n[         R"                  R%                  XSS9nUU-  R'                  5       UR'                  5       S-   -  U R                   R(                  -  n[+        UUUR,                  UR.                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = r   )rW   r   rT   r   r=   r   r>   r<   none)	reductiongh㈵>)lossreconstructionr   r  )r   r1   rY  r   r  r	  rE   mathfloorrI   rH   r[  rx   repeat_interleaver[   r   r   rJ   l1_losssumr^   r   r   r  )r2   rV   rW   r   rT   r   outputsrC  r]   sequence_lengthr^   r8   r9   reconstructed_pixel_valuesmasked_im_lossr@   ra   reconstruction_losss                     r4   rc   !ViTForMaskedImageModeling.forward  s   L &DKK,B,BdkkF`F`,`&&*kk&<&<%==UVZVaVaVpVpUqqrt  /3hh/
+%=	/

 /
 "33 *!QR%04C4I4I1
\OS$899)11!Q:BB:]ck &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN(5!//))	
 	
r6   )r[  r  rF  )rg   rh   ri   rj   r   r"   r   r   r   r$   rm   ro   rl   r   r   r   rc   rp   rq   rr   s   @r4   rT  rT    s    y "  046:,037P
u||,P
 "%"2"23P
 ELL)	P

 #+4.P
 +,P
 
#P
  P
r6   rT  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrS\4U 4S jjr\\    SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )ViTForImageClassificationip  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r,  r   )r!   r"   
num_labelsr+  r  r   r   r&   Identity
classifierr2  r   s     r4   r"   "ViTForImageClassification.__init__  ss      ++Fe< OUN_N_bcNc"))F$6$68I8IJikititiv 	r6   rV   r   labelsrT   r   r:   c                    U R                   " U4UUS.UD6nUR                  nUSS2SSS24   nU R                  U5      n	Sn
Ub  U R                  " X9U R                  40 UD6n
[        U
U	UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   rT   Nr   )r`  logitsr   r  )r  r	  rr  loss_functionr   r   r   r  )r2   rV   r   rt  rT   r   rg  rC  rD  rv  r`  s              r4   rc   !ViTForImageClassification.forward  s    " /3hh/
%=/
 	/
 "33'1a0/%%fdkkLVLD$!//))	
 	
r6   )rr  rp  r  rF  )rg   rh   ri   rj   r   r"   r   r   r   r$   rm   rl   r   r   r   rc   rp   rq   rr   s   @r4   rn  rn  p  s    
y 
  04,0)-37#
u||,#
 ELL)#
 &	#

 #+4.#
 +,#
 
#
  #
r6   rn  )rn  rT  r+  r  )r   );rk   collections.abcrz   rb  typingr   r   r   r$   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_vitr   
get_loggerrg   loggerModuler   r*   rm   floatr   r   r   r   r   r   r   r   r  r+  r0  rT  rn  __all__r  r6   r4   <module>r     s      , ,   ! 9  G & Q K K A ( 
		H	%UBII Up$ $\ %II%<<% 
% <<	%
 U\\*% % %<1.ryy 1.hBII "299 >bii 
		 
) <@ @ */ */ */Z Gj! Gj GjT		  	d
 2 d
d
N 2
 2 2
2
j gr6   