
    cCi-p                     (   S r SSKrSSKJrJrJr  SSKrSSKJr  SSK	J
r
  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJrJrJr  SSKJ r   SSK!J"r"J#r#  SSK$J%r%  \RL                  " \'5      r( " S S\RR                  5      r* " S S\RR                  5      r+ S@S\RR                  S\RX                  S\RX                  S\RX                  S\\RX                     S\-S\-4S jjr. " S S\RR                  5      r/ " S S \RR                  5      r0 " S! S"\RR                  5      r1 " S# S$\RR                  5      r2SAS%\RX                  S&\-S'\3S(\RX                  4S) jjr4 " S* S+\RR                  5      r5 " S, S-\RR                  5      r6 " S. S/\RR                  5      r7 " S0 S1\5      r8 " S2 S3\RR                  5      r9\ " S4 S5\5      5       r:\ " S6 S7\:5      5       r;\" S8S99 " S: S;\:5      5       r<\" S<S99 " S= S>\:\ 5      5       r=/ S?Qr>g)BzPyTorch DINOv2 model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)BackboneMixin)can_return_tuplecheck_model_inputs   )Dinov2Configc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )Dinov2Embeddings&   zE
Construct the CLS token, mask token, position and patch embeddings.
configreturnNc                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a:  [        R                  " [        R                  " SUR                  5      5      U l
        [        U5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                   5      U l        UR$                  U l        UR                  U l        Xl        g )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r.   	__class__s      d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/dinov2/modeling_dinov2.pyr#   Dinov2Embeddings.__init__+   s    ekk!Q8J8J&KL   ll5;;q&:L:L+MNDO 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<= ++$33    
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      nUR                  n[        R                  R                  UR                  [        R                  5      X4SS	S
9R                  US9nUR                  SSSS5      R                  SSU5      n[        R                   " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper/   r%   jit
is_tracingr3   r   reshapepermuterD   r   
functionalinterpolatetofloat32viewcat)r4   r9   r:   r;   r.   num_positionsclass_pos_embedpatch_pos_embedrF   
new_height	new_widthsqrt_num_positionstarget_dtypes                r6   interpolate_pos_encoding)Dinov2Embeddings.interpolate_pos_encoding9   s~    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=&,,--33u}}-(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/;CCr8   pixel_valuesbool_masked_posc                 >   UR                   u  p4pVU R                  R                  R                  R                  nU R                  UR                  US95      nUbj  U R                  (       aY  [        R                  " UR                  S5      U R                  R                  UR                  5      R                  S5      U5      nU R                  R                  USS5      n	[        R                  " X4SS9nXR                  XU5      -   nU R                  U5      nU$ )NrC   r=   r   r   rE   )rG   r-   
projectionweightrD   rN   r)   r%   where	unsqueezer+   r(   expandrQ   rY   r2   )
r4   r[   r\   
batch_size_r:   r;   rX   r9   
cls_tokenss
             r6   forwardDinov2Embeddings.forwarda   s    '3'9'9$
v,,77>>DD**<???+NO
&4+>+>))"-t/A/A*BRBR/S/]/]^_/`blJ
 ^^**:r2>
YY
7Q?
  "?"?
TY"ZZ
\\*-
r8   )r(   r   r2   r+   r-   r3   r/   r)   N)__name__
__module____qualname____firstlineno____doc__r   r#   r%   TensorintrY   r   rf   __static_attributes____classcell__r5   s   @r6   r   r   &   s    |  &D5<< &D &DUX &D]b]i]i &DPELL 8ELLCY ejeqeq  r8   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )r,   w   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r"   r#   
image_sizer3   num_channelsr'   
isinstancecollectionsabcIterabler.   r   Conv2dr^   )r4   r   rx   r3   ry   r'   r.   r5   s          r6   r#   Dinov2PatchEmbeddings.__init__~   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir8   r[   r    c                     UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r>   )rG   ry   
ValueErrorr^   flatten	transpose)r4   r[   ry   r9   s       r6   rf   Dinov2PatchEmbeddings.forward   sx    #))!,,,,!../yaI  __\2::1=GG1M
r8   )rx   ry   r.   r3   r^   )ri   rj   rk   rl   rm   r#   r%   rn   rf   rp   rq   rr   s   @r6   r,   r,   w   s.    jELL U\\  r8   r,   modulequerykeyvalueattention_maskscalingr2   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr=   )rF   rD   )ptrainingr   r>   )r%   matmulr   r   rL   softmaxrO   rN   rD   r2   r   
contiguous)
r   r   r   r   r   r   r2   kwargsattn_weightsattn_outputs
             r6   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r8   c            	          ^  \ rS rSrS\4U 4S jjr S	S\R                  S\\R                     S\	\R                  \R                  4   4S jjr
SrU =r$ )
Dinov2SelfAttention   r   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   g      Fbias)r"   r#   r'   num_attention_headshasattrr   r   ro   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r4   r   r5   s     r6   r#   Dinov2SelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r8   hidden_states	head_maskr    c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU	R!                  5       S S U R"                  4-   nU	R%                  U5      n	X4$ )	Nr   r=   r   r>   eager        )r   r   r2   r   )rG   r   r   r   rP   r   r   r   r   r   _attn_implementationr   r   r   r   r   r@   r   rJ   )r4   r   r   rc   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r6   rf   Dinov2SelfAttention.forward   sH    #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?;;++w6"9$++:Z:Z"[)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r8   )
r   r   r   r   r   r   r   r   r   r   rh   )ri   rj   rk   rl   r   r#   r%   rn   r   tuplerf   rp   rq   rr   s   @r6   r   r      sY    ]| ]* PT."\\.6>u||6L.	u||U\\)	*. .r8   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Dinov2SelfOutput   z
The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rh   )	r"   r#   r   r   r'   denser0   r1   r2   r   s     r6   r#   Dinov2SelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r8   r   input_tensorr    c                 J    U R                  U5      nU R                  U5      nU$ rh   r   r2   )r4   r   r   s      r6   rf   Dinov2SelfOutput.forward   s$    

=1]3r8   r   )ri   rj   rk   rl   rm   r   r#   r%   rn   rf   rp   rq   rr   s   @r6   r   r      sB    
>| >
U\\  RWR^R^  r8   r   c                      ^  \ rS rSrS\4U 4S jjrS\\   4S jrSS\	R                  S\\	R                     S\	R                  4S	 jjrS
rU =r$ )Dinov2Attention   r   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g rh   )r"   r#   r   	attentionr   outputsetpruned_headsr   s     r6   r#   Dinov2Attention.__init__   s0    ,V4&v.Er8   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r4   r   indexs      r6   prune_headsDinov2Attention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r8   r   r   r    c                 N    U R                  X5      u  p4U R                  X15      nU$ rh   )r   r   )r4   r   r   self_attn_outputrd   r   s         r6   rf   Dinov2Attention.forward  s(    "nn]F-=r8   )r   r   r   rh   )ri   rj   rk   rl   r   r#   r   ro   r   r%   rn   r   rf   rp   rq   rr   s   @r6   r   r      sR    "| ";S ;$U\\ hu||>T `e`l`l  r8   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )Dinov2LayerScalei  r    c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g rh   )	r"   r#   r   r$   layerscale_valuer%   onesr'   lambda1r   s     r6   r#   Dinov2LayerScale.__init__  s8    ||F$;$;ejjI[I[>\$\]r8   hidden_statec                     XR                   -  $ rh   r   r4   r   s     r6   rf   Dinov2LayerScale.forward#  s    ll**r8   r   r    N
ri   rj   rk   rl   r#   r%   rn   rf   rp   rq   rr   s   @r6   r   r     s)    ^+ELL +U\\ + +r8   r   input	drop_probr   r    c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
r   r   r   )r   )rD   device)rG   ndimr%   randrD   r   floor_div)r   r   r   	keep_probrG   random_tensorr   s          r6   	drop_pathr   (  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr8   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )Dinov2DropPathi=  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r    c                 .   > [         TU ]  5         Xl        g rh   )r"   r#   r   )r4   r   r5   s     r6   r#   Dinov2DropPath.__init__@  s    "r8   r   c                 B    [        XR                  U R                  5      $ rh   )r   r   r   )r4   r   s     r6   rf   Dinov2DropPath.forwardD  s    FFr8   c                      SU R                    3$ )Nzp=r   r4   s    r6   
extra_reprDinov2DropPath.extra_reprG  s    DNN#$$r8   r   rh   )ri   rj   rk   rl   rm   r   floatr#   r%   rn   rf   strr   rp   rq   rr   s   @r6   r   r   =  sQ    b#(5/ #T # #GU\\ Gell G%C % %r8   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )	Dinov2MLPiK  r    c                 z  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[
        R                  " X$SS9U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [
        R                  " XCSS9U l        g )NTr   )r"   r#   r'   ro   	mlp_ratior   r   fc1rz   
hidden_actr   r   
activationfc2r4   r   in_featuresout_featureshidden_featuresr5   s        r6   r#   Dinov2MLP.__init__L  s    %+%7%77f0063C3CCD99[Ef''--$V%6%67DO$//DO99_Fr8   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rh   )r   r  r  r   s     r6   rf   Dinov2MLP.forwardW  s2    xx-|4xx-r8   )r  r   r  r   r   rr   s   @r6   r   r   K  s)    	GELL U\\  r8   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )Dinov2SwiGLUFFNi^  r    c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g )Nr>   r         Tr   )	r"   r#   r'   ro   r   r   r   
weights_inweights_outr  s        r6   r#   Dinov2SwiGLUFFN.__init___  s    %+%7%77f0063C3CCD2Q67!;AAE))K_1D4P99_Nr8   r   c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ )Nr>   r=   rE   )r  chunkr   rL   silur  )r4   r   x1x2hiddens        r6   rf   Dinov2SwiGLUFFN.forwardh  sQ    |4##A2#.##B'",''r8   )r  r  r   r   rr   s   @r6   r  r  ^  s)    O(ELL (U\\ ( (r8   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\	\R                     S\R                  4S	 jjr
S
rU =r$ )Dinov2Layerio  zCThis corresponds to the Block class in the original implementation.r   r    Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        U5      U l
        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        R                  " UR                  UR
                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        U5      U l        g )Nepsr   )r"   r#   r   	LayerNormr'   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr  mlpr   layer_scale2r   s     r6   r#   Dinov2Layer.__init__r  s    \\&"4"4&:O:OP
(0,V4BHBWBWZ]B](=(=>cecncncp\\&"4"4&:O:OP
  &v.DH (DH,V4r8   r   r   c                 "   U R                  U5      nU R                  X25      nU R                  U5      nU R                  U5      U-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU$ rh   )r!  r   r"  r   r%  r'  r(  )r4   r   r   hidden_states_normself_attention_outputlayer_outputs         r6   rf   Dinov2Layer.forward  s    
 "ZZ6 $/A M $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr8   )r   r   r"  r(  r'  r!  r%  rh   )ri   rj   rk   rl   rm   r   r#   r%   rn   r   rf   rp   rq   rr   s   @r6   r  r  o  sU    M5| 5 5& -1|| ELL) 
	 r8   r  c            	       ~   ^  \ rS rSrS\4U 4S jjr S
S\R                  S\\R                     S\	S\
4S jjrS	rU =r$ )Dinov2Encoderi  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r"   r#   r   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointingr4   r   rd   r5   s      r6   r#   Dinov2Encoder.__init__  sR    ]]vG_G_A`#aA`AK$7A`#ab
&+# $bs   A&r   r   output_hidden_statesr    c                     U(       a  U/OS n[        U R                  5       H0  u  pVUb  X%   OS nU" X5      nU(       d  M  UR                  U5        M2     [        UU(       a  [	        U5      S9$ S S9$ )N)last_hidden_stater   )	enumerater6  appendr   r   )r4   r   r   r:  all_hidden_statesilayer_modulelayer_head_masks           r6   rf   Dinov2Encoder.forward  s|     0D]O(4OA.7.CilO(HM  !((7	  5 +6G% 12
 	
MQ
 	
r8   )r   r7  r6  r2  )ri   rj   rk   rl   r   r#   r%   rn   r   boolr   rf   rp   rq   rr   s   @r6   r0  r0    sM    ,| , sx
"\\
6>u||6L
ko
	
 
r8   r0  c                       \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrS\0rS\\R$                  \R&                  \R(                  4   S	S
4S jrSrg
)Dinov2PreTrainedModeli  r   dinov2r[   Tr  
attentionsr   r    Nc                 j   [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       Gam  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        [        R                  R                  UR(                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR(                  R                  5      UR(                  l        U R                  R*                  (       a%  UR,                  R                  R                  5         gg[        U[.        5      (       a:  UR0                  R                  R#                  U R                  R2                  5        gg)zInitialize the weightsr   )meanstdNg      ?)rz   r   r   r~   inittrunc_normal_r_   datarN   r%   rO   r   initializer_rangerD   r   zero_r  fill_r   r/   r(   r)   r+   r   r   r   )r4   r   s     r6   _init_weights#Dinov2PreTrainedModel._init_weights  s-   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) 011.0gg.C.C**//225==AKK11 /D / b++112	 &&+ %'GG$9$9  %%((7KK11 %: % b!!''(	 ! {{))!!&&,,. * 011NN%%dkk&B&BC 2r8    )ri   rj   rk   rl   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr   r   r   r~   r  rR  rp   rT  r8   r6   rF  rF    su     $O&*#&N"&)DE"))RYY*L$M DRV Dr8   rF  c                      ^  \ rS rSrS\4U 4S jjrS\4S jrS\\	\
\	   4   SS4S jr\" S	S
9\    SS\\R                      S\\R                      S\\R                      S\\   S\4
S jj5       5       rSrU =r$ )Dinov2Modeli  r   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )Nr  )r"   r#   r   r   r9   r0  encoderr   r  r'   r   	layernorm	post_initr   s     r6   r#   Dinov2Model.__init__  sW     *62$V,f&8&8f>S>ST 	r8   r    c                 .    U R                   R                  $ rh   r9   r-   r   s    r6   get_input_embeddings Dinov2Model.get_input_embeddings      ///r8   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrb  r6  r   r   )r4   rk  r6  r   s       r6   _prune_headsDinov2Model._prune_heads  s<    
 +002LELLu%//;;EB 3r8   F)tie_last_hidden_statesr[   r\   r   r:  c                 T   Uc  U R                   R                  nUc  [        S5      eU R                  X0R                   R                  5      nU R                  XS9nU R                  XcUS9nUR                  nU R                  U5      nUSS2SSS24   n	[        UU	UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
    pre-training.
Nz You have to specify pixel_values)r\   )r   r:  r   )r<  pooler_outputr   )r   r:  r   get_head_maskr5  r9   rb  r<  rc  r   r   )
r4   r[   r\   r   r:  r   embedding_outputencoder_outputssequence_outputpooled_outputs
             r6   rf   Dinov2Model.forward  s      '#';;#C#C ?@@ &&y++2O2OP	??<?Y+/<<H\ ,8 ,
 *;;..9'1a0)-')77
 	
r8   )r   r9   rb  rc  )NNNN)ri   rj   rk   rl   r   r#   r,   rh  dictro   listrn  r   r   r   r%   rn   rD  r   rf   rp   rq   rr   s   @r6   r`  r`    s    
| 
0&; 0C4T#Y+? CD C u5 0426,0/3'
u||,'
 "%,,/'
 ELL)	'

 'tn'
 
$'
  6'
r8   r`  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\\   SS\\	R                     S\\	R                     S\\	R                     S	\\   S\4
S
 jj5       5       rSrU =r$ )Dinov2ForImageClassificationi$  r   r    Nc                 6  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a.  [
        R                  " UR                  S-  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   r>   )r"   r#   
num_labelsr`  rG  r   r   r'   r$  
classifierrd  r   s     r6   r#   %Dinov2ForImageClassification.__init__+  sy      ++!&) EKDUDUXYDYBIIf((1,f.?.?@_a_j_j_l 	
 	r8   r[   r   labelsr   c                 Z   U R                   " U4SU0UD6nUR                  nUSS2S4   nUSS2SS24   n[        R                  " XxR	                  SS9/SS9n	U R                  U	5      n
SnUb  U R                  " X:U R                  40 UD6n[        UU
UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   Nr   r   rE   )losslogitsr   rH  )rG  r<  r%   rQ   rJ  r  loss_functionr   r   r   rH  )r4   r[   r   r  r   outputsrv  r(   patch_tokenslinear_inputr  r  s               r6   rf   $Dinov2ForImageClassification.forward9  s     /3kk,.fR[.f_e.f!33#AqD)	&q!"u-yy)->->1->-E!FAN.%%fdkkLVLD$!//))	
 	
r8   )r  rG  r  )NNN)ri   rj   rk   rl   r   r#   r   r   r   r%   rn   r   r   r   rf   rp   rq   rr   s   @r6   r}  r}  $  s    |    04,0)-	
u||,
 ELL)
 &	

 +,
 

  
r8   r}  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c            	          ^  \ rS rSrU 4S jrS\4S jr\" 5       \ S	S\	R                  S\\   S\4S jj5       5       rSrU =r$ )
Dinov2Backbonei]  c                 v  > [         TU ]  U5        [         TU ]	  U5        [        UR                  S-   5       Vs/ s H  o!R
                  PM     snU l        [        U5      U l        [        U5      U l
        [        R                  " UR
                  UR                  S9U l        U R                  5         g s  snf )Nr   r  )r"   r#   _init_backboner4  r5  r'   num_featuresr   r9   r0  rb  r   r  r   rc  rd  r8  s      r6   r#   Dinov2Backbone.__init__c  s     v&9>v?W?WZ[?[9\]9\A//9\]*62$V,f&8&8f>S>ST 	 ^s   B6r    c                 .    U R                   R                  $ rh   rg  r   s    r6   rh  #Dinov2Backbone.get_input_embeddingsp  rj  r8   r[   r:  c                    Uc  U R                   R                  nU R                  U5      nU R                  USS9nUR                  n/ n[        U R                  U5       H  u  pXR                  ;   d  M  U R                   R                  (       a  U R                  U	5      n	U R                   R                  (       aj  U	SS2SS24   n	UR                  u  ppU R                   R                  nU	R                  XU-  X-  S5      n	U	R                  SSSS5      R                  5       n	UR!                  U	5        M     [#        [%        U5      U(       a  US	9$ SS	9$ )
a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
>>> model = AutoBackbone.from_pretrained(
...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 16, 16]
```NT)r:  r   r=   r   r   r>   )feature_mapsr   )r   r:  r9   rb  r   zipstage_namesr  apply_layernormrc  reshape_hidden_statesrG   r3   rJ   rK   r   r>  r
   r   )r4   r[   r:  r   rt  r   r   r  stager   rc   rd   r:   r;   r3   s                  r6   rf   Dinov2Backbone.forwards  sB   :  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#GE)));;..#'>>,#?L;;44#/12#6L 4@3E3E0J6!%!7!7J#/#7#7
jDXZ_Zmoq#rL#/#7#71a#C#N#N#PL##L1 $H |,+?-
 	
EI
 	
r8   )r9   rb  rc  r  rh   )ri   rj   rk   rl   r#   r,   rh  r   r   r%   rn   r   rD  r
   rf   rp   rq   rr   s   @r6   r  r  ]  sW    0&; 0 QU4
!LL4
@H4
	4
  4
r8   r  )r}  r`  rF  r  )r   )r   F)?rm   collections.abcr{   typingr   r   r   r%   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   r   configuration_dinov2r   
get_loggerri   loggerModuler   r,   rn   r   r   r   r   r   r   rD  r   r   r   r  r  r0  rF  r`  r}  r  __all__rT  r8   r6   <module>r     s-     , ,   ! 9 r r F & Q K K 1 A . 
		H	%Nryy NbBII R %II%<<% 
% <<	%
 U\\*% % %>1.")) 1.jryy $bii >+ryy +U\\ e T V[VbVb *%RYY %		 &(bii ("', 'T
BII 
. +DO +D +D\ A
' A
 A
H 0
#8 0
0
f 
G
*M G

G
T er8   