
    cCi              	          S r SSKrSSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSKJr  \R:                  " \5      rS>S\ S\ S\\    S\ 4S jjr! " S S\RD                  5      r# " S S\RD                  5      r$ " S S\RD                  5      r% " S S\RD                  5      r& " S S\RD                  5      r' " S S\RD                  5      r( " S  S!\RD                  5      r) " S" S#\RD                  5      r* " S$ S%\RD                  5      r+ " S& S'\RD                  5      r, " S( S)\5      r- " S* S+\RD                  5      r.\ " S, S-\5      5       r/\ " S. S/\/5      5       r0\" S0S19 " S2 S3\/5      5       r1 " S4 S5\RD                  5      r2 " S6 S7\RD                  5      r3 " S8 S9\RD                  5      r4\" S:S19 " S; S<\/5      5       r5/ S=Qr6g)?zPyTorch MobileViT model.    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    Uc  Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  n[        U5      $ )z
Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
original TensorFlow repo. It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
   g?)maxint)r   r   r   	new_values       j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler    +   sO     	Is5Q;#677BWLMI3;	y>    c                      ^  \ rS rSr      SS\S\S\S\S\S\S\S	\S
\S\\\4   SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )MobileViTConvLayer:   configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 D  > [         TU ]  5         [        US-
  S-  5      U-  nX&-  S:w  a  [        SU SU S35      eX6-  S:w  a  [        SU SU S35      e[        R
                  " UUUUUUUUSS	9	U l        U	(       a  [        R                  " US
SSSS9U l        OS U l        U
(       an  [        U
[        5      (       a  [        U
   U l        g [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g S U l        g )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r&   r'   r(   r)   paddingr,   r*   r+   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r1   	__class__s               r   r9   MobileViTConvLayer.__init__;   s,    	{Q!+,x71$/}<STZS[[cdee A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#.."("8F--s33"():):";"("3"3"DOr!   featuresc                     U R                  U5      nU R                  b  U R                  U5      nU R                  b  U R                  U5      nU$ N)r<   r>   rA   )rC   rF   s     r   forwardMobileViTConvLayer.forwardq   sK    ##H-)))(3H??&x0Hr!   )rA   r<   r>   )r   r   Fr   TT)__name__
__module____qualname____firstlineno__r   r   boolr   r@   r9   torchTensorrI   __static_attributes____classcell__rD   s   @r   r#   r#   :   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4#l   r!   r#   c                      ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\R                  S\R                  4S jr
SrU =r$ )MobileViTInvertedResidualz   zQ
Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
r%   r&   r'   r)   r,   r   Nc           
      6  > [         TU ]  5         [        [        [	        X!R
                  -  5      5      S5      nUS;  a  [        SU S35      eUS:H  =(       a    X#:H  U l        [        XUSS9U l	        [        UUUSUUUS9U l
        [        UUUSS	S
9U l        g )N   )r   r   zInvalid stride .r   r&   r'   r(   r   )r&   r'   r(   r)   r*   r,   Fr&   r'   r(   r.   )r8   r9   r    r   roundexpand_ratior:   use_residualr#   
expand_1x1conv_3x3
reduce_1x1)rC   r%   r&   r'   r)   r,   expanded_channelsrD   s          r   r9   "MobileViTInvertedResidual.__init__   s     	*3u[CVCV5V/W+XZ[\vha899#q[K{/J,:KYZ
 +)*$
 -)% 
r!   rF   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  X!-   $ U$ rH   )r`   ra   rb   r_   )rC   rF   residuals      r   rI   !MobileViTInvertedResidual.forward   sG    ??8,==*??8,&*&7&7x"EXEr!   )ra   r`   rb   r_   r   )rK   rL   rM   rN   __doc__r   r   r9   rP   rQ   rI   rR   rS   rT   s   @r   rV   rV   z   sn    
 jk
%
47
GJ
TW
cf
	
 
BF F F Fr!   rV   c                      ^  \ rS rSr SS\S\S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr	Sr
U =r$ )MobileViTMobileNetLayer   r%   r&   r'   r)   
num_stagesr   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        U5       H4  n[        UUUUS:X  a  UOSS9nU R                  R                  U5        UnM6     g )Nr   r   )r&   r'   r)   )r8   r9   r   
ModuleListlayerrangerV   append)	rC   r%   r&   r'   r)   rm   irp   rD   s	           r   r9    MobileViTMobileNetLayer.__init__   sc     	]]_
z"A-')!"avQ	E JJe$&K #r!   rF   c                 <    U R                    H  nU" U5      nM     U$ rH   rp   )rC   rF   layer_modules      r   rI   MobileViTMobileNetLayer.forward   s     JJL#H-H 'r!   rv   )r   r   rK   rL   rM   rN   r   r   r9   rP   rQ   rI   rR   rS   rT   s   @r   rk   rk      s`    op'%'47'GJ'TW'il'	' '    r!   rk   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfAttention   r%   hidden_sizer   Nc                 r  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X R                  UR                  S9U l
        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rZ   )r+   )r8   r9   num_attention_headsr:   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrC   r%   r}   rD   s      r   r9   MobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{5O5O'O#P !558P8PPYY{,>,>V__U
99[*<*<6??SYY{,>,>V__U
zz&"E"EFr!   hidden_statesc                    UR                   u  p#nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n[        R                  " XVR                  SS5      5      nU[        R                  " U R                  5      -  n[        R                  R                  USS9n	U R                  U	5      n	[        R                  " X5      n
U
R!                  SSSS5      R#                  5       n
U
R%                  5       S S U R&                  4-   nU
R                  " U6 n
U
$ )Nr   r   dimr   r   )shaper   viewr   r   	transposer   r   rP   matmulmathsqrtr   
functionalsoftmaxr   permute
contiguoussizer   )rC   r   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r   rI   MobileViTSelfAttention.forward   s   $1$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDr!   )r   r   r   r   r   r   r   ry   rT   s   @r   r{   r{      sA    G GS GT G&"U\\ "ell " "r!   r{   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfOutput   r%   r}   r   Nc                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rH   r8   r9   r   r   denser   hidden_dropout_probr   r   s      r   r9   MobileViTSelfOutput.__init__   s4    YY{8
zz&"<"<=r!   r   c                 J    U R                  U5      nU R                  U5      nU$ rH   r   r   rC   r   s     r   rI   MobileViTSelfOutput.forward   s$    

=1]3r!   r   ry   rT   s   @r   r   r      s=    > >S >T >
U\\ ell  r!   r   c                      ^  \ rS rSrS\S\SS4U 4S jjrS\\   SS4S jrS	\	R                  S\	R                  4S
 jrSrU =r$ )MobileViTAttentioni  r%   r}   r   Nc                    > [         TU ]  5         [        X5      U l        [	        X5      U l        [        5       U l        g rH   )r8   r9   r{   	attentionr   outputsetpruned_headsr   s      r   r9   MobileViTAttention.__init__  s0    /D)&>Er!   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rC   r   indexs      r   prune_headsMobileViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r!   r   c                 J    U R                  U5      nU R                  U5      nU$ rH   )r   r   )rC   r   self_outputsattention_outputs       r   rI   MobileViTAttention.forward  s%    ~~m4;;|4r!   )r   r   r   )rK   rL   rM   rN   r   r   r9   r   r   rP   rQ   rI   rR   rS   rT   s   @r   r   r     sT    " "S "T ";S ;d ;$ U\\  ell    r!   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTIntermediatei%  r%   r}   intermediate_sizer   Nc                    > [         TU ]  5         [        R                  " X#5      U l        [        UR                  [        5      (       a  [        UR                     U l	        g UR                  U l	        g rH   )
r8   r9   r   r   r   r?   rB   r@   r   intermediate_act_fnrC   r%   r}   r   rD   s       r   r9   MobileViTIntermediate.__init__&  sR    YY{>
f''--'-f.?.?'@D$'-'8'8D$r!   r   c                 J    U R                  U5      nU R                  U5      nU$ rH   r   r   r   s     r   rI   MobileViTIntermediate.forward.  s&    

=100?r!   r   ry   rT   s   @r   r   r   %  sF    9 9S 9UX 9]a 9U\\ ell  r!   r   c                      ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S	\R                  S\R                  4S
 jr	Sr
U =r$ )MobileViTOutputi4  r%   r}   r   r   Nc                    > [         TU ]  5         [        R                  " X25      U l        [        R
                  " UR                  5      U l        g rH   r   r   s       r   r9   MobileViTOutput.__init__5  s5    YY0>
zz&"<"<=r!   r   input_tensorc                 R    U R                  U5      nU R                  U5      nX-   nU$ rH   r   )rC   r   r   s      r   rI   MobileViTOutput.forward:  s,    

=1]3%4r!   r   ry   rT   s   @r   r   r   4  sT    > >S >UX >]a >
U\\  RWR^R^  r!   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformerLayeriA  r%   r}   r   r   Nc                   > [         TU ]  5         [        X5      U l        [	        XU5      U l        [        XU5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  S9U l        g )Nr4   )r8   r9   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r   r9   "MobileViTTransformerLayer.__init__B  sg    +F@1&GXY%f;LM "[>S>S T!||K=R=RSr!   r   c                     U R                  U R                  U5      5      nX!-   nU R                  U5      nU R                  U5      nU R	                  X15      nU$ rH   )r   r   r   r   r   )rC   r   r   layer_outputs       r   rI   !MobileViTTransformerLayer.forwardJ  sX    >>$*?*?*NO(8++M:((6{{<?r!   )r   r   r   r   r   ry   rT   s   @r   r   r   A  sK    T TS TUX T]a TU\\ ell  r!   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformeriT  r%   r}   rm   r   Nc           	         > [         TU ]  5         [        R                  " 5       U l        [        U5       H>  n[        UU[        X!R                  -  5      S9nU R                  R                  U5        M@     g )N)r}   r   )
r8   r9   r   ro   rp   rq   r   r   	mlp_ratiorr   )rC   r%   r}   rm   r   transformer_layerrD   s         r   r9   MobileViTTransformer.__init__U  sa    ]]_
z"A 9'"%k4D4D&D"E!
 JJ/0 #r!   r   c                 <    U R                    H  nU" U5      nM     U$ rH   rv   )rC   r   rw   s      r   rI   MobileViTTransformer.forwarda  s      JJL(7M 'r!   rv   ry   rT   s   @r   r   r   T  sE    
1 
1S 
1c 
1VZ 
1U\\ ell  r!   r   c                     ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
S4U 4S jjjrS\R                  S
\
\R                  \4   4S jrS\R                  S\S
\R                  4S jrS\R                  S
\R                  4S jrSrU =r$ )MobileViTLayerig  z;
MobileViT block: https://huggingface.co/papers/2110.02178
r%   r&   r'   r)   r}   rm   r,   r   Nc           	        > [         TU ]  5         UR                  U l        UR                  U l        US:X  a(  [        UUUUS:X  a  UOSUS:  a  US-  OSS9U l        UnOS U l        [        UUUUR                  S9U l	        [        UUUSSSS9U l
        [        UUUS9U l        [        R                  " XQR                  S9U l        [        XUSS9U l        [        USU-  X!R                  S9U l        g )	Nr   r   )r&   r'   r)   r,   r[   F)r&   r'   r(   r-   r.   )r}   rm   r   )r8   r9   
patch_sizepatch_widthpatch_heightrV   downsampling_layerr#   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rC   r%   r&   r'   r)   r}   rm   r,   rD   s	           r   r9   MobileViTLayer.__init__l  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 k7L7LM1+ST 
 )KkWnWn
r!   rF   c                 n   U R                   U R                  p2[        X#-  5      nUR                  u  pVpx[        R
                  R                  5       (       a$  [        [        R                  " Xs-  5      U-  5      O#[        [        R                  " Xs-  5      U-  5      n	[        R
                  R                  5       (       a$  [        [        R                  " X-  5      U-  5      O#[        [        R                  " X-  5      U-  5      n
SnX:w  d  X:w  a#  [        R                  R                  XU
4SSS9nSnX-  nX-  nX-  nUR                  XV-  U-  X<U5      nUR                  SS5      nUR                  XVX5      nUR                  SS5      nUR                  XT-  US5      nXx4UUUUUUS	.nUU4$ )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rP   jit
is_tracingr   ceilr   r   r   r  reshaper   )rC   rF   r   r   
patch_arear   r  orig_height
orig_width
new_height	new_widthr  num_patch_widthnum_patch_heightr  patches	info_dicts                    r   	unfoldingMobileViTLayer.unfolding  s   $($4$4d6G6G\34
8@5
k yy##%% ejj!;<|KLTYY{9:\IJ 	 yy##%% ejj!9:[HITYYz78;FG 	 "j&?}}00I6ZW\ 1 H K $2%5&8 ""!$44lU`
 ##Aq)//*P##Aq)//*"9;K &2$ &&!0"2
	 	!!r!   r  r  c                    U R                   U R                  pC[        X4-  5      nUS   nUS   nUS   nUS   n	US   n
UR                  5       R	                  XeUS5      nUR                  SS5      nUR                  Xg-  U	-  XU5      nUR                  SS	5      nUR                  XgX-  X-  5      nUS
   (       a"  [        R                  R                  XS   SSS9nU$ )Nr   r  r  r  r  r   r   r   r   r  r   r   Fr   )
r   r   r   r   r   r   r	  r   r   r  )rC   r  r  r   r   r
  r   r  r  r  r  rF   s               r   foldingMobileViTLayer.folding  s   $($4$4d6G6G\34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44oU`
 %%a+##"2"A?C`
 ]#}}005JV[ 1 H r!   c                    U R                   (       a  U R                  U5      nUnU R                  U5      nU R                  U5      nU R                  U5      u  p4U R	                  U5      nU R                  U5      nU R                  X45      nU R                  U5      nU R                  [        R                  " X!4SS95      nU$ Nr   r   )r   r   r   r  r   r   r  r   r   rP   cat)rC   rF   rf   r  r  s        r   rI   MobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy()=1EFr!   )	r   r   r   r   r   r   r   r   r   rh   )rK   rL   rM   rN   ri   r   r   r9   rP   rQ   tupledictr  r  rI   rR   rS   rT   s   @r   r   r   g  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
t1"%,, 1"5t9K3L 1"fu||   :   r!   r   c                   t   ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\S\S\	\
\4   4S	 jjrS
rU =r$ )MobileViTEncoderi  r%   r   Nc           
        > [         T
U ]  5         Xl        [        R                  " 5       U l        SU l        S=p#UR                  S:X  a  SnSnOUR                  S:X  a  SnSn[        UUR                  S   UR                  S   SSS9nU R
                  R                  U5        [        UUR                  S   UR                  S   SS	S9nU R
                  R                  U5        [        UUR                  S   UR                  S	   SUR                  S   SS
9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S	   UR                  S   SUR                  S   SUS9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S   UR                  S   SUR                  S   S	US9n	U R
                  R                  U	5        g )NFrY   T   r   r   )r&   r'   r)   rm   r   r   )r&   r'   r)   r}   rm      )r&   r'   r)   r}   rm   r,      )r8   r9   r%   r   ro   rp   gradient_checkpointingoutput_striderk   neck_hidden_sizesrr   r   hidden_sizes)rC   r%   dilate_layer_4dilate_layer_5r,   layer_1layer_2layer_3layer_4layer_5rD   s             r   r9   MobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r!   r   output_hidden_statesreturn_dictc                     U(       a  SOS n[        U R                  5       H  u  pVU" U5      nU(       d  M  XA4-   nM     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   .   #    U  H  oc  M  Uv   M     g 7frH   r3  ).0vs     r   	<genexpr>+MobileViTEncoder.forward.<locals>.<genexpr>j  s     X$Fq$Fs   	)last_hidden_stater   )	enumeraterp   r  r
   )rC   r   r0  r1  all_hidden_statesrs   rw   s          r   rI   MobileViTEncoder.forward[  sc     #7BD(4OA(7M##$58H$H!	  5 X]$FXXX-oor!   )r%   r$  rp   )FT)rK   rL   rM   rN   r   r9   rP   rQ   rO   r   r  r
   rI   rR   rS   rT   s   @r   r  r    sg    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5p pr!   r  c                   X    \ rS rSr% \\S'   SrSrSrS/r	S\
R                  SS	4S
 jrSrg	)MobileViTPreTrainedModelio  r%   	mobilevitpixel_valuesTr   moduler   Nc                 (   [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)r?   r   r   r;   r=   weightdatanormal_r%   initializer_ranger+   zero_r   fill_)rC   rA  s     r   _init_weights&MobileViTPreTrainedModel._init_weightsw  s    fryy"))R^^DEE MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r!   r3  )rK   rL   rM   rN   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   ModulerK  rR   r3  r!   r   r>  r>  o  s9    #$O&*#)*
*BII 
*$ 
*r!   r>  c                      ^  \ rS rSrSS\S\4U 4S jjjrS r\   SS\	\
R                     S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )MobileViTModeli  r%   expand_outputc                 F  > [         TU ]  U5        Xl        X l        [	        UUR
                  UR                  S   SSS9U l        [        U5      U l	        U R                  (       a+  [	        UUR                  S   UR                  S   SS9U l
        U R                  5         g	)
a%  
expand_output (`bool`, *optional*, defaults to `True`):
    Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
    1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
r   r   r   )r&   r'   r(   r)   r#     r   r[   N)r8   r9   r%   rU  r#   num_channelsr&  	conv_stemr  encoderconv_1x1_exp	post_init)rC   r%   rU  rD   s      r   r9   MobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r!   c                    UR                  5        Hm  u  p#U R                  R                  U   n[        U[        5      (       d  M5  UR
                  R                   H  nUR                  R                  U5        M      Mo     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsrZ  rp   r?   r   r   r   r   )rC   heads_to_prunelayer_indexr   mobilevit_layerr   s         r   _prune_headsMobileViTModel._prune_heads  sg     #1"6"6"8K"ll00=O/>::)8)D)D)J)J%%//;;EB *K #9r!   r@  r0  r1  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nU R                  (       a-  U R                  US   5      n[        R                  " USS/SS9nOUS   nS nU(       d  Ub  Xg4OU4nXSS  -   $ [        UUUR                  S	9$ )
Nz You have to specify pixel_valuesr0  r1  r   r   r   F)r   keepdimr   )r9  pooler_outputr   )r%   r0  use_return_dictr:   rY  rZ  rU  r[  rP   rC  r   r   )	rC   r@  r0  r1  embedding_outputencoder_outputsr9  pooled_outputr   s	            r   rI   MobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFAB///7/')77
 	
r!   )r%   r[  rY  rZ  rU  )T)NNN)rK   rL   rM   rN   r   rO   r9   rc  r   r   rP   rQ   r   r  r   rI   rR   rS   rT   s   @r   rT  rT    s     t  >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r!   rT  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\
   S\\R                     S	\\
   S\\\4   4
S
 jj5       rSrU =r$ )MobileViTForImageClassificationi  r%   r   Nc                 ~  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  SS9U l        UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       U l        U R                  5         g )NT)inplacer   r   )r8   r9   
num_labelsrT  r?  r   r   classifier_dropout_probr   r   r&  Identity
classifierr\  rC   r%   rD   s     r   r9   (MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r!   r@  r0  labelsr1  c                 j   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U R                  U5      5      nSnUb  U R                  X7U R                   5      nU(       d  U4USS -   n	Ub  U4U	-   $ U	$ [        UUUR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrf  r   r   )losslogitsr   )	r%   ri  r?  rh  rv  r   loss_functionr   r   )
rC   r@  r0  ry  r1  outputsrl  r|  r{  r   s
             r   rI   'MobileViTForImageClassification.forward  s     &1%<k$++B]B]..fq.r1<--'!*m!<=%%fdkkBDY,F)-)9TGf$EvE3!//
 	
r!   )rv  r   r?  rs  NNNN)rK   rL   rM   rN   r   r9   r   r   rP   rQ   rO   r   r  r   rI   rR   rS   rT   s   @r   rp  rp    s     4   04/3)-&*!
u||,!
 'tn!
 &	!

 d^!
 
u::	;!
 !
r!   rp  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTASPPPoolingi  r%   r&   r'   r   Nc           
      |   > [         TU ]  5         [        R                  " SS9U l        [        UUUSSSSS9U l        g )Nr   )output_sizeTrelu)r&   r'   r(   r)   r-   r.   )r8   r9   r   AdaptiveAvgPool2dglobal_poolr#   r   )rC   r%   r&   r'   rD   s       r   r9   MobileViTASPPPooling.__init__  sB    //A>*#%"!
r!   rF   c                     UR                   SS  nU R                  U5      nU R                  U5      n[        R                  R                  XSSS9nU$ )Nr   r   Fr   )r   r  r   r   r   r  )rC   rF   spatial_sizes      r   rI   MobileViTASPPPooling.forward%  sQ    ~~bc*##H-==*==,,Xzin,or!   )r   r  ry   rT   s   @r   r  r    sF    
 
S 
PS 
X\ 
   r!   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTASPPi-  z{
ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
r%   r   Nc                 p  > [         TU ]  5         UR                  S   nUR                  n[	        UR
                  5      S:w  a  [        S5      e[        R                  " 5       U l	        [        UUUSSS9nU R                  R                  U5        U R                  R                  UR
                   Vs/ s H  n[        UUUSUSS9PM     sn5        [        XU5      nU R                  R                  U5        [        USU-  USSS9U l        [        R                  " UR                   S	9U l        g s  snf )
Nr   r   z"Expected 3 values for atrous_ratesr   r  r\   )r&   r'   r(   r,   r.   r#  )p)r8   r9   r&  aspp_out_channelsr   atrous_ratesr:   r   ro   convsr#   rr   extendr  projectr   aspp_dropout_probr   )rC   r%   r&   r'   in_projectionrate
pool_layerrD   s          r   r9   MobileViTASPP.__init__2  s-   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
 0D # +!- !!#) 0
	
 *&|L


*%)L 0|YZkq
 zzF$<$<=)
s   4D3rF   c                     / nU R                    H  nUR                  U" U5      5        M     [        R                  " USS9nU R	                  U5      nU R                  U5      nU$ r  )r  rr   rP   r  r  r   )rC   rF   pyramidconvpooled_featuress        r   rI   MobileViTASPP.forward]  sW    JJDNN4>* ))G+,,w/,,7r!   )r  r   r  rK   rL   rM   rN   ri   r   r9   rP   rQ   rI   rR   rS   rT   s   @r   r  r  -  s<    )> )>4 )>V   r!   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTDeepLabV3ih  zB
DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
r%   r   Nc           
         > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        UUR                  UR                  SSSSS9U l        g )Nr   FT)r&   r'   r(   r-   r.   r+   )r8   r9   r  asppr   	Dropout2drt  r   r#   r  rs  rv  rw  s     r   r9   MobileViTDeepLabV3.__init__m  s]    !&)	||F$B$BC,00**# 
r!   r   c                 r    U R                  US   5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   rv  )rC   r   rF   s      r   rI   MobileViTDeepLabV3.forward}  s6    99]2./<<)??8,r!   )r  rv  r   r  rT   s   @r   r  r  h  s;    
 
4 
 U\\ ell  r!   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\R                     S\\
   S	\\
   S\\\4   4
S
 jj5       rSrU =r$ ) MobileViTForSemanticSegmentationi  r%   r   Nc                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g )NF)rU  )r8   r9   rs  rT  r?  r  segmentation_headr\  rw  s     r   r9   )MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r!   r@  ry  r0  r1  c                 z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  USUS9nU(       a  UR                  OUS   nU R                  U5      nSnUbQ  [        R                  R                  XrR                  SS SSS	9n	[        U R                   R                  S
9n
U
" X5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [        UUU(       a  UR                  SS9$ SSS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
>>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTrf  r   r   Fr   )ignore_indexr   )r{  r|  r   
attentions)r%   r0  ri  rs  r:   r?  r   r  r   r   r  r   r   semantic_loss_ignore_indexr   )rC   r@  ry  r0  r1  r~  encoder_hidden_statesr|  r{  upsampled_logitsloss_fctr   s               r   rI   (MobileViTForSemanticSegmentation.forward  sm   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88\\"#.Zu  9   (T[[5[5[\H,5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r!   )r?  rs  r  r  )rK   rL   rM   rN   r   r9   r   r   rP   rQ   rO   r   r  r   rI   rR   rS   rT   s   @r   r  r    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r!   r  )rp  r  rT  r>  )rY   N)7ri   r   typingr   r   rP   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrK   loggerr   r    rR  r#   rV   rk   r{   r   r   r   r   r   r   r   r  r>  rT  rp  r  r  r  r  __all__r3  r!   r   <module>r     s   "   "   % ! 9  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .6RYY 6r	")) 	   >BII 
bii 
		 &299 &f/ fR\pryy \p~ * * *( R
- R
 R
j 2
&> 2
2
j299 08BII 8v 8 
U
'? U

U
pr!   