
    cCi{N              	       8   S r SSKrSSKJr  SSKJrJr  SSKrSSKJr  SSK	J
r
  SSKJrJrJr  SS	KJr  SS
KJrJrJrJr  SSKJr  \R0                  " \5      rSrSr/ SQrSrSrS5S\R@                  S\!S\"S\R@                  4S jjr# " S S\RH                  5      r% " S S\RH                  5      r& " S S\RH                  5      r' " S S\RH                  5      r( " S S\RH                  5      r) " S  S!\RH                  5      r* " S" S#\RH                  5      r+ " S$ S%\RH                  5      r, " S& S'\RH                  5      r- " S( S)\RH                  5      r. " S* S+\5      r/S,r0S-r1\" S.\05       " S/ S0\/5      5       r2\" S1\05       " S2 S3\/5      5       r3/ S4Qr4g)6z-PyTorch Visual Attention Network (VAN) model.    N)OrderedDict)OptionalUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )	VanConfigr   z!Visual-Attention-Network/van-base)r   i      r   ztabby, tabby catinput	drop_probtrainingreturnc                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/van/modeling_van.py	drop_pathr&   1   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FM    c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )VanDropPathE   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 .   > [         TU ]  5         Xl        g N)super__init__r   )selfr   	__class__s     r%   r.   VanDropPath.__init__H   s    "r'   hidden_statesc                 B    [        XR                  U R                  5      $ r,   )r&   r   r   )r/   r2   s     r%   forwardVanDropPath.forwardL   s    FFr'   c                      SU R                    3$ )Nzp=r   )r/   s    r%   
extra_reprVanDropPath.extra_reprO   s    DNN#$$r'   r7   r,   )__name__
__module____qualname____firstlineno____doc__r   floatr.   r   Tensorr4   strr8   __static_attributes____classcell__r0   s   @r%   r)   r)   E   sQ    b#(5/ #T # #GU\\ Gell G%C % %r'   r)   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  S	\R                  4S
 jr	Sr
U =r$ )VanOverlappingPatchEmbedderS   z
Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by
half of the area. From [PVTv2: Improved Baselines with Pyramid Vision
Transformer](https://huggingface.co/papers/2106.13797).
in_channelshidden_size
patch_sizestridec                    > [         TU ]  5         [        R                  " XX4US-  S9U l        [        R
                  " U5      U l        g )N   )kernel_sizerK   padding)r-   r.   r   Conv2dconvolutionBatchNorm2dnormalization)r/   rH   rI   rJ   rK   r0   s        r%   r.   $VanOverlappingPatchEmbedder.__init__Z   s@    99*U_cdUd
  ^^K8r'   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r,   rQ   rS   )r/   r   hidden_states      r%   r4   #VanOverlappingPatchEmbedder.forwarda   s(    ''.)),7r'   rV   )r   r   r:   r;   r<   r=   r>   intr.   r   r@   r4   rB   rC   rD   s   @r%   rF   rF   S   sO    9C 9c 9s 9X[ 9 9U\\ ell  r'   rF   c                      ^  \ rS rSrSr  SS\S\S\S\S\4
U 4S jjjrS	\	R                  S
\	R                  4S jrSrU =r$ )VanMlpLayerg   z
MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision
Transformer](https://huggingface.co/papers/2106.13797).
rH   rI   out_channels
hidden_actdropout_ratec                 J  > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X"SSUS9U l        [        U   U l        [        R                  " U5      U l	        [        R                  " X#SS9U l
        [        R                  " U5      U l        g )Nr   rN      rN   rO   groups)r-   r.   r   rP   in_dense
depth_wiser   
activationDropoutdropout1	out_densedropout2)r/   rH   rI   r^   r_   r`   r0   s         r%   r.   VanMlpLayer.__init__m   sx     			+J))K!UV_jk ,

<0;!L

<0r'   rW   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU$ r,   )rf   rg   rh   rj   rk   rl   r/   rW   s     r%   r4   VanMlpLayer.forward}   s\    }}\2|4|4}}\2~~l3}}\2r'   )rh   rg   rj   rl   rf   rk   )gelu      ?)r:   r;   r<   r=   r>   rZ   rA   r?   r.   r   r@   r4   rB   rC   rD   s   @r%   r\   r\   g   sj     !!11 1 	1
 1 1 1 ELL U\\  r'   r\   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	VanLargeKernelAttention   z%
Basic Large Kernel Attention (LKA).
rI   c           	         > [         TU ]  5         [        R                  " XSSUS9U l        [        R                  " XSSSUS9U l        [        R                  " XSS	9U l        g )
N   rM   rd   r   rc   	   )rN   dilationrO   re   r   rb   )r-   r.   r   rP   rg   depth_wise_dilated
point_wiser/   rI   r0   s     r%   r.    VanLargeKernelAttention.__init__   sW    ))K!UV_jk"$))!aS^#
 ))K!Lr'   rW   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r,   rg   rz   r{   ro   s     r%   r4   VanLargeKernelAttention.forward   s4    |4..|<|4r'   r   rY   rD   s   @r%   rt   rt      s6    MC MELL U\\  r'   rt   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	VanLargeKernelAttentionLayer   zN
Computes attention using Large Kernel Attention (LKA) and attends the input.
rI   c                 B   > [         TU ]  5         [        U5      U l        g r,   )r-   r.   rt   	attentionr|   s     r%   r.   %VanLargeKernelAttentionLayer.__init__   s    0=r'   rW   r   c                 0    U R                  U5      nX-  nU$ r,   r   )r/   rW   r   attendeds       r%   r4   $VanLargeKernelAttentionLayer.forward   s    NN<0	+r'   r   rY   rD   s   @r%   r   r      s4    >C >ELL U\\  r'   r   c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )VanSpatialAttentionLayer   z
Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
projection (via conv) + residual connection.
rI   r_   c                    > [         TU ]  5         [        R                  " [	        S[        R
                  " XSS94S[        U   4/5      5      U l        [        U5      U l	        [        R
                  " XSS9U l
        g )Nconvr   rb   act)r-   r.   r   
Sequentialr   rP   r   pre_projectionr   attention_layerpost_projection)r/   rI   r_   r0   s      r%   r.   !VanSpatialAttentionLayer.__init__   sn     mmRYY{QOPF:./
  <KH!yyqQr'   rW   r   c                 x    UnU R                  U5      nU R                  U5      nU R                  U5      nX-   nU$ r,   )r   r   r   r/   rW   residuals      r%   r4    VanSpatialAttentionLayer.forward   sE    **<8++L9++L9#.r'   )r   r   r   )rq   )r:   r;   r<   r=   r>   rZ   rA   r.   r   r@   r4   rB   rC   rD   s   @r%   r   r      sD    
RC RS R RELL U\\  r'   r   c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )VanLayerScaling   zL
Scales the inputs by a learnable parameter initialized by `initial_value`.
rI   initial_valuec                    > [         TU ]  5         [        R                  " U[        R
                  " U5      -  SS9U l        g )NT)requires_grad)r-   r.   r   	Parameterr   onesweight)r/   rI   r   r0   s      r%   r.   VanLayerScaling.__init__   s/    ll=5::k3J#JZ^_r'   rW   r   c                 `    U R                   R                  S5      R                  S5      U-  nU$ )N)r   	unsqueezero   s     r%   r4   VanLayerScaling.forward   s,    {{,,R0::2>Mr'   )r   )g{Gz?)r:   r;   r<   r=   r>   rZ   r?   r.   r   r@   r4   rB   rC   rD   s   @r%   r   r      sD    `C ` ` `ELL U\\  r'   r   c            	          ^  \ rS rSrSr  SS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )VanLayer   zn
Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
configrI   	mlp_ratiodrop_path_ratec                   > [         TU ]  5         US:  a  [        U5      O[        R                  " 5       U l        [        R                  " U5      U l        [        X!R                  5      U l
        [        X!R                  5      U l        [        R                  " U5      U l        [        X"U-  X!R                  UR                   5      U l        [        X!R                  5      U l        g )Nr   )r-   r.   r)   r   Identityr&   rR   pre_normomalizationr   r_   r   r   layer_scale_init_valueattention_scalingpost_normalizationr\   r`   mlpmlp_scaling)r/   r   rI   r   r   r0   s        r%   r.   VanLayer.__init__   s     	8F8L^4RTR]R]R_#%>>+#> 1+?P?PQ!0>[>[!\"$.."=y0+?P?PRXReRe
 +;8U8UVr'   rW   r   c                 .   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX!-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nX!-   nU$ r,   )r   r   r   r&   r   r   r   r   s      r%   r4   VanLayer.forward   s    //=~~l3--l;~~l3...|<xx-''5~~l3.r'   )r   r   r&   r   r   r   r   )r   rr   r:   r;   r<   r=   r>   r   rZ   r?   r.   r   r@   r4   rB   rC   rD   s   @r%   r   r      sf      #WW W 	W
 W W$ELL U\\  r'   r   c                      ^  \ rS rSrSr  SS\S\S\S\S\S\S	\S
\4U 4S jjjrS\	R                  S\	R                  4S jrSrU =r$ )VanStage   z*
VanStage, consisting of multiple layers.
r   rH   rI   rJ   rK   depthr   r   c	                   > [         T
U ]  5         [        X#XE5      U l        [        R
                  " [        U5       V	s/ s H  n	[        UUUUS9PM     sn	6 U l        [        R                  " X1R                  S9U l        g s  sn	f )N)r   r   eps)r-   r.   rF   
embeddingsr   r   ranger   layers	LayerNormlayer_norm_epsrS   )r/   r   rH   rI   rJ   rK   r   r   r   _r0   s             r%   r.   VanStage.__init__  s     	5kPZcmm u &A '#1	 &

  \\+;P;PQs   BrW   r   c                    U R                  U5      nU R                  U5      nUR                  u  p#pEUR                  S5      R	                  SS5      nU R                  U5      nUR                  X$XS5      R                  SSSS5      nU$ )NrM   r   r   rc   )r   r   r   flatten	transposerS   viewpermute)r/   rW   
batch_sizerI   heightwidths         r%   r4   VanStage.forward  s    |4{{<01=1C1C.
#++A.88A>)),7#((UPXXYZ\]_`bcdr'   )r   r   rS   )r   r   r   rD   s   @r%   r   r      s      #RR R 	R
 R R R R R R4	ELL 	U\\ 	 	r'   r   c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\
   S\	\
   S\\\4   4S	 jjrS
rU =r$ )
VanEncoderi+  z,
VanEncoder, consisting of multiple stages.
r   c                 T  > [         TU ]  5         [        R                  " / 5      U l        UR
                  nUR                  nUR                  nUR                  nUR                  n[        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  owR                  5       PM     nn[        [!        X#XEXh5      5       HR  u  n	u  pppU	S:H  nXIS-
     nU(       a  UR"                  nU R                  R%                  ['        UUUU
UUUUS95        MT     g s  snf )Nr   cpu)r   r   )rJ   rK   r   r   r   )r-   r.   r   
ModuleListstagespatch_sizesstrideshidden_sizesdepths
mlp_ratiosr   linspacer   sumitem	enumeratezipnum_channelsappendr   )r/   r   r   r   r   r   r   xdrop_path_rates	num_stagerJ   rK   rI   r   mlp_expansionr   is_first_stagerH   r0   s                     r%   r.   VanEncoder.__init__0  s   mmB'((..**&&
#nnQ0E0Es6==GYbgh
hFFHh 	 
 cllJXc
^I^
K '!^N&1}5K$11KK)!+#1	c
	
s   D%rW   output_hidden_statesreturn_dictr   c                     U(       a  SOS n[        U R                  5       H  u  pVU" U5      nU(       d  M  XA4-   nM     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   .   #    U  H  oc  M  Uv   M     g 7fr,   r   ).0vs     r%   	<genexpr>%VanEncoder.forward.<locals>.<genexpr>_  s     W$Eq$Es   	)last_hidden_stater2   )r   r   tupler	   )r/   rW   r   r   all_hidden_statesr   stage_modules          r%   r4   VanEncoder.forwardP  sb     #7BD(5OA'5L##$5$G!	  6 W\$EWWW-nnr'   )r   )FT)r:   r;   r<   r=   r>   r   r.   r   r@   r   boolr   r   r	   r4   rB   rC   rD   s   @r%   r   r   +  sj    y F 05&*	ollo 'tno d^	o
 
u44	5o or'   r   c                   6    \ rS rSr% Sr\\S'   SrSrSr	S r
Srg	)
VanPreTrainedModelid  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r   vanpixel_valuesTc                    [        U[        R                  5      (       a  [        R                  R	                  UR
                  U R                  R                  S9  [        U[        R                  5      (       a9  UR                  b+  [        R                  R                  UR                  S5        ggg[        U[        R                  5      (       aU  [        R                  R                  UR                  S5        [        R                  R                  UR
                  S5        g[        U[        R                  5      (       a  UR                  S   UR                  S   -  UR                  -  nX!R                  -  nUR
                  R                  R!                  S["        R$                  " SU-  5      5        UR                  b%  UR                  R                  R'                  5         ggg)zInitialize the weights)stdNr   g      ?r   g       @)
isinstancer   Linearinittrunc_normal_r   r   initializer_rangebias	constant_r   rP   rN   r^   re   datanormal_mathsqrtzero_)r/   modulefan_outs      r%   _init_weights VanPreTrainedModel._init_weightso  sM   fbii((GG!!&--T[[5R5R!S&")),,1H!!&++q1 2I,--GGfkk1-GGfmmS1		**((+f.@.@.CCfFYFYYG%GMM&&q$))C'M*BC{{&  &&( '	 +r'   r   N)r:   r;   r<   r=   r>   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr  rB   r   r'   r%   r   r   d  s%    
 $O&*#)r'   r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zxThe bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding layer.c                      ^  \ rS rSrU 4S jr\" \5      \" \\	\
S\S9  SS\\R                     S\\   S\\   S\\\	4   4S	 jj5       5       rS
rU =r$ )VanModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  S   UR                  S9U l	        U R                  5         g )Nr   r   )r-   r.   r   r   encoderr   r   r   r   	layernorm	post_initr/   r   r0   s     r%   r.   VanModel.__init__  sN     !&)f&9&9"&=6CXCXYr'   vision)
checkpointoutput_typeconfig_classmodalityexpected_outputr   r   r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nUS   nUR	                  SS/S9nU(       d	  XV4USS  -   $ [        UUUR                  S9$ )Nr   r   r   r   )dimr   )r   pooler_outputr2   )r   r   use_return_dictr  meanr
   r2   )r/   r   r   r   encoder_outputsr   pooled_outputs          r%   r4   VanModel.forward  s     %9$D $++JjJj 	 &1%<k$++B]B],,!5# ' 

 ,A.)..B8.<%58KKK7/')77
 	
r'   )r   r  r  )NN)r:   r;   r<   r=   r.   r   VAN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr
   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   FloatTensorr   r   r   r4   rB   rC   rD   s   @r%   r  r    s     ++?@&<$. 04&*	
u001
 'tn
 d^	

 
u>>	?
 A
r'   r  z
    VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                      ^  \ rS rSrU 4S jr\" \5      \" \\	\
\S9    SS\\R                     S\\R                     S\\   S\\   S\\\	4   4
S	 jj5       5       rS
rU =r$ )VanForImageClassificationi  c                   > [         TU ]  U5        [        U5      U l        UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   r   )r-   r.   r  r   
num_labelsr   r   r   r   
classifierr  r  s     r%   r.   "VanForImageClassification.__init__  sl     F# FLEVEVYZEZBIIf))"-v/@/@A`b`k`k`m 	
 	r'   )r  r  r  r  r   labelsr   r   r   c                 J   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U5      nSnUb  U R                  X'U R                   5      nU(       d  U4USS -   n	Ub  U4U	-   $ U	$ [        XUR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr!  r   rM   )losslogitsr2   )r   r%  r   r$  r3  loss_functionr   r2   )
r/   r   r5  r   r   outputsr(  r8  r7  r$   s
             r%   r4   !VanForImageClassification.forward  s    ( &1%<k$++B]B]((<`k(l1<--'!*/%%fdkkBDY,F)-)9TGf$EvE3\c\q\qrrr'   )r3  r   )NNNN)r:   r;   r<   r=   r.   r   r*  r   _IMAGE_CLASS_CHECKPOINTr   r,  _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r.  
LongTensorr   r   r   r4   rB   rC   rD   s   @r%   r0  r0    s    	 ++?@*8$4	 59-1/3&*su001s ))*s 'tn	s
 d^s 
u::	;s Asr'   r0  )r0  r  r   )r   F)5r>   r  collectionsr   typingr   r   r   r   activationsr   modeling_outputsr	   r
   r   modeling_utilsr   utilsr   r   r   r   configuration_vanr   
get_loggerr:   loggerr,  r+  r-  r<  r=  r@   r?   r   r&   Moduler)   rF   r\   rt   r   r   r   r   r   r   r   VAN_START_DOCSTRINGr*  r  r0  __all__r   r'   r%   <module>rK     s   4  # "   " 
 / v v ( 
		H	%  : '  > 1 U\\ e T V[VbVb (%")) %")) (")) @bii (299 ryy 8bii (ryy (V(ryy (V6o 6or) )8	   
-
! -

-
`  0s 2 0s0sf Jr'   