
    cCiF              	       z   S r SSKrSSKJr  SSKJrJr  SSKrSSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJrJrJrJrJrJrJr  SSKJr  SSKJr  \" 5       (       a	  SSK J!r!J"r"  OS r"S r!\RF                  " \$5      r%Sr&Sr'/ SQr(Sr)Sr*\ " S S\5      5       r+\ " S S\5      5       r,\ " S S\5      5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S  S!\R\                  5      r1SGS"\Rd                  S#\3S$\4S%\Rd                  4S& jjr5 " S' S(\R\                  5      r6 " S) S*\R\                  5      r7 " S+ S,\R\                  5      r8 " S- S.\R\                  5      r9 " S/ S0\R\                  5      r: " S1 S2\R\                  5      r; " S3 S4\R\                  5      r< " S5 S6\R\                  5      r= " S7 S8\R\                  5      r> " S9 S:\5      r?S;r@S<rA\" S=\@5       " S> S?\?5      5       rB\" S@\@5       " SA SB\?5      5       rC\" SC\@5       " SD SE\?\5      5       rD/ SFQrEg)Hz1PyTorch Neighborhood Attention Transformer model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)	ModelOutputOptionalDependencyNotAvailableadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_natten_availableloggingreplace_return_docstringsrequires_backends)BackboneMixin   )	NatConfig)
natten2davnatten2dqkrpbc                      [        5       eNr   argskwargss     i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/nat/modeling_nat.pyr   r   /       ,..    c                      [        5       er   r   r   s     r!   r   r   2   r"   r#   r   zshi-labs/nat-mini-in1k-224)r      r%   i   z	tiger catc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
NatEncoderOutputG   a3  
Nat encoder's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r)   r   torchFloatTensor__annotations__r*   tupler+   r,   __static_attributes__r-   r#   r!   r'   r'   G   s}    2 6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr#   r'   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)NatModelOutputh   a  
Nat model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr)   pooler_output.r*   r+   r,   r-   )r.   r/   r0   r1   r2   r)   r   r3   r4   r5   r;   r*   r6   r+   r,   r7   r-   r#   r!   r9   r9   h   s    6 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr#   r9   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)NatImageClassifierOutput   a  
Nat outputs for image classification.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlosslogits.r*   r+   r,   r-   )r.   r/   r0   r1   r2   r?   r   r3   r4   r5   r@   r*   r6   r+   r,   r7   r-   r#   r!   r=   r=      s    6 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr#   r=   c                   r   ^  \ rS rSrSrU 4S jrS\\R                     S\	\R                     4S jrSrU =r$ )NatEmbeddings   z.
Construct the patch and position embeddings.
c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  5      U l
        g r   )super__init__NatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__s     r!   rF   NatEmbeddings.__init__   sG     26 :LL!1!12	zz&"<"<=r#   pixel_valuesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rH   rK   rN   )rP   rT   
embeddingss      r!   forwardNatEmbeddings.forward   s4    **<8
YYz*
\\*-
r#   )rN   rK   rH   )r.   r/   r0   r1   r2   rF   r   r3   r4   r6   TensorrX   r7   __classcell__rR   s   @r!   rB   rB      s9    >HU->->$? E%,,DW  r#   rB   c                   l   ^  \ rS rSrSrU 4S jrS\\R                     S\R                  4S jr
SrU =r$ )rG      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  nUR                  UR                  pCX0l        US:X  a  O[        S5      e[        R                  " [        R                  " U R                  US-  SSSS9[        R                  " US-  USSSS95      U l	        g )Nr   z2Dinat only supports patch size of 4 at the moment.      rb   r`   r`   r   r   )kernel_sizestridepadding)
rE   rF   
patch_sizenum_channelsrJ   
ValueErrorr   
SequentialConv2d
projection)rP   rQ   rh   ri   hidden_sizerR   s        r!   rF   NatPatchEmbeddings.__init__   s    &&
$*$7$79I9Ik(? QRR--IId'')9vV\flmIIkQ&PV`fg
r#   rT   rU   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR	                  SSSS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r`   rb   r   )shaperi   rj   rm   permute)rP   rT   _ri   heightwidthrW   s          r!   rX   NatPatchEmbeddings.forward   sZ    )5););&,,,w  __\2
''1a3
r#   )ri   rm   )r.   r/   r0   r1   r2   rF   r   r3   r4   rZ   rX   r7   r[   r\   s   @r!   rG   rG      s4    
"	HU->->$? 	ELL 	 	r#   rG   c                      ^  \ rS rSrSr\R                  4S\S\R                  SS4U 4S jjjr	S\
R                  S\
R                  4S	 jrS
rU =r$ )NatDownsampler   z
Convolutional Downsampling Layer.

Args:
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
dim
norm_layerrU   Nc           	         > [         TU ]  5         Xl        [        R                  " USU-  SSSSS9U l        U" SU-  5      U l        g )Nr`   ra   rc   rd   F)re   rf   rg   bias)rE   rF   rz   r   rl   	reductionrK   )rP   rz   r{   rR   s      r!   rF   NatDownsampler.__init__   sC    3CVF\binoq3w'	r#   input_featurec                     U R                  UR                  SSSS5      5      R                  SSSS5      nU R                  U5      nU$ )Nr   rb   r   r`   )r~   rr   rK   )rP   r   s     r!   rX   NatDownsampler.forward   sJ    }'<'<Q1a'HIQQRSUVXY[\]		-0r#   )rz   rK   r~   )r.   r/   r0   r1   r2   r   rI   intModulerF   r3   rZ   rX   r7   r[   r\   s   @r!   rx   rx      sT     :< (C (RYY ($ ( (U\\ ell  r#   rx   input	drop_probtrainingrU   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)rq   ndimr3   randr   r   floor_div)r   r   r   	keep_probrq   random_tensoroutputs          r!   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr#   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )NatDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rU   c                 .   > [         TU ]  5         Xl        g r   )rE   rF   r   )rP   r   rR   s     r!   rF   NatDropPath.__init__  s    "r#   r*   c                 B    [        XR                  U R                  5      $ r   )r   r   r   rP   r*   s     r!   rX   NatDropPath.forward  s    FFr#   c                      SU R                    3$ )Nzp=r   rP   s    r!   
extra_reprNatDropPath.extra_repr  s    DNN#$$r#   r   r   )r.   r/   r0   r1   r2   r   floatrF   r3   rZ   rX   strr   r7   r[   r\   s   @r!   r   r     sQ    b#(5/ #T # #GU\\ Gell G%C % %r#   r   c                   ~   ^  \ rS rSrU 4S jrS r S	S\R                  S\\	   S\
\R                     4S jjrSrU =r$ )
NeighborhoodAttentioni#  c                   > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        X@l        [        R                  " [        R                  " USU R                  -  S-
  SU R                  -  S-
  5      5      U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R$                  " UR&                  5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r`   r   )r}   )rE   rF   rj   num_attention_headsr   attention_head_sizeall_head_sizere   r   	Parameterr3   zerosrpbLinearqkv_biasquerykeyvaluerL   attention_probs_dropout_probrN   rP   rQ   rz   	num_headsre   rR   s        r!   rF   NeighborhoodAttention.__init__$  s5   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP& <<ID<L<L8Lq8PTUX\XhXhThklTl noYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr#   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSSS5      $ )Nr   rb   r   r`   r   )sizer   r   viewrr   )rP   xnew_x_shapes      r!   transpose_for_scores*NeighborhoodAttention.transpose_for_scores9  sN    ffhsmt'?'?AYAY&ZZFF;yyAq!Q''r#   r*   output_attentionsrU   c                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU[        R
                  " U R                  5      -  n[        X4U R                  U R                  S5      n[        R                  R                  USS9nU R                  U5      n[        XuU R                  S5      nUR                  SSSSS5      R!                  5       nUR#                  5       S S U R$                  4-   n	UR'                  U	5      nU(       a  X4n
U
$ U4n
U
$ )	Nr   r   rz   r   r`   rb   r   )r   r   r   r   mathsqrtr   r   r   re   r   
functionalsoftmaxrN   r   rr   
contiguousr   r   r   )rP   r*   r   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss              r!   rX   NeighborhoodAttention.forward>  s?   
 //

=0IJ--dhh}.EF	//

=0IJ
 "DIId.F.F$GG )4K[K[]^_ --//0@b/I ,,7"?AQAQSTU%--aAq!<GGI"/"4"4"6s";t?Q?Q>S"S%**+BC6G=2 O\M]r#   )	r   r   rN   re   r   r   r   r   r   F)r.   r/   r0   r1   rF   r   r3   rZ   r   boolr6   rX   r7   r[   r\   s   @r!   r   r   #  sJ    G*( -2|| $D> 
u||		 r#   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )NeighborhoodAttentionOutputi`  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )rE   rF   r   r   denserL   r   rN   rP   rQ   rz   rR   s      r!   rF   $NeighborhoodAttentionOutput.__init__a  s4    YYs(
zz&"E"EFr#   r*   input_tensorrU   c                 J    U R                  U5      nU R                  U5      nU$ r   r   rN   )rP   r*   r   s      r!   rX   #NeighborhoodAttentionOutput.forwardf  s$    

=1]3r#   r   
r.   r/   r0   r1   rF   r3   rZ   rX   r7   r[   r\   s   @r!   r   r   `  s7    G
U\\  RWR^R^  r#   r   c                   ~   ^  \ rS rSrU 4S jrS r S	S\R                  S\\	   S\
\R                     4S jjrSrU =r$ )
NeighborhoodAttentionModuleim  c                    > [         TU ]  5         [        XX45      U l        [	        X5      U l        [        5       U l        g r   )rE   rF   r   rP   r   r   setpruned_headsr   s        r!   rF   $NeighborhoodAttentionModule.__init__n  s2    )&yN	1&>Er#   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rP   r   r   r   r   r   r   r   r   r   r   union)rP   headsindexs      r!   prune_heads'NeighborhoodAttentionModule.prune_headst  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r#   r*   r   rU   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )rP   r   )rP   r*   r   self_outputsattention_outputr   s         r!   rX   #NeighborhoodAttentionModule.forward  s@    
 yyB;;|AF#%QR(88r#   )r   r   rP   r   )r.   r/   r0   r1   rF   r   r3   rZ   r   r   r6   rX   r7   r[   r\   s   @r!   r   r   m  sI    ";* -2|| $D> 
u||		 r#   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )NatIntermediatei  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rE   rF   r   r   r   	mlp_ratior   
isinstance
hidden_actr   r   intermediate_act_fnr   s      r!   rF   NatIntermediate.__init__  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r#   r*   rU   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r   s     r!   rX   NatIntermediate.forward  s&    

=100?r#   r   r   r\   s   @r!   r   r     s(    9U\\ ell  r#   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	NatOutputi  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
rE   rF   r   r   r   r   r   rL   rM   rN   r   s      r!   rF   NatOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r#   r*   rU   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   s     r!   rX   NatOutput.forward  s$    

=1]3r#   r   r   r\   s   @r!   r   r     s(    >
U\\ ell  r#   r   c            	          ^  \ rS rSrS	U 4S jjrS r S
S\R                  S\\	   S\
\R                  \R                  4   4S jjrSrU =r$ )NatLayeri  c                 d  > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " X!R                  S9U l        [        XX0R                  S9U l	        US:  a  [        U5      O[        R                  " 5       U l        [        R
                  " X!R                  S9U l        [        X5      U l        [!        X5      U l        UR$                  S:  a>  [        R&                  " UR$                  [(        R*                  " SU45      -  SS9U l        g S U l        g )Neps)re   r   r   r`   T)requires_grad)rE   rF   chunk_size_feed_forwardre   r   rI   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr   intermediater   r   layer_scale_init_valuer   r3   oneslayer_scale_parameters)rP   rQ   rz   r   drop_path_raterR   s        r!   rF   NatLayer.__init__  s    '-'E'E$!-- "S6K6K L4V)YiYij8F8L^4RTR]R]R_!||C5J5JK+F8, ,,q0 LL66QH9MM]ab 	#  	#r#   c                     U R                   nSnX$:  d  X4:  aD  S=pg[        SXC-
  5      n[        SXB-
  5      n	SSXhXy4n[        R                  R	                  X5      nX4$ )N)r   r   r   r   r   r   r   )re   maxr   r   pad)
rP   r*   rt   ru   window_size
pad_valuespad_lpad_tpad_rpad_bs
             r!   	maybe_padNatLayer.maybe_pad  sn    &&'
5#6E;./E;/0EQe;JMM--mHM((r#   r*   r   rU   c                    UR                  5       u  p4pVUnU R                  U5      nU R                  XU5      u  pUR                  u  ppU R	                  XS9nUS   nUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R                  5       nU R                  b  U R                  S   U-  nXpR                  U5      -   nU R                  U5      nU R                  U R                  U5      5      nU R                  b  U R                  S   U-  nXR                  U5      -   nU(       a  XS   4nU$ U4nU$ )N)r   r   rb      r   )r   r  r  rq   r  r   r  r   r
  r   r  )rP   r*   r   
batch_sizert   ru   channelsshortcutr  rs   
height_pad	width_padattention_outputsr   
was_paddedlayer_outputlayer_outputss                    r!   rX   NatLayer.forward  sf   
 /<.@.@.B+
E --m<$(NN=%$P!&3&9&9#y NN=N^,Q/]Q&;*Q-!*;
/7F7FUFA0EFQQS&&2#::1=@PP >>2B#CC++M:{{4#4#4\#BC&&266q9LHL$~~l'CC@Q';< YeWfr#   )	r  r  r   r  re   r  r
  r  r   )r   r   )r.   r/   r0   r1   rF   r  r3   rZ   r   r   r6   rX   r7   r[   r\   s   @r!   r   r     sR    
 	) -2$||$ $D>$ 
u||U\\)	*	$ $r#   r   c                   x   ^  \ rS rSrU 4S jr SS\R                  S\\   S\	\R                     4S jjr
SrU =r$ )	NatStagei  c                   > [         TU ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        UUUXW   S9PM     sn5      U l        Ub  U" U[        R                  S9U l
        OS U l
        SU l        g s  snf )N)rQ   rz   r   r  )rz   r{   F)rE   rF   rQ   rz   r   
ModuleListranger   layersrI   
downsamplepointing)	rP   rQ   rz   depthr   r  r.  irR   s	           r!   rF   NatStage.__init__  s    mm u &A !'#1#4	 &

 !(SR\\JDO"DO#s   B
r*   r   rU   c                     UR                  5       u  p4pS[        U R                  5       H  u  pgU" X5      nUS   nM     Un	U R                  b  U R                  U	5      nX4n
U(       a  U
WSS  -  n
U
$ r   )r   	enumerater-  r.  )rP   r*   r   rs   rt   ru   r1  layer_moduler&  !hidden_states_before_downsamplingstage_outputss              r!   rX   NatStage.forward  s    
 ,0025(5OA(JM)!,M  6 -:)??& OO,MNM&J]12..Mr#   )rQ   rz   r.  r-  r/  r   )r.   r/   r0   r1   rF   r3   rZ   r   r   r6   rX   r7   r[   r\   s   @r!   r)  r)    sD    6 -2|| $D> 
u||		 r#   r)  c                      ^  \ rS rSrU 4S jr    SS\R                  S\\   S\\   S\\   S\\   S\	\
\4   4S	 jjrS
rU =r$ )
NatEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  o"R                  5       PM     nn[        R                  " [        U R                  5       Vs/ s H  n[        U[        UR                   SU-  -  5      UR                  U   UR"                  U   U[        UR                  S U 5      [        UR                  S US-    5       X@R                  S-
  :  a  [$        OS S9PM     sn5      U l        g s  snf s  snf )Nr   cpu)r   r`   r   )rQ   rz   r0  r   r  r.  )rE   rF   r   depths
num_levelsrQ   r3   linspacer  sumitemr   r+  r,  r)  r   rJ   r   rx   levels)rP   rQ   r   dpri_layerrR   s        r!   rF   NatEncoder.__init__  s    fmm,!&63H3H#fmmJ\ej!kl!kAvvx!klmm  %T__5
  6G !F,,q'z9: --0$..w7#&s6=='+B'Cc&--XeZadeZeJfFg#h29OOa<O2O~VZ  6

 m
s   &E
(BEr*   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrU   c                    U(       a  SOS nU(       a  SOS nU(       a  SOS nU(       a  UR                  SSSS5      n	Xa4-  nXy4-  n[        U R                  5       H  u  pU" X5      nUS   nUS   nU(       a&  U(       a  UR                  SSSS5      n	Xm4-  nXy4-  nO,U(       a%  U(       d  UR                  SSSS5      n	Xa4-  nXy4-  nU(       d  My  XSS  -  nM     U(       d  [        S XU4 5       5      $ [	        UUUUS9$ )Nr-   r   rb   r   r`   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r-   ).0vs     r!   	<genexpr>%NatEncoder.forward.<locals>.<genexpr>X  s     m$[q$[s   	)r)   r*   r+   r,   )rr   r4  rB  r6   r'   )rP   r*   r   rF  rG  rH  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsreshaped_hidden_stater1  r5  r&  r6  s                 r!   rX   NatEncoder.forward1  sA    #7BD+?RT"$5b4$1$9$9!Q1$E!!11&*BB&(5OA(JM)!,M0=a0@-#(P(I(Q(QRSUVXY[\(]%!%II!*.FF*%.V(5(=(=aAq(I%!%55!*.FF*  #QR'88#%  6( m]GZ$[mmm++*#=	
 	
r#   )rQ   rB  r>  )FFFT)r.   r/   r0   r1   rF   r3   rZ   r   r   r   r6   r'   rX   r7   r[   r\   s   @r!   r:  r:    sy    
, -2/4CH&*.
||.
 $D>.
 'tn	.

 3;4..
 d^.
 
u&&	'.
 .
r#   r:  c                   2    \ rS rSr% Sr\\S'   SrSrS r	Sr
g)	NatPreTrainedModelib  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
rQ   natrT   c                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r   rl   weightdatanormal_rQ   initializer_ranger}   zero_rI   fill_)rP   modules     r!   _init_weights NatPreTrainedModel._init_weightsl  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r#   r-   N)r.   r/   r0   r1   r2   r   r5   base_model_prefixmain_input_namera  r7   r-   r#   r!   rU  rU  b  s    
 $O
*r#   rU  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`NatConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare Nat Model transformer outputting raw hidden-states without any specific head on top.c                      ^  \ rS rSrSU 4S jjrS rS r\" \5      \	" \
\\S\S9    SS\\R                      S\\   S	\\   S
\\   S\\\4   4
S jj5       5       rSrU =r$ )NatModeli  c                   > [         TU ]  U5        [        U S/5        Xl        [	        UR
                  5      U l        [        UR                  SU R                  S-
  -  -  5      U l	        [        U5      U l        [        U5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OS U l        U R)                  5         g )Nnattenr`   r   r  )rE   rF   r   rQ   r   r=  r>  r   rJ   num_featuresrB   rW   r:  encoderr   rI   r  	layernormAdaptiveAvgPool1dpooler	post_init)rP   rQ   add_pooling_layerrR   s      r!   rF   NatModel.__init__  s     $
+fmm, 0 0119L3M MN'/!&)d&7&7V=R=RS1Bb**1- 	r#   c                 .    U R                   R                  $ r   rW   rH   r   s    r!   get_input_embeddingsNatModel.get_input_embeddings      ///r#   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrj  layerr  r   )rP   heads_to_prunerx  r   s       r!   _prune_headsNatModel._prune_heads  s<    
 +002LELLu%//;;EB 3r#   vision)
checkpointoutput_typeconfig_classmodalityexpected_outputrT   r   rF  rH  rU   c                 Z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nS nU R                  bH  U R                  UR                  SS5      R                  SS5      5      n[        R                  " US5      nU(       d  Xx4USS  -   n	U	$ [        UUUR                  UR                  UR                  S9$ )Nz You have to specify pixel_valuesr   rF  rH  r   r   r`   )r)   r;   r*   r+   r,   )rQ   r   rF  use_return_dictrj   rW   rj  rk  rm  flatten	transposer3   r9   r*   r+   r,   )
rP   rT   r   rF  rH  embedding_outputencoder_outputssequence_outputpooled_outputr   s
             r!   rX   NatModel.forward  s?    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,/!5#	 ' 
 *!,..9;;" KK(?(?1(E(O(OPQST(UVM!MM-;M%58KKFM-')77&11#2#I#I
 	
r#   )rQ   rW   rj  rk  ri  r>  rm  )T)NNNN)r.   r/   r0   r1   rF   rs  rz  r   NAT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr9   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r3   r4   r   r   r6   rX   r7   r[   r\   s   @r!   rf  rf    s    
$0C ++?@&"$. 59,0/3&*,
u001,
 $D>,
 'tn	,

 d^,
 
un$	%,
 A,
r#   rf  z
    Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\" \5      \" \\	\
\S9     SS\\R                     S\\R                     S\\   S\\   S\\   S	\\\	4   4S
 jj5       5       rSrU =r$ )NatForImageClassificationi  c                 ^  > [         TU ]  U5        [        U S/5        UR                  U l        [	        U5      U l        UR                  S:  a5  [        R                  " U R
                  R                  UR                  5      O[        R                  " 5       U l
        U R                  5         g )Nrh  r   )rE   rF   r   
num_labelsrf  rV  r   r   ri  r	  
classifierrn  rO   s     r!   rF   "NatForImageClassification.__init__  s     $
+ ++F# DJCTCTWXCXBIIdhh++V->->?^`^i^i^k 	
 	r#   )r}  r~  r  r  rT   labelsr   rF  rH  rU   c                 V   Ub  UOU R                   R                  nU R                  UUUUS9nUS   nU R                  U5      nSn	Ub  U R	                  X(U R                   5      n	U(       d  U4USS -   n
U	b  U	4U
-   $ U
$ [        U	UUR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r`   )r?   r@   r*   r+   r,   )	rQ   r  rV  r  loss_functionr=   r*   r+   r,   )rP   rT   r  r   rF  rH  r   r  r@   r?   r   s              r!   rX   !NatForImageClassification.forward  s    * &1%<k$++B]B]((/!5#	  
  
/%%fdkkBDY,F)-)9TGf$EvE'!//))#*#A#A
 	
r#   )r  rV  r  )NNNNN)r.   r/   r0   r1   rF   r   r  r   _IMAGE_CLASS_CHECKPOINTr=   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r3   r4   
LongTensorr   r   r6   rX   r7   r[   r\   s   @r!   r  r    s      ++?@*,$4	 59-1,0/3&*)
u001)
 ))*)
 $D>	)

 'tn)
 d^)
 
u..	/)
 A)
r#   r  zBNAT backbone, to be used with frameworks like DETR and MaskFormer.c                      ^  \ rS rSrU 4S jrS r\" \5      \" \	\
S9   SS\R                  S\\   S\\   S\\   S	\	4
S
 jj5       5       rSrU =r$ )NatBackbonei:  c           	      8  > [         TU ]  U5        [         TU ]	  U5        [        U S/5        [	        U5      U l        [        U5      U l        UR                  /[        [        UR                  5      5       Vs/ s H  n[        UR                  SU-  -  5      PM      sn-   U l        0 n[        U R                  U R                   5       H  u  pE["        R$                  " U5      X4'   M     ["        R&                  " U5      U l        U R+                  5         g s  snf )Nrh  r`   )rE   rF   _init_backboner   rB   rW   r:  rj  rJ   r,  r   r=  r   ri  zipout_featuresr  r   rI   
ModuleDicthidden_states_normsrn  )rP   rQ   r1  r  stageri   rR   s         r!   rF   NatBackbone.__init__?  s     v&$
+'/!&)#--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rr !#&t'8'8$--#HE)+l)C& $I#%==1D#E  	 2ss   9%Dc                 .    U R                   R                  $ r   rr  r   s    r!   rs   NatBackbone.get_input_embeddingsR  ru  r#   )r~  r  rT   rF  r   rH  rU   c                 $   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  UUSSSS9nUR                  nSn[        U R                  U5       H  u  pXR                  ;   d  M  U
R                  u  ppU
R                  SSSS5      R                  5       n
U
R                  XU-  U5      n
U R                  U	   " U
5      n
U
R                  XX5      n
U
R                  SSSS5      R                  5       n
X4-  nM     U(       d  U4nU(       a  XR                  4-  nU$ [!        UU(       a  UR                  OSUR"                  S	9$ )
a  
Returns:

Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)

>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 512, 7, 7]
```NT)r   rF  rG  rH  r-   r   r`   rb   r   )feature_mapsr*   r+   )rQ   r  rF  r   rW   rj  r,   r  stage_namesr  rq   rr   r   r   r  r*   r	   r+   )rP   rT   rF  r   rH  r  r   r*   r  r  hidden_stater  ri   rt   ru   r   s                   r!   rX   NatBackbone.forwardU  s   H &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,/!%59  
  66#&t'7'7#GE))):F:L:L7
&+33Aq!Q?JJL+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $H "_F#0022M%3G'//T))
 	
r#   )rW   rj  r  ri  )NNN)r.   r/   r0   r1   rF   rs  r   r  r   r	   r  r3   rZ   r   r   rX   r7   r[   r\   s   @r!   r  r  :  s    
&0 ++?@>X 04,0&*J
llJ
 'tnJ
 $D>	J

 d^J
 
J
 Y AJ
r#   r  )r  rf  rU  r  )r   F)Fr2   r   dataclassesr   typingr   r   r3   r   activationsr   modeling_outputsr	   modeling_utilsr
   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   r   r   utils.backbone_utilsr   configuration_natr   natten.functionalr   r   
get_loggerr.   loggerr  r  r  r  r  r'   r9   r=   r   rB   rG   rx   rZ   r   r   r   r   r   r   r   r   r   r   r)  r:  rU  NAT_START_DOCSTRINGr  rf  r  r  __all__r-   r#   r!   <module>r     s   8  ! "   " / . R
 
 
 3 ( ;;// 
		H	%  3 '  7 *  K{ K K@  K[  K  KF  K{  K  KFBII ,! !HRYY .U\\ e T V[VbVb (%")) %:BII :z
")) 
!")) !Hbii 			 	@ryy @F+ryy +\B
 B
J* *.	  " cR
! R
	R
j  A
 2 A
A
H Hc
$m c
	c
L Yr#   