
    cCi               	       t   S r SSKrSSKJr  SSKJrJr  SSKrSSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJrJrJrJr  SSKJr  SSKJr  \" 5       (       a	  SSKJrJr  OS rS r\R@                  " \!5      r"\\" SS9 " S S\5      5       5       r#\\" SS9 " S S\5      5       5       r$\\" SS9 " S S\5      5       5       r% " S S\RL                  5      r' " S S\RL                  5      r( " S  S!\RL                  5      r)SDS"\RT                  S#\+S$\,S%\RT                  4S& jjr- " S' S(\RL                  5      r. " S) S*\RL                  5      r/ " S+ S,\RL                  5      r0 " S- S.\RL                  5      r1 " S/ S0\RL                  5      r2 " S1 S2\RL                  5      r3 " S3 S4\RL                  5      r4 " S5 S6\RL                  5      r5 " S7 S8\RL                  5      r6\ " S9 S:\5      5       r7\ " S; S<\75      5       r8\" S=S9 " S> S?\75      5       r9\" S@S9 " SA SB\7\5      5       r:/ SCQr;g)Ez9PyTorch Dilated Neighborhood Attention Transformer model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputOptionalDependencyNotAvailableauto_docstringis_natten_availableloggingrequires_backends)BackboneMixin   )DinatConfig)
natten2davnatten2dqkrpbc                      [        5       eNr   argskwargss     b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/dinat/modeling_dinat.pyr   r   ,       ,..    c                      [        5       er   r   r   s     r   r   r   /   r   r    zO
    Dinat encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
DinatEncoderOutput9   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r&   r   torchFloatTensor__annotations__r'   tupler(   r)   __static_attributes__r*   r    r   r$   r$   9   s}     6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr    r$   zW
    Dinat model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)DinatModelOutputO   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr&   pooler_output.r'   r(   r)   r*   )r+   r,   r-   r.   r/   r&   r   r0   r1   r2   r8   r'   r3   r(   r)   r4   r*   r    r   r6   r6   O   s    	 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr    r6   z1
    Dinat outputs for image classification.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)DinatImageClassifierOutputh   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlosslogits.r'   r(   r)   r*   )r+   r,   r-   r.   r/   r<   r   r0   r1   r2   r=   r'   r3   r(   r)   r4   r*   r    r   r:   r:   h   s     )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr    r:   c                   r   ^  \ rS rSrSrU 4S jrS\\R                     S\	\R                     4S jrSrU =r$ )DinatEmbeddings   z.
Construct the patch and position embeddings.
c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  5      U l
        g r   )super__init__DinatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__s     r   rC   DinatEmbeddings.__init__   sG     4V <LL!1!12	zz&"<"<=r    pixel_valuesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rE   rH   rK   )rM   rQ   
embeddingss      r   forwardDinatEmbeddings.forward   s4    **<8
YYz*
\\*-
r    )rK   rH   rE   )r+   r,   r-   r.   r/   rC   r   r0   r1   r3   TensorrU   r4   __classcell__rO   s   @r   r?   r?      s9    >HU->->$? E%,,DW  r    r?   c                   l   ^  \ rS rSrSrU 4S jrS\\R                     S\R                  4S jr
SrU =r$ )rD      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  nUR                  UR                  pCX0l        US:X  a  O[        S5      e[        R                  " [        R                  " U R                  US-  SSSS9[        R                  " US-  USSSS95      U l	        g )N   z2Dinat only supports patch size of 4 at the moment.   r   r   r^   r^   r   r   )kernel_sizestridepadding)
rB   rC   
patch_sizenum_channelsrG   
ValueErrorr   
SequentialConv2d
projection)rM   rN   re   rf   hidden_sizerO   s        r   rC   DinatPatchEmbeddings.__init__   s    &&
$*$7$79I9Ik(? QRR--IId'')9vV\flmIIkQ&PV`fg
r    rQ   rR   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR	                  SSSS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r^   r   r   )shaperf   rg   rj   permute)rM   rQ   _rf   heightwidthrT   s          r   rU   DinatPatchEmbeddings.forward   sZ    )5););&,,,w  __\2
''1a3
r    )rf   rj   )r+   r,   r-   r.   r/   rC   r   r0   r1   rW   rU   r4   rX   rY   s   @r   rD   rD      s4    
"	HU->->$? 	ELL 	 	r    rD   c                      ^  \ rS rSrSr\R                  4S\S\R                  SS4U 4S jjjr	S\
R                  S\
R                  4S	 jrS
rU =r$ )DinatDownsampler   z
Convolutional Downsampling Layer.

Args:
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
dim
norm_layerrR   Nc           	         > [         TU ]  5         Xl        [        R                  " USU-  SSSSS9U l        U" SU-  5      U l        g )Nr^   r_   r`   ra   F)rb   rc   rd   bias)rB   rC   rw   r   ri   	reductionrH   )rM   rw   rx   rO   s      r   rC   DinatDownsampler.__init__   sC    3CVF\binoq3w'	r    input_featurec                     U R                  UR                  SSSS5      5      R                  SSSS5      nU R                  U5      nU$ )Nr   r   r   r^   )r{   ro   rH   )rM   r}   s     r   rU   DinatDownsampler.forward   sJ    }'<'<Q1a'HIQQRSUVXY[\]		-0r    )rw   rH   r{   )r+   r,   r-   r.   r/   r   rF   intModulerC   r0   rW   rU   r4   rX   rY   s   @r   ru   ru      sT     :< (C (RYY ($ ( (U\\ ell  r    ru   input	drop_probtrainingrR   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)rn   ndimr0   randr   r   floor_div)r   r   r   	keep_probrn   random_tensoroutputs          r   	drop_pathr      s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr    c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )DinatDropPath   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rR   c                 .   > [         TU ]  5         Xl        g r   )rB   rC   r   )rM   r   rO   s     r   rC   DinatDropPath.__init__   s    "r    r'   c                 B    [        XR                  U R                  5      $ r   )r   r   r   rM   r'   s     r   rU   DinatDropPath.forward   s    FFr    c                      SU R                    3$ )Nzp=r   rM   s    r   
extra_reprDinatDropPath.extra_repr   s    DNN#$$r    r   r   )r+   r,   r-   r.   r/   r   floatrC   r0   rW   rU   strr   r4   rX   rY   s   @r   r   r      sQ    b#(5/ #T # #GU\\ Gell G%C % %r    r   c                   x   ^  \ rS rSrU 4S jr SS\R                  S\\   S\	\R                     4S jjr
SrU =r$ )	NeighborhoodAttention   c                   > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        X@l        XPl        [        R                  " [        R                  " USU R                  -  S-
  SU R                  -  S-
  5      5      U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R&                  " UR(                  5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r^   r   )rz   )rB   rC   rg   num_attention_headsr   attention_head_sizeall_head_sizerb   dilationr   	Parameterr0   zerosrpbLinearqkv_biasquerykeyvaluerI   attention_probs_dropout_probrK   rM   rN   rw   	num_headsrb   r   rO   s         r   rC   NeighborhoodAttention.__init__   s:   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP&  <<ID<L<L8Lq8PTUX\XhXhThklTl noYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr    r'   output_attentionsrR   c                    UR                   u  p4nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU[        R                  " U R                  5      -  n[        XgU R                  U R                  U R                  5      n	[        R                  R!                  U	SS9n
U R#                  U
5      n
[%        XU R                  U R                  5      nUR'                  SSSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR                  U5      nU(       a  X4nU$ U4nU$ )	Nr   r^   rw   r   r   r]   )rn   r   viewr   r   	transposer   r   mathsqrtr   r   rb   r   r   
functionalsoftmaxrK   r   ro   
contiguoussizer   )rM   r'   r   
batch_size
seq_lengthrp   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r   rU   NeighborhoodAttention.forward  s   
 %2$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 "DIId.F.F$GG )4K[K[]a]j]jk --//0@b/I ,,7"?AQAQSWS`S`a%--aAq!<GGI"/"4"4"6s";t?Q?Q>S"S%**+BC6G=2 O\M]r    )
r   r   r   rK   rb   r   r   r   r   r   Fr+   r,   r-   r.   rC   r0   rW   r   boolr3   rU   r4   rX   rY   s   @r   r   r      sE    G2 -2,||, $D>, 
u||		, ,r    r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )NeighborhoodAttentionOutputi>  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )rB   rC   r   r   denserI   r   rK   rM   rN   rw   rO   s      r   rC   $NeighborhoodAttentionOutput.__init__?  s4    YYs(
zz&"E"EFr    r'   input_tensorrR   c                 J    U R                  U5      nU R                  U5      nU$ r   r   rK   )rM   r'   r   s      r   rU   #NeighborhoodAttentionOutput.forwardD  s$    

=1]3r    r   
r+   r,   r-   r.   rC   r0   rW   rU   r4   rX   rY   s   @r   r   r   >  s7    G
U\\  RWR^R^  r    r   c                   ~   ^  \ rS rSrU 4S jrS r S	S\R                  S\\	   S\
\R                     4S jjrSrU =r$ )
NeighborhoodAttentionModuleiK  c                    > [         TU ]  5         [        XX4U5      U l        [	        X5      U l        [        5       U l        g r   )rB   rC   r   rM   r   r   setpruned_headsr   s         r   rC   $NeighborhoodAttentionModule.__init__L  s4    )&yxX	1&>Er    c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rM   r   r   r   r   r   r   r   r   r   r   union)rM   headsindexs      r   prune_heads'NeighborhoodAttentionModule.prune_headsR  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r    r'   r   rR   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )rM   r   )rM   r'   r   self_outputsattention_outputr   s         r   rU   #NeighborhoodAttentionModule.forwardd  s@    
 yyB;;|AF#%QR(88r    )r   r   rM   r   )r+   r,   r-   r.   rC   r   r0   rW   r   r   r3   rU   r4   rX   rY   s   @r   r   r   K  sI    ";* -2|| $D> 
u||		 r    r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DinatIntermediateio  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rB   rC   r   r   r   	mlp_ratior   
isinstance
hidden_actr   r   intermediate_act_fnr   s      r   rC   DinatIntermediate.__init__p  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r    r'   rR   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r   s     r   rU   DinatIntermediate.forwardx  s&    

=100?r    r   r   rY   s   @r   r   r   o  s(    9U\\ ell  r    r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DinatOutputi~  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
rB   rC   r   r   r   r   r   rI   rJ   rK   r   s      r   rC   DinatOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r    r'   rR   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   s     r   rU   DinatOutput.forward  s$    

=1]3r    r   r   rY   s   @r   r   r   ~  s(    >
U\\ ell  r    r   c            	          ^  \ rS rSrS	U 4S jjrS r S
S\R                  S\\	   S\
\R                  \R                  4   4S jjrSrU =r$ )
DinatLayeri  c                   > [         TU ]  5         UR                  U l        UR                  U l        X@l        U R                  U R                  -  U l        [        R                  " X!R                  S9U l	        [        XX0R                  U R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        UR(                  S:  a>  [        R*                  " UR(                  [,        R.                  " SU45      -  SS9U l        g S U l        g )Neps)rb   r   r   r   r^   T)requires_grad)rB   rC   chunk_size_feed_forwardrb   r   window_sizer   rF   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr   intermediater   r   layer_scale_init_valuer   r0   oneslayer_scale_parameters)rM   rN   rw   r   r   drop_path_raterO   s         r   rC   DinatLayer.__init__  s   '-'E'E$!-- ++dmm; "S6K6K L40@0@4==
 ;I3:N~6TVT_T_Ta!||C5J5JK-f:!&. ,,q0 LL66QH9MM]ab 	#  	#r    c                     U R                   nSnX$:  d  X4:  aD  S=pg[        SXC-
  5      n[        SXB-
  5      n	SSXhXy4n[        R                  R	                  X5      nX4$ )N)r   r   r   r   r   r   r   )r  maxr   r   pad)
rM   r'   rq   rr   r  
pad_valuespad_lpad_tpad_rpad_bs
             r   	maybe_padDinatLayer.maybe_pad  sn    &&'
5#6E;./E;/0EQe;JMM--mHM((r    r'   r   rR   c                    UR                  5       u  p4pVUnU R                  U5      nU R                  XU5      u  pUR                  u  ppU R	                  XS9nUS   nUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R                  5       nU R                  b  U R                  S   U-  nXpR                  U5      -   nU R                  U5      nU R                  U R                  U5      5      nU R                  b  U R                  S   U-  nXR                  U5      -   nU(       a  XS   4nU$ U4nU$ )N)r   r   r      r   )r   r  r  rn   r  r   r  r   r
  r   r  )rM   r'   r   r   rq   rr   channelsshortcutr  rp   
height_pad	width_padattention_outputsr   
was_paddedlayer_outputlayer_outputss                    r   rU   DinatLayer.forward  sf   
 /<.@.@.B+
E --m<$(NN=%$P!&3&9&9#y NN=N^,Q/]Q&;*Q-!*;
/7F7FUFA0EFQQS&&2#::1=@PP >>2B#CC++M:{{4#4#4\#BC&&266q9LHL$~~l'CC@Q';< YeWfr    )r  r  r   r   r  rb   r  r
  r  r   r  )r   r   )r+   r,   r-   r.   rC   r  r0   rW   r   r   r3   rU   r4   rX   rY   s   @r   r   r     sR    
(	) -2$||$ $D>$ 
u||U\\)	*	$ $r    r   c                   x   ^  \ rS rSrU 4S jr SS\R                  S\\   S\	\R                     4S jjr
SrU =r$ )	
DinatStagei  c                 $  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        UUUXX   Xh   S9PM     sn5      U l        Ub  U" U[        R                  S9U l
        OS U l
        SU l        g s  snf )N)rN   rw   r   r   r  )rw   rx   F)rB   rC   rN   rw   r   
ModuleListranger   layersrF   
downsamplepointing)
rM   rN   rw   depthr   	dilationsr  r,  irO   s
            r   rC   DinatStage.__init__  s    mm u	 &A !'&\#1#4 &	
 !(SR\\JDO"DO%	s   Br'   r   rR   c                     UR                  5       u  p4pS[        U R                  5       H  u  pgU" X5      nUS   nM     Un	U R                  b  U R                  U	5      nX4n
U(       a  U
WSS  -  n
U
$ r   )r   	enumerater+  r,  )rM   r'   r   rp   rq   rr   r0  layer_moduler$  !hidden_states_before_downsamplingstage_outputss              r   rU   DinatStage.forward  s    
 ,0025(5OA(JM)!,M  6 -:)??& OO,MNM&J]12..Mr    )rN   rw   r,  r+  r-  r   r   rY   s   @r   r'  r'    sD    8 -2|| $D> 
u||		 r    r'  c                      ^  \ rS rSrU 4S jr    SS\R                  S\\   S\\   S\\   S\\   S\	\
\4   4S	 jjrS
rU =r$ )DinatEncoderi   c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  o"R                  5       PM     nn[        R                  " [        U R                  5       Vs/ s H  n[        U[        UR                   SU-  -  5      UR                  U   UR"                  U   UR$                  U   U[        UR                  S U 5      [        UR                  S US-    5       X@R                  S-
  :  a  [&        OS S9PM     sn5      U l        g s  snf s  snf )Nr   cpu)r   r^   r   )rN   rw   r.  r   r/  r  r,  )rB   rC   r   depths
num_levelsrN   r0   linspacer  sumitemr   r)  r*  r'  r   rG   r   r/  ru   levels)rM   rN   xdpri_layerrO   s        r   rC   DinatEncoder.__init__  s0   fmm,!&63H3H#fmmJ\ej!kl!kAvvx!klmm  %T__5  6G !F,,q'z9: --0$..w7$..w7#&s6=='+B'Cc&--XeZadeZeJfFg#h4;ooPQ>Q4Q/X\  6
 ms   &E(B#Er'   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrR   c                    U(       a  SOS nU(       a  SOS nU(       a  SOS nU(       a  UR                  SSSS5      n	Xa4-  nXy4-  n[        U R                  5       H  u  pU" X5      nUS   nUS   nU(       a&  U(       a  UR                  SSSS5      n	Xm4-  nXy4-  nO,U(       a%  U(       d  UR                  SSSS5      n	Xa4-  nXy4-  nU(       d  My  XSS  -  nM     U(       d  [        S XU4 5       5      $ [	        UUUUS9$ )Nr*   r   r   r   r^   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r*   ).0vs     r   	<genexpr>'DinatEncoder.forward.<locals>.<genexpr><  s     m$[q$[s   	)r&   r'   r(   r)   )ro   r3  rA  r3   r$   )rM   r'   r   rF  rG  rH  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsreshaped_hidden_stater0  r4  r$  r5  s                 r   rU   DinatEncoder.forward  sA    #7BD+?RT"$5b4$1$9$9!Q1$E!!11&*BB&(5OA(JM)!,M0=a0@-#(P(I(Q(QRSUVXY[\(]%!%II!*.FF*%.V(5(=(=aAq(I%!%55!*.FF*  #QR'88#%  6( m]GZ$[mmm!++*#=	
 	
r    )rN   rA  r=  )FFFT)r+   r,   r-   r.   rC   r0   rW   r   r   r   r3   r$   rU   r4   rX   rY   s   @r   r9  r9     sy    
. -2/4CH&*.
||.
 $D>.
 'tn	.

 3;4..
 d^.
 
u((	).
 .
r    r9  c                   .    \ rS rSr% \\S'   SrSrS rSr	g)DinatPreTrainedModeliF  rN   dinatrQ   c                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r   ri   weightdatanormal_rN   initializer_rangerz   zero_rF   fill_)rM   modules     r   _init_weights"DinatPreTrainedModel._init_weightsL  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r    r*   N)
r+   r,   r-   r.   r   r2   base_model_prefixmain_input_namera  r4   r*   r    r   rU  rU  F  s    $O
*r    rU  c                      ^  \ rS rSrSU 4S jjrS rS r\    SS\\	R                     S\\   S\\   S\\   S	\\\4   4
S
 jj5       rSrU =r$ )
DinatModeliY  c                   > [         TU ]  U5        [        U S/5        Xl        [	        UR
                  5      U l        [        UR                  SU R                  S-
  -  -  5      U l	        [        U5      U l        [        U5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
nattenr^   r   r  N)rB   rC   r   rN   r   r<  r=  r   rG   num_featuresr?   rT   r9  encoderr   rF   r  	layernormAdaptiveAvgPool1dpooler	post_init)rM   rN   add_pooling_layerrO   s      r   rC   DinatModel.__init__[  s    
 	 $
+fmm, 0 0119L3M MN)&1#F+d&7&7V=R=RS1Bb**1- 	r    c                 .    U R                   R                  $ r   rT   rE   r   s    r   get_input_embeddingsDinatModel.get_input_embeddingsq      ///r    c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrj  layerr  r   )rM   heads_to_prunerx  r   s       r   _prune_headsDinatModel._prune_headst  s<    
 +002LELLu%//;;EB 3r    rQ   r   rF  rH  rR   c                 Z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nS nU R                  bH  U R                  UR                  SS5      R                  SS5      5      n[        R                  " US5      nU(       d  Xx4USS  -   n	U	$ [        UUUR                  UR                  UR                  S9$ )Nz You have to specify pixel_valuesr   rF  rH  r   r   r^   )r&   r8   r'   r(   r)   )rN   r   rF  use_return_dictrg   rT   rj  rk  rm  flattenr   r0   r6   r'   r(   r)   )
rM   rQ   r   rF  rH  embedding_outputencoder_outputssequence_outputpooled_outputr   s
             r   rU   DinatModel.forward|  s?    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,/!5#	 ' 
 *!,..9;;" KK(?(?1(E(O(OPQST(UVM!MM-;M%58KKFM-')77&11#2#I#I
 	
r    )rN   rT   rj  rk  ri  r=  rm  )T)NNNN)r+   r,   r-   r.   rC   rs  rz  r   r   r0   r1   r   r   r3   r6   rU   r4   rX   rY   s   @r   rf  rf  Y  s    ,0C  59,0/3&*,
u001,
 $D>,
 'tn	,

 d^,
 
u&&	',
 ,
r    rf  z
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\     SS\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\4   4S	 jj5       rS
rU =r$ )DinatForImageClassificationi  c                 ^  > [         TU ]  U5        [        U S/5        UR                  U l        [	        U5      U l        UR                  S:  a5  [        R                  " U R
                  R                  UR                  5      O[        R                  " 5       U l
        U R                  5         g )Nrh  r   )rB   rC   r   
num_labelsrf  rV  r   r   ri  r	  
classifierrn  rL   s     r   rC   $DinatForImageClassification.__init__  s     $
+ ++'
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r    rQ   labelsr   rF  rH  rR   c                 V   Ub  UOU R                   R                  nU R                  UUUUS9nUS   nU R                  U5      nSn	Ub  U R	                  X(U R                   5      n	U(       d  U4USS -   n
U	b  U	4U
-   $ U
$ [        U	UUR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr}  r   r^   )r<   r=   r'   r(   r)   )	rN   r~  rV  r  loss_functionr:   r'   r(   r)   )rM   rQ   r  r   rF  rH  r   r  r=   r<   r   s              r   rU   #DinatForImageClassification.forward  s     &1%<k$++B]B]**/!5#	  
  
/%%fdkkBDY,F)-)9TGf$EvE)!//))#*#A#A
 	
r    )r  rV  r  )NNNNN)r+   r,   r-   r.   rC   r   r   r0   r1   
LongTensorr   r   r3   r:   rU   r4   rX   rY   s   @r   r  r    s       59-1,0/3&*)
u001)
 ))*)
 $D>	)

 'tn)
 d^)
 
u00	1)
 )
r    r  zL
    NAT backbone, to be used with frameworks like DETR and MaskFormer.
    c                      ^  \ rS rSrU 4S jrS r\   SS\R                  S\	\
   S\	\
   S\	\
   S\4
S	 jj5       rS
rU =r$ )DinatBackbonei  c           	      8  > [         TU ]  U5        [         TU ]	  U5        [        U S/5        [	        U5      U l        [        U5      U l        UR                  /[        [        UR                  5      5       Vs/ s H  n[        UR                  SU-  -  5      PM      sn-   U l        0 n[        U R                  U R                   5       H  u  pE["        R$                  " U5      X4'   M     ["        R&                  " U5      U l        U R+                  5         g s  snf )Nrh  r^   )rB   rC   _init_backboner   r?   rT   r9  rj  rG   r*  r   r<  r   ri  zip_out_featuresr  r   rF   
ModuleDicthidden_states_normsrn  )rM   rN   r0  r  stagerf   rO   s         r   rC   DinatBackbone.__init__  s     v&$
+)&1#F+#--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rr !#&t'9'94==#IE)+l)C& $J#%==1D#E  	 2ss   9%Dc                 .    U R                   R                  $ r   rr  r   s    r   rs  "DinatBackbone.get_input_embeddings	  ru  r    rQ   rF  r   rH  rR   c                 $   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  UUSSSS9nUR                  nSn[        U R                  U5       H  u  pXR                  ;   d  M  U
R                  u  ppU
R                  SSSS5      R                  5       n
U
R                  XU-  U5      n
U R                  U	   " U
5      n
U
R                  XX5      n
U
R                  SSSS5      R                  5       n
X4-  nM     U(       d  U4nU(       a  XR                  4-  nU$ [!        UU(       a  UR                  OSUR"                  S	9$ )
a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)

>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 512, 7, 7]
```NT)r   rF  rG  rH  r*   r   r^   r   r   )feature_mapsr'   r(   )rN   r~  rF  r   rT   rj  r)   r  stage_namesout_featuresrn   ro   r   r   r  r'   r	   r(   )rM   rQ   rF  r   rH  r  r   r'   r  r  hidden_stater   rf   rq   rr   r   s                   r   rU   DinatBackbone.forward  s   B &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,/!%59  
  66#&t'7'7#GE))):F:L:L7
&+33Aq!Q?JJL+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $H "_F#0022M%3G'//T))
 	
r    )rT   rj  r  ri  )NNN)r+   r,   r-   r.   rC   rs  r   r0   rW   r   r   r	   rU   r4   rX   rY   s   @r   r  r    ss    &0  04,0&*G
llG
 'tnG
 $D>	G

 d^G
 
G
 G
r    r  )r  rf  rU  r  )r   F)<r/   r   dataclassesr   typingr   r   r0   r   activationsr   modeling_outputsr	   modeling_utilsr
   pytorch_utilsr   r   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_dinatr   natten.functionalr   r   
get_loggerr+   loggerr$   r6   r:   r   r?   rD   ru   rW   r   r   r   r   r   r   r   r   r   r   r'  r9  rU  rf  r  r  __all__r*   r    r   <module>r     s   @  ! "   ! . - Q  2 , ;;// 
		H	% 
K K K  
K{ K K& 
K K K*bii ,!299 !Hryy 0U\\ e T V[VbVb *%BII %CBII CL
")) 
!")) !H		 	")) 	D DN, ,^C
299 C
L *? * *$ O
% O
 O
d ;
"6 ;
;
| 
_
(- _

_
D ar    