
    bCitH              	          S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
Jr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  \R.                  " \5      rS-S\R4                  S\S\S\R4                  4S jjr " S S\R<                  5      r " S S\R<                  5      r  " S S\RB                  5      r" " S S\R<                  5      r# " S S\R<                  5      r$ " S S\R<                  5      r% " S S \R<                  5      r&\ " S! S"\5      5       r'\ " S# S$\'5      5       r(\" S%S&9 " S' S(\'5      5       r)\" S)S&9 " S* S+\'\5      5       r*/ S,Qr+g).zPyTorch ConvNextV2 model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextV2Configinput	drop_probtrainingreturnc                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          l/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_pathr$   (   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FM    c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )ConvNextV2DropPath=   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 .   > [         TU ]  5         Xl        g N)super__init__r   )selfr   	__class__s     r#   r,   ConvNextV2DropPath.__init__@   s    "r%   hidden_statesc                 B    [        XR                  U R                  5      $ r*   )r$   r   r   )r-   r0   s     r#   forwardConvNextV2DropPath.forwardD   s    FFr%   c                      SU R                    3$ )Nzp=r   )r-   s    r#   
extra_reprConvNextV2DropPath.extra_reprG   s    DNN#$$r%   r5   r*   )__name__
__module____qualname____firstlineno____doc__r   floatr,   r   Tensorr2   strr6   __static_attributes____classcell__r.   s   @r#   r'   r'   =   sQ    b#(5/ #T # #GU\\ Gell G%C % %r%   r'   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ConvNextV2GRNK   z)GRN (Global Response Normalization) layerdimc                    > [         TU ]  5         [        R                  " [        R
                  " SSSU5      5      U l        [        R                  " [        R
                  " SSSU5      5      U l        g )Nr   )r+   r,   r   	Parameterr   zerosweightbias)r-   rF   r.   s     r#   r,   ConvNextV2GRN.__init__N   sL    ll5;;q!Q#<=LLQ1c!:;	r%   r0   r   c                     [         R                  R                  USSSS9nX"R                  SSS9S-   -  nU R                  X-  -  U R
                  -   U-   nU$ )N   )r   rN   T)ordrF   keepdim)rF   rP   ư>)r   linalgvector_normmeanrJ   rK   )r-   r0   global_featuresnorm_featuress       r#   r2   ConvNextV2GRN.forwardS   se    ,,22=aV]a2b'+?+?BPT+?+UX\+\]}'DE		QTaar%   )rK   rJ   )r8   r9   r:   r;   r<   intr,   r   FloatTensorr2   r@   rA   rB   s   @r#   rD   rD   K   s6    3<C <
U%6%6 5;L;L  r%   rD   c                   v   ^  \ rS rSrSrSSS.U 4S jjrS\R                  S\R                  4U 4S	 jjrS
r	U =r
$ )ConvNextV2LayerNorm]   a5  LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
rR   channels_lastepsdata_formatc                `   > [         TU ]  " U4SU0UD6  US;  a  [        SU 35      eX0l        g )Nr`   )r^   channels_firstzUnsupported data format: )r+   r,   NotImplementedErrorra   )r-   normalized_shaper`   ra   kwargsr.   s        r#   r,   ConvNextV2LayerNorm.__init__c   s=    )=s=f=AA%(A+&OPP&r%   featuresr   c                    > U R                   S:X  a9  UR                  SSSS5      n[        TU ]  U5      nUR                  SSSS5      nU$ [        TU ]  U5      nU$ )zt
Args:
    features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
rc   r   rN   r   r   )ra   permuter+   r2   )r-   rh   r.   s     r#   r2   ConvNextV2LayerNorm.forwardi   sj    
 //''1a3Hwx0H''1a3H  wx0Hr%   ra   r8   r9   r:   r;   r<   r,   r   r>   r2   r@   rA   rB   s   @r#   r\   r\   ]   s9    
 15/ ' '   r%   r\   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jr	Sr
U =r$ )ConvNextV2Embeddingsx   zThis class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  S   UR                  UR                  S9U l        [        UR
                  S   SSS9U l	        UR                  U l        g )Nr   kernel_sizestriderR   rc   r_   )
r+   r,   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsr\   	layernormr-   configr.   s     r#   r,   ConvNextV2Embeddings.__init__}   sr     "		!4!4Q!7VEVEV_e_p_p!
 -V-@-@-C[kl"//r%   pixel_valuesr   c                     UR                   S   nX R                  :w  a  [        S5      eU R                  U5      nU R	                  U5      nU$ )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rv   
ValueErrorry   rz   )r-   r~   rv   
embeddingss       r#   r2   ConvNextV2Embeddings.forward   sT    #))!,,,,w  **<8
^^J/
r%   )rz   rv   ry   )r8   r9   r:   r;   r<   r,   r   rZ   r>   r2   r@   rA   rB   s   @r#   ro   ro   x   s/    0E$5$5 %,,  r%   ro   c                   j   ^  \ rS rSrSrSU 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	ConvNextV2Layer   a  This corresponds to the `Block` class in the original implementation.

There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

The authors used (2) as they find it slightly faster in PyTorch.

Args:
    config ([`ConvNextV2Config`]): Model configuration class.
    dim (`int`): Number of input channels.
    drop_path (`float`): Stochastic depth rate. Default: 0.0.
c                   > [         TU ]  5         [        R                  " X"SSUS9U l        [        USS9U l        [        R                  " USU-  5      U l        [        UR                     U l        [        SU-  5      U l        [        R                  " SU-  U5      U l        US:  a  [        U5      U l        g [        R                   " 5       U l        g )N   r   )rs   paddinggroupsrR   r`      r   )r+   r,   r   ru   dwconvr\   rz   Linearpwconv1r   
hidden_actactrD   grnpwconv2r'   Identityr$   )r-   r|   rF   r$   r.   s       r#   r,   ConvNextV2Layer.__init__   s    iia3O,Sd;yya#g.&++, S)yyS#.:Cc/+I6r{{}r%   rh   r   c                 L   UnU R                  U5      nUR                  SSSS5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nUR                  SSSS5      nX R                  U5      -   nU$ )Nr   rN   r   r   )r   rj   rz   r   r   r   r   r$   )r-   rh   residuals      r#   r2   ConvNextV2Layer.forward   s    ;;x(##Aq!Q/>>(+<<)88H%88H%<<)##Aq!Q/nnX66r%   )r   r$   r   r   rz   r   r   )r   rm   rB   s   @r#   r   r      s.    
]   r%   r   c                   j   ^  \ rS rSrSrSU 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	ConvNextV2Stage   a  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

Args:
    config ([`ConvNextV2Config`]): Model configuration class.
    in_channels (`int`): Number of input channels.
    out_channels (`int`): Number of output channels.
    depth (`int`): Number of residual blocks.
    drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
c                   > [         T	U ]  5         X#:w  d  US:  a:  [        R                  " [	        USSS9[        R
                  " X#XES9/5      U l        O[        R                  " 5       U l        U=(       d    S/U-  n[        R                  " [        U5       Vs/ s H  n[        XXx   S9PM     sn5      U l	        g s  snf )Nr   rR   rc   r_   rr   r   )rF   r$   )
r+   r,   r   
ModuleListr\   ru   downsampling_layerranger   layers)
r-   r|   in_channelsout_channelsrs   rt   depthdrop_path_ratesjr.   s
            r#   r,   ConvNextV2Stage.__init__   s    &&1*&(mm'K[\IIk[`'D# ')mmoD#):cUU]mm^cdi^jk^jYZ_VAST^jk
ks   B>rh   r   c                 r    U R                    H  nU" U5      nM     U R                   H  nU" U5      nM     U$ r*   r   r   )r-   rh   layers      r#   r2   ConvNextV2Stage.forward   s7    ,,EXH -[[EXH !r%   r   )rN   rN   rN   Nrm   rB   s   @r#   r   r      s-    
"   r%   r   c                   ^   ^  \ rS rSrU 4S jr SS\R                  S\\   S\	4S jjr
SrU =r$ )	ConvNextV2Encoder   c           
      .  > [         TU ]  5         [        R                  " 5       U l        [
        R                  " SUR                  [        UR                  5      SS9R                  UR                  5       Vs/ s H  nUR                  5       PM     nnUR                  S   n[        UR                  5       HT  nUR                  U   n[        UUUUS:  a  SOSUR                  U   X5   S9nU R                  R!                  U5        UnMV     g s  snf )Nr   cpu)r   rN   r   )r   r   rt   r   r   )r+   r,   r   r   stagesr   linspacedrop_path_ratesumdepthssplittolistrw   r   
num_stagesr   append)	r-   r|   xr   prev_chsiout_chsstager.   s	           r#   r,   ConvNextV2Encoder.__init__   s    mmo ^^Av'<'<c&-->PY^_eeflfsfst
t HHJt 	 
 &&q)v(()A))!,G#$$EqqmmA& / 2E KKu%H *
s   9Dr0   output_hidden_statesr   c                     U(       a  U/OS nU R                    H!  nU" U5      nUc  M  UR                  U5        M#     [        XS9$ )N)last_hidden_stater0   )r   r   r   )r-   r0   r   all_hidden_stateslayer_modules        r#   r2   ConvNextV2Encoder.forward   sJ     0D]O KKL(7M ,!((7 (
 .oor%   )r   )F)r8   r9   r:   r;   r,   r   r>   r   boolr   r2   r@   rA   rB   s   @r#   r   r      s>    , SX
p"\\
pAI$
p	'
p 
pr%   r   c                   4    \ rS rSr% \\S'   SrSrS/rS r	Sr
g)	ConvNextV2PreTrainedModeli  r|   
convnextv2r~   r   c                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  [        45      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  5         gg)zInitialize the weightsr   )rU   stdNg      ?)
isinstancer   r   ru   rJ   datanormal_r|   initializer_rangerK   zero_	LayerNormr\   fill_rD   )r-   modules     r#   _init_weights'ConvNextV2PreTrainedModel._init_weights  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '/B CDDKK""$MM$$S)..MM$$&KK""$ /r%    N)r8   r9   r:   r;   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   r@   r   r%   r#   r   r     s!    $$O*+%r%   r   c            	       x   ^  \ rS rSrU 4S jr\\ SS\\R                     S\\
   S\4S jj5       5       rSrU =r$ )	ConvNextV2Modeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  S   UR                  S9U l        U R                  5         g )NrQ   r   )r+   r,   r|   ro   r   r   encoderr   r   rw   layer_norm_epsrz   	post_initr{   s     r#   r,   ConvNextV2Model.__init__  s^     .v6(0 f&9&9"&=6CXCXY 	r%   r~   r   r   c                    Uc  U R                   R                  nUc  [        S5      eU R                  U5      nU R	                  X2S9nUR
                  nU R                  UR                  SS/5      5      n[        UUUR                  S9$ )Nz You have to specify pixel_valuesr   rQ   )r   pooler_outputr0   )
r|   r   r   r   r   r   rz   rU   r	   r0   )r-   r~   r   embedding_outputencoder_outputsr   pooled_outputs          r#   r2   ConvNextV2Model.forward+  s    
  '#';;#C#C ?@@??<8:>,, ;G ;
 ,== '8'='=r2h'GH7/')77
 	
r%   )r|   r   r   rz   NN)r8   r9   r:   r;   r,   r   r   r   r   rZ   r   r	   r2   r@   rA   rB   s   @r#   r   r     sP     gk
$U%6%67
V^_cVd
	1
  
r%   r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc            	          ^  \ rS rSrSrU 4S jr\\ S	S\\	R                     S\\	R                     S\4S jj5       5       rSrU =r$ )
 ConvNextV2ForImageClassificationiF  Fc                 B  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a4  [
        R                  " UR                  S   UR                  5      U l        O[
        R                  " 5       U l        U R                  5         g )Nr   rQ   )r+   r,   
num_labelsr   r   r   r   rw   
classifierr   r   r{   s     r#   r,   )ConvNextV2ForImageClassification.__init__P  su      ++)&1 q  ii(;(;B(?ARARSDO kkmDO 	r%   r~   labelsr   c                     U R                   " U40 UD6nUR                  nU R                  U5      nSnUb  U R                  X&U R                  S9n[        UUUR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   pooled_logitsr|   )losslogitsr0   )r   r   r   loss_functionr|   r
   r0   )r-   r~   r   rf   outputsr   r   r   s           r#   r2   (ConvNextV2ForImageClassification.forward_  su     =AOOL<c\b<c--/%%VRVR]R]%^D3!//
 	
r%   )r   r   r   r   )r8   r9   r:   r;   accepts_loss_kwargsr,   r   r   r   r   rZ   
LongTensorr
   r2   r@   rA   rB   s   @r#   r   r   F  s^       ei
$U%6%67
HPQVQaQaHb
	-
  
r%   r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       v   ^  \ rS rSrSrU 4S jr\\ S	S\R                  S\
\   S\4S jj5       5       rSrU =r$ )
ConvNextV2Backboneiy  Fc                   > [         TU ]  U5        [         TU ]	  U5        [        U5      U l        [        U5      U l        UR                  S   /UR                  -   U l        0 n[        U R                  U R                  5       H  u  p4[        USS9X#'   M     [        R                  " U5      U l        U R!                  5         g )Nr   rc   rl   )r+   r,   _init_backbonero   r   r   r   rw   num_featureszip_out_featureschannelsr\   r   
ModuleDicthidden_states_normsr   )r-   r|   r  r   rv   r.   s        r#   r,   ConvNextV2Backbone.__init__  s     v&.v6(0#0034v7J7JJ !#&t'9'94==#IE)<\Wg)h& $J#%==1D#E  	r%   r~   r   r   c                 z   Uc  U R                   R                  nU R                  U5      nU R                  USS9nUR                  n/ n[        U R                  U5       H<  u  pxXpR                  ;   d  M  U R                  U   " U5      nUR                  U5        M>     [        [        U5      U(       a  US9$ SS9$ )a
  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
>>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```NTr   )feature_mapsr0   )r|   r   r   r   r0   r   stage_namesout_featuresr  r   r   tuple)	r-   r~   r   r   r   r0   r  r   hidden_states	            r#   r2   ConvNextV2Backbone.forward  s    2  '#';;#C#C ??<8<@LLIYptL<u--#&t'7'7#GE)))#77>|L##L1 $H
 |,+?-
 	
EI
 	
r%   )r   r   r  r   r*   )r8   r9   r:   r;   has_attentionsr,   r   r   r   r>   r   r   r   r2   r@   rA   rB   s   @r#   r   r   y  sT     N"  04'
ll'
 'tn'
 
	'
  '
r%   r   )r   r   r   r   )r   F),r<   typingr   r   r   activationsr   modeling_outputsr   r   r	   r
   modeling_utilsr   utilsr   r   utils.backbone_utilsr   utils.genericr   configuration_convnextv2r   
get_loggerr8   loggerr>   r=   r   r$   Moduler'   rD   r   r\   ro   r   r   r   r   r   r   r   __all__r   r%   r#   <module>r     st        !  . , 1 - 6 
		H	%U\\ e T V[VbVb *% %BII $",, 6299 0(bii (X!bii !J p		  pF % % %, &
/ &
 &
R )
'@ )
)
X =
2M =
=
@ ur%   