
    h                   J   S r SSKJr  SSKrSSKJr  SSKJs  Jr  SSK	J
r
  SSKJrJrJrJrJrJr  SSKJr  Sr " S	 S
\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\5      r  " S S \R*                  5      r! " S! S"\5      r" " S# S$\5      r# " S% S&\R*                  5      r$ " S' S(\R*                  5      r% " S) S*\R*                  5      r& " S+ S,\R*                  5      r' " S- S.\R*                  5      r( " S/ S0\R*                  5      r) " S1 S2\R*                  5      r* " S3 S4\R*                  5      r+ " S5 S6\R*                  5      r, " S7 S8\R*                  5      r- " S9 S:\%5      r. " S; S<\5      r/ " S= S>\R*                  5      r0 " S? S@\05      r1 " SA SB\R*                  5      r2 " SC SD\R*                  5      r3 " SE SF\R*                  5      r4 " SG SH\R*                  5      r5 " SI SJ\R*                  5      r6 " SK SL\R*                  5      r7 " SM SN\5      r8 " SO SP\5      r9 " SQ SR\R
                  R*                  5      r: " SS ST\R*                  5      r; " SU SV\5      r< " SW SX\R*                  5      r= " SY SZ\R*                  5      r> " S[ S\\R*                  5      r? " S] S^\R*                  5      r@ " S_ S`\5      rA " Sa Sb\R*                  5      rB " Sc Sd\R*                  5      rC " Se Sf\R*                  5      rD " Sg Sh\R*                  5      rE " Si Sj\R*                  5      rF " Sk Sl\R*                  5      rG " Sm Sn\R*                  5      rH " So Sp\R*                  5      rIg)qzBlock modules.    )annotationsN)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)'DFLHGBlockHGStemSPPSPPFC1C2C3C2fC2fAttnImagePoolingAttnContrastiveHeadBNContrastiveHeadC3xC3TRC3GhostGhostBottleneck
BottleneckBottleneckCSPProtoRepC3ResNetLayerRepNCSPELAN4ELAN1ADownAConvSPPELANCBFuseCBLinearC3k2C2fPSAC2PSARepVGGDWCIBC2fCIB	AttentionPSASCDownTorchVisionc                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   :   z
Integral module of Distribution Focal Loss (DFL).

Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
c                \  > [         TU ]  5         [        R                  " USSSS9R	                  S5      U l        [        R                  " U[        R                  S9n[        R                  " UR                  SUSS5      5      U R
                  R                  R                  SS& Xl        g)zx
Initialize a convolutional layer with a given number of input channels.

Args:
    c1 (int): Number of input channels.
r   Fbias)dtypeN)super__init__nnConv2drequires_grad_convtorcharangefloat	Parameterviewweightdatac1)selfrG   x	__class__s      V/home/james-whalen/.local/lib/python3.13/site-packages/ultralytics/nn/modules/block.pyr;   DFL.__init__A   s|     	IIb!QU3BB5I	LL5;;/#%<<q"a0C#D		a     c                    UR                   u  p#nU R                  UR                  USU R                  U5      R	                  SS5      R                  S5      5      R                  USU5      $ )zCApply the DFL module to input tensor and return transformed output.      r   )shaper?   rD   rG   	transposesoftmax)rH   rI   b_as        rK   forwardDFL.forwardN   s[    ''ayy1dggq1;;AqAII!LMRRSTVWYZ[[rM   )rG   r?   )   )rG   intrI   torch.Tensorreturnr\   	__name__
__module____qualname____firstlineno____doc__r;   rW   __static_attributes____classcell__rJ   s   @rK   r   r   :   s     \ \rM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r    U   zBUltralytics YOLO models mask Proto module for segmentation models.c           	        > [         TU ]  5         [        XSS9U l        [        R
                  " X"SSSSS9U l        [        X"SS9U l        [        X#5      U l        g)z
Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.

Args:
    c1 (int): Input channels.
    c_ (int): Intermediate channels.
    c2 (int): Output channels (number of protos).
   )krP   r   Tr7   N)	r:   r;   r   cv1r<   ConvTranspose2dupsamplecv2cv3)rH   rG   c_c2rJ   s       rK   r;   Proto.__init__X   sQ     	!$**21aF!$<rM   c           	     ~    U R                  U R                  U R                  U R                  U5      5      5      5      $ )zEPerform a forward pass through layers using an upsampled input image.)rp   ro   rn   rl   rH   rI   s     rK   rW   Proto.forwardg   s+    xxtxx{!;<==rM   )rl   ro   rp   rn   )       )rG   rZ   rq   rZ   rr   rZ   r[   r^   rf   s   @rK   r    r    U   s    L   > >rM   r    c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r   l   z
StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
c           	       > [         TU ]  5         [        XSS[        R                  " 5       S9U l        [        X"S-  SSS[        R                  " 5       S9U l        [        US-  USSS[        R                  " 5       S9U l        [        US-  USS[        R                  " 5       S9U l        [        X#SS[        R                  " 5       S9U l	        [        R                  " SSSSS9U l        g)	z
Initialize the StemBlock of PPHGNetV2.

Args:
    c1 (int): Input channels.
    cm (int): Middle channels.
    c2 (int): Output channels.
rj   rP   actr   r   T)kernel_sizestridepadding	ceil_modeN)r:   r;   r   r<   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)rH   rG   cmrr   rJ   s       rK   r;   HGStem.__init__s   s     	"!QBGGI6
2Qw1aRWWY?27B1aRWWY?"q&"a	:
"!QBGGI6
LLQq!tT	rM   c                b   U R                  U5      n[        R                  " U/ SQ5      nU R                  U5      n[        R                  " U/ SQ5      nU R	                  U5      nU R                  U5      n[        R                  " X2/SS9nU R                  U5      nU R                  U5      nU$ )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
r   Fpadr   r   r   r@   catr   r   )rH   rI   x2x1s       rK   rW   HGStem.forward   s    JJqMEE!\"[[^UU2|$[[_YYq\IIrhA&JJqMJJqMrM   )r   r   r   r   r   r   )rG   rZ   r   rZ   rr   rZ   r[   r^   rf   s   @rK   r   r   l   s    U" rM   r   c                     ^  \ rS rSrSrSSSS\R                  " 5       4               S	U 4S jjjrS
S jrSr	U =r
$ )r      z
HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
rj      Fc	                <  >^^^^^	 [         T
U ]  5         U(       a  [        O[        m	[        R
                  " UU	UUU4S j[        U5       5       5      U l        [        TUT-  -   US-  SSTS9U l        [        US-  USSTS9U l	        U=(       a    TU:H  U l
        g)ap  
Initialize HGBlock with specified parameters.

Args:
    c1 (int): Input channels.
    cm (int): Middle channels.
    c2 (int): Output channels.
    k (int): Kernel size.
    n (int): Number of LightConv or Conv blocks.
    lightconv (bool): Whether to use LightConv.
    shortcut (bool): Whether to use shortcut connection.
    act (nn.Module): Activation function.
c              3  F   >#    U  H  nT" US :X  a  TOTTTTS9v   M     g7f)r   rk   r}   N ).0ir}   blockrG   r   rk   s     rK   	<genexpr>#HGBlock.__init__.<locals>.<genexpr>   s'     _V^QRu16Rr2LV^   !rP   r   r|   N)r:   r;   r	   r   r<   
ModuleListrangemscecadd)rH   rG   r   rr   rk   n	lightconvshortcutr}   r   rJ   s    `` `   `@rK   r;   HGBlock.__init__   s    0 	&	D__V[\]V^__rAF{B!GQs;rQwAqc2(brM   c                   ^ U/mTR                  U4S jU R                   5       5        U R                  U R                  [        R
                  " TS5      5      5      mU R                  (       a  TU-   $ T$ )r   c              3  8   >#    U  H  o" TS    5      v   M     g7fNr   r   r   ys     rK   r   "HGBlock.forward.<locals>.<genexpr>        *6a1R56   r   )extendr   r   r   r@   r   r   rH   rI   r   s     @rK   rW   HGBlock.forward   sV    C	*466**GGDGGEIIaO,-q1u'a'rM   )r   r   r   r   )rG   rZ   r   rZ   rr   rZ   rk   rZ   r   rZ   r   boolr   r   r}   	nn.Moduler[   )r_   r`   ra   rb   rc   r<   r   r;   rW   rd   re   rf   s   @rK   r   r      s     )) ) 	)
 ) ) ) ) ) )>( (rM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r      zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.c                $  > [         TU ]  5         US-  n[        XSS5      U l        [        U[	        U5      S-   -  USS5      U l        [        R                  " U Vs/ s H  n[        R                  " USUS-  S9PM     sn5      U l	        gs  snf )z
Initialize the SPP layer with input/output channels and pooling kernel sizes.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    k (tuple): Kernel sizes for max pooling.
rP   r   r~   r   r   N)
r:   r;   r   rl   lenro   r<   r   r   r   )rH   rG   rr   rk   rq   rI   rJ   s         rK   r;   SPP.__init__   s     	1W1%c!fqj)2q!4_`a_`Z[1aSTf U_`abas   #Bc                    U R                  U5      nU R                  [        R                  " U/U R                   Vs/ s H
  o"" U5      PM     sn-   S5      5      $ s  snf )zBForward pass of the SPP layer, performing spatial pyramid pooling.r   )rl   ro   r@   r   r   )rH   rI   r   s      rK   rW   SPP.forward   sJ    HHQKxx		1#tvv(>v!1v(>">BCC(>s   Arl   ro   r   ))   	      )rG   rZ   rr   rZ   rk   tuple[int, ...]r[   r^   rf   s   @rK   r   r      s    Nc cD DrM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r      zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.c                   > [         TU ]  5         US-  n[        XSS5      U l        [        US-  USS5      U l        [
        R                  " USUS-  S9U l        g)z
Initialize the SPPF layer with given input/output channels and kernel size.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    k (int): Kernel size.

Notes:
    This module is equivalent to SPP(k=(5, 9, 13)).
rP   r   rO   r   N)r:   r;   r   rl   ro   r<   r   r   )rH   rG   rr   rk   rq   rJ   s        rK   r;   SPPF.__init__   sW     	1W1%QAq)!AqAvFrM   c                   ^ ^ T R                  U5      /mTR                  U U4S j[        S5       5       5        T R                  [        R
                  " TS5      5      $ )zRApply sequential pooling operations to input and return concatenated feature maps.c              3  L   >#    U  H  nTR                  TS    5      v   M     g7fr   r   )r   rU   rH   r   s     rK   r   SPPF.forward.<locals>.<genexpr>   s     11"s   !$rj   r   )rl   r   r   ro   r@   r   r   s   ` @rK   rW   SPPF.forward   sA    XXa[M	1a11xx		!Q((rM   r   r   )rG   rZ   rr   rZ   rk   rZ   r[   r^   rf   s   @rK   r   r      s    QG G$) )rM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r      z"CSP Bottleneck with 1 convolution.c                   >^ [         TU ]  5         [        UTSS5      U l        [        R
                  " U4S j[        U5       5       6 U l        g)z
Initialize the CSP Bottleneck with 1 convolution.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of convolutions.
r   c              3  >   >#    U  H  n[        TTS 5      v   M     g7f)rj   N)r   )r   rU   rr   s     rK   r   C1.__init__.<locals>.<genexpr>  s      C(Qb"a(   N)r:   r;   r   rl   r<   
Sequentialr   r   )rH   rG   rr   r   rJ   s     ` rK   r;   C1.__init__   s<     	B1% C%( CDrM   c                L    U R                  U5      nU R                  U5      U-   $ )z:Apply convolution and residual connection to input tensor.rl   r   r   s      rK   rW   
C1.forward  s!    HHQKvvay1}rM   r   r   )rG   rZ   rr   rZ   r   rZ   r[   r^   rf   s   @rK   r   r      s    ,E E rM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   i
  z#CSP Bottleneck with 2 convolutions.c                $  >^ ^^ [         TT ]  5         [        X&-  5      T l        [	        UST R                  -  SS5      T l        [	        ST R                  -  US5      T l        [        R                  " UU U4S j[        U5       5       6 T l
        g)a   
Initialize a CSP Bottleneck with 2 convolutions.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
rP   r   c           
   3  h   >#    U  H'  n[        TR                  TR                  TTS SS9v   M)     g7f)rj   rj   r         ?rk   eNr   cr   rU   grH   r   s     rK   r   C2.__init__.<locals>.<genexpr>  s-      vmuhiDFFDFFHaK[_b!cmu   /2Nr:   r;   rZ   r   r   rl   ro   r<   r   r   r   rH   rG   rr   r   r   r   r   rJ   s   `   `` rK   r;   C2.__init__  sl     	RVAJ1-DFF
B* vmrstmu vwrM   c                    U R                  U5      R                  SS5      u  p#U R                  [        R                  " U R                  U5      U4S5      5      $ )z<Forward pass through the CSP bottleneck with 2 convolutions.rP   r   )rl   chunkro   r@   r   r   rH   rI   rV   rT   s       rK   rW   
C2.forward   sD    xx{  A&xx		466!9a.!455rM   r   rl   ro   r   r   Tr         ?rG   rZ   rr   rZ   r   rZ   r   r   r   rZ   r   rB   r[   r^   rf   s   @rK   r   r   
  s    -x x&6 6rM   r   c                  H   ^  \ rS rSrSrSSU 4S jjjrS	S jrS	S jrSrU =r	$ )
r   i&  <Faster Implementation of CSP Bottleneck with 2 convolutions.c                0  >^ ^^ [         TT ]  5         [        X&-  5      T l        [	        UST R                  -  SS5      T l        [	        SU-   T R                  -  US5      T l        [        R                  " UU U4S j[        U5       5       5      T l
        g)a   
Initialize a CSP bottleneck with 2 convolutions.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
rP   r   c           
   3  h   >#    U  H'  n[        TR                  TR                  TTS SS9v   M)     g7fr   r   r   s     rK   r   C2f.__init__.<locals>.<genexpr>9  -     tksfgz$&&$&&(AIY]`aksr   N)r:   r;   rZ   r   r   rl   ro   r<   r   r   r   r   s   `   `` rK   r;   C2f.__init__)  sq     	RVAJ1-Q$&&("a0tkpqrksttrM   c                   ^ [        U R                  U5      R                  SS5      5      mTR                  U4S jU R                   5       5        U R                  [        R                  " TS5      5      $ )zForward pass through C2f layer.rP   r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   C2f.forward.<locals>.<genexpr>>  r   r   )listrl   r   r   r   ro   r@   r   r   s     @rK   rW   C2f.forward;  sQ    !""1a()	*466**xx		!Q((rM   c                  ^ U R                  U5      R                  U R                  U R                  4S5      mTS   TS   /mTR                  U4S jU R                   5       5        U R                  [        R                  " TS5      5      $ ).Forward pass using split() instead of chunk().r   r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   $C2f.forward_split.<locals>.<genexpr>E  r   r   )rl   splitr   r   r   ro   r@   r   r   s     @rK   forward_splitC2f.forward_splitA  sj    HHQKtvvtvv.2qT1Q4L	*466**xx		!Q((rM   r   r   Fr   r   r   r[   
r_   r`   ra   rb   rc   r;   rW   r  rd   re   rf   s   @rK   r   r   &  s!    Fu u$)) )rM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   iI  z#CSP Bottleneck with 3 convolutions.c                  >^^^ [         TU ]  5         [        X&-  5      m[        UTSS5      U l        [        UTSS5      U l        [        ST-  US5      U l        [        R                  " UUU4S j[        U5       5       6 U l
        g)a"  
Initialize the CSP Bottleneck with 3 convolutions.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
r   rP   c           
   3  @   >#    U  H  n[        TTTTS SS9v   M     g7f)))r   r   r   r   r   Nr   r   rU   rq   r   r   s     rK   r   C3.__init__.<locals>.<genexpr>]  s%      nem`aBHaCSWZ![em   N)r:   r;   rZ   r   rl   ro   rp   r<   r   r   r   	rH   rG   rr   r   r   r   r   rq   rJ   s	       `` @rK   r;   C3.__init__L  sp     	[B1%B1%BA& nejklem norM   c           	         U R                  [        R                  " U R                  U R	                  U5      5      U R                  U5      4S5      5      $ )z<Forward pass through the CSP bottleneck with 3 convolutions.r   )rp   r@   r   r   rl   ro   ru   s     rK   rW   
C3.forward_  s:    xx		466$((1+#6"DaHIIrM   rl   ro   rp   r   r   r   r[   r^   rf   s   @rK   r   r   I  s    -p p&J JrM   r   c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )r   id  z"C3 module with cross-convolutions.c                   >^ ^^ [         TT ]  XUTTU5        [        X&-  5      T l        [        R
                  " UU U4S j[        U5       5       6 T l        g)a  
Initialize C3 module with cross-convolutions.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
c           
   3  h   >#    U  H'  n[        TR                  TR                  TTS SS9v   M)     g7f)))r   rj   rj   r   r   r   N)r   rq   r   s     rK   r   C3x.__init__.<locals>.<genexpr>u  s-      vmuhiDGGTWWhM]ab!cmur   N)r:   r;   rZ   rq   r<   r   r   r   r   s   `   `` rK   r;   C3x.__init__g  sD     	Ha3bf+ vmrstmu vwrM   )rq   r   r   r   r_   r`   ra   rb   rc   r;   rd   re   rf   s   @rK   r   r   d  s    ,x xrM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r!   ix  zRep C3.c           	     h  > [         TU ]  5         [        X$-  5      n[        XSS5      U l        [        XSS5      U l        [        R                  " [        U5       Vs/ s H  n[        XU5      PM     sn6 U l
        XR:w  a  [        XRSS5      U l        g[        R                  " 5       U l        gs  snf )z
Initialize CSP Bottleneck with a single convolution.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of RepConv blocks.
    e (float): Expansion ratio.
r   N)r:   r;   rZ   r   rl   ro   r<   r   r   r
   r   Identityrp   )rH   rG   rr   r   r   rq   rU   rJ   s          rK   r;   RepC3.__init__{  s     	[1%1%%( C(Q( CD)+41%r{{} !Ds   B/c                    U R                  U R                  U R                  U5      5      U R                  U5      -   5      $ )zForward pass of RepC3 module.)rp   r   rl   ro   ru   s     rK   rW   RepC3.forward  s/    xxtxx{+dhhqk9::rM   r  )rj   r   rG   rZ   rr   rZ   r   rZ   r   rB   r[   r^   rf   s   @rK   r!   r!   x  s    E E"; ;rM   r!   c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )r   i  z"C3 module with TransformerBlock().c                f   > [         TU ]  XX4XV5        [        X&-  5      n[        XwSU5      U l        g)a  
Initialize C3 module with TransformerBlock.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Transformer blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
rO   N)r:   r;   rZ   r   r   r  s	           rK   r;   C3TR.__init__  s1     	a3[!"!Q/rM   r   r   r   r  rf   s   @rK   r   r     s    ,0 0rM   r   c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )r   i  z!C3 module with GhostBottleneck().c                   >^ [         TU ]  XX4XV5        [        X&-  5      m[        R                  " U4S j[        U5       5       6 U l        g)a   
Initialize C3 module with GhostBottleneck.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Ghost bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
c              3  <   >#    U  H  n[        TT5      v   M     g 7f)N)r   )r   rU   rq   s     rK   r   #C3Ghost.__init__.<locals>.<genexpr>  s      K(QR!8!8(s   Nr:   r;   rZ   r<   r   r   r   r  s	          @rK   r;   C3Ghost.__init__  s;     	a3[ K%( KLrM   r   r   r   r  rf   s   @rK   r   r     s    +M MrM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   i  zGGhost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones.c                x  > [         TU ]  5         US-  n[        R                  " [	        XSS5      US:X  a  [        XUX4SS9O[        R                  " 5       [	        XRSSSS95      U l        US:X  a0  [        R                  " [        XX4SS9[        XSSSS95      U l	        g[        R                  " 5       U l	        g)z
Initialize Ghost Bottleneck module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    k (int): Kernel size.
    s (int): Stride.
rP   r   Fr|   N)
r:   r;   r<   r   r   r   r!  r?   r   r   )rH   rG   rr   rk   srq   rJ   s         rK   r;   GhostBottleneck.__init__  s     	1WMMba#/0AvF21U+2;;=ba.
	 ^_bc]cBMM&5941RW;XY 	ikititiv 	rM   c                H    U R                  U5      U R                  U5      -   $ )z8Apply skip connection and concatenation to input tensor.r?   r   ru   s     rK   rW   GhostBottleneck.forward  s    yy|dmmA...rM   r4  r  rG   rZ   rr   rZ   rk   rZ   r1  rZ   r[   r^   rf   s   @rK   r   r     s    Q
 
*/ /rM   r   c                  V   ^  \ rS rSrSr S           SU 4S jjjrSS jrSrU =r$ )	r   i  zStandard bottleneck.c                   > [         TU ]  5         [        X&-  5      n[        XUS   S5      U l        [        XrUS   SUS9U l        U=(       a    X:H  U l        g)a  
Initialize a standard bottleneck module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    shortcut (bool): Whether to use shortcut connection.
    g (int): Groups for convolutions.
    k (tuple): Kernel sizes for convolutions.
    e (float): Expansion ratio.
r   r   r   N)r:   r;   rZ   r   rl   ro   r   	rH   rG   rr   r   r   rk   r   rq   rJ   s	           rK   r;   Bottleneck.__init__  sS     	[!a(!a1-(rM   c                    U R                   (       a"  XR                  U R                  U5      5      -   $ U R                  U R                  U5      5      $ )z3Apply bottleneck with optional shortcut connection.)r   ro   rl   ru   s     rK   rW   Bottleneck.forward  s8    ,0HHq88DHHQK((O$((488A;:OOrM   )r   rl   ro   Tr   r   r   rG   rZ   rr   rZ   r   r   r   rZ   rk   ztuple[int, int]r   rB   r[   r^   rf   s   @rK   r   r     sS     lo)))*.):=)FU)ch) )(P PrM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r   i  zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.c                  >^^^ [         TU ]  5         [        X&-  5      m[        UTSS5      U l        [
        R                  " UTSSSS9U l        [
        R                  " TTSSSS9U l        [        ST-  USS5      U l	        [
        R                  " ST-  5      U l        [
        R                  " 5       U l        [
        R                  " UUU4S j[        U5       5       6 U l        g)a
  
Initialize CSP Bottleneck.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
r   Fr7   rP   c           	   3  >   >#    U  H  n[        TTTTS S9v   M     g7fr   r   Nr  r  s     rK   r   )BottleneckCSP.__init__.<locals>.<genexpr>	  s       ZQYABHa3!GQYr   N)r:   r;   rZ   r   rl   r<   r=   ro   rp   cv4BatchNorm2dbnSiLUr}   r   r   r   r  s	       `` @rK   r;   BottleneckCSP.__init__  s     	[B1%99RQ699RQ6BAq)..R(779 ZQVWXQY Z[rM   c           
        U R                  U R                  U R                  U5      5      5      nU R                  U5      nU R	                  U R                  U R                  [        R                  " X#4S5      5      5      5      $ )z)Apply CSP bottleneck with 3 convolutions.r   )	rp   r   rl   ro   rF  r}   rH  r@   r   )rH   rI   y1y2s       rK   rW   BottleneckCSP.forward  s\    XXdffTXXa[)*XXa[xxB8Q)?!@ABBrM   )r}   rH  rl   ro   rp   rF  r   r   r   r[   r^   rf   s   @rK   r   r     s    Q\ \,C CrM   r   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	ResNetBlocki  z.ResNet block with standard convolution layers.c           
     &  > [         TU ]  5         XB-  n[        XSSSS9U l        [        X"SUSSS9U l        [        X%SSS9U l        US:w  d  X:w  a&  [        R                  " [        XSUSS95      U l	        g[        R                  " 5       U l	        g)	z
Initialize ResNet block.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    s (int): Stride.
    e (int): Expansion ratio.
r   Trk   r1  r}   rj   rk   r1  pr}   Fr   N)
r:   r;   r   rl   ro   rp   r<   r   r!  r   )rH   rG   rr   r1  r   c3rJ   s         rK   r;   ResNetBlock.__init__  s     	V!qd3!qA48!/LMQRFVXV^d2Q!&GHdfdododqrM   c           	         [         R                  " U R                  U R                  U R	                  U5      5      5      U R                  U5      -   5      $ )z&Forward pass through the ResNet block.)r   relurp   ro   rl   r   ru   s     rK   rW   ResNetBlock.forward&  s9    vvdhhtxx45a8HHIIrM   )rl   ro   rp   r   )r   rO   )rG   rZ   rr   rZ   r1  rZ   r   rZ   r[   r^   rf   s   @rK   rP  rP    s    8r r"J JrM   rP  c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r"   i+  z)ResNet layer with multiple ResNet blocks.c                  > [         T	U ]  5         X@l        U R                  (       a<  [        R                  " [        XSSSSS9[        R                  " SSSS95      U l        g	[        XX6S9/nUR                  [        US-
  5       Vs/ s H  n[        Xb-  USUS9PM     sn5        [        R                  " U6 U l        g	s  snf )
z
Initialize ResNet layer.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    s (int): Stride.
    is_first (bool): Whether this is the first layer.
    n (int): Number of ResNet blocks.
    e (int): Expansion ratio.
   rP   rj   TrS  r   r   rD  N)r:   r;   is_firstr<   r   r   r   layerrP  r   r   )
rH   rG   rr   r1  r]  r   r   blocksrU   rJ   s
            rK   r;   ResNetLayer.__init__.  s     	 ==RqA5r||PQZ[ef7gDJ ""!12FMME!a%LQLq;qvr1:LQR/DJ Rs   	Cc                $    U R                  U5      $ )z&Forward pass through the ResNet layer.)r^  ru   s     rK   rW   ResNetLayer.forwardF  s    zz!}rM   )r]  r^  )r   Fr   rO   )rG   rZ   rr   rZ   r1  rZ   r]  r   r   rZ   r   rZ   r[   r^   rf   s   @rK   r"   r"   +  s    30 00 rM   r"   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	MaxSigmoidAttnBlockiK  zMax Sigmoid attention block.c                  > [         TU ]  5         X0l        X#-  U l        X:w  a  [	        XSSS9OSU l        [        R                  " XT5      U l        [        R                  " [        R                  " U5      5      U l        [	        XSSSS9U l        U(       a3  [        R                  " [        R                  " SUSS5      5      U l        gSU l        g)a   
Initialize MaxSigmoidAttnBlock.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    nh (int): Number of heads.
    ec (int): Embedding channels.
    gc (int): Guide channels.
    scale (bool): Whether to use learnable scale parameter.
r   Fr   Nrj   rR  r   )r:   r;   nhhcr   r   r<   LinearglrC   r@   zerosr8   	proj_convonesscale)rH   rG   rr   rf  r   gcrm  rJ   s          rK   r;   MaxSigmoidAttnBlock.__init__N  s     	(24($r.))B#LLR1	bQE:>CR\\%**QAq"9:

rM   c                   UR                   u  p4pVU R                  U5      nUR                  X2R                   S   U R                  U R                  5      nU R
                  b  U R                  U5      OUnUR                  X0R                  U R                  XV5      n[        R                  " SXr5      nUR                  SS9S   nXR                  S-  -  nXR                  SSS2SS4   -   nUR                  5       U R                  -  nU R                  U5      nUR                  X0R                  SXV5      nXR                  S5      -  nUR                  USXV5      $ )	z
Forward pass of MaxSigmoidAttnBlock.

Args:
    x (torch.Tensor): Input tensor.
    guide (torch.Tensor): Guide tensor.

Returns:
    (torch.Tensor): Output tensor after attention.
r   Nzbmchw,bnmc->bmhwnr   r   r   r   rP   )rQ   ri  rD   rf  rg  r   r@   einsummaxr8   sigmoidrm  rk  	unsqueeze)	rH   rI   guidebsrU   hwembedaws	            rK   rW   MaxSigmoidAttnBlock.forwardc  s    ggq

2{{1~tww@"gg1
q

2ww6\\-u<VVV^A77C< ))D!T4/00ZZ\DJJ&NN1FF2wwA)Qvvb"a##rM   )r8   r   ri  rg  rf  rk  rm  )r         F)rG   rZ   rr   rZ   rf  rZ   r   rZ   rn  rZ   rm  r   rI   r\   ru  r\   r]   r\   r^   rf   s   @rK   rd  rd  K  s    &M M*$ $rM   rd  c                  x   ^  \ rS rSrSr       S                 SU 4S jjjrS	S jrS	S jrSrU =r	$ )
r   i  z*C2f module with an additional attn module.c
                z  >^ ^^ [         T
T ]  5         [        X)-  5      T l        [	        UST R                  -  SS5      T l        [	        SU-   T R                  -  US5      T l        [        R                  " UU U4S j[        U5       5       5      T l
        [        T R                  T R                  XdUS9T l        g)a  
Initialize C2f module with attention mechanism.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    ec (int): Embedding channels for attention.
    nh (int): Number of heads for attention.
    gc (int): Guide channels for attention.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
rP   r   rj   c           
   3  h   >#    U  H'  n[        TR                  TR                  TTS SS9v   M)     g7fr   r   r   s     rK   r   #C2fAttn.__init__.<locals>.<genexpr>  r   r   )rn  r   rf  N)r:   r;   rZ   r   r   rl   ro   r<   r   r   r   rd  attn)rH   rG   rr   r   r   rf  rn  r   r   r   rJ   s   `      `` rK   r;   C2fAttn.__init__  s    4 	RVAJ1-Q$&&("a0tkpqrkstt'2L	rM   c                6  ^ [        U R                  U5      R                  SS5      5      mTR                  U4S jU R                   5       5        TR                  U R                  TS   U5      5        U R                  [        R                  " TS5      5      $ )z
Forward pass through C2f layer with attention.

Args:
    x (torch.Tensor): Input tensor.
    guide (torch.Tensor): Guide tensor for attention.

Returns:
    (torch.Tensor): Output tensor after processing.
rP   r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   "C2fAttn.forward.<locals>.<genexpr>  r   r   r   )
r  rl   r   r   r   appendr  ro   r@   r   rH   rI   ru  r   s      @rK   rW   C2fAttn.forward  sn     !""1a()	*466**	1R5%()xx		!Q((rM   c                b  ^ [        U R                  U5      R                  U R                  U R                  4S5      5      mTR	                  U4S jU R
                   5       5        TR                  U R                  TS   U5      5        U R                  [        R                  " TS5      5      $ )z
Forward pass using split() instead of chunk().

Args:
    x (torch.Tensor): Input tensor.
    guide (torch.Tensor): Guide tensor for attention.

Returns:
    (torch.Tensor): Output tensor after processing.
r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   (C2fAttn.forward_split.<locals>.<genexpr>  r   r   r   )r  rl   r  r   r   r   r  r  ro   r@   r   r  s      @rK   r  C2fAttn.forward_split  s{     !""DFFDFF#3Q78	*466**	1R5%()xx		!Q((rM   )r  r   rl   ro   r   )r   r|  r   r}  Fr   r   )rG   rZ   rr   rZ   r   rZ   r   rZ   rf  rZ   rn  rZ   r   r   r   rZ   r   rB   r~  r  rf   s   @rK   r   r     s    4 MM M 	M
 M M M M M M MB) ) )rM   r   c                  V   ^  \ rS rSrSr S           SU 4S jjjrSS jrSrU =r$ )	r   i  zKImagePoolingAttn: Enhance the text embeddings with image-aware information.c                  > [         T
U ]  5         [        U5      n[        R                  " [        R
                  " U5      [        R                  " X15      5      U l        [        R                  " [        R
                  " U5      [        R                  " X5      5      U l        [        R                  " [        R
                  " U5      [        R                  " X5      5      U l	        [        R                  " X5      U l
        U(       a*  [        R                  " [        R                  " S/5      SS9OSU l        [        R                  " U Vs/ s H  n[        R                   " XSS9PM     sn5      U l        [        R                  " [%        U5       V	s/ s H  n	[        R&                  " XU45      PM     sn	5      U l        Xl        X@l        Xpl        X-  U l        XPl        gs  snf s  sn	f )aC  
Initialize ImagePoolingAttn module.

Args:
    ec (int): Embedding channels.
    ch (tuple): Channel dimensions for feature maps.
    ct (int): Channel dimension for text embeddings.
    nh (int): Number of attention heads.
    k (int): Kernel size for pooling.
    scale (bool): Whether to use learnable scale parameter.
g        Trequires_gradr   r   )r~   N)r:   r;   r   r<   r   	LayerNormrh  querykeyvalueprojrC   r@   tensorrm  r   r=   projectionsr   AdaptiveMaxPool2dim_poolsr   rf  nfrg  rk   )rH   r   chctrf  rk   rm  r  in_channelsrU   rJ   s             rK   r;   ImagePoolingAttn.__init__  s>    	W]]2<<#3RYYr5FG
==b!1299R3DE]]2<<#3RYYr5FG
IIb%	NSR\\%,,u"5TJY\
==gi)jgiXc"))KQR*Sgi)jkUSUY&WYr';';QF'CY&WX( *k&Ws   G%!G*c           
        US   R                   S   n[        U5      U R                  :X  d   eU R                  S-  n[	        XR
                  U R                  5       VVVs/ s H$  u  pof" U" U5      5      R                  USU5      PM&     snnnn[        R                  " USS9R                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      n	UR                  USU R                  U R                   5      nUR                  USU R                  U R                   5      nU	R                  USU R                  U R                   5      n	[        R"                  " SXx5      n
XR                   S-  -  n
[$        R&                  " U
SS9n
[        R"                  " SX5      nU R)                  UR                  USU R*                  5      5      nXR,                  -  U-   $ s  snnnf )	z
Forward pass of ImagePoolingAttn.

Args:
    x (list[torch.Tensor]): List of input feature maps.
    text (torch.Tensor): Text embeddings.

Returns:
    (torch.Tensor): Enhanced text embeddings.
r   rP   r   r   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rQ   r   r  rk   zipr  r  rD   r@   r   rR   r  r  r  reshaperf  rg  rq  r   rS   r  r   rm  )rH   rI   textrv  num_patchesr  r   qrk   vrz  s              rK   rW   ImagePoolingAttn.forward  s    qTZZ]1v   ffaiLOPQScSceiererLstLs!4T$q']B4LstIIaR **1a0JJtHHQKJJqM IIb"dggtww/IIb"dggtww/IIb"dggtww/\\+Q277C< YYrr"LL*B2IIaiiB01::~$$# us   !+G2)r   rg  r  rk   r  r  rf  r  r  r  rm  r  )rw   r   r}     rj   F)r   rZ   r  r   r  rZ   rf  rZ   rk   rZ   rm  r   )rI   list[torch.Tensor]r  r\   r]   r\   r^   rf   s   @rK   r   r     sQ    U ns!0;>JMVYfj <% %rM   r   c                  6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )r   i	  zZImplements contrastive learning head for region-text similarity in vision-language models.c                *  > [         TU ]  5         [        R                  " [        R
                  " S/5      5      U l        [        R                  " [        R                  " / 5      [        R
                  " S5      R                  5       -  5      U l	        g)zBInitialize ContrastiveHead with region-text similarity parameters.      $g$I$I,@N)
r:   r;   r<   rC   r@   r  r8   rl  loglogit_scale)rH   rJ   s    rK   r;   ContrastiveHead.__init__  sY    LLug!67	<<

2h9O9S9S9U(UVrM   c                    [         R                  " USSS9n[         R                  " USSS9n[        R                  " SX5      nXR                  R                  5       -  U R                  -   $ )z
Forward function of contrastive learning.

Args:
    x (torch.Tensor): Image features.
    w (torch.Tensor): Text features.

Returns:
    (torch.Tensor): Similarity scores.
r   rP   r   rT  r   bchw,bkc->bkhw)r   	normalizer@   rq  r  expr8   rH   rI   rx  s      rK   rW   ContrastiveHead.forward  sZ     KKqA&KKrQ'LL)10##''))DII55rM   )r8   r  rI   r\   rx  r\   r]   r\   r^   rf   s   @rK   r   r   	  s    dW6 6rM   r   c                  J   ^  \ rS rSrSrSU 4S jjrS rS	S jrS	S jrSr	U =r
$ )
r   i$  z
Batch Norm Contrastive Head using batch norm instead of l2-normalization.

Args:
    embed_dims (int): Embed dimensions of text and image features.
c                  > [         TU ]  5         [        R                  " U5      U l        [        R
                  " [        R                  " S/5      5      U l        [        R
                  " S[        R                  " / 5      -  5      U l
        g)z_
Initialize BNContrastiveHead.

Args:
    embed_dims (int): Embedding dimensions for features.
r  g      N)r:   r;   r<   rG  normrC   r@   r  r8   rl  r  )rH   
embed_dimsrJ   s     rK   r;   BNContrastiveHead.__init__,  sY     	NN:.	LLug!67	<<uzz"~(=>rM   c                2    U ? U ?U ?U R                  U l        g)zCFuse the batch normalization layer in the BNContrastiveHead module.N)r  r8   r  forward_fuserW   )rH   s    rK   fuseBNContrastiveHead.fuse:  s    II((rM   c                    U$ )zPasses input out unchanged.r   r  s      rK   r  BNContrastiveHead.forward_fuseA  s    rM   c                    U R                  U5      n[        R                  " USSS9n[        R                  " SX5      nXR
                  R                  5       -  U R                  -   $ )z
Forward function of contrastive learning with batch normalization.

Args:
    x (torch.Tensor): Image features.
    w (torch.Tensor): Text features.

Returns:
    (torch.Tensor): Similarity scores.
r   rP   r  r  )r  r   r  r@   rq  r  r  r8   r  s      rK   rW   BNContrastiveHead.forwardE  sU     IIaLKKrQ'LL)10##''))DII55rM   )r8   rW   r  r  )r  rZ   r  )r_   r`   ra   rb   rc   r;   r  r  rW   rd   re   rf   s   @rK   r   r   $  s!    ?)6 6rM   r   c                  L   ^  \ rS rSrSr S           SU 4S jjjrSrU =r$ )RepBottleneckiW  zRep bottleneck.c                l   > [         TU ]  XX4XV5        [        X&-  5      n[        XUS   S5      U l        g)a  
Initialize RepBottleneck.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    shortcut (bool): Whether to use shortcut connection.
    g (int): Groups for convolutions.
    k (tuple): Kernel sizes for convolutions.
    e (float): Expansion ratio.
r   r   N)r:   r;   rZ   r
   rl   r:  s	           rK   r;   RepBottleneck.__init__Z  s5     	a3[21Q4+rM   rl   r>  r?  r  rf   s   @rK   r  r  W  sG     lo,,,*.,:=,FU,ch, ,rM   r  c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )RepCSPim  zXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.c                   >^^^ [         TU ]  XUTTU5        [        X&-  5      m[        R                  " UUU4S j[        U5       5       6 U l        g)a  
Initialize RepCSP layer.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of RepBottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
c           	   3  >   >#    U  H  n[        TTTTS S9v   M     g7frC  )r  r  s     rK   r   "RepCSP.__init__.<locals>.<genexpr>~  s       ]T\qr2xc!JT\r   Nr-  r  s	       `` @rK   r;   RepCSP.__init__p  sB     	Ha3[ ]TYZ[T\ ]^rM   r   r   r   r  rf   s   @rK   r  r  m  s    b_ _rM   r  c                  H   ^  \ rS rSrSrSSU 4S jjjrS	S jrS	S jrSrU =r	$ )
r#   i  z	CSP-ELAN.c           	     V  > [         TU ]  5         US-  U l        [        XSS5      U l        [
        R                  " [        US-  XE5      [        XDSS5      5      U l        [
        R                  " [        XDU5      [        XDSS5      5      U l	        [        USU-  -   USS5      U l
        g)z
Initialize CSP-ELAN layer.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    c3 (int): Intermediate channels.
    c4 (int): Intermediate channels for RepCSP.
    n (int): Number of RepCSP blocks.
rP   r   rj   N)r:   r;   r   r   rl   r<   r   r  ro   rp   rF  )rH   rG   rr   rU  c4r   rJ   s         rK   r;   RepNCSPELAN4.__init__  s     	q1%==a!7ba9KL==!2DA4FGa"fr1a0rM   c                  ^ [        U R                  U5      R                  SS5      5      mTR                  U4S jU R                  U R
                  4 5       5        U R                  [        R                  " TS5      5      $ )z(Forward pass through RepNCSPELAN4 layer.rP   r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   'RepNCSPELAN4.forward.<locals>.<genexpr>  s     :%9!AbE((%9r   )	r  rl   r   r   ro   rp   rF  r@   r   r   s     @rK   rW   RepNCSPELAN4.forward  sZ    !""1a()	:dhh%9::xx		!Q((rM   c                2  ^ [        U R                  U5      R                  U R                  U R                  4S5      5      mTR	                  U4S jU R
                  U R                  4 5       5        U R                  [        R                  " TS5      5      $ )r  r   c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   -RepNCSPELAN4.forward_split.<locals>.<genexpr>  s     8#7a1R5#7r   )
r  rl   r  r   r   ro   rp   rF  r@   r   r   s     @rK   r  RepNCSPELAN4.forward_split  sg    !""DFFDFF#3Q78	8DHHdhh#788xx		!Q((rM   r   rl   ro   rp   rF  r   )
rG   rZ   rr   rZ   rU  rZ   r  rZ   r   rZ   r[   r  rf   s   @rK   r#   r#     s    1 1$)) )rM   r#   c                  0   ^  \ rS rSrSrSU 4S jjrSrU =r$ )r$   i  z!ELAN1 module with 4 convolutions.c                   > [         TU ]  XX45        US-  U l        [        XSS5      U l        [        US-  USS5      U l        [        XDSS5      U l        [        USU-  -   USS5      U l        g)z
Initialize ELAN1 layer.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    c3 (int): Intermediate channels.
    c4 (int): Intermediate channels for convolutions.
rP   r   rj   N)r:   r;   r   r   rl   ro   rp   rF  )rH   rG   rr   rU  r  rJ   s        rK   r;   ELAN1.__init__  so     	(q1%aQ*1%a"fr1a0rM   r  )rG   rZ   rr   rZ   rU  rZ   r  rZ   r  rf   s   @rK   r$   r$     s    +1 1rM   r$   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r&   i  zAConv.c                H   > [         TU ]  5         [        XSSS5      U l        g)z^
Initialize AConv module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
rj   rP   r   N)r:   r;   r   rl   rH   rG   rr   rJ   s      rK   r;   AConv.__init__  s"     	1a(rM   c                    [         R                  R                  R                  USSSSS5      nU R	                  U5      $ )z!Forward pass through AConv layer.rP   r   r   FT)r@   r<   
functional
avg_pool2drl   ru   s     rK   rW   AConv.forward  s4    HH**1aAudCxx{rM   r  rG   rZ   rr   rZ   r[   r^   rf   s   @rK   r&   r&     s    	) rM   r&   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r%   i  zADown.c                   > [         TU ]  5         US-  U l        [        US-  U R                  SSS5      U l        [        US-  U R                  SSS5      U l        g)z^
Initialize ADown module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
rP   rj   r   r   N)r:   r;   r   r   rl   ro   r  s      rK   r;   ADown.__init__  sS     	qaAq1aAq1rM   c                R   [         R                  R                  R                  USSSSS5      nUR	                  SS5      u  p#U R                  U5      n[         R                  R                  R                  USSS5      nU R                  U5      n[         R                  " X#4S5      $ )z!Forward pass through ADown layer.rP   r   r   FTrj   )	r@   r<   r  r  r   rl   
max_pool2dro   r   )rH   rI   r   r   s       rK   rW   ADown.forward  s    HH**1aAudCAXXb\XX  ++B1a8XXb\yy"1%%rM   )r   rl   ro   r  r[   r^   rf   s   @rK   r%   r%     s    2& &rM   r%   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r'   i  z	SPP-ELAN.c                2  > [         TU ]  5         X0l        [        XSS5      U l        [
        R                  " USUS-  S9U l        [
        R                  " USUS-  S9U l        [
        R                  " USUS-  S9U l	        [        SU-  USS5      U l
        g)z
Initialize SPP-ELAN block.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    c3 (int): Intermediate channels.
    k (int): Kernel size for max pooling.
r   rP   r   rO   N)r:   r;   r   r   rl   r<   r   ro   rp   rF  cv5)rH   rG   rr   rU  rk   rJ   s        rK   r;   SPPELAN.__init__  s     	1%<<AaaH<<AaaH<<AaaHBAq)rM   c                   ^ U R                  U5      /mTR                  U4S jU R                  U R                  U R                  4 5       5        U R                  [        R                  " TS5      5      $ )z#Forward pass through SPPELAN layer.c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   "SPPELAN.forward.<locals>.<genexpr>  s     B#Aa1R5#Ar   r   )rl   r   ro   rp   rF  r  r@   r   r   s     @rK   rW   SPPELAN.forward  sP    XXa[M	BDHHdhh#ABBxx		!Q((rM   )r   rl   ro   rp   rF  r  r   )rG   rZ   rr   rZ   rU  rZ   rk   rZ   r[   r^   rf   s   @rK   r'   r'     s    * *$) )rM   r'   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r)   i  z	CBLinear.c           
        > [         TU ]  5         X l        [        R                  " U[        U5      X4[        X55      USS9U l        g)z
Initialize CBLinear module.

Args:
    c1 (int): Input channels.
    c2s (list[int]): List of output channel sizes.
    k (int): Kernel size.
    s (int): Stride.
    p (int | None): Padding.
    g (int): Groups.
T)groupsr8   N)r:   r;   c2sr<   r=   sumr   r?   )rH   rG   r  rk   r1  rT  r   rJ   s          rK   r;   CBLinear.__init__  s8     	IIb#c(A'!-PTU	rM   c                T    U R                  U5      R                  U R                  SS9$ )z$Forward pass through CBLinear layer.r   r   )r?   r  r  ru   s     rK   rW   CBLinear.forward  s$    yy|!!$((!22rM   )r  r?   )r   r   Nr   )rG   rZ   r  	list[int]rk   rZ   r1  rZ   rT  z
int | Noner   rZ   )rI   r\   r]   r  r^   rf   s   @rK   r)   r)     s    V V 3 3rM   r)   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r(   i  zCBFuse.c                .   > [         TU ]  5         Xl        g)zV
Initialize CBFuse module.

Args:
    idx (list[int]): Indices for feature selection.
N)r:   r;   idx)rH   r  rJ   s     rK   r;   CBFuse.__init__  s     	rM   c           
        US   R                   SS n[        USS 5       VVs/ s H*  u  p4[        R                  " X@R                  U      USS9PM,     nnn[
        R                  " [
        R                  " XQSS -   5      SS9$ s  snnf )z
Forward pass through CBFuse layer.

Args:
    xs (list[torch.Tensor]): List of input tensors.

Returns:
    (torch.Tensor): Fused output tensor.
r   rP   Nnearest)sizemoder   r   )rQ   	enumerater   interpolater  r@   r  stack)rH   xstarget_sizer   rI   ress         rK   rW   CBFuse.forward'  s     fll12&[deghkikel[mn[mSWSTq}}Qxx{^+IN[mnyySbc7]3;; os   1B)r  )r  r   )r  r  r]   r\   r^   rf   s   @rK   r(   r(     s    < <rM   r(   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	C3fi6  r   c                  >^^^ [         TU ]  5         [        X&-  5      m[        UTSS5      U l        [        UTSS5      U l        [        SU-   T-  US5      U l        [        R                  " UUU4S j[        U5       5       5      U l
        g)a&  
Initialize CSP bottleneck layer with two convolutions.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
r   rP   c           
   3  @   >#    U  H  n[        TTTTS SS9v   M     g7fr   r  r  s     rK   r   C3f.__init__.<locals>.<genexpr>J  s%     lck^_z"b(AAQUXYckr  N)r:   r;   rZ   r   rl   ro   rp   r<   r   r   r   r  s	       `` @rK   r;   C3f.__init__9  st     	[B1%B1%Q"b!,lchijckllrM   c                   ^ U R                  U5      U R                  U5      /mTR                  U4S jU R                   5       5        U R	                  [
        R                  " TS5      5      $ )zForward pass through C3f layer.c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   C3f.forward.<locals>.<genexpr>O  r   r   r   )ro   rl   r   r   rp   r@   r   r   s     @rK   rW   C3f.forwardL  sL    XXa[$((1+&	*466**xx		!Q((rM   r  r
  r   r[   r^   rf   s   @rK   r  r  6  s    Fm m&) )rM   r  c                  P   ^  \ rS rSrSr S             SU 4S jjjrSrU =r$ )r*   iS  r   c                   >^ ^^^ [         TT ]  XUTTU5        [        R                  " UUU U4S j[	        U5       5       5      T l        g)a'  
Initialize C3k2 module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of blocks.
    c3k (bool): Whether to use C3k blocks.
    e (float): Expansion ratio.
    g (int): Groups for convolutions.
    shortcut (bool): Whether to use shortcut connections.
c              3     >#    U  HQ  nT(       a#  [        TR                  TR                  S TT5      O![        TR                  TR                  TT5      v   MS     g7f)rP   N)C3kr   r   )r   rU   c3kr   rH   r   s     rK   r    C3k2.__init__.<locals>.<genexpr>f  sG      
muhi3C8Q/JtvvtvvW_ab<ccmus   AANr:   r;   r<   r   r   r   )	rH   rG   rr   r   r  r   r   r   rJ   s	   `   ` ``rK   r;   C3k2.__init__V  s=     	Ha3 
mrstmu
 
rM   r   )r   Fr   r   T)rG   rZ   rr   rZ   r   rZ   r  r   r   rB   r   rZ   r   r   r  rf   s   @rK   r*   r*   S  sO    F mq


#&
15
BG
RU
ei
 
rM   r*   c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )r  ik  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.c                   >^^^^ [         T	U ]  XUTTU5        [        X&-  5      m[        R                  " UUUU4S j[        U5       5       6 U l        g)a   
Initialize C3k module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of Bottleneck blocks.
    shortcut (bool): Whether to use shortcut connections.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
    k (int): Kernel size.
c           
   3  D   >#    U  H  n[        TTTTTT4S S9v   M     g7f)r   r   Nr  )r   rU   rq   r   rk   r   s     rK   r   C3k.__init__.<locals>.<genexpr>~  s'      d[cVWBHaAq6S!Q[cs    Nr-  )
rH   rG   rr   r   r   r   r   rk   rq   rJ   s
       `` `@rK   r;   C3k.__init__n  sB     	Ha3[ d[`ab[c derM   r   )r   Tr   r   rj   )rG   rZ   rr   rZ   r   rZ   r   r   r   rZ   r   rB   rk   rZ   r  rf   s   @rK   r  r  k  s    rf frM   r  c                  r   ^  \ rS rSrSrSU 4S jjrS	S jrS	S jr\R                  " 5       S 5       r
SrU =r$ )
r-   i  zfRepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.c           
        > [         TU ]  5         [        XSSSUSS9U l        [        XSSSUSS9U l        Xl        [        R                  " 5       U l        g)zM
Initialize RepVGGDW module.

Args:
    ed (int): Input and output channels.
r\  r   rj   Fr   r}   N)	r:   r;   r   r?   conv1r   r<   rI  r}   )rH   edrJ   s     rK   r;   RepVGGDW.__init__  sN     	AqBE:	"!QRU;
779rM   c                f    U R                  U R                  U5      U R                  U5      -   5      $ )z
Perform a forward pass of the RepVGGDW block.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after applying the depth wise separable convolution.
)r}   r?   r*  ru   s     rK   rW   RepVGGDW.forward  s(     xx		!tzz!}455rM   c                B    U R                  U R                  U5      5      $ )z
Perform a forward pass of the RepVGGDW block without fusing the convolutions.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after applying the depth wise separable convolution.
)r}   r?   ru   s     rK   r  RepVGGDW.forward_fuse  s     xx		!%%rM   c                @   [        U R                  R                  U R                  R                  5      n[        U R                  R                  U R                  R                  5      nUR                  nUR
                  nUR                  nUR
                  n[        R                  R                  R                  U/ SQ5      nX5-   nXF-   nUR                  R                  R                  U5        UR
                  R                  R                  U5        Xl        U ?g)z
Fuse the convolutional layers in the RepVGGDW block.

This method fuses the convolutional layers and updates the weights and biases accordingly.
)rP   rP   rP   rP   N)r   r?   rH  r*  rE   r8   r@   r<   r  r   rF   copy_)	rH   r?   r*  conv_wconv_bconv1_wconv1_bfinal_conv_wfinal_conv_bs	            rK   r  RepVGGDW.fuse  s      				= $**--@,,**((%%))'<@''|,		\*	JrM   )r}   r?   r*  r   )r+  rZ   r]   Noner[   )r_   r`   ra   rb   rc   r;   rW   r  r@   no_gradr  rd   re   rf   s   @rK   r-   r-     s/    p
6
& ]]_ rM   r-   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r.   i  a  
Conditional Identity Block (CIB) module.

Args:
    c1 (int): Number of input channels.
    c2 (int): Number of output channels.
    shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
    e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
    lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
c                J  > [         TU ]  5         [        X$-  5      n[        R                  " [        XSUS9[        USU-  S5      U(       a  [        SU-  5      O[        SU-  SU-  SSU-  S9[        SU-  US5      [        X"SUS95      U l        U=(       a    X:H  U l        g)z
Initialize the CIB module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    shortcut (bool): Whether to use shortcut connection.
    e (float): Expansion ratio.
    lk (bool): Whether to use RepVGGDW.
rj   r9  rP   r   N)	r:   r;   rZ   r<   r   r   r-   rl   r   )rH   rG   rr   r   r   lkrq   rJ   s          rK   r;   CIB.__init__  s     	[==b!QVQ "HQVQVQVQ!b&(IRQb!
 (rM   c                l    U R                   (       a  XR                  U5      -   $ U R                  U5      $ )zy
Forward pass of the CIB module.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor.
r   rl   ru   s     rK   rW   CIB.forward  s'     #'((q88A;;;rM   rA  )Tr   F)
rG   rZ   rr   rZ   r   r   r   rB   r>  r   r[   r^   rf   s   @rK   r.   r.     s    	) ).
< 
<rM   r.   c                  P   ^  \ rS rSrSr S             SU 4S jjjrSrU =r$ )r/   i  a)  
C2fCIB class represents a convolutional block with C2f and CIB modules.

Args:
    c1 (int): Number of input channels.
    c2 (int): Number of output channels.
    n (int, optional): Number of CIB modules to stack. Defaults to 1.
    shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
    lk (bool, optional): Whether to use local key connection. Defaults to False.
    g (int, optional): Number of groups for grouped convolution. Defaults to 1.
    e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
c                   >^ ^^ [         TT ]  XUTXg5        [        R                  " UU U4S j[	        U5       5       5      T l        g)a6  
Initialize C2fCIB module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of CIB modules.
    shortcut (bool): Whether to use shortcut connection.
    lk (bool): Whether to use local key connection.
    g (int): Groups for convolutions.
    e (float): Expansion ratio.
c           	   3  f   >#    U  H&  n[        TR                  TR                  TS TS9v   M(     g7f)r   )r   r>  N)r.   r   )r   rU   r>  rH   r   s     rK   r   "C2fCIB.__init__.<locals>.<genexpr>  s(     ]T\qs4664668srJT\s   .1Nr   )	rH   rG   rr   r   r   r>  r   r   rJ   s	   `   ``  rK   r;   C2fCIB.__init__  s5     	Ha3]TYZ[T\]]rM   r   )r   FFr   r   )rG   rZ   rr   rZ   r   rZ   r   r   r>  r   r   rZ   r   rB   r  rf   s   @rK   r/   r/     sZ     nq^^^#&^6:^HL^Y\^ej^ ^rM   r/   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r0   i  a  
Attention module that performs self-attention on the input tensor.

Args:
    dim (int): The input tensor dimension.
    num_heads (int): The number of attention heads.
    attn_ratio (float): The ratio of the attention key dimension to the head dimension.

Attributes:
    num_heads (int): The number of attention heads.
    head_dim (int): The dimension of each attention head.
    key_dim (int): The dimension of the attention key.
    scale (float): The scaling factor for the attention scores.
    qkv (Conv): Convolutional layer for computing the query, key, and value.
    proj (Conv): Convolutional layer for projecting the attended values.
    pe (Conv): Convolutional layer for positional encoding.
c           	     2  > [         TU ]  5         X l        X-  U l        [	        U R                  U-  5      U l        U R
                  S-  U l        U R
                  U-  nXS-  -   n[        XSSS9U l        [        XSSS9U l	        [        XSSUSS9U l
        g)	z
Initialize multi-head attention module.

Args:
    dim (int): Input dimension.
    num_heads (int): Number of attention heads.
    attn_ratio (float): Attention ratio for key dimension.
      rP   r   Fr|   rj   r)  N)r:   r;   	num_headshead_dimrZ   key_dimrm  r   qkvr  pe)rH   r   rK  
attn_rationh_kdrw  rJ   s         rK   r;   Attention.__init__(  s     	"(4==:56\\4'
y(!)Ou-1%0	sA%8rM   c           	     0   UR                   u  p#pEXE-  nU R                  U5      nUR                  X R                  U R                  S-  U R
                  -   U5      R                  U R                  U R                  U R
                  /SS9u  pn
UR                  SS5      U	-  U R                  -  nUR                  SS9nXR                  SS5      -  R                  X#XE5      U R                  U
R                  X#XE5      5      -   nU R                  U5      nU$ )z
Forward pass of the Attention module.

Args:
    x (torch.Tensor): The input tensor.

Returns:
    (torch.Tensor): The output tensor after self-attention.
rP   r   r   )rQ   rN  rD   rK  rM  rL  r  rR   rm  rS   rO  r  r  )rH   rI   BCHWNrN  r  rk   r  r  s               rK   rW   Attention.forward<  s     WW
aEhhqk((1nndllQ.>.NPQRXX\\4<<7Q Y 
a B#a'4::5|||#B''--aA9DGGAIIaTUDY<ZZIIaLrM   )rL  rM  rK  rO  r  rN  rm  )r  r   )r   rZ   rK  rZ   rP  rB   r[   r^   rf   s   @rK   r0   r0     s    $9 9( rM   r0   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	PSABlockiT  a  
PSABlock class implementing a Position-Sensitive Attention block for neural networks.

This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
with optional shortcut connections.

Attributes:
    attn (Attention): Multi-head attention module.
    ffn (nn.Sequential): Feed-forward neural network module.
    add (bool): Flag indicating whether to add shortcut connections.

Methods:
    forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

Examples:
    Create a PSABlock and perform a forward pass
    >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
    >>> input_tensor = torch.randn(1, 128, 32, 32)
    >>> output_tensor = psablock(input_tensor)
c           
        > [         TU ]  5         [        XUS9U l        [        R
                  " [        XS-  S5      [        US-  USSS95      U l        X@l        g)z
Initialize the PSABlock.

Args:
    c (int): Input and output channels.
    attn_ratio (float): Attention ratio for key dimension.
    num_heads (int): Number of attention heads.
    shortcut (bool): Whether to use shortcut connections.
rP  rK  rP   r   Fr|   N)	r:   r;   r0   r  r<   r   r   ffnr   )rH   r   rP  rK  r   rJ   s        rK   r;   PSABlock.__init__j  sO     	a)L	==aQ!2DQ1%4PQrM   c                    U R                   (       a  XR                  U5      -   OU R                  U5      nU R                   (       a  XR                  U5      -   nU$ U R                  U5      nU$ )z
Execute a forward pass through PSABlock.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after attention and feed-forward processing.
r   r  r_  ru   s     rK   rW   PSABlock.forwardz  sR     !%A		!diil#xxAO .2XXa[rM   rb  )r   rO   T)
r   rZ   rP  rB   rK  rZ   r   r   r]   r:  r[   r^   rf   s   @rK   r\  r\  T  s    *   rM   r\  c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r1   i  a  
PSA class for implementing Position-Sensitive Attention in neural networks.

This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
input tensors, enhancing feature extraction and processing capabilities.

Attributes:
    c (int): Number of hidden channels after applying the initial convolution.
    cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
    cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
    attn (Attention): Attention module for position-sensitive attention.
    ffn (nn.Sequential): Feed-forward network for further processing.

Methods:
    forward: Applies position-sensitive attention and feed-forward network to the input tensor.

Examples:
    Create a PSA module and apply it to an input tensor
    >>> psa = PSA(c1=128, c2=128, e=0.5)
    >>> input_tensor = torch.randn(1, 128, 64, 64)
    >>> output_tensor = psa.forward(input_tensor)
c           
       > [         TU ]  5         X:X  d   e[        X-  5      U l        [	        USU R                  -  SS5      U l        [	        SU R                  -  US5      U l        [        U R                  SU R                  S-  S9U l        [        R                  " [	        U R                  U R                  S-  S5      [	        U R                  S-  U R                  SSS95      U l        g)	z|
Initialize PSA module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    e (float): Expansion ratio.
rP   r   r   @   r^  Fr|   N)r:   r;   rZ   r   r   rl   ro   r0   r  r<   r   r_  )rH   rG   rr   r   rJ   s       rK   r;   PSA.__init__  s     	xxRVAJ1-DFF
B*dff"M	==dffdffqj!!<d466A:tvvWX^c>derM   c                
   U R                  U5      R                  U R                  U R                  4SS9u  p#X0R                  U5      -   nX0R	                  U5      -   nU R                  [        R                  " X#4S5      5      $ )z
Execute forward pass in PSA module.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after attention and feed-forward processing.
r   r   )rl   r  r   r  r_  ro   r@   r   r   s       rK   rW   PSA.forward  sk     xx{  $&&$&&!1q 9		!Oxx		1&!,--rM   )r  r   rl   ro   r_  )r   )rG   rZ   rr   rZ   r   rB   r[   r^   rf   s   @rK   r1   r1     s    .f f$. .rM   r1   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	r,   i  a  
C2PSA module with attention mechanism for enhanced feature extraction and processing.

This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

Attributes:
    c (int): Number of hidden channels.
    cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
    cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
    m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

Methods:
    forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

Notes:
    This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.

Examples:
    >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
    >>> input_tensor = torch.randn(1, 256, 64, 64)
    >>> output_tensor = c2psa(input_tensor)
c                *  >^  [         TT ]  5         X:X  d   e[        X-  5      T l        [	        UST R                  -  SS5      T l        [	        ST R                  -  US5      T l        [        R                  " U 4S j[        U5       5       6 T l
        g)z
Initialize C2PSA module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of PSABlock modules.
    e (float): Expansion ratio.
rP   r   c              3  h   >#    U  H'  n[        TR                  S TR                  S-  S9v   M)     g7fr   rf  r^  Nr\  r   r   rU   rH   s     rK   r   !C2PSA.__init__.<locals>.<genexpr>  s*      lck^_$&&SDFFVXL!Yckr   Nr   rH   rG   rr   r   r   rJ   s   `    rK   r;   C2PSA.__init__  su     	xxRVAJ1-DFF
B* lchijck lmrM   c                    U R                  U5      R                  U R                  U R                  4SS9u  p#U R                  U5      nU R	                  [
        R                  " X#4S5      5      $ )z
Process the input tensor through a series of PSA blocks.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after processing.
r   r   )rl   r  r   r   ro   r@   r   r   s       rK   rW   C2PSA.forward  sY     xx{  $&&$&&!1q 9FF1Ixx		1&!,--rM   r   r   r   r%  r[   r^   rf   s   @rK   r,   r,     s    0n n$. .rM   r,   c                  4   ^  \ rS rSrSrSSU 4S jjjrSrU =r$ )r+   i  aW  
C2fPSA module with enhanced feature extraction using PSA blocks.

This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.

Attributes:
    c (int): Number of hidden channels.
    cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
    cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
    m (nn.ModuleList): List of PSA blocks for feature extraction.

Methods:
    forward: Performs a forward pass through the C2fPSA module.
    forward_split: Performs a forward pass using split() instead of chunk().

Examples:
    >>> import torch
    >>> from ultralytics.models.common import C2fPSA
    >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
    >>> x = torch.randn(1, 64, 128, 128)
    >>> output = model(x)
    >>> print(output.shape)
c                   >^  X:X  d   e[         TT ]  XX4S9  [        R                  " U 4S j[	        U5       5       5      T l        g)z
Initialize C2fPSA module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    n (int): Number of PSABlock modules.
    e (float): Expansion ratio.
)r   r   c              3  h   >#    U  H'  n[        TR                  S TR                  S-  S9v   M)     g7frm  rn  ro  s     rK   r   "C2fPSA.__init__.<locals>.<genexpr>"  s*     jai\]x3$&&TV,Wair   Nr   rq  s   `    rK   r;   C2fPSA.__init__  s=     xx1*jafghaijjrM   r   ru  r%  r  rf   s   @rK   r+   r+     s    0k krM   r+   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r2   i%  a  
SCDown module for downsampling with separable convolutions.

This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

Attributes:
    cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
    cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

Methods:
    forward: Applies the SCDown module to the input tensor.

Examples:
    >>> import torch
    >>> from ultralytics import SCDown
    >>> model = SCDown(c1=64, c2=128, k=3, s=2)
    >>> x = torch.randn(1, 64, 128, 128)
    >>> y = model(x)
    >>> print(y.shape)
    torch.Size([1, 128, 64, 64])
c           	     h   > [         TU ]  5         [        XSS5      U l        [        X"X4USS9U l        g)z
Initialize SCDown module.

Args:
    c1 (int): Input channels.
    c2 (int): Output channels.
    k (int): Kernel size.
    s (int): Stride.
r   F)rk   r1  r   r}   N)r:   r;   r   rl   ro   )rH   rG   rr   rk   r1  rJ   s        rK   r;   SCDown.__init__=  s2     	1%!BE:rM   c                B    U R                  U R                  U5      5      $ )z
Apply convolution and downsampling to the input tensor.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Downsampled output tensor.
)ro   rl   ru   s     rK   rW   SCDown.forwardK  s     xx$$rM   )rl   ro   r6  r[   r^   rf   s   @rK   r2   r2   %  s    .;
% 
%rM   r2   c                  R   ^  \ rS rSrSr S         SU 4S jjjrSS jrSrU =r$ )	r3   iX  a.  
TorchVision module to allow loading any torchvision model.

This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.

Attributes:
    m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.

Args:
    model (str): Name of the torchvision model to load.
    weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
    unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
    truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
    split (bool, optional): Returns output from intermediate child modules as list. Default is False.
c                  > SSK n[        TU ]	  5         [        UR                  S5      (       a  UR                  R                  XS9U l        O+UR                  R                  U   " [        U5      S9U l        U(       a  [        U R                  R                  5       5      n[        US   [        R                  5      (       a#  / [        US   R                  5       5      QUSS Qn[        R                  " U(       a  USU*  OU6 U l        XPl        gSU l        [        R                  " 5       =U R                  l        U R                  l        g)a.  
Load the model and weights from torchvision.

Args:
    model (str): Name of the torchvision model to load.
    weights (str): Pre-trained weights to load.
    unwrap (bool): Whether to unwrap the model.
    truncate (int): Number of layers to truncate.
    split (bool): Whether to split the output.
r   N	get_model)weights)
pretrainedr   F)torchvisionr:   r;   hasattrmodelsr  r   __dict__r   r  children
isinstancer<   r   r  r!  headheads)	rH   modelr  unwraptruncater  r  layersrJ   s	           rK   r;   TorchVision.__init__i  s     	;%%{33 ''11%1IDF ''0074=QDF$&&//+,F&)R]]33C4q	 2 2 45Cqr
C]]8VJhY%7QDFJDJ)+6DFFK$&&,rM   c                   ^ U R                   (       a*  U/mTR                  U4S jU R                   5       5        T$ U R                  U5      mT$ )z
Forward pass through the model.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor | list[torch.Tensor]): Output tensor or list of tensors.
c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r   &TorchVision.forward.<locals>.<genexpr>  s     .v!QquXXvr   )r  r   r   r   s     @rK   rW   TorchVision.forward  sD     ::AHH.tvv..  q	ArM   )r   r  )DEFAULTTrP   F)
r  strr  r  r  r   r  rZ   r  r   r[   r^   rf   s   @rK   r3   r3   X  sK    " kp77#&7<@7SV7cg7 7< rM   r3   c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )	AAttni  a  
Area-attention module for YOLO models, providing efficient attention mechanisms.

This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
making it particularly effective for object detection tasks.

Attributes:
    area (int): Number of areas the feature map is divided.
    num_heads (int): Number of heads into which the attention mechanism is divided.
    head_dim (int): Dimension of each attention head.
    qkv (Conv): Convolution layer for computing query, key and value tensors.
    proj (Conv): Projection convolution layer.
    pe (Conv): Position encoding convolution layer.

Methods:
    forward: Applies area-attention to input tensor.

Examples:
    >>> attn = AAttn(dim=256, num_heads=8, area=4)
    >>> x = torch.randn(1, 256, 32, 32)
    >>> output = attn(x)
    >>> print(output.shape)
    torch.Size([1, 256, 32, 32])
c           
        > [         TU ]  5         X0l        X l        X-  =U l        nX@R                  -  n[        XS-  SSS9U l        [        XQSSS9U l        [        XQSSSUSS9U l        g)z
Initialize an Area-attention module for YOLO models.

Args:
    dim (int): Number of hidden channels.
    num_heads (int): Number of heads into which the attention mechanism is divided.
    area (int): Number of areas the feature map is divided.
rj   r   Fr|   r\  r)  N)	r:   r;   arearK  rL  r   rN  r  rO  )rH   r   rK  r  rL  all_head_dimrJ   s         rK   r;   AAttn.__init__  sp     		"#&#33..0A-qe<A59	|!QSeDrM   c                   UR                   u  p#pEXE-  nU R                  U5      R                  S5      R                  SS5      nU R                  S:  a=  UR                  X R                  -  X`R                  -  US-  5      nUR                   u  p&nUR                  X&U R                  U R                  S-  5      R                  SSSS5      R                  U R                  U R                  U R                  /SS9u  pnU	R                  SS5      U
-  U R                  S-  -  nUR                  SS9nXR                  SS5      -  nUR                  SSSS5      nUR                  SSSS5      nU R                  S:  ae  UR                  X R                  -  X`R                  -  U5      nUR                  X R                  -  X`R                  -  U5      nUR                   u  p&nUR                  X$XS5      R                  SSSS5      R                  5       nUR                  X$XS5      R                  SSSS5      R                  5       nXR                  U5      -   nU R                  U5      $ )	z
Process the input tensor through the area-attention.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after area-attention.
rP   r   rj   r   r   rT  r   rJ  )rQ   rN  flattenrR   r  r  rD   rK  rL  permuter  rS   
contiguousrO  r  )rH   rI   rU  rV  rW  rX  rY  rN  rU   r  rk   r  r  s                rK   rW   AAttn.forward  s    WW
aEhhqk!!!$..q!499q=++a))mQ))^QUCCiiGA!HHQ4>>4==1+<=WQ1a UDMM4==$--@aUH 	a
 B#a'DMM4,?@|||#r2&&IIaAq!IIaAq!99q=		!yy.!ii-;A		!yy.!ii-;AggGA!IIaA!))!Q15@@BIIaA!))!Q15@@B
Nyy|rM   )r  rL  rK  rO  r  rN  r   )r   rZ   rK  rZ   r  rZ   r[   r^   rf   s   @rK   r  r    s    2E E(% %rM   r  c                  H   ^  \ rS rSrSrSSU 4S jjjrS	S jrS
S jrSrU =r	$ )ABlocki  ah  
Area-attention block module for efficient feature extraction in YOLO models.

This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
It uses a novel area-based attention approach that is more efficient than traditional self-attention while
maintaining effectiveness.

Attributes:
    attn (AAttn): Area-attention module for processing spatial features.
    mlp (nn.Sequential): Multi-layer perceptron for feature transformation.

Methods:
    _init_weights: Initializes module weights using truncated normal distribution.
    forward: Applies area-attention and feed-forward processing to input tensor.

Examples:
    >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
    >>> x = torch.randn(1, 256, 32, 32)
    >>> output = block(x)
    >>> print(output.shape)
    torch.Size([1, 256, 32, 32])
c           
        > [         TU ]  5         [        XUS9U l        [	        X-  5      n[
        R                  " [        XS5      [        XQSSS95      U l        U R                  U R                  5        g)a-  
Initialize an Area-attention block module.

Args:
    dim (int): Number of input channels.
    num_heads (int): Number of heads into which the attention mechanism is divided.
    mlp_ratio (float): Expansion ratio for MLP hidden dimension.
    area (int): Number of areas the feature map is divided.
)rK  r  r   Fr|   N)r:   r;   r  r  rZ   r<   r   r   mlpapply_init_weights)rH   r   rK  	mlp_ratior  mlp_hidden_dimrJ   s         rK   r;   ABlock.__init__  s`     	#>	S_-==c1!=tNYZ`e?fg

4%%&rM   c                   [        U[        R                  5      (       aa  [        R                  R	                  UR
                  SS9  UR                  b+  [        R                  R                  UR                  S5        ggg)zk
Initialize weights using a truncated normal distribution.

Args:
    m (nn.Module): Module to initialize.
g{Gz?)stdNr   )r  r<   r=   inittrunc_normal_rE   r8   	constant_)rH   r   s     rK   r  ABlock._init_weights  s\     a##GG!!!((!5vv!!!!&&!, " $rM   c                N    XR                  U5      -   nXR                  U5      -   $ )z
Forward pass through ABlock.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after area-attention and feed-forward processing.
r  r  ru   s     rK   rW   ABlock.forward%  s$     		!88A;rM   r  )g333333?r   )r   rZ   rK  rZ   r  rB   r  rZ   )r   r   r[   )
r_   r`   ra   rb   rc   r;   r  rW   rd   re   rf   s   @rK   r  r    s!    .' '$
- rM   r  c                  t   ^  \ rS rSrSr        S                   SU 4S jjjrSS jrSrU =r$ )	A2C2fi3  a  
Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.

This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
processing. It supports both area-attention and standard convolution modes.

Attributes:
    cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
    cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
    gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
    m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.

Methods:
    forward: Processes input through area-attention or standard convolution pathway.

Examples:
    >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
    >>> x = torch.randn(1, 512, 32, 32)
    >>> output = m(x)
    >>> print(output.shape)
    torch.Size([1, 512, 32, 32])
c                  >^^^^	^
^ [         TU ]  5         [        X(-  5      mTS-  S:X  d   S5       e[        UTSS5      U l        [        SU-   T-  US5      U l        T(       a3  U(       a,  [        R                  " S[        R                  " U5      -  SS9OSU l
        [        R                  " UUUU	UU
4S	 j[        U5       5       5      U l        g)
a  
Initialize Area-Attention C2f module.

Args:
    c1 (int): Number of input channels.
    c2 (int): Number of output channels.
    n (int): Number of ABlock or C3k modules to stack.
    a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
    area (int): Number of areas the feature map is divided.
    residual (bool): Whether to use residual connections with learnable gamma parameter.
    mlp_ratio (float): Expansion ratio for MLP hidden dimension.
    e (float): Channel expansion ratio for hidden channels.
    g (int): Number of groups for grouped convolutions.
    shortcut (bool): Whether to use shortcut connections in C3k blocks.
rx   r   z(Dimension of ABlock be a multiple of 32.r   g{Gz?Tr  Nc              3     >#    U  HC  nT(       a(  [         R                  " UUU4S  j[        S5       5       6 O[        TTSTT5      v   ME     g7f)c              3  F   >#    U  H  n[        TTS -  TT5      v   M     g7f)rx   N)r  )r   rU   r  rq   r  s     rK   r   +A2C2f.__init__.<locals>.<genexpr>.<genexpr>p  s#     T8aF2rRxDAA8r   rP   N)r<   r   r   r  )r   rU   a2r  rq   r   r  r   s     rK   r   !A2C2f.__init__.<locals>.<genexpr>o  sJ      
   MMT5QR8TURQ!,- s   AA)r:   r;   rZ   r   rl   ro   r<   rC   r@   rl  gammar   r   r   )rH   rG   rr   r   r  r  residualr  r   r   r   rq   rJ   s       `` ` ``@rK   r;   A2C2f.__init__K  s    8 	[Bw!|GGG|B1%Q"b!,PRW_R\\$B"7tLei
 
 
 1X	
 
rM   c                P  ^ U R                  U5      /mTR                  U4S jU R                   5       5        U R                  [        R
                  " TS5      5      mU R                  b:  XR                  R                  SU R                  R                  S   SS5      T-  -   $ T$ )z
Forward pass through A2C2f layer.

Args:
    x (torch.Tensor): Input tensor.

Returns:
    (torch.Tensor): Output tensor after processing.
c              3  8   >#    U  H  o" TS    5      v   M     g7fr   r   r   s     rK   r    A2C2f.forward.<locals>.<genexpr>  r   r   r   r   r   )	rl   r   r   ro   r@   r   r  rD   rQ   r   s     @rK   rW   A2C2f.forwardv  s     XXa[M	*466**HHUYYq!_%::!zzr4::+;+;A+>1EIIIrM   )rl   ro   r  r   )r   Tr   Fg       @r   r   T)rG   rZ   rr   rZ   r   rZ   r  r   r  rZ   r  r   r  rB   r   rB   r   rZ   r   r   r[   r^   rf   s   @rK   r  r  3  s    6 )
)
 )
 	)

 )
 )
 )
 )
 )
 )
 )
 )
V rM   r  c                  >   ^  \ rS rSrSrSSU 4S jjjrSS jrSrU =r$ )		SwiGLUFFNi  z@SwiGLU Feed-Forward Network for transformer-based architectures.c                   > [         TU ]  5         [        R                  " XU-  5      U l        [        R                  " X2-  S-  U5      U l        g)z
Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.

Args:
    gc (int): Guide channels.
    ec (int): Embedding channels.
    e (int): Expansion factor.
rP   N)r:   r;   r<   rh  w12w3)rH   rn  r   r   rJ   s       rK   r;   SwiGLUFFN.__init__  s<     	99RR())AFaK,rM   c                    U R                  U5      nUR                  SSS9u  p4[        R                  " U5      U-  nU R	                  U5      $ )z.Apply SwiGLU transformation to input features.rP   r   r   )r  r   r   silur  )rH   rI   x12r   r   hiddens         rK   rW   SwiGLUFFN.forward  sB    hhqk1"%bwwvrM   )r  r  )rO   )rn  rZ   r   rZ   r   rZ   r]   r:  r[   r^   rf   s   @rK   r  r    s    J- - rM   r  c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )Residuali  z7Residual connection wrapper for neural network modules.c                "  > [         TU ]  5         Xl        [        R                  R                  U R                  R                  R                  5        [        R                  R                  U R                  R                  R                  5        g)zx
Initialize residual module with the wrapped module.

Args:
    m (nn.Module): Module to wrap with residual connection.
N)	r:   r;   r   r<   r  zeros_r  r8   rE   )rH   r   rJ   s     rK   r;   Residual.__init__  sQ     	
tvvyy~~& 	tvvyy''(rM   c                (    XR                  U5      -   $ )z,Apply residual connection to input features.r   ru   s     rK   rW   Residual.forward  s    66!9}rM   r   )r   r   r]   r:  r[   r^   rf   s   @rK   r  r    s    A) rM   r  c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )SAVPEi  zESpatial-Aware Visual Prompt Embedding module for feature enhancement.c           
       >^ [         TU ]  5         [        R                  " U4S j[	        U5       5       5      U l        [        R                  " U4S j[	        U5       5       5      U l        SU l        [        R                  " ST-  US5      U l	        [        R                  " ST-  U R                  SSS9U l
        [        R                  " SU R                  SSS9U l        [        R                  " [        SU R                  -  U R                  S5      [        R                  " U R                  U R                  SSS95      U l        g)	z
Initialize SAVPE module with channels, intermediate channels, and embedding dimension.

Args:
    ch (list[int]): List of input channel dimensions.
    c3 (int): Intermediate channels.
    embed (int): Embedding dimension.
c           	   3     >#    U  He  u  p[         R                  " [        UTS 5      [        TTS 5      US;   a  [         R                  " US-  S9O[         R                  " 5       5      v   Mg     g7f)rj      r   rP   rP   scale_factorNr<   r   r   Upsampler!  r   r   rI   rU  s      rK   r   !SAVPE.__init__.<locals>.<genexpr>  se      !
 & MMQARQTUY_T_!a%1Pegepeper  &s   A-A0c              3     >#    U  HY  u  p[         R                  " [        UTS 5      US;   a  [         R                  " US-  S9O[         R                  " 5       5      v   M[     g7f)r   r  rP   r  Nr  r  s      rK   r   r    sP      !
% MM$q"a.QRX["++1q5*I^`^i^i^kll%s   A!A$rY   rj   r   )r   rP   N)r:   r;   r<   r   r	  rl   ro   r   r=   rp   rF  r  r   r   cv6)rH   r  rU  ry  rJ   s     ` rK   r;   SAVPE.__init__  s     	== !
 ""	!
 
 == !
!"!
 

 99QVUA.99QVTVVQ:99Q15==a$&&j$&&!!<biiPTPVPVXYcd>efrM   c                n   [        U5       VVs/ s H  u  p4U R                  U   " U5      PM     nnnU R                  [        R                  " USS95      n[        U5       VVs/ s H  u  p4U R
                  U   " U5      PM     nnnU R                  [        R                  " USS95      nUR                  u  pgpUR                  S   n
UR                  XgS5      nUR                  USU R                  X5      R                  SU
SSS5      R                  Xj-  U R                  X5      nUR                  XjSX5      R                  Xj-  SX5      nU R                  [        R                  " XPR                  U5      4SS95      nUR                  XjU R                  S5      nUR                  XjSS5      nXR-  [        R                  " U5      [        R                  " UR                   5      R"                  -  -   n[$        R&                  " USS9R)                  UR                   5      nUR+                  SS5      UR                  X`R                  XpR                  -  S5      R+                  SS5      -  n[$        R,                  " UR+                  SS5      R                  XjS5      SSS9$ s  snnf s  snnf )zJProcess input features and visual prompts to generate enhanced embeddings.r   r   r   rT  rP   r  )r	  ro   rF  r@   r   rl   rp   rQ   rD   r  r   expandr  r  logical_notfinfor9   minr   rS   torR   r  )rH   rI   vpr   xir   rU  rV  rW  rX  Qscore
aggregateds                rK   rW   SAVPE.forward  s   *3A,7,TXXa[_,7HHUYYqa()*3A,7,TXXa[_,7HHUYYqa()WW
aHHQKFF1IIaDFFA)00QBCKKAESWSYSY[\`ZZa&..qua>HHUYY88B<0a89IIaDFFB'ZZa$**2.QWW1E1I1III		%R(++AGG4__R,qyyFFAKQS/T/^/^_ace/ff
{{://B7??bIrUVWW1 8 8s   !J+%!J1)r   rl   ro   rp   rF  r  r  )r  r   rU  rZ   ry  rZ   )rI   r  r  r\   r]   r\   r^   rf   s   @rK   r  r    s    Og8X XrM   r  )Jrc   
__future__r   r@   torch.nnr<   torch.nn.functionalr  r   ultralytics.utils.torch_utilsr   r?   r   r   r   r	   r
   r   transformerr   __all__Moduler   r    r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   rP  r"   rd  r   r   r   r   r  r  r#   r$   r&   r%   r'   r)   r(   r  r*   r  r-   r.   r/   r0   r\  r1   r,   r+   r2   r3   r  r  r  r  r  r  r   rM   rK   <module>r     s    "     : F F )(V\")) \6>BII >.#RYY #L+(bii +(\D")) D0)299 )8 ,6 68 )"))  )FJ J6x" x(;BII ;202 0(Mb M(/bii /:P P8CBII C@J")) J2")) @3$")) 3$lB)bii B)J@%ryy @%F6bii 6606		 06f,J ,,_R _()299 )D1L 1*BII (&BII &4)bii )83ryy 30<RYY <8)")) ):
3 
0f" f,@uxx @F-<")) -<`^S ^B<		 <~2ryy 2j7.")) 7.t7.BII 7.t%kS %kP0%RYY 0%f>")) >BSBII SlARYY AHRBII Rj		 0ryy ,9XBII 9XrM   